Exemple #1
0
    def test_create_entry_from_json_min_answer(self):
        json_dict = _JSON_MIN_ANSWER
        result = preproc.create_entry_from_json(json_dict,
                                                max_passages=45,
                                                max_position=45,
                                                tokenizer=make_tokenizer(),
                                                fail_on_invalid=True)

        # Checks that passage markers generated by TyDiTokenizer.get_passage_marker
        # are inserted by preproc.create_entry_from_json.
        self.assertCreateEntryFromJsonResult(
            json_dict=json_dict,
            result=result,
            expected_context=
            ("\ue006 The zebra finch is the most common estrildid finch. "
             "The bird has been introduced to Puerto Rico. "
             "\ue007 The body temperature (as measured from the cloaca) "
             "of the zebra finch may vary from 38 to 44 °C. "
             "\ue008 The zebra finch was first collected in 1801 during "
             "Nicolas Baudin's expedition to Australia. It was described in "
             "1817 by Louis Jean Pierre Vieillot in his Nouveau Dictionnaire "
             "d'Histoire Naturelle. "
             "\ue009 Morphological differences between the subspecies. "
             "Males do not have the fine barring found on the throat and upper "
             "breast. "
             "\ue00a Symmetry of both plumage, like chest bands, and "
             "artificial features, like leg bands, are preferred by the female. "
             "\ue00b Nest predators of the zebra finch include the tiger "
             "snake."),
            expected_answer_type="minimal",
            expected_passage_answer_index=3,
            expected_min_span_start=507,
            expected_min_span_end=530,
            expected_min_span_text="throat and upper breast")
Exemple #2
0
def read_entries(input_jsonl_pattern,
                 tokenizer,
                 max_passages, max_position, fail_on_invalid):
  """Reads TyDi QA examples from JSONL files.

  Args:
    input_jsonl_pattern: Glob of the gzipped JSONL files to read.
    tokenizer: Used to create special marker symbols to insert into the text.
    max_passages: see FLAGS.max_passages.
    max_position: see FLAGS.max_position.
    fail_on_invalid: Immediately stop if an error is found?

  Yields:
    tuple:
      input_file: str
      line_no: int
      tydi_entry: "TyDiEntry"s, dicts as returned by `create_entry_from_json`,
        one per line of the input JSONL files.
      debug_info: Dict containing debugging data.
  """
  matches = tf.gfile.Glob(input_jsonl_pattern)
  if not matches:
    raise ValueError(f"No files matched: {input_jsonl_pattern}")
  for input_path in matches:
    with gzip.GzipFile(fileobj=tf.gfile.Open(input_path, "rb")) as input_file:  # pytype: disable=wrong-arg-types
      for line_no, line in enumerate(input_file, 1):
        json_elem = json.loads(line, object_pairs_hook=collections.OrderedDict)
        entry = preproc.create_entry_from_json(
            json_elem,
            tokenizer,
            max_passages=max_passages,
            max_position=max_position,
            fail_on_invalid=fail_on_invalid)

        if not entry:
          tf.logging.info("Invalid Example %d", json_elem["example_id"])
          if fail_on_invalid:
            raise ValueError("Invalid example at {}:{}".format(
                input_path, line_no))

        # Return a `debug_info` dict that methods throughout the codebase
        # append to with debugging information.
        debug_info = {"json": json_elem}
        yield input_path, line_no, entry, debug_info
Exemple #3
0
    def test_create_entry_from_json_no_answer(self):
        json_dict = _JSON_NO_ANSWER
        result = preproc.create_entry_from_json(json_dict,
                                                max_passages=45,
                                                max_position=45,
                                                tokenizer=make_tokenizer(),
                                                fail_on_invalid=True)

        # Checks that passage markers generated by TyDiTokenizer.get_passage_marker
        # are inserted by preproc.create_entry_from_json.
        self.assertCreateEntryFromJsonResult(
            json_dict=json_dict,
            result=result,
            expected_context=(
                "\ue006 The zebra finch is the most common estrildid finch. "
                "\ue007 The body temperature may vary from 38 to 44 °C."),
            expected_answer_type="passage",
            expected_passage_answer_index=-1,
            expected_min_span_start=-1,
            expected_min_span_end=-1,
            expected_min_span_text="")