Beispiel #1
0
 def test_tokenizer(self):
   testdata = os.path.join(
       FLAGS.test_srcdir, ".//"
       "small_gold_annotation.jsonl")
   test_examples = preproc.read_tydi_examples(
       testdata,
       is_training=True,
       max_passages=45,
       max_position=45,
       fail_on_invalid=False,
       open_fn=tf_io.gopen)
   vocab_file = self._get_vocab_file()
   tokenizer = tokenization.TyDiTokenizer(vocab_file=vocab_file)
   for tydi_example in test_examples:
     features = preproc.convert_single_example(
         tydi_example,
         tokenizer,
         is_training=True,
         max_question_length=64,
         max_seq_length=512,
         doc_stride=128,
         include_unknowns=1.0,
         errors=[],
         debug_info={})
     self.assertEqual(len(set([f.language_id for f in features])), 1)
     for feature in features:
       if feature.end_position <= 0:
         self.assertEqual(feature.start_position, 0)
Beispiel #2
0
 def __init__(self, is_training, max_question_length, max_seq_length,
              doc_stride, include_unknowns, vocab_file):
     self.is_training = is_training
     self.tokenizer = tokenization.TyDiTokenizer(vocab_file=vocab_file)
     self.max_question_length = max_question_length
     self.max_seq_length = max_seq_length
     self.doc_stride = doc_stride
     self.include_unknowns = include_unknowns
     self.vocab = self.tokenizer.vocab  # used by callers
Beispiel #3
0
  def test_tokenizer_simple(self):
    vocab_file = self._get_vocab_file()
    tokenizer = tokenization.TyDiTokenizer(vocab_file=vocab_file)
    text = "[CLS] [ContextId=0] This is a test."
    tokens, _, _, _ = tokenizer.tokenize_with_offsets(text)

    # Create reverse vocab lookup.
    reverse_vocab_table = {
        word_id: word for word, word_id in tokenizer.vocab.items()
    }
    output_tokens = [reverse_vocab_table[i] for i in tokens]
    self.assertEqual(output_tokens,
                     ["[CLS]", "[ContextId=0]", "This", "is", "a", "test", "."])
Beispiel #4
0
 def test_tokenizer_val(self):
   testdata = os.path.join(
       FLAGS.test_srcdir, ".//"
       "small_gold_annotation.jsonl")
   train_examples = preproc.read_tydi_examples(
       testdata,
       is_training=True,
       max_passages=45,
       max_position=45,
       fail_on_invalid=False,
       open_fn=tf_io.gopen)
   dev_examples = preproc.read_tydi_examples(
       testdata,
       is_training=False,
       max_passages=45,
       max_position=45,
       fail_on_invalid=False,
       open_fn=tf_io.gopen)
   vocab_file = self._get_vocab_file()
   tokenizer = tokenization.TyDiTokenizer(vocab_file=vocab_file)
   for tr_ex, dev_ex in zip(train_examples, dev_examples):
     train_feats = preproc.convert_single_example(
         tr_ex,
         tokenizer,
         is_training=True,
         max_question_length=64,
         max_seq_length=512,
         doc_stride=128,
         include_unknowns=1.0,
         errors=[],
         debug_info={})
     dev_feats = preproc.convert_single_example(
         dev_ex,
         tokenizer,
         is_training=False,
         max_question_length=64,
         max_seq_length=512,
         doc_stride=128,
         include_unknowns=1.0,
         errors=[],
         debug_info={})
     for train_f, dev_f in zip(train_feats, dev_feats):
       if train_f.answer_text:
         st_ = train_f.start_position
         ed_ = train_f.end_position
         st_offset = dev_f.wp_start_offset[st_]
         end_offset = dev_f.wp_end_offset[ed_]
         self.assertGreaterEqual(end_offset, st_offset)
Beispiel #5
0
  def test_tokenizer_korean(self):
    vocab_file = self._get_vocab_file()
    tokenizer = tokenization.TyDiTokenizer(
        vocab_file=vocab_file, fail_on_mismatch=True)
    text = "[Q] 작가는 만화를 그리기 시작했나요?"
    tokens, _, _, _ = tokenizer.tokenize_with_offsets(text)

    # Create reverse vocab lookup.
    reverse_vocab_table = {
        word_id: word for word, word_id in tokenizer.vocab.items()
    }
    output_tokens = [reverse_vocab_table[i] for i in tokens]
    self.assertEqual(output_tokens, [
        "[Q]", u"\uc791", u"##\uac00\ub294", u"\ub9cc", u"##\ud654\ub97c",
        u"\uadf8", u"##\ub9ac", u"##\uae30", u"\uc2dc", u"##\uc791",
        u"##\ud588", u"##\ub098", u"##\uc694", "?"
    ])
Beispiel #6
0
  def test_offset_wp_mapping(self):
    """Test the mapping from wordpiece to plaintext offsets."""
    testdata = os.path.join(
        FLAGS.test_srcdir, ".//"
        "small_gold_annotation.jsonl")
    vocab_file = self._get_vocab_file()
    examples = preproc.read_tydi_examples(
        testdata,
        is_training=False,
        max_passages=45,
        max_position=45,
        fail_on_invalid=False,
        open_fn=tf_io.gopen)
    vocab_file = self._get_vocab_file()
    tokenizer = tokenization.TyDiTokenizer(vocab_file=vocab_file)
    for tydi_example in examples:
      wordpieces, start_offsets, end_offsets, offset_to_wp = (
          tokenizer.tokenize_with_offsets(tydi_example.contexts))

      # Check invariants.
      for i in start_offsets:
        if i > 0:
          self.assertLess(
              i, len(tydi_example.context_to_plaintext_offset),
              "Expected start offset {} to be in `context_to_plaintext_offset` "
              "byte_len(contexts)={} Context@{}='{}' Have={}".format(
                  i, data.byte_len(tydi_example.contexts), i,
                  data.byte_slice(
                      tydi_example.contexts, i, i + 100,
                      errors="ignore").encode("utf8"),
                  tydi_example.context_to_plaintext_offset))
      for i in end_offsets:
        if i > 0:
          self.assertLess(
              i, len(tydi_example.context_to_plaintext_offset),
              "Expected end offset {} to be in `context_to_plaintext_offset` "
              "byte_len(contexts)={} Have={}".format(
                  i, data.byte_len(tydi_example.contexts),
                  tydi_example.context_to_plaintext_offset))

      wp_start_offsets, wp_end_offsets = (
          preproc.create_mapping(start_offsets, end_offsets,
                                 tydi_example.context_to_plaintext_offset))
      wp_count = 0
      for wp_s, wp_e in zip(wp_start_offsets, wp_end_offsets):
        if wp_s >= 0 or wp_e >= 0 and wp_count < 20:
          wp_txt = wordpieces[wp_count]
          if isinstance(wp_txt, str):
            if "##" not in wp_txt and wp_txt != "[UNK]":
              self.assertEqual(tydi_example.plaintext[wp_s:wp_e + 1], wp_txt)
        wp_count += 1

      for offset in offset_to_wp:
        self.assertLess(offset, data.byte_len(tydi_example.contexts))
        self.assertGreaterEqual(offset, 0)
        matching_wp = offset_to_wp[offset]
        if matching_wp == -1:
          continue
        if wp_end_offsets[matching_wp] == -1:
          continue
        if wp_start_offsets[matching_wp] == -1:
          continue
        self.assertGreaterEqual(wp_end_offsets[matching_wp],
                                wp_start_offsets[matching_wp])
Beispiel #7
0
def convert_examples_to_features(tydi_examples, is_training, vocab_file,
                                 max_question_length, max_seq_length,
                                 doc_stride, include_unknowns, output_fn):
  """Converts `TyDiExample`s into `InputFeatures` and sends them to `output_fn`.

  Each entry is split into multiple `InputFeatures`, which contains windows

  spans of the article text that contain no more than N wordpieces so as to
  fit within BERT's context window.

  This function assigns `unique_ids` to features, which allow us to identify
  which example a shorter window of text corresponds to.

  Args:
    tydi_examples: generator of `TyDiExample`s generated by
      `read_tydi_examples`.
    is_training: boolean flag
    vocab_file: path to WordPiece vocabulary file.
    max_question_length: see FLAGS.max_question_length.
    max_seq_length: see FLAGS.max_seq_length.
    doc_stride: see FLAGS.doc_stride.
    include_unknowns: see FLAGS.include_unknowns.
    output_fn: output function to be applied to the features generated from
      examples.

  Returns:
    num_spans_to_id: a dictionary containing a mapping from number of features
      to a list of example ids that has that number of features.
    num_examples: Number of examples from the `tydi_examples` generator.
  """
  num_spans_to_ids = collections.defaultdict(list)
  tokenizer = tokenization.TyDiTokenizer(vocab_file=vocab_file)
  num_examples = 0
  for tydi_example in tydi_examples:
    example_index = tydi_example.example_id
    # Each TyDi entry is split into multiple features, each of
    # FLAGS.max_seq_length word pieces.
    errors = []
    features = convert_single_example(
        tydi_example,
        tokenizer=tokenizer,
        is_training=is_training,
        max_question_length=max_question_length,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        include_unknowns=include_unknowns,
        errors=errors)
    num_examples += 1

    num_spans_to_ids[len(features)].append(tydi_example.example_id)

    for feature in features:
      feature.example_index = example_index
      # This integer `unique_id` is used for compute_predictions
      # to merge features with example. Both `example_index` and
      # `doc_span_index` are integers, so this works primarily by virtue of
      # the `example_index`s being uniformly distributed with many unoccupied
      # indices between them so as to make collissions unlikely.
      feature.unique_id = feature.example_index + feature.doc_span_index
      output_fn(feature)

  return num_spans_to_ids, num_examples