def test_tokenizer(self): testdata = os.path.join( FLAGS.test_srcdir, ".//" "small_gold_annotation.jsonl") test_examples = preproc.read_tydi_examples( testdata, is_training=True, max_passages=45, max_position=45, fail_on_invalid=False, open_fn=tf_io.gopen) vocab_file = self._get_vocab_file() tokenizer = tokenization.TyDiTokenizer(vocab_file=vocab_file) for tydi_example in test_examples: features = preproc.convert_single_example( tydi_example, tokenizer, is_training=True, max_question_length=64, max_seq_length=512, doc_stride=128, include_unknowns=1.0, errors=[], debug_info={}) self.assertEqual(len(set([f.language_id for f in features])), 1) for feature in features: if feature.end_position <= 0: self.assertEqual(feature.start_position, 0)
def __init__(self, is_training, max_question_length, max_seq_length, doc_stride, include_unknowns, vocab_file): self.is_training = is_training self.tokenizer = tokenization.TyDiTokenizer(vocab_file=vocab_file) self.max_question_length = max_question_length self.max_seq_length = max_seq_length self.doc_stride = doc_stride self.include_unknowns = include_unknowns self.vocab = self.tokenizer.vocab # used by callers
def test_tokenizer_simple(self): vocab_file = self._get_vocab_file() tokenizer = tokenization.TyDiTokenizer(vocab_file=vocab_file) text = "[CLS] [ContextId=0] This is a test." tokens, _, _, _ = tokenizer.tokenize_with_offsets(text) # Create reverse vocab lookup. reverse_vocab_table = { word_id: word for word, word_id in tokenizer.vocab.items() } output_tokens = [reverse_vocab_table[i] for i in tokens] self.assertEqual(output_tokens, ["[CLS]", "[ContextId=0]", "This", "is", "a", "test", "."])
def test_tokenizer_val(self): testdata = os.path.join( FLAGS.test_srcdir, ".//" "small_gold_annotation.jsonl") train_examples = preproc.read_tydi_examples( testdata, is_training=True, max_passages=45, max_position=45, fail_on_invalid=False, open_fn=tf_io.gopen) dev_examples = preproc.read_tydi_examples( testdata, is_training=False, max_passages=45, max_position=45, fail_on_invalid=False, open_fn=tf_io.gopen) vocab_file = self._get_vocab_file() tokenizer = tokenization.TyDiTokenizer(vocab_file=vocab_file) for tr_ex, dev_ex in zip(train_examples, dev_examples): train_feats = preproc.convert_single_example( tr_ex, tokenizer, is_training=True, max_question_length=64, max_seq_length=512, doc_stride=128, include_unknowns=1.0, errors=[], debug_info={}) dev_feats = preproc.convert_single_example( dev_ex, tokenizer, is_training=False, max_question_length=64, max_seq_length=512, doc_stride=128, include_unknowns=1.0, errors=[], debug_info={}) for train_f, dev_f in zip(train_feats, dev_feats): if train_f.answer_text: st_ = train_f.start_position ed_ = train_f.end_position st_offset = dev_f.wp_start_offset[st_] end_offset = dev_f.wp_end_offset[ed_] self.assertGreaterEqual(end_offset, st_offset)
def test_tokenizer_korean(self): vocab_file = self._get_vocab_file() tokenizer = tokenization.TyDiTokenizer( vocab_file=vocab_file, fail_on_mismatch=True) text = "[Q] 작가는 만화를 그리기 시작했나요?" tokens, _, _, _ = tokenizer.tokenize_with_offsets(text) # Create reverse vocab lookup. reverse_vocab_table = { word_id: word for word, word_id in tokenizer.vocab.items() } output_tokens = [reverse_vocab_table[i] for i in tokens] self.assertEqual(output_tokens, [ "[Q]", u"\uc791", u"##\uac00\ub294", u"\ub9cc", u"##\ud654\ub97c", u"\uadf8", u"##\ub9ac", u"##\uae30", u"\uc2dc", u"##\uc791", u"##\ud588", u"##\ub098", u"##\uc694", "?" ])
def test_offset_wp_mapping(self): """Test the mapping from wordpiece to plaintext offsets.""" testdata = os.path.join( FLAGS.test_srcdir, ".//" "small_gold_annotation.jsonl") vocab_file = self._get_vocab_file() examples = preproc.read_tydi_examples( testdata, is_training=False, max_passages=45, max_position=45, fail_on_invalid=False, open_fn=tf_io.gopen) vocab_file = self._get_vocab_file() tokenizer = tokenization.TyDiTokenizer(vocab_file=vocab_file) for tydi_example in examples: wordpieces, start_offsets, end_offsets, offset_to_wp = ( tokenizer.tokenize_with_offsets(tydi_example.contexts)) # Check invariants. for i in start_offsets: if i > 0: self.assertLess( i, len(tydi_example.context_to_plaintext_offset), "Expected start offset {} to be in `context_to_plaintext_offset` " "byte_len(contexts)={} Context@{}='{}' Have={}".format( i, data.byte_len(tydi_example.contexts), i, data.byte_slice( tydi_example.contexts, i, i + 100, errors="ignore").encode("utf8"), tydi_example.context_to_plaintext_offset)) for i in end_offsets: if i > 0: self.assertLess( i, len(tydi_example.context_to_plaintext_offset), "Expected end offset {} to be in `context_to_plaintext_offset` " "byte_len(contexts)={} Have={}".format( i, data.byte_len(tydi_example.contexts), tydi_example.context_to_plaintext_offset)) wp_start_offsets, wp_end_offsets = ( preproc.create_mapping(start_offsets, end_offsets, tydi_example.context_to_plaintext_offset)) wp_count = 0 for wp_s, wp_e in zip(wp_start_offsets, wp_end_offsets): if wp_s >= 0 or wp_e >= 0 and wp_count < 20: wp_txt = wordpieces[wp_count] if isinstance(wp_txt, str): if "##" not in wp_txt and wp_txt != "[UNK]": self.assertEqual(tydi_example.plaintext[wp_s:wp_e + 1], wp_txt) wp_count += 1 for offset in offset_to_wp: self.assertLess(offset, data.byte_len(tydi_example.contexts)) self.assertGreaterEqual(offset, 0) matching_wp = offset_to_wp[offset] if matching_wp == -1: continue if wp_end_offsets[matching_wp] == -1: continue if wp_start_offsets[matching_wp] == -1: continue self.assertGreaterEqual(wp_end_offsets[matching_wp], wp_start_offsets[matching_wp])
def convert_examples_to_features(tydi_examples, is_training, vocab_file, max_question_length, max_seq_length, doc_stride, include_unknowns, output_fn): """Converts `TyDiExample`s into `InputFeatures` and sends them to `output_fn`. Each entry is split into multiple `InputFeatures`, which contains windows spans of the article text that contain no more than N wordpieces so as to fit within BERT's context window. This function assigns `unique_ids` to features, which allow us to identify which example a shorter window of text corresponds to. Args: tydi_examples: generator of `TyDiExample`s generated by `read_tydi_examples`. is_training: boolean flag vocab_file: path to WordPiece vocabulary file. max_question_length: see FLAGS.max_question_length. max_seq_length: see FLAGS.max_seq_length. doc_stride: see FLAGS.doc_stride. include_unknowns: see FLAGS.include_unknowns. output_fn: output function to be applied to the features generated from examples. Returns: num_spans_to_id: a dictionary containing a mapping from number of features to a list of example ids that has that number of features. num_examples: Number of examples from the `tydi_examples` generator. """ num_spans_to_ids = collections.defaultdict(list) tokenizer = tokenization.TyDiTokenizer(vocab_file=vocab_file) num_examples = 0 for tydi_example in tydi_examples: example_index = tydi_example.example_id # Each TyDi entry is split into multiple features, each of # FLAGS.max_seq_length word pieces. errors = [] features = convert_single_example( tydi_example, tokenizer=tokenizer, is_training=is_training, max_question_length=max_question_length, max_seq_length=max_seq_length, doc_stride=doc_stride, include_unknowns=include_unknowns, errors=errors) num_examples += 1 num_spans_to_ids[len(features)].append(tydi_example.example_id) for feature in features: feature.example_index = example_index # This integer `unique_id` is used for compute_predictions # to merge features with example. Both `example_index` and # `doc_span_index` are integers, so this works primarily by virtue of # the `example_index`s being uniformly distributed with many unoccupied # indices between them so as to make collissions unlikely. feature.unique_id = feature.example_index + feature.doc_span_index output_fn(feature) return num_spans_to_ids, num_examples