def __init__(self, config: HotpotQAInputConfig, spm_model_file: Optional[str] = None, vocab_file: Optional[str] = None) -> None: self._config = config if spm_model_file: self._use_wordpiece = False self._tokenizer = tokenization.FullTokenizer( None, None, spm_model_file) self._get_tokenized_text = functools.partial( data_utils.get_sentencepiece_tokenized_text, tokenizer=self._tokenizer) self._find_answer_spans = data_utils.find_answer_spans_sentencepiece elif vocab_file: self._use_wordpiece = True self._tokenizer = tokenization.FullTokenizer(vocab_file) self._get_tokenized_text = functools.partial( data_utils.get_wordpiece_tokenized_text, tokenizer=self._tokenizer) self._find_answer_spans = functools.partial( data_utils.find_answer_spans_wordpiece, tokenizer=self._tokenizer) else: raise ValueError( "Either a 'sp_model' or a 'vocab_file' need to specified to create a" "tokenizer.")
def setup(self): if self._config.spm_model_path: self._tokenizer = tokenization.FullTokenizer( None, do_lower_case=None, spm_model_file=self._config.spm_model_path) else: self._tokenizer = tokenization.FullTokenizer( self._config.bert_vocab_path, do_lower_case=self._config.do_lower_case)
def _get_tokenizer(): """Gets tokenizer and whether WordPiece tokenizer is used.""" if FLAGS.spm_model_file: use_wordpiece = False tokenizer = tokenization.FullTokenizer(None, None, FLAGS.spm_model_file) elif FLAGS.vocab_file: use_wordpiece = True tokenizer = tokenization.FullTokenizer(FLAGS.vocab_file) else: raise ValueError( "Either a 'sp_model' or a 'vocab_file' need to specified to create a" "tokenizer.") return tokenizer, use_wordpiece
def setup(self): super().setup() if FLAGS.tokenizer_type == "BERT": tokenization.validate_case_matches_checkpoint( FLAGS.do_lower_case, FLAGS.init_checkpoint) if not FLAGS.vocab_file: raise ValueError("vocab_file should be specified when using " "BERT tokenizer.") self._tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) elif FLAGS.tokenizer_type == "ALBERT": if not FLAGS.spm_model_path: raise ValueError( "spm_model_path should be specified when using " "ALBERT tokenizer.") self._tokenizer = tokenization.FullTokenizer( vocab_file=None, do_lower_case=None, spm_model_file=FLAGS.spm_model_path) else: raise ValueError("Unexpected tokenizer_type found: {}".format( FLAGS.tokenizer_type))
def main(argv): if len(argv) != 3: raise tf.app.UsageError("Exactly two arguments expected.") input_json_filepath = argv[1].strip() output_json_filepath = argv[2].strip() tokenizer = tokenization.FullTokenizer( vocab_file=None, do_lower_case=None, spm_model_file=SPM_MODEL_VOCAB) with tf.gfile.Open(input_json_filepath, "r") as test_data: json_examples = json.load(test_data) predictions = generate_eval_output( tokenizer=tokenizer, json_examples=json_examples, model_dir_path=MODEL_PATH) with tf.gfile.GFile(output_json_filepath, "w") as output_writer: json.dump(predictions, output_writer)
def test_etc_features_fixed_global_blocks(self): example = lib.OpenKpExample( url= 'http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html', text= 'Star Trek Discovery Season 1 Jason Isaacs Jason Isaacs and Doug', vdom=[ lib.VdomElement(id=0, text='Star Trek Discovery Season 1 Jason', features=lib.VdomFeatures(x_coord=44.0, width=728.0, y_coord=78.0, height=45.0, is_block=True, is_inline=False, is_heading=True, is_leaf=False, font_size=20, is_bold=False), parent_features=lib.VdomFeatures( x_coord=44.0, width=728.0, y_coord=78.0, height=45.0, is_block=True, is_inline=False, is_heading=True, is_leaf=False, font_size=20, is_bold=False), start_idx=0, end_idx=6), lib.VdomElement(id=0, text='Isaacs Jason Isaacs and Doug', features=lib.VdomFeatures(x_coord=208.0, width=49.0, y_coord=138.0, height=15.0, is_block=False, is_inline=False, is_heading=False, is_leaf=False, font_size=12, is_bold=True), parent_features=lib.VdomFeatures( x_coord=198.0, width=564.0, y_coord=138.0, height=15.0, is_block=True, is_inline=False, is_heading=False, is_leaf=False, font_size=12, is_bold=True), start_idx=6, end_idx=11) ], key_phrases=[ lib.KeyPhrase(['Star', 'Trek']), lib.KeyPhrase(['Jason', 'Isaacs']) ]) bert_vocab_path = os.path.join(absltest.get_default_test_srcdir(), VOCAB_PATH) config = lib.EtcFeaturizationConfig(long_max_length=16, global_max_length=4, url_max_code_points=80, bert_vocab_path=bert_vocab_path, do_lower_case=True, fixed_block_len=4) tokenizer = tokenization.FullTokenizer( config.bert_vocab_path, do_lower_case=config.do_lower_case) etc_features = example.to_etc_features(tokenizer, config) expected = lib.OpenKpEtcFeatures( url_code_points=[ 104, 116, 116, 112, 58, 47, 47, 48, 49, 50, 51, 112, 117, 116, 108, 111, 99, 107, 101, 114, 46, 99, 111, 109, 47, 119, 97, 116, 99, 104, 47, 113, 100, 55, 107, 66, 111, 100, 75, 45, 115, 116, 97, 114, 45, 116, 114, 101, 107, 45, 100, 105, 115, 99, 111, 118, 101, 114, 121, 45, 115, 101, 97, 115, 111, 110, 45, 49, 46, 104, 116, 109, 108, -1, -1, -1, -1, -1, -1, -1 ], label_start_idx=[5, 0, -1], label_phrase_len=[2, 2, -1], long_token_ids=[ 3, 4, 5, 6, 7, 8, 9, 10, 8, 9, 10, 11, 12, 0, 0, 0 ], long_word_idx=[0, 1, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 0, 0, 0], long_vdom_idx=[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 0, 0, 0], long_input_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], long_word_input_mask=[ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 ], global_token_ids=[1, 1, 1, 1], global_input_mask=[1, 1, 1, 1], global_x_coords=[], global_y_coords=[], global_widths=[], global_heights=[], global_font_ids=[], global_block_indicator=[], global_inline_indicator=[], global_heading_indicator=[], global_leaf_indicator=[], global_bold_indicator=[], global_parent_x_coords=[], global_parent_y_coords=[], global_parent_widths=[], global_parent_heights=[], global_parent_font_ids=[], global_parent_heading_indicator=[], global_parent_leaf_indicator=[], global_parent_bold_indicator=[]) self.assertEqual(expected, etc_features)
def test_etc_features_with_long_overflow(self): text = 'Star Wars and not Trek ' + ' '.join(['star'] * 12) vdom = [ lib.VdomElement(id=0, text='Star Wars and not Trek', features=lib.VdomFeatures(x_coord=44.0, width=728.0, y_coord=78.0, height=45.0, is_block=True, is_inline=False, is_heading=True, is_leaf=False, font_size=20, is_bold=False), parent_features=lib.VdomFeatures(x_coord=44.0, width=728.0, y_coord=78.0, height=45.0, is_block=True, is_inline=False, is_heading=True, is_leaf=False, font_size=20, is_bold=False), start_idx=0, end_idx=5), lib.VdomElement(id=0, text=' '.join(['star'] * 99), features=lib.VdomFeatures(x_coord=44.0, width=728.0, y_coord=78.0, height=45.0, is_block=True, is_inline=False, is_heading=True, is_leaf=False, font_size=20, is_bold=False), parent_features=lib.VdomFeatures(x_coord=44.0, width=728.0, y_coord=78.0, height=45.0, is_block=True, is_inline=False, is_heading=True, is_leaf=False, font_size=20, is_bold=False), start_idx=5, end_idx=17) ] example = lib.OpenKpExample( url= 'http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html', text=text, vdom=vdom, key_phrases=[ lib.KeyPhrase(['Star', 'Wars']), lib.KeyPhrase(['Trek']), ]) bert_vocab_path = os.path.join(absltest.get_default_test_srcdir(), VOCAB_PATH) config = lib.EtcFeaturizationConfig(long_max_length=16, global_max_length=4, url_max_code_points=80, bert_vocab_path=bert_vocab_path, do_lower_case=True) tokenizer = tokenization.FullTokenizer( config.bert_vocab_path, do_lower_case=config.do_lower_case) etc_features = example.to_etc_features(tokenizer, config) expected = lib.OpenKpEtcFeatures( url_code_points=[ 104, 116, 116, 112, 58, 47, 47, 48, 49, 50, 51, 112, 117, 116, 108, 111, 99, 107, 101, 114, 46, 99, 111, 109, 47, 119, 97, 116, 99, 104, 47, 113, 100, 55, 107, 66, 111, 100, 75, 45, 115, 116, 97, 114, 45, 116, 114, 101, 107, 45, 100, 105, 115, 99, 111, 118, 101, 114, 121, 45, 115, 101, 97, 115, 111, 110, 45, 49, 46, 104, 116, 109, 108, -1, -1, -1, -1, -1, -1, -1 ], label_start_idx=[0, 4, -1], label_phrase_len=[2, 1, -1], long_token_ids=[3, 14, 11, 15, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], long_word_idx=[0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], long_vdom_idx=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], long_input_mask=[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], long_word_input_mask=[ 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], long_word_first_occurrence=LONG_WORD_FIRST_OCCURRENCE3, global_token_ids=[1, 1, 1, 1], global_input_mask=[1, 0, 0, 0], global_x_coords=[44.0, 0, 0, 0], global_y_coords=[78.0, 0, 0, 0], global_widths=[728.0, 0, 0, 0], global_heights=[45.0, 0, 0, 0], global_font_ids=[13, 0, 0, 0], global_block_indicator=[1, 0, 0, 0], global_inline_indicator=[0, 0, 0, 0], global_heading_indicator=[1, 0, 0, 0], global_leaf_indicator=[0, 0, 0, 0], global_bold_indicator=[0, 0, 0, 0], global_parent_x_coords=[44.0, 0, 0, 0], global_parent_y_coords=[78.0, 0, 0, 0], global_parent_widths=[728.0, 0, 0, 0], global_parent_heights=[45.0, 0, 0, 0], global_parent_font_ids=[13, 0, 0, 0], global_parent_heading_indicator=[1, 0, 0, 0], global_parent_leaf_indicator=[0, 0, 0, 0], global_parent_bold_indicator=[0, 0, 0, 0]) self.assertEqual(expected, etc_features)
def test_etc_features_with_vdom_overflow(self): vdom = [ lib.VdomElement(id=0, text='Star Trek Discovery Season 1 Jason', features=lib.VdomFeatures(x_coord=44.0, width=728.0, y_coord=78.0, height=45.0, is_block=True, is_inline=False, is_heading=True, is_leaf=False, font_size=20, is_bold=False), parent_features=lib.VdomFeatures(x_coord=44.0, width=728.0, y_coord=78.0, height=45.0, is_block=True, is_inline=False, is_heading=True, is_leaf=False, font_size=20, is_bold=False), start_idx=0, end_idx=5), lib.VdomElement(id=0, text='Isaacs Jason Isaacs and Doug', features=lib.VdomFeatures(x_coord=208.0, width=49.0, y_coord=138.0, height=15.0, is_block=False, is_inline=False, is_heading=False, is_leaf=False, font_size=12, is_bold=True), parent_features=lib.VdomFeatures(x_coord=198.0, width=564.0, y_coord=138.0, height=15.0, is_block=True, is_inline=False, is_heading=False, is_leaf=False, font_size=12, is_bold=True), start_idx=5, end_idx=8) ] text = 'Star Trek Discovery Season 1 Director Jason Isaacs' text += ' foo' * (20 - 8) vdom.extend([ lib.VdomElement(id=0, text='foo', features=lib.VdomFeatures(x_coord=208.0, width=49.0, y_coord=138.0, height=15.0, is_block=False, is_inline=False, is_heading=False, is_leaf=True, font_size=12, is_bold=True), parent_features=lib.VdomFeatures(x_coord=3110.0, width=92.0, y_coord=123.0, height=75.0, is_block=True, is_inline=False, is_heading=False, is_leaf=True, font_size=13, is_bold=True), start_idx=start_idx, end_idx=start_idx + 1) for start_idx in range(8, 20) ]) example = lib.OpenKpExample( url= 'http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html', text=text, vdom=vdom, key_phrases=[ lib.KeyPhrase(['Star', 'Trek']), lib.KeyPhrase(['Jason', 'Isaacs']), ]) bert_vocab_path = os.path.join(absltest.get_default_test_srcdir(), VOCAB_PATH) config = lib.EtcFeaturizationConfig(long_max_length=16, global_max_length=4, url_max_code_points=80, bert_vocab_path=bert_vocab_path, do_lower_case=True) tokenizer = tokenization.FullTokenizer( config.bert_vocab_path, do_lower_case=config.do_lower_case) etc_features = example.to_etc_features(tokenizer, config) expected = lib.OpenKpEtcFeatures( url_code_points=[ 104, 116, 116, 112, 58, 47, 47, 48, 49, 50, 51, 112, 117, 116, 108, 111, 99, 107, 101, 114, 46, 99, 111, 109, 47, 119, 97, 116, 99, 104, 47, 113, 100, 55, 107, 66, 111, 100, 75, 45, 115, 116, 97, 114, 45, 116, 114, 101, 107, 45, 100, 105, 115, 99, 111, 118, 101, 114, 121, 45, 115, 101, 97, 115, 111, 110, 45, 49, 46, 104, 116, 109, 108, -1, -1, -1, -1, -1, -1, -1 ], label_start_idx=[0, 7, -1], label_phrase_len=[2, 2, -1], long_token_ids=[ 3, 4, 5, 6, 7, 8, 9, 10, 8, 9, 10, 11, 12, 13, 13, 0 ], long_word_idx=[0, 1, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 11, 12, 0], long_vdom_idx=[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 0], long_input_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], long_word_input_mask=[ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 ], long_word_first_occurrence=LONG_WORD_FIRST_OCCURRENCE2, global_token_ids=[1, 1, 1, 1], global_input_mask=[1, 1, 1, 1], global_x_coords=[44.0, 208.0, 208.0, 208.0], global_y_coords=[78.0, 138.0, 138.0, 138.0], global_widths=[728.0, 49.0, 49.0, 49.0], global_heights=[45.0, 15.0, 15.0, 15.0], global_font_ids=[13, 5, 5, 5], global_block_indicator=[1, 0, 0, 0], global_inline_indicator=[0, 0, 0, 0], global_heading_indicator=[1, 0, 0, 0], global_leaf_indicator=[0, 0, 1, 1], global_bold_indicator=[0, 1, 1, 1], global_parent_x_coords=[44.0, 198.0, 3110.0, 3110.0], global_parent_y_coords=[78.0, 138.0, 123.0, 123.0], global_parent_widths=[728.0, 564.0, 92.0, 92.0], global_parent_heights=[45.0, 15.0, 75.0, 75.0], global_parent_font_ids=[13, 5, 6, 6], global_parent_heading_indicator=[1, 0, 0, 0], global_parent_leaf_indicator=[0, 0, 1, 1], global_parent_bold_indicator=[0, 1, 1, 1]) self.assertEqual(expected, etc_features)
def __init__(self, stride, seq_len, global_seq_len, question_len, vocab_file, do_lower_case, predict_la_when_no_sa, include_unknown_rate, include_unknown_rate_for_unanswerable, include_html_tokens, global_token_types, spm_model_path, tokenizer_type, is_train, fixed_blocks=False, fixed_block_size=27, global_size_counter=None, long_size_counter=None, global_size_threshold_counters=None, global_sentence_counter=None, long_sentence_tokens_counter=None): if tokenizer_type == "BERT": # Use BERT tokenization: self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) self.sos_id = self.tokenizer.vocab["[unused102]"] self.eos_id = self.tokenizer.vocab["[unused103]"] self.pad_id = self.tokenizer.vocab["[PAD]"] self.cls_id = self.tokenizer.vocab["[CLS]"] self.sep_id = self.tokenizer.vocab["[SEP]"] elif tokenizer_type == "ALBERT": # Use ALBERT SentencePiece tokenization: # Notice that 'vocab_file' and 'do_lower_case' are ignored when # 'spm_model_path' is not None self.tokenizer = tokenization.FullTokenizer( vocab_file, do_lower_case, spm_model_path) self.sos_id = self.tokenizer.vocab["<unused_35>"] self.eos_id = self.tokenizer.vocab["<unused_36>"] self.pad_id = self.tokenizer.vocab["<pad>"] self.cls_id = self.tokenizer.vocab["<unused_63>"] self.sep_id = self.tokenizer.vocab["<unused_2>"] else: raise ValueError("Only 'BERT' and 'ALBERT' are supported: %s" % (tokenizer_type)) self.answer_type_enum = { "NULL": 0, "YES": 1, "NO": 2, "LONG": 3, "SHORT": 4 } self.seq_len = seq_len self.question_len = question_len self.stride = stride self.predict_la_when_no_sa = predict_la_when_no_sa self.include_unknown_rate = include_unknown_rate if include_unknown_rate_for_unanswerable is None: self.include_unknown_rate_for_unanswerable = include_unknown_rate * 4 else: self.include_unknown_rate_for_unanswerable = ( include_unknown_rate_for_unanswerable) self.include_html_tokens = include_html_tokens self.global_seq_len = global_seq_len self.gt_type_sentence = global_token_types[0] self.gt_type_cls = global_token_types[1] self.gt_type_question = global_token_types[2] self.is_train = is_train self.fixed_blocks = fixed_blocks self.fixed_block_size = fixed_block_size self.question_ids_in_long = True self.cls_in_long = True self.global_cls_id = self.cls_id # 35 corresponds to "unused34" both in BERT (uncased) and ALBERT vocabs, # it will be "unused35" in BERT cased. self.global_question_id = 35 self.global_sentence_id = 1 self._global_size_counter = global_size_counter self._long_size_counter = long_size_counter if global_size_threshold_counters is None: self._global_size_threshold_counters = [] else: self._global_size_threshold_counters = global_size_threshold_counters self.global_sentence_counter = global_sentence_counter self.long_sentence_tokens_counter = long_sentence_tokens_counter