Python FullTokenizer Examples, etcmodel.models.tokenization.FullTokenizer Python Examples

Example #1

0

Show file

File: generate_tf_examples_lib.py Project: zzhaozeng/google-research

 def __init__(self,
              config: HotpotQAInputConfig,
              spm_model_file: Optional[str] = None,
              vocab_file: Optional[str] = None) -> None:
     self._config = config
     if spm_model_file:
         self._use_wordpiece = False
         self._tokenizer = tokenization.FullTokenizer(
             None, None, spm_model_file)
         self._get_tokenized_text = functools.partial(
             data_utils.get_sentencepiece_tokenized_text,
             tokenizer=self._tokenizer)
         self._find_answer_spans = data_utils.find_answer_spans_sentencepiece
     elif vocab_file:
         self._use_wordpiece = True
         self._tokenizer = tokenization.FullTokenizer(vocab_file)
         self._get_tokenized_text = functools.partial(
             data_utils.get_wordpiece_tokenized_text,
             tokenizer=self._tokenizer)
         self._find_answer_spans = functools.partial(
             data_utils.find_answer_spans_wordpiece,
             tokenizer=self._tokenizer)
     else:
         raise ValueError(
             "Either a 'sp_model' or a 'vocab_file' need to specified to create a"
             "tokenizer.")

Example #2

0

Show file

 def setup(self):
     if self._config.spm_model_path:
         self._tokenizer = tokenization.FullTokenizer(
             None,
             do_lower_case=None,
             spm_model_file=self._config.spm_model_path)
     else:
         self._tokenizer = tokenization.FullTokenizer(
             self._config.bert_vocab_path,
             do_lower_case=self._config.do_lower_case)

Example #3

0

Show file

File: run_finetuning.py Project: zqhfpjlswsqy/google-research

def _get_tokenizer():
  """Gets tokenizer and whether WordPiece tokenizer is used."""
  if FLAGS.spm_model_file:
    use_wordpiece = False
    tokenizer = tokenization.FullTokenizer(None, None, FLAGS.spm_model_file)
  elif FLAGS.vocab_file:
    use_wordpiece = True
    tokenizer = tokenization.FullTokenizer(FLAGS.vocab_file)
  else:
    raise ValueError(
        "Either a 'sp_model' or a 'vocab_file' need to specified to create a"
        "tokenizer.")
  return tokenizer, use_wordpiece

Example #4

0

Show file

 def setup(self):
     super().setup()
     if FLAGS.tokenizer_type == "BERT":
         tokenization.validate_case_matches_checkpoint(
             FLAGS.do_lower_case, FLAGS.init_checkpoint)
         if not FLAGS.vocab_file:
             raise ValueError("vocab_file should be specified when using "
                              "BERT tokenizer.")
         self._tokenizer = tokenization.FullTokenizer(
             vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
     elif FLAGS.tokenizer_type == "ALBERT":
         if not FLAGS.spm_model_path:
             raise ValueError(
                 "spm_model_path should be specified when using "
                 "ALBERT tokenizer.")
         self._tokenizer = tokenization.FullTokenizer(
             vocab_file=None,
             do_lower_case=None,
             spm_model_file=FLAGS.spm_model_path)
     else:
         raise ValueError("Unexpected tokenizer_type found: {}".format(
             FLAGS.tokenizer_type))

Example #5

0

Show file

def main(argv):
  if len(argv) != 3:
    raise tf.app.UsageError("Exactly two arguments expected.")
  input_json_filepath = argv[1].strip()
  output_json_filepath = argv[2].strip()
  tokenizer = tokenization.FullTokenizer(
      vocab_file=None, do_lower_case=None, spm_model_file=SPM_MODEL_VOCAB)

  with tf.gfile.Open(input_json_filepath, "r") as test_data:
    json_examples = json.load(test_data)

  predictions = generate_eval_output(
      tokenizer=tokenizer,
      json_examples=json_examples,
      model_dir_path=MODEL_PATH)
  with tf.gfile.GFile(output_json_filepath, "w") as output_writer:
    json.dump(predictions, output_writer)

Example #6

0

Show file

    def test_etc_features_fixed_global_blocks(self):
        example = lib.OpenKpExample(
            url=
            'http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html',
            text=
            'Star Trek Discovery Season 1 Jason Isaacs Jason Isaacs and Doug',
            vdom=[
                lib.VdomElement(id=0,
                                text='Star Trek Discovery Season 1 Jason',
                                features=lib.VdomFeatures(x_coord=44.0,
                                                          width=728.0,
                                                          y_coord=78.0,
                                                          height=45.0,
                                                          is_block=True,
                                                          is_inline=False,
                                                          is_heading=True,
                                                          is_leaf=False,
                                                          font_size=20,
                                                          is_bold=False),
                                parent_features=lib.VdomFeatures(
                                    x_coord=44.0,
                                    width=728.0,
                                    y_coord=78.0,
                                    height=45.0,
                                    is_block=True,
                                    is_inline=False,
                                    is_heading=True,
                                    is_leaf=False,
                                    font_size=20,
                                    is_bold=False),
                                start_idx=0,
                                end_idx=6),
                lib.VdomElement(id=0,
                                text='Isaacs Jason Isaacs and Doug',
                                features=lib.VdomFeatures(x_coord=208.0,
                                                          width=49.0,
                                                          y_coord=138.0,
                                                          height=15.0,
                                                          is_block=False,
                                                          is_inline=False,
                                                          is_heading=False,
                                                          is_leaf=False,
                                                          font_size=12,
                                                          is_bold=True),
                                parent_features=lib.VdomFeatures(
                                    x_coord=198.0,
                                    width=564.0,
                                    y_coord=138.0,
                                    height=15.0,
                                    is_block=True,
                                    is_inline=False,
                                    is_heading=False,
                                    is_leaf=False,
                                    font_size=12,
                                    is_bold=True),
                                start_idx=6,
                                end_idx=11)
            ],
            key_phrases=[
                lib.KeyPhrase(['Star', 'Trek']),
                lib.KeyPhrase(['Jason', 'Isaacs'])
            ])

        bert_vocab_path = os.path.join(absltest.get_default_test_srcdir(),
                                       VOCAB_PATH)
        config = lib.EtcFeaturizationConfig(long_max_length=16,
                                            global_max_length=4,
                                            url_max_code_points=80,
                                            bert_vocab_path=bert_vocab_path,
                                            do_lower_case=True,
                                            fixed_block_len=4)
        tokenizer = tokenization.FullTokenizer(
            config.bert_vocab_path, do_lower_case=config.do_lower_case)
        etc_features = example.to_etc_features(tokenizer, config)
        expected = lib.OpenKpEtcFeatures(
            url_code_points=[
                104, 116, 116, 112, 58, 47, 47, 48, 49, 50, 51, 112, 117, 116,
                108, 111, 99, 107, 101, 114, 46, 99, 111, 109, 47, 119, 97,
                116, 99, 104, 47, 113, 100, 55, 107, 66, 111, 100, 75, 45, 115,
                116, 97, 114, 45, 116, 114, 101, 107, 45, 100, 105, 115,
                99, 111, 118, 101, 114, 121, 45, 115, 101, 97, 115, 111, 110,
                45, 49, 46, 104, 116, 109, 108, -1, -1, -1, -1, -1, -1, -1
            ],
            label_start_idx=[5, 0, -1],
            label_phrase_len=[2, 2, -1],
            long_token_ids=[
                3, 4, 5, 6, 7, 8, 9, 10, 8, 9, 10, 11, 12, 0, 0, 0
            ],
            long_word_idx=[0, 1, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 0, 0, 0],
            long_vdom_idx=[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 0, 0, 0],
            long_input_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
            long_word_input_mask=[
                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
            ],
            global_token_ids=[1, 1, 1, 1],
            global_input_mask=[1, 1, 1, 1],
            global_x_coords=[],
            global_y_coords=[],
            global_widths=[],
            global_heights=[],
            global_font_ids=[],
            global_block_indicator=[],
            global_inline_indicator=[],
            global_heading_indicator=[],
            global_leaf_indicator=[],
            global_bold_indicator=[],
            global_parent_x_coords=[],
            global_parent_y_coords=[],
            global_parent_widths=[],
            global_parent_heights=[],
            global_parent_font_ids=[],
            global_parent_heading_indicator=[],
            global_parent_leaf_indicator=[],
            global_parent_bold_indicator=[])
        self.assertEqual(expected, etc_features)

Example #7

0

Show file

    def test_etc_features_with_long_overflow(self):
        text = 'Star Wars and not Trek ' + ' '.join(['star'] * 12)
        vdom = [
            lib.VdomElement(id=0,
                            text='Star Wars and not Trek',
                            features=lib.VdomFeatures(x_coord=44.0,
                                                      width=728.0,
                                                      y_coord=78.0,
                                                      height=45.0,
                                                      is_block=True,
                                                      is_inline=False,
                                                      is_heading=True,
                                                      is_leaf=False,
                                                      font_size=20,
                                                      is_bold=False),
                            parent_features=lib.VdomFeatures(x_coord=44.0,
                                                             width=728.0,
                                                             y_coord=78.0,
                                                             height=45.0,
                                                             is_block=True,
                                                             is_inline=False,
                                                             is_heading=True,
                                                             is_leaf=False,
                                                             font_size=20,
                                                             is_bold=False),
                            start_idx=0,
                            end_idx=5),
            lib.VdomElement(id=0,
                            text=' '.join(['star'] * 99),
                            features=lib.VdomFeatures(x_coord=44.0,
                                                      width=728.0,
                                                      y_coord=78.0,
                                                      height=45.0,
                                                      is_block=True,
                                                      is_inline=False,
                                                      is_heading=True,
                                                      is_leaf=False,
                                                      font_size=20,
                                                      is_bold=False),
                            parent_features=lib.VdomFeatures(x_coord=44.0,
                                                             width=728.0,
                                                             y_coord=78.0,
                                                             height=45.0,
                                                             is_block=True,
                                                             is_inline=False,
                                                             is_heading=True,
                                                             is_leaf=False,
                                                             font_size=20,
                                                             is_bold=False),
                            start_idx=5,
                            end_idx=17)
        ]
        example = lib.OpenKpExample(
            url=
            'http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html',
            text=text,
            vdom=vdom,
            key_phrases=[
                lib.KeyPhrase(['Star', 'Wars']),
                lib.KeyPhrase(['Trek']),
            ])
        bert_vocab_path = os.path.join(absltest.get_default_test_srcdir(),
                                       VOCAB_PATH)
        config = lib.EtcFeaturizationConfig(long_max_length=16,
                                            global_max_length=4,
                                            url_max_code_points=80,
                                            bert_vocab_path=bert_vocab_path,
                                            do_lower_case=True)
        tokenizer = tokenization.FullTokenizer(
            config.bert_vocab_path, do_lower_case=config.do_lower_case)
        etc_features = example.to_etc_features(tokenizer, config)
        expected = lib.OpenKpEtcFeatures(
            url_code_points=[
                104, 116, 116, 112, 58, 47, 47, 48, 49, 50, 51, 112, 117, 116,
                108, 111, 99, 107, 101, 114, 46, 99, 111, 109, 47, 119, 97,
                116, 99, 104, 47, 113, 100, 55, 107, 66, 111, 100, 75, 45, 115,
                116, 97, 114, 45, 116, 114, 101, 107, 45, 100, 105, 115,
                99, 111, 118, 101, 114, 121, 45, 115, 101, 97, 115, 111, 110,
                45, 49, 46, 104, 116, 109, 108, -1, -1, -1, -1, -1, -1, -1
            ],
            label_start_idx=[0, 4, -1],
            label_phrase_len=[2, 1, -1],
            long_token_ids=[3, 14, 11, 15, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            long_word_idx=[0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            long_vdom_idx=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            long_input_mask=[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            long_word_input_mask=[
                1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
            ],
            long_word_first_occurrence=LONG_WORD_FIRST_OCCURRENCE3,
            global_token_ids=[1, 1, 1, 1],
            global_input_mask=[1, 0, 0, 0],
            global_x_coords=[44.0, 0, 0, 0],
            global_y_coords=[78.0, 0, 0, 0],
            global_widths=[728.0, 0, 0, 0],
            global_heights=[45.0, 0, 0, 0],
            global_font_ids=[13, 0, 0, 0],
            global_block_indicator=[1, 0, 0, 0],
            global_inline_indicator=[0, 0, 0, 0],
            global_heading_indicator=[1, 0, 0, 0],
            global_leaf_indicator=[0, 0, 0, 0],
            global_bold_indicator=[0, 0, 0, 0],
            global_parent_x_coords=[44.0, 0, 0, 0],
            global_parent_y_coords=[78.0, 0, 0, 0],
            global_parent_widths=[728.0, 0, 0, 0],
            global_parent_heights=[45.0, 0, 0, 0],
            global_parent_font_ids=[13, 0, 0, 0],
            global_parent_heading_indicator=[1, 0, 0, 0],
            global_parent_leaf_indicator=[0, 0, 0, 0],
            global_parent_bold_indicator=[0, 0, 0, 0])

        self.assertEqual(expected, etc_features)

Example #8

0

Show file

    def test_etc_features_with_vdom_overflow(self):
        vdom = [
            lib.VdomElement(id=0,
                            text='Star Trek Discovery Season 1 Jason',
                            features=lib.VdomFeatures(x_coord=44.0,
                                                      width=728.0,
                                                      y_coord=78.0,
                                                      height=45.0,
                                                      is_block=True,
                                                      is_inline=False,
                                                      is_heading=True,
                                                      is_leaf=False,
                                                      font_size=20,
                                                      is_bold=False),
                            parent_features=lib.VdomFeatures(x_coord=44.0,
                                                             width=728.0,
                                                             y_coord=78.0,
                                                             height=45.0,
                                                             is_block=True,
                                                             is_inline=False,
                                                             is_heading=True,
                                                             is_leaf=False,
                                                             font_size=20,
                                                             is_bold=False),
                            start_idx=0,
                            end_idx=5),
            lib.VdomElement(id=0,
                            text='Isaacs Jason Isaacs and Doug',
                            features=lib.VdomFeatures(x_coord=208.0,
                                                      width=49.0,
                                                      y_coord=138.0,
                                                      height=15.0,
                                                      is_block=False,
                                                      is_inline=False,
                                                      is_heading=False,
                                                      is_leaf=False,
                                                      font_size=12,
                                                      is_bold=True),
                            parent_features=lib.VdomFeatures(x_coord=198.0,
                                                             width=564.0,
                                                             y_coord=138.0,
                                                             height=15.0,
                                                             is_block=True,
                                                             is_inline=False,
                                                             is_heading=False,
                                                             is_leaf=False,
                                                             font_size=12,
                                                             is_bold=True),
                            start_idx=5,
                            end_idx=8)
        ]

        text = 'Star Trek Discovery Season 1 Director Jason Isaacs'
        text += ' foo' * (20 - 8)
        vdom.extend([
            lib.VdomElement(id=0,
                            text='foo',
                            features=lib.VdomFeatures(x_coord=208.0,
                                                      width=49.0,
                                                      y_coord=138.0,
                                                      height=15.0,
                                                      is_block=False,
                                                      is_inline=False,
                                                      is_heading=False,
                                                      is_leaf=True,
                                                      font_size=12,
                                                      is_bold=True),
                            parent_features=lib.VdomFeatures(x_coord=3110.0,
                                                             width=92.0,
                                                             y_coord=123.0,
                                                             height=75.0,
                                                             is_block=True,
                                                             is_inline=False,
                                                             is_heading=False,
                                                             is_leaf=True,
                                                             font_size=13,
                                                             is_bold=True),
                            start_idx=start_idx,
                            end_idx=start_idx + 1)
            for start_idx in range(8, 20)
        ])
        example = lib.OpenKpExample(
            url=
            'http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html',
            text=text,
            vdom=vdom,
            key_phrases=[
                lib.KeyPhrase(['Star', 'Trek']),
                lib.KeyPhrase(['Jason', 'Isaacs']),
            ])
        bert_vocab_path = os.path.join(absltest.get_default_test_srcdir(),
                                       VOCAB_PATH)
        config = lib.EtcFeaturizationConfig(long_max_length=16,
                                            global_max_length=4,
                                            url_max_code_points=80,
                                            bert_vocab_path=bert_vocab_path,
                                            do_lower_case=True)
        tokenizer = tokenization.FullTokenizer(
            config.bert_vocab_path, do_lower_case=config.do_lower_case)
        etc_features = example.to_etc_features(tokenizer, config)
        expected = lib.OpenKpEtcFeatures(
            url_code_points=[
                104, 116, 116, 112, 58, 47, 47, 48, 49, 50, 51, 112, 117, 116,
                108, 111, 99, 107, 101, 114, 46, 99, 111, 109, 47, 119, 97,
                116, 99, 104, 47, 113, 100, 55, 107, 66, 111, 100, 75, 45, 115,
                116, 97, 114, 45, 116, 114, 101, 107, 45, 100, 105, 115,
                99, 111, 118, 101, 114, 121, 45, 115, 101, 97, 115, 111, 110,
                45, 49, 46, 104, 116, 109, 108, -1, -1, -1, -1, -1, -1, -1
            ],
            label_start_idx=[0, 7, -1],
            label_phrase_len=[2, 2, -1],
            long_token_ids=[
                3, 4, 5, 6, 7, 8, 9, 10, 8, 9, 10, 11, 12, 13, 13, 0
            ],
            long_word_idx=[0, 1, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 11, 12, 0],
            long_vdom_idx=[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 0],
            long_input_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
            long_word_input_mask=[
                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
            ],
            long_word_first_occurrence=LONG_WORD_FIRST_OCCURRENCE2,
            global_token_ids=[1, 1, 1, 1],
            global_input_mask=[1, 1, 1, 1],
            global_x_coords=[44.0, 208.0, 208.0, 208.0],
            global_y_coords=[78.0, 138.0, 138.0, 138.0],
            global_widths=[728.0, 49.0, 49.0, 49.0],
            global_heights=[45.0, 15.0, 15.0, 15.0],
            global_font_ids=[13, 5, 5, 5],
            global_block_indicator=[1, 0, 0, 0],
            global_inline_indicator=[0, 0, 0, 0],
            global_heading_indicator=[1, 0, 0, 0],
            global_leaf_indicator=[0, 0, 1, 1],
            global_bold_indicator=[0, 1, 1, 1],
            global_parent_x_coords=[44.0, 198.0, 3110.0, 3110.0],
            global_parent_y_coords=[78.0, 138.0, 123.0, 123.0],
            global_parent_widths=[728.0, 564.0, 92.0, 92.0],
            global_parent_heights=[45.0, 15.0, 75.0, 75.0],
            global_parent_font_ids=[13, 5, 6, 6],
            global_parent_heading_indicator=[1, 0, 0, 0],
            global_parent_leaf_indicator=[0, 0, 1, 1],
            global_parent_bold_indicator=[0, 1, 1, 1])

        self.assertEqual(expected, etc_features)

Example #9

0

Show file

    def __init__(self,
                 stride,
                 seq_len,
                 global_seq_len,
                 question_len,
                 vocab_file,
                 do_lower_case,
                 predict_la_when_no_sa,
                 include_unknown_rate,
                 include_unknown_rate_for_unanswerable,
                 include_html_tokens,
                 global_token_types,
                 spm_model_path,
                 tokenizer_type,
                 is_train,
                 fixed_blocks=False,
                 fixed_block_size=27,
                 global_size_counter=None,
                 long_size_counter=None,
                 global_size_threshold_counters=None,
                 global_sentence_counter=None,
                 long_sentence_tokens_counter=None):
        if tokenizer_type == "BERT":
            # Use BERT tokenization:
            self.tokenizer = tokenization.FullTokenizer(
                vocab_file=vocab_file, do_lower_case=do_lower_case)
            self.sos_id = self.tokenizer.vocab["[unused102]"]
            self.eos_id = self.tokenizer.vocab["[unused103]"]
            self.pad_id = self.tokenizer.vocab["[PAD]"]
            self.cls_id = self.tokenizer.vocab["[CLS]"]
            self.sep_id = self.tokenizer.vocab["[SEP]"]
        elif tokenizer_type == "ALBERT":
            # Use ALBERT SentencePiece tokenization:
            # Notice that 'vocab_file' and 'do_lower_case' are ignored when
            # 'spm_model_path' is not None
            self.tokenizer = tokenization.FullTokenizer(
                vocab_file, do_lower_case, spm_model_path)
            self.sos_id = self.tokenizer.vocab["<unused_35>"]
            self.eos_id = self.tokenizer.vocab["<unused_36>"]
            self.pad_id = self.tokenizer.vocab["<pad>"]
            self.cls_id = self.tokenizer.vocab["<unused_63>"]
            self.sep_id = self.tokenizer.vocab["<unused_2>"]
        else:
            raise ValueError("Only 'BERT' and 'ALBERT' are supported: %s" %
                             (tokenizer_type))

        self.answer_type_enum = {
            "NULL": 0,
            "YES": 1,
            "NO": 2,
            "LONG": 3,
            "SHORT": 4
        }
        self.seq_len = seq_len
        self.question_len = question_len
        self.stride = stride
        self.predict_la_when_no_sa = predict_la_when_no_sa
        self.include_unknown_rate = include_unknown_rate
        if include_unknown_rate_for_unanswerable is None:
            self.include_unknown_rate_for_unanswerable = include_unknown_rate * 4
        else:
            self.include_unknown_rate_for_unanswerable = (
                include_unknown_rate_for_unanswerable)
        self.include_html_tokens = include_html_tokens
        self.global_seq_len = global_seq_len
        self.gt_type_sentence = global_token_types[0]
        self.gt_type_cls = global_token_types[1]
        self.gt_type_question = global_token_types[2]
        self.is_train = is_train
        self.fixed_blocks = fixed_blocks
        self.fixed_block_size = fixed_block_size
        self.question_ids_in_long = True
        self.cls_in_long = True
        self.global_cls_id = self.cls_id
        # 35 corresponds to "unused34" both in BERT (uncased) and ALBERT vocabs,
        # it will be "unused35" in BERT cased.
        self.global_question_id = 35
        self.global_sentence_id = 1
        self._global_size_counter = global_size_counter
        self._long_size_counter = long_size_counter
        if global_size_threshold_counters is None:
            self._global_size_threshold_counters = []
        else:
            self._global_size_threshold_counters = global_size_threshold_counters
        self.global_sentence_counter = global_sentence_counter
        self.long_sentence_tokens_counter = long_sentence_tokens_counter