Ejemplo n.º 1
0
 def test_special_tokens_partial(self):
     vocab_file = self._make_vocab_file(["[PAD]", "[CLS]", "[SEP]"])
     bert_tokenize = text_layers.BertTokenizer(vocab_file=vocab_file,
                                               lower_case=True)
     self.assertDictEqual(bert_tokenize.get_special_tokens_dict(),
                          dict(padding_id=0,
                               start_of_sequence_id=1,
                               end_of_segment_id=2))  # No mask_id,
Ejemplo n.º 2
0
 def test_special_tokens_complete(self):
   vocab_file = self._make_vocab_file(
       ["foo", "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "xy"])
   bert_tokenize = text_layers.BertTokenizer(
       vocab_file=vocab_file, lower_case=True)
   self.assertDictEqual(bert_tokenize.get_special_tokens_dict(),
                        dict(padding_id=1,
                             start_of_sequence_id=3,
                             end_of_segment_id=4,
                             mask_id=5,
                             vocab_size=7))
Ejemplo n.º 3
0
 def test_cased(self):
   vocab_file = self._make_vocab_file(
       ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "d", "##ef", "abc", "ABC"])
   bert_tokenize = text_layers.BertTokenizer(
       vocab_file=vocab_file, lower_case=False, tokenize_with_offsets=True)
   inputs = tf.constant(["abc def", "ABC DEF"])
   token_ids, start_offsets, limit_offsets = bert_tokenize(inputs)
   self.assertAllEqual(token_ids, tf.ragged.constant([[[6], [4, 5]],
                                                      [[7], [1]]]))
   self.assertAllEqual(start_offsets, tf.ragged.constant([[[0], [4, 5]],
                                                          [[0], [4]]]))
   self.assertAllEqual(limit_offsets, tf.ragged.constant([[[3], [5, 7]],
                                                          [[3], [7]]]))
Ejemplo n.º 4
0
 def test_uncased(self):
   vocab_file = self._make_vocab_file(
       ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "d", "##ef", "abc", "xy"])
   bert_tokenize = text_layers.BertTokenizer(
       vocab_file=vocab_file, lower_case=True)
   inputs = tf.constant(["abc def", "ABC DEF d"])
   token_ids = bert_tokenize(inputs)
   self.assertAllEqual(token_ids, tf.ragged.constant([[[6], [4, 5]],
                                                      [[6], [4, 5], [4]]]))
   bert_tokenize.tokenize_with_offsets = True
   token_ids_2, start_offsets, limit_offsets = bert_tokenize(inputs)
   self.assertAllEqual(token_ids, token_ids_2)
   self.assertAllEqual(start_offsets, tf.ragged.constant([[[0], [4, 5]],
                                                          [[0], [4, 5], [8]]]))
   self.assertAllEqual(limit_offsets, tf.ragged.constant([[[3], [5, 7]],
                                                          [[3], [5, 7], [9]]]))
   self.assertEqual(bert_tokenize.vocab_size.numpy(), 8)
Ejemplo n.º 5
0
 def input_fn():
   with tf.init_scope():
     self.assertFalse(tf.executing_eagerly())
   # Build a preprocessing Model.
   sentences = tf.keras.layers.Input(shape=[], dtype=tf.string)
   bert_tokenizer = text_layers.BertTokenizer(
       vocab_file=vocab_file, lower_case=True)
   special_tokens_dict = bert_tokenizer.get_special_tokens_dict()
   for k, v in special_tokens_dict.items():
     self.assertIsInstance(v, int, "Unexpected type for {}".format(k))
   tokens = bert_tokenizer(sentences)
   packed_inputs = text_layers.BertPackInputs(
       4, special_tokens_dict=special_tokens_dict)(tokens)
   preprocessing = tf.keras.Model(sentences, packed_inputs)
   # Map the dataset.
   ds = tf.data.Dataset.from_tensors(
       (tf.constant(["abc", "DEF"]), tf.constant([0, 1])))
   ds = ds.map(lambda features, labels: (preprocessing(features), labels))
   return ds