Beispiel #1
0
 def test_saving(self):
     sentencepiece_tokenizer = text_layers.SentencepieceTokenizer(
         model_file_path=self._spm_path, lower_case=True, nbest_size=0)
     inputs = tf.keras.layers.Input([], dtype=tf.string)
     outputs = sentencepiece_tokenizer(inputs)
     model = tf.keras.Model(inputs, outputs)
     export_path = tempfile.mkdtemp(dir=self.get_temp_dir())
     model.save(export_path, signatures={})
Beispiel #2
0
 def test_special_tokens(self):
   sentencepiece_tokenizer = text_layers.SentencepieceTokenizer(
       model_file_path=self._spm_path, lower_case=True, nbest_size=0)
   self.assertDictEqual(sentencepiece_tokenizer.get_special_tokens_dict(),
                        dict(padding_id=0,
                             start_of_sequence_id=2,
                             end_of_segment_id=3,
                             mask_id=4,
                             vocab_size=16))
Beispiel #3
0
    def test_fail_on_tokenize_with_offsets_and_strip_diacritics(self):
        # Raise an error in init().
        with self.assertRaises(ValueError):
            text_layers.SentencepieceTokenizer(model_file_path=self._spm_path,
                                               tokenize_with_offsets=True,
                                               lower_case=True,
                                               nbest_size=0,
                                               strip_diacritics=True)

        sentencepiece_tokenizer = text_layers.SentencepieceTokenizer(
            model_file_path=self._spm_path,
            lower_case=True,
            nbest_size=0,
            strip_diacritics=True)
        sentencepiece_tokenizer.tokenize_with_offsets = True

        # Raise an error in call():
        inputs = tf.constant(["abc def", "ABC DEF d", "Äffin"])
        with self.assertRaises(ValueError):
            sentencepiece_tokenizer(inputs)
Beispiel #4
0
 def test_strip_diacritics(self):
     sentencepiece_tokenizer = text_layers.SentencepieceTokenizer(
         model_file_path=self._spm_path,
         lower_case=True,
         nbest_size=0,
         strip_diacritics=True)
     inputs = tf.constant(["a b c d e", "ă ḅ č ḓ é"])
     token_ids = sentencepiece_tokenizer(inputs)
     self.assertAllEqual(
         token_ids,
         tf.ragged.constant([[7, 9, 10, 11, 13], [7, 9, 10, 11, 13]]))
Beispiel #5
0
 def test_serialize_deserialize(self):
     self.skipTest("b/170480226")
     sentencepiece_tokenizer = text_layers.SentencepieceTokenizer(
         model_file_path=self._spm_path,
         lower_case=False,
         nbest_size=0,
         tokenize_with_offsets=False,
         name="sentencepiece_tokenizer_layer")
     config = sentencepiece_tokenizer.get_config()
     new_tokenizer = text_layers.SentencepieceTokenizer.from_config(config)
     self.assertEqual(config, new_tokenizer.get_config())
     inputs = tf.constant(["abc def", "ABC DEF d"])
     token_ids = sentencepiece_tokenizer(inputs)
     token_ids_2 = new_tokenizer(inputs)
     self.assertAllEqual(token_ids, token_ids_2)
Beispiel #6
0
    def test_uncased(self):
        sentencepiece_tokenizer = text_layers.SentencepieceTokenizer(
            model_file_path=self._spm_path, lower_case=True, nbest_size=0)

        inputs = tf.constant(["abc def", "ABC DEF d"])
        token_ids = sentencepiece_tokenizer(inputs)
        self.assertAllEqual(token_ids,
                            tf.ragged.constant([[8, 12], [8, 12, 11]]))
        sentencepiece_tokenizer.tokenize_with_offsets = True
        token_ids_2, start_offsets, limit_offsets = sentencepiece_tokenizer(
            inputs)
        self.assertAllEqual(token_ids, token_ids_2)
        self.assertAllEqual(start_offsets,
                            tf.ragged.constant([[0, 3], [0, 3, 7]]))
        self.assertAllEqual(limit_offsets,
                            tf.ragged.constant([[3, 7], [3, 7, 9]]))
        self.assertEqual(sentencepiece_tokenizer.vocab_size.numpy(), 16)
Beispiel #7
0
 def input_fn():
   with tf.init_scope():
     self.assertFalse(tf.executing_eagerly())
   # Build a preprocessing Model.
   sentences = tf.keras.layers.Input(shape=[], dtype=tf.string)
   sentencepiece_tokenizer = text_layers.SentencepieceTokenizer(
       model_file_path=self._spm_path, lower_case=True, nbest_size=0)
   special_tokens_dict = sentencepiece_tokenizer.get_special_tokens_dict()
   for k, v in special_tokens_dict.items():
     self.assertIsInstance(v, int, "Unexpected type for {}".format(k))
   tokens = sentencepiece_tokenizer(sentences)
   packed_inputs = text_layers.BertPackInputs(
       4, special_tokens_dict=special_tokens_dict)(tokens)
   preprocessing = tf.keras.Model(sentences, packed_inputs)
   # Map the dataset.
   ds = tf.data.Dataset.from_tensors(
       (tf.constant(["abc", "DEF"]), tf.constant([0, 1])))
   ds = ds.map(lambda features, labels: (preprocessing(features), labels))
   return ds