def testToWordsWithSpacer(self, tokens, expected): tokens = tf.constant(tokens) expected = tf.constant(expected) words = text.tokens_to_words(tokens, subword_token="▁", is_spacer=True) words = words.to_tensor() words, expected = self.evaluate([words, expected]) self.assertAllEqual(words, expected)
def testToWordsWithJoiner(self, tokens, expected): tokens = tf.constant(tokens) expected = tf.constant(expected) words = text.tokens_to_words(tokens) words = words.to_tensor() words, expected = self.evaluate([words, expected]) self.assertAllEqual(words, expected)
def __call__(self, tokens, sequence_length=None, keep_shape=False): """Applies noise on :obj:`tokens`. Args: tokens: A string ``tf.Tensor`` or batch of string ``tf.Tensor``. sequence_length: When :obj:`tokens` is ND, the length of each sequence in the batch. keep_shape: Ensure that the shape is kept. Otherwise, fit the shape to the new lengths. Returns: A tuple with the noisy version of :obj:`tokens` and the new lengths. """ rank = tokens.shape.ndims if rank == 1: input_length = tf.shape(tokens)[0] if sequence_length is not None: tokens = tokens[:sequence_length] else: tokens = tokens[:tf.math.count_nonzero(tokens)] words = text.tokens_to_words(tokens, subword_token=self.subword_token, is_spacer=self.is_spacer) words = words.to_tensor() for noise in self.noises: words = noise(words) outputs = tf.RaggedTensor.from_tensor(words, padding="").flat_values output_length = tf.shape(outputs)[0] if keep_shape: outputs = tf.pad(outputs, [[0, input_length - output_length]]) return outputs, output_length elif rank == 2: if sequence_length is None: raise ValueError( "sequence_length must be passed for 2D inputs") tokens, sequence_length = tf.map_fn( lambda arg: self(*arg, keep_shape=True), (tokens, sequence_length), back_prop=False) if not keep_shape: tokens = tokens[:, :tf.reduce_max(sequence_length)] return tokens, sequence_length else: if sequence_length is None: raise ValueError( "sequence_length must be passed for ND inputs") original_shape = misc.shape_list(tokens) tokens = tf.reshape(tokens, [-1, original_shape[-1]]) sequence_length = tf.reshape(sequence_length, [-1]) tokens, sequence_length = self(tokens, sequence_length, keep_shape=keep_shape) tokens = tf.reshape(tokens, original_shape[:-1] + [-1]) sequence_length = tf.reshape(sequence_length, original_shape[:-1]) return tokens, sequence_length
def _call(self, tokens, sequence_length, keep_shape): rank = tokens.shape.ndims if rank == 1: input_length = tf.shape(tokens)[0] if sequence_length is not None: tokens = tokens[:sequence_length] else: tokens = tokens[:tf.math.count_nonzero(tokens)] words = text.tokens_to_words(tokens, subword_token=self.subword_token, is_spacer=self.is_spacer) words = words.to_tensor() for noise in self.noises: words = noise(words) outputs = tf.RaggedTensor.from_tensor(words, padding="").flat_values output_length = tf.shape(outputs)[0] if keep_shape: outputs = tf.pad(outputs, [[0, input_length - output_length]]) return outputs, output_length elif rank == 2: if sequence_length is None: raise ValueError( "sequence_length must be passed for 2D inputs") tokens, sequence_length = tf.map_fn( lambda arg: self._call(*arg, keep_shape=True), (tokens, sequence_length), back_prop=False) if not keep_shape: tokens = tokens[:, :tf.reduce_max(sequence_length)] return tokens, sequence_length else: if sequence_length is None: raise ValueError( "sequence_length must be passed for ND inputs") original_shape = misc.shape_list(tokens) tokens = tf.reshape(tokens, [-1, original_shape[-1]]) sequence_length = tf.reshape(sequence_length, [-1]) tokens, sequence_length = self._call(tokens, sequence_length, keep_shape=keep_shape) tokens = tf.reshape(tokens, original_shape[:-1] + [-1]) sequence_length = tf.reshape(sequence_length, original_shape[:-1]) return tokens, sequence_length
def testToWordsWithSpacer(self, tokens, expected): expected = tf.nest.map_structure(tf.compat.as_bytes, expected) tokens = tf.constant(tokens) words = text.tokens_to_words(tokens, subword_token="▁", is_spacer=True) self.assertAllEqual(words.to_list(), expected)
def testToWordsWithJoiner(self, tokens, expected): expected = tf.nest.map_structure(tf.compat.as_bytes, expected) tokens = tf.constant(tokens) words = text.tokens_to_words(tokens) self.assertAllEqual(words.to_list(), expected)