def _unicode_decode(input, input_encoding, errors, replacement_char, replace_control_characters, with_offsets): """Decodes each string into a sequence of codepoints.""" input = ragged_tensor.convert_to_tensor_or_ragged_tensor(input, name="input") input_ndims = input.shape.ndims if input_ndims is None: raise ValueError("Rank of `input` must be statically known.") if input_ndims > 1: # Convert to a ragged tensor with ragged_rank = input_ndims - 1. if not ragged_tensor.is_ragged(input): input = ragged_conversion_ops.from_tensor(input, ragged_rank=input_ndims - 1) elif input.ragged_rank < input_ndims - 1: input = input.with_flat_values( ragged_conversion_ops.from_tensor(input.flat_values, ragged_rank=input_ndims - input.ragged_rank + 1)) # Reshape the input to a flat vector, and apply the gen_string_ops op. if ragged_tensor.is_ragged(input): flat_input = array_ops.reshape(input.flat_values, [-1]) else: flat_input = array_ops.reshape(input, [-1]) if with_offsets: decode_op = gen_string_ops.unicode_decode_with_offsets else: decode_op = gen_string_ops.unicode_decode flat_result = decode_op( input=flat_input, input_encoding=input_encoding, errors=errors, replacement_char=replacement_char, replace_control_characters=replace_control_characters) if input_ndims == 0: codepoints = flat_result.char_values if with_offsets: offsets = flat_result.char_to_byte_starts else: codepoints = ragged_tensor.RaggedTensor.from_row_splits( flat_result.char_values, flat_result.row_splits) if input_ndims > 1: codepoints = input.with_flat_values(codepoints) if with_offsets: offsets = ragged_tensor.RaggedTensor.from_row_splits( flat_result.char_to_byte_starts, flat_result.row_splits) if input_ndims > 1: offsets = input.with_flat_values(offsets) if with_offsets: return codepoints, offsets else: return codepoints
def _unicode_decode(input, input_encoding, errors, replacement_char, replace_control_characters, with_offsets): """Decodes each string into a sequence of codepoints.""" input = ragged_tensor.convert_to_tensor_or_ragged_tensor(input, name="input") input_ndims = input.shape.ndims if input_ndims is None: raise ValueError("Rank of `input` must be statically known.") if input_ndims > 1: # Convert to a ragged tensor with ragged_rank = input_ndims - 1. if not ragged_tensor.is_ragged(input): input = ragged_conversion_ops.from_tensor( input, ragged_rank=input_ndims - 1) elif input.ragged_rank < input_ndims - 1: input = input.with_flat_values( ragged_conversion_ops.from_tensor( input.flat_values, ragged_rank=input_ndims - input.ragged_rank + 1)) # Reshape the input to a flat vector, and apply the gen_string_ops op. if ragged_tensor.is_ragged(input): flat_input = array_ops.reshape(input.flat_values, [-1]) else: flat_input = array_ops.reshape(input, [-1]) if with_offsets: decode_op = gen_string_ops.unicode_decode_with_offsets else: decode_op = gen_string_ops.unicode_decode flat_result = decode_op( input=flat_input, input_encoding=input_encoding, errors=errors, replacement_char=replacement_char, replace_control_characters=replace_control_characters) if input_ndims == 0: codepoints = flat_result.char_values if with_offsets: offsets = flat_result.char_to_byte_starts else: codepoints = ragged_tensor.RaggedTensor.from_row_splits( flat_result.char_values, flat_result.row_splits) if input_ndims > 1: codepoints = input.with_flat_values(codepoints) if with_offsets: offsets = ragged_tensor.RaggedTensor.from_row_splits( flat_result.char_to_byte_starts, flat_result.row_splits) if input_ndims > 1: offsets = input.with_flat_values(offsets) if with_offsets: return codepoints, offsets else: return codepoints
def tokenize_with_offsets(self, input, name=None): # pylint: disable=redefined-builtin """Tokenizes a tensor of UTF-8 strings. Args: input: A `RaggedTensor` or `Tensor` of UTF-8 strings with any shape. name: The name argument that is passed to the op function. Returns: A `RaggedTensor` of tokenized text. The returned shape is the shape of the input tensor with an added ragged dimension for tokens of each string. """ with ops.name_scope(name, "SentenceTokenizer", [input, self]): input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor( input) if input_tensor.shape.ndims is None: raise ValueError( "Rank of input_tensor must be statically known.") if ragged_tensor.is_ragged(input_tensor): # Recursively process the values of the ragged tensor (tokens, starts, limits) = self.tokenize_with_offsets(input_tensor.flat_values) tokens = input_tensor.with_flat_values(tokens) starts = input_tensor.with_flat_values(starts) limits = input_tensor.with_flat_values(limits) return (tokens, starts, limits) else: if input_tensor.shape.ndims > 1: # Convert the input tensor to ragged and process it. return self.tokenize_with_offsets( ragged_conversion_ops.from_tensor(input_tensor)) elif input_tensor.shape.ndims == 0: (tokens, starts, limits) = self.tokenize_with_offsets( array_ops.stack([input_tensor])) tokens = tokens.values starts = starts.values limits = limits.values return (tokens, starts, limits) else: # Our rank 1 tensor is the correct shape, so we can process it as # normal. (output_values, output_splits, output_offset_starts, output_offset_limits) = ( gen_sentencepiece_tokenizer. sentencepiece_tokenize_with_offsets_op( self._resource_handle, input_tensor, self.nbest_size, self.alpha, self.add_bos, self.add_eos, self.reverse, self.out_type)) tokens = RaggedTensor.from_nested_row_splits( flat_values=output_values, nested_row_splits=[output_splits], validate=False) starts = RaggedTensor.from_nested_row_splits( flat_values=output_offset_starts, nested_row_splits=[output_splits], validate=False) limits = RaggedTensor.from_nested_row_splits( flat_values=output_offset_limits, nested_row_splits=[output_splits], validate=False) return (tokens, starts, limits)
def tokenize_with_offsets(self, input): # pylint: disable=redefined-builtin """Tokenizes a tensor of UTF-8 strings on Unicode script boundaries. The strings are split when a change in the Unicode script is detected between sequential tokens. The script codes used correspond to International Components for Unicode (ICU) UScriptCode values. See: http://icu-project.org/apiref/icu4c/uscript_8h.html ICU defined whitespace characters are dropped, unless the keep_whitespace option was specified at construction time. Args: input: A `RaggedTensor`or `Tensor` of UTF-8 strings with any shape. Returns: A tuple `(tokens, start_offsets, limit_offsets)` where: * `tokens`: A `RaggedTensor` of tokenized text. * `start_offsets`: A `RaggedTensor` of the tokens' starting byte offset. * `limit_offsets`: A `RaggedTensor` of the tokens' ending byte offset. """ name = None with ops.name_scope(name, "UnicodeScriptTokenize", [input]): input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor( input) if input_tensor.shape.ndims is None: raise ValueError( "Rank of input_tensor must be statically known.") if ragged_tensor.is_ragged(input_tensor): if input_tensor.flat_values.shape.ndims > 1: # If the flat_values of our ragged tensor is multi-dimensional, we can # process it separately and our output will have the same nested # splits as our input. (tokens, starts, limits) = self.tokenize_with_offsets( input_tensor.flat_values) return (input_tensor.with_flat_values(tokens), input_tensor.with_flat_values(starts), input_tensor.with_flat_values(limits)) else: # Recursively process the values of the ragged tensor. (tokens, starts, limits) = self.tokenize_with_offsets(input_tensor.values) return (input_tensor.with_values(tokens), input_tensor.with_values(starts), input_tensor.with_values(limits)) else: if input_tensor.shape.ndims > 1: # Convert the input tensor to ragged and process it. return self.tokenize_with_offsets( ragged_conversion_ops.from_tensor(input_tensor)) elif input_tensor.shape.ndims == 0: (tokens, starts, limits) = self.tokenize_with_offsets( array_ops.stack([input_tensor])) return tokens.values, starts.values, limits.values else: # Our rank 1 tensor is the correct shape, so we can process it as # normal return self._tokenize_with_offsets_encode_decode_wrapper( input_tensor)
def _increase_ragged_rank_to(rt_input, ragged_rank): """Adds ragged dimensions to `rt_input` so it has the desired ragged rank.""" if ragged_rank > 0: if not ragged_tensor.is_ragged(rt_input): rt_input = ragged_conversion_ops.from_tensor(rt_input) if rt_input.ragged_rank < ragged_rank: rt_input = rt_input.with_values( _increase_ragged_rank_to(rt_input.values, ragged_rank - 1)) return rt_input
def _increase_ragged_rank_to(rt_input, ragged_rank): """Adds ragged dimensions to `rt_input` so it has the desired ragged rank.""" if ragged_rank > 0: if not ragged_tensor.is_ragged(rt_input): rt_input = ragged_conversion_ops.from_tensor(rt_input) if rt_input.ragged_rank < ragged_rank: rt_input = rt_input.with_values( _increase_ragged_rank_to(rt_input.values, ragged_rank - 1)) return rt_input
def tokenize_with_offsets(self, input): # pylint: disable=redefined-builtin """Tokenizes a tensor of UTF-8 strings on whitespaces. The strings are split on ICU defined whitespace characters. These whitespace characters are dropped. Args: input: A `RaggedTensor`or `Tensor` of UTF-8 strings with any shape. Returns: A tuple `(tokens, start_offsets, end_offsets)` where: * `tokens`: A `RaggedTensor` of tokenized text. * `start_offsets`: A `RaggedTensor` of the tokens' starting byte offset. * `end_offsets`: A `RaggedTensor` of the tokens' ending byte offset. """ name = None with ops.name_scope(name, "WhitespaceTokenize", [input]): input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor( input) if input_tensor.shape.ndims is None: raise ValueError( "Rank of input_tensor must be statically known.") if ragged_tensor.is_ragged(input_tensor): if input_tensor.flat_values.shape.ndims > 1: # If the flat_values of our ragged tensor is multi-dimensional, we can # process it separately and our output will have the same nested # splits as our input. (tokens, starts, ends) = self.tokenize_with_offsets( input_tensor.flat_values) return (input_tensor.with_flat_values(tokens), input_tensor.with_flat_values(starts), input_tensor.with_flat_values(ends)) else: # Recursively process the values of the ragged tensor. (tokens, starts, ends) = self.tokenize_with_offsets(input_tensor.values) return (input_tensor.with_values(tokens), input_tensor.with_values(starts), input_tensor.with_values(ends)) else: if input_tensor.shape.ndims > 1: # Convert the input tensor to ragged and process it. return self.tokenize_with_offsets( ragged_conversion_ops.from_tensor(input_tensor)) elif input_tensor.shape.ndims == 0: (tokens, starts, ends) = self.tokenize_with_offsets( array_ops.stack([input_tensor])) return tokens.values, starts.values, ends.values else: # Our rank 1 tensor is the correct shape, so we can process it as # normal. return self._whitespace_tokenize_with_offsets_encode_decode_wrapper( input_tensor)
def detokenize(self, input, name=None): # pylint: disable=redefined-builtin """Detokenizes tokens into preprocessed text. Args: input: A `RaggedTensor` or `Tensor` of UTF-8 string tokens with a rank of at least 1. name: The name argument that is passed to the op function. Returns: A N-1 dimensional string Tensor or RaggedTensor of the detokenized text. """ with ops.name_scope(name, "SentenceTokenizer", [input, self]): input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor( input) if input_tensor.shape.ndims is None: raise ValueError( "Rank of input_tensor must be statically known.") if input_tensor.shape.ndims == 0: raise ValueError("Rank of input_tensor must be at least 1.") if ragged_tensor.is_ragged(input_tensor): if input_tensor.flat_values.shape.ndims > 1: # If the flat_values of our ragged tensor is multi-dimensional, we can # process it separately and our output will have the same nested # splits as our input. tokens = self.detokenize(input_tensor.flat_values) return input_tensor.with_flat_values(tokens) elif input_tensor.ragged_rank > 1: # Recursively process the values of the ragged tensor. tokens = self.detokenize(input_tensor.values) return input_tensor.with_values(tokens) else: return gen_sentencepiece_tokenizer.sentencepiece_detokenize_op( self._model_resource.resource_handle, input_tensor.flat_values, input_tensor.row_splits, self.add_bos, self.add_eos, self.reverse) else: if input_tensor.shape.ndims > 1: # Convert the input tensor to ragged and process it. return self.detokenize( ragged_conversion_ops.from_tensor(input_tensor)) else: tokens = self.detokenize(array_ops.stack([input_tensor])) return array_ops.reshape(tokens, [])
def _ragged_tile_axis(rt_input, axis, repeats): """Tile a dimension of a RaggedTensor to match a ragged shape.""" assert axis > 0 # Outermost dimension may not be ragged. if not ragged_tensor.is_ragged(rt_input): rt_input = ragged_conversion_ops.from_tensor(rt_input, ragged_rank=1) if axis > 1: return rt_input.with_values( _ragged_tile_axis(rt_input.values, axis - 1, repeats)) else: src_row_splits = rt_input.nested_row_splits src_row_lengths = rt_input.nested_row_lengths() splits = src_row_splits[0] dst_row_lengths = [repeats] for i in range(1, len(src_row_lengths)): dst_row_lengths.append( ragged_util.repeat_ranges(src_row_lengths[i], splits, repeats)) splits = array_ops.gather(src_row_splits[i], splits) dst_values = ragged_util.repeat_ranges(rt_input.flat_values, splits, repeats) return ragged_tensor.RaggedTensor.from_nested_row_lengths( dst_values, dst_row_lengths)
def _ragged_tile_axis(rt_input, axis, repeats): """Tile a dimension of a RaggedTensor to match a ragged shape.""" assert axis > 0 # Outermost dimension may not be ragged. if not ragged_tensor.is_ragged(rt_input): rt_input = ragged_conversion_ops.from_tensor(rt_input, ragged_rank=1) if axis > 1: return rt_input.with_values( _ragged_tile_axis(rt_input.values, axis - 1, repeats)) else: src_row_splits = rt_input.nested_row_splits src_row_lengths = rt_input.nested_row_lengths() splits = src_row_splits[0] dst_row_lengths = [repeats] for i in range(1, len(src_row_lengths)): dst_row_lengths.append( ragged_util.repeat_ranges(src_row_lengths[i], splits, repeats)) splits = array_ops.gather(src_row_splits[i], splits) dst_values = ragged_util.repeat_ranges(rt_input.flat_values, splits, repeats) return ragged_tensor.RaggedTensor.from_nested_row_lengths( dst_values, dst_row_lengths)
def test_passing_simple_from_dense(self, input_list, squeeze_ranks=None): dt = constant_op.constant(input_list) rt = ragged_conversion_ops.from_tensor(dt) rt_s = ragged_squeeze_op.squeeze(rt, squeeze_ranks) dt_s = array_ops.squeeze(dt, squeeze_ranks) self.assertRaggedEqual(ragged_conversion_ops.to_tensor(rt_s), dt_s)
def _ragged_stack_concat_helper(rt_inputs, axis, stack_values): """Helper function to concatenate or stack ragged tensors. Args: rt_inputs: A list of RaggedTensors or Tensors to combine. axis: The axis along which to concatenate or stack. stack_values: A boolean -- if true, then stack values; otherwise, concatenate them. Returns: A RaggedTensor. Raises: ValueError: If rt_inputs is empty, or if axis is out of range. """ # Validate parameters. if not rt_inputs: raise ValueError('rt_inputs may not be empty.') # Convert input tensors. rt_inputs = [ ragged_tensor.convert_to_tensor_or_ragged_tensor( rt_input, name='rt_input') for rt_input in rt_inputs ] # Special case: if there's only one input, then return it as-is. if len(rt_inputs) == 1: if stack_values: return expand_dims(rt_inputs[0], axis=0) else: return rt_inputs[0] # Check the rank (number of dimensions) of the input tensors. ndims = None for rt in rt_inputs: if ndims is None: ndims = rt.shape.ndims else: rt.shape.assert_has_rank(ndims) out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1 axis = ragged_util.get_positive_axis(axis, out_ndims) # If all the inputs are Tensors, and we're combining the final dimension, # then we can delegate to the tf.stack/tf.concat operation, and return a # Tensor. if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs): if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1): if stack_values: return array_ops.stack(rt_inputs, axis) else: return array_ops.concat(rt_inputs, axis) # Convert any Tensor inputs to RaggedTensors. This makes it # possible to concatenate Tensors and RaggedTensors together. for i in range(len(rt_inputs)): if not ragged_tensor.is_ragged(rt_inputs[i]): rt_inputs[i] = ragged_conversion_ops.from_tensor( rt_inputs[i], ragged_rank=1) # Convert the input tensors to all have the same ragged_rank. ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1) rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank) for rt in rt_inputs] if axis == 0: return _ragged_stack_concat_axis_0(rt_inputs, stack_values) elif axis == 1: return _ragged_stack_concat_axis_1(rt_inputs, stack_values) else: # axis > 1: recurse. values = [rt.values for rt in rt_inputs] splits = [[rt_input.row_splits] for rt_input in rt_inputs] with ops.control_dependencies(ragged_util.assert_splits_match(splits)): return ragged_tensor.RaggedTensor.from_row_splits( _ragged_stack_concat_helper(values, axis - 1, stack_values), splits[0][0])
def boolean_mask(data, mask, keepdims=False, name=None): """Applies a boolean mask to `data`. Returns a potentially ragged tensor that is formed by retaining the elements in `data` where the corresponding value in `mask` is `True`. If `keepdims` is true then outer dimensions (corresponding to the `mask` dimensions) are preserved, and: * `output[a1...aA, i, b1...bB] = data[a1...aA, j, b1...bB]` Where `j` is the `i`th `True` entry of `mask[a1...aA]`. If `keepdims` is false, then the outer dimensions are collapsed (similar to the behavior of `tf.boolean_mask`), and: * `output[i, b1...bB] = data[a1...aA, b1...bB]` Where `(a1...aA)` is the `i`th `True` entry of `mask` (in row-major order). Args: data: A potentially ragged tensor. mask: A potentially ragged boolean tensor. `mask`'s shape must be a prefix of `data`'s shape. `rank(mask)` must be known statically. keepdims: Whether to preserve the outer dimensions (`keepdims=True`) or flatten them (`keepdims=False`). name: A name prefix for the returned tensor (optional). Returns: A potentially ragged tensor that is formed by retaining the elements in `data` where the corresponding value in `mask` is `True`. If `keepdims` is false: * `rank(output) = rank(data) - rank(mask) + 1`. * `output.ragged_rank = max(data.ragged_rank - rank(mask) + 1, 0)`. If `keepdims` is true: * `rank(output) = rank(data)`. * `output.ragged_rank = max(data.ragged_rank, rank(mask) - 1)`. Raises: ValueError: if `rank(mask)` is not known statically; or if `mask.shape` is not a prefix of `data.shape`. #### Examples: ```python >>> # Aliases for True & False so data and mask line up. >>> T, F = (True, False) >>> tf.ragged.boolean_mask( # Mask a 2D Tensor. Flatten outer dims. ... data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], ... mask=[[T, F, T], [F, F, F], [T, F, F]], ... keepdims=False).tolist() [1, 3, 7] >>> tf.ragged.boolean_mask( # Mask a 2D Tensor. Preserve outer dims. ... data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], ... mask=[[T, F, T], [F, F, F], [T, F, F]], ... keepdims=True).tolist() [[1, 3], [], [7]] >>> tf.ragged.boolean_mask( # Mask a 2D RaggedTensor. Flatten outer dims. ... tf.ragged.constant([[1, 2, 3], [4], [5, 6]]), ... tf.ragged.constant([[F, F, T], [F], [T, T]]), ... keepdims=False).tolist() [3, 5, 6] >>> tf.ragged.boolean_mask( # Mask a 2D RaggedTensor. Preserve outer dims. ... tf.ragged.constant([[1, 2, 3], [4], [5, 6]]), ... tf.ragged.constant([[F, F, T], [F], [T, T]]), ... keepdims=True).tolist() [[3], [], [5, 6]] >>> tf.ragged.boolean_mask( # Mask rows of a 2D RaggedTensor. ... tf.ragged.constant([[1, 2, 3], [4], [5, 6]]), ... tf.ragged.constant([True, False, True]), ... keepdims=True).tolist() [[1, 2, 3], [5, 6]] ``` """ with ops.name_scope(name, 'RaggedMask', [data, mask]): # Convert inputs to tensors. data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name='data') mask = ragged_tensor.convert_to_tensor_or_ragged_tensor( mask, dtypes.bool, name='mask') # Get static rank of mask. if mask.shape.ndims is None: raise ValueError('mask.shape.ndims must be kown statically.') elif mask.shape.ndims == 0: raise ValueError('mask cannot be scalar.') # If mask is ragged, then recurse with a non-ragged mask. if ragged_tensor.is_ragged(mask): if not ragged_tensor.is_ragged(data): data = ragged_conversion_ops.from_tensor( data, ragged_rank=mask.ragged_rank) # Check that mask.nested_row_splits is a prefix of # data.nested_row_splits. splits_list = [ mask.nested_row_splits, data.nested_row_splits[:mask.ragged_rank] ] with ops.control_dependencies( ragged_util.assert_splits_match(splits_list)): # Strip off ragged `splits` until `mask` is non-ragged. Keep the splits # that we strip off in `splits`, so we can add them back on after # we recursively mask the non-ragged data. splits = [] while ragged_tensor.is_ragged(mask): if mask.shape.ndims > 2: splits.append(mask.row_splits) else: # Count the number of True mask values in each row to find the # lengths of the filtered rows; then convert to splits. int_mask = ragged_functional_ops.map_flat_values( math_ops.cast, mask, dtype=dtypes.int64) masked_row_lengths = ragged_math_ops.reduce_sum(int_mask, axis=1) splits.append(ragged_util.lengths_to_splits(masked_row_lengths)) mask = mask.values data = data.values # Recursively apply the nested non-ragged mask to the nested data. masked_values = boolean_mask(data, mask, keepdims) # Add the ragged `splits` back to the result. if keepdims: masked_values = ragged_tensor.RaggedTensor.from_nested_row_splits( masked_values, splits) return masked_values # If mask is non-ragged and has rank 1, and data is ragged, then build a # ragged tensor with the indicated rows. elif ragged_tensor.is_ragged(data) and mask.shape.ndims == 1: # Get the masked splits: first get the length of each row, then filter # out the rows that we are deleting, and convert that filtered set of # masks back to a splits tensor. lengths = data.row_lengths() masked_lengths = array_ops.boolean_mask(lengths, mask) masked_splits = ragged_util.lengths_to_splits(masked_lengths) # Get the masked values: first get row ids corresponding to each # value, then use tf.gather to build a boolean mask that's false for # values that come from rows that we are deleting, and use that mask to # construct the masked values tensor. segment_ids = segment_id_ops.row_splits_to_segment_ids(data.row_splits) segment_mask = array_ops.gather(mask, segment_ids) masked_values = boolean_mask(data.values, segment_mask, keepdims=False) return ragged_tensor.RaggedTensor.from_row_splits(masked_values, masked_splits) # If mask is non-ragged and has rank>1, then convert it to be ragged, # with a ragged rank matching data. if ragged_tensor.is_ragged(data): mask = ragged_conversion_ops.from_tensor( mask, ragged_rank=min(data.ragged_rank, mask.shape.ndims - 1)) return boolean_mask(data, mask, keepdims) # Otherwise, data and mask are both `Tensor`s. else: # Apply `boolean_mask` to get the masked values. masked_values = array_ops.boolean_mask(data, mask) if mask.shape.ndims >= 2 and keepdims: # Add the innermost ragged dimension. For each innermost cell, get the # number of values it contains. Then flatten that to get a list of # cell lengths, and convert it to splits. Finally, combine the splits # and values to get the innermost ragged tensor. masked_lengths = math_ops.count_nonzero(mask, axis=-1) flattened_masked_lengths = array_ops.reshape(masked_lengths, [-1]) masked_values = ragged_tensor.RaggedTensor.from_row_lengths( masked_values, flattened_masked_lengths) # Wrap remaining ragged dimensions. if mask.shape.ndims > 2 and keepdims: mask_shape = array_ops.shape(mask, out_type=dtypes.int64) split_size = math_ops.cumprod(mask_shape) + 1 for dim in range(mask.shape.ndims - 3, -1, -1): elt_size = mask_shape[dim + 1] masked_splits = math_ops.range(split_size[dim]) * elt_size masked_values = ragged_tensor.RaggedTensor.from_row_splits( masked_values, masked_splits) return masked_values
def gather_nd(params, indices, name=None): """Gather slices from `params` using `n`-dimensional indices. This operation is similar to `gather`, but it uses the innermost dimension of `indices` to define a slice into `params`. In particular, if: * `indices` has shape `[A1...AN, I]` * `params` has shape `[B1...BM]` Then: * `result` has shape `[A1...AN, B_{I+1}...BM]`. * `result[a1...aN] = params[indices[a1...aN, :]]` Args: params: A potentially ragged tensor with shape `[A1...AN, I]`. indices: A potentially ragged tensor with shape `[B1...BM]`. name: A name for the operation (optional). Returns: A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`. #### Examples: ```python >>> params = tf.ragged.constant_value( ... [ [ ['000', '001'], ['010' ] ], ... [ ['100' ], ['110', '111', '112'], ['120'] ], ... [ [ ], ['210' ] ] ]) >>> # Gather 2D slices from a 3D tensor >>> ragged.gather_nd(params, [[2], [0]]) [ [ [ ], ['210'] ] [ ['000', '001'], ['010'] ] ] >>> # Gather 1D slices from a 3D tensor >>> ragged.gather_nd(params, [[2, 1], [0, 0]]) [['210'], ['000', '001']] >>> # Gather scalars from a 3D tensor >>> ragged.gather_nd(params, [[0, 0, 1], [1, 1, 2]]) ['001', '112'] ``` """ if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)): return array_ops.gather_nd(params, indices, name) with ops.name_scope(name, 'RaggedGatherNd', [params, indices]): params = ragged_tensor.convert_to_tensor_or_ragged_tensor( params, name='params') indices = ragged_tensor.convert_to_tensor_or_ragged_tensor( indices, name='indices') indices_shape = indices.shape indices_ndims = indices_shape.ndims if indices_ndims is None: raise ValueError('indices.rank be statically known.') if indices_ndims == 0: raise ValueError('indices.rank must be at least 1.') if (ragged_tensor.is_ragged(indices) and indices_ndims == indices.ragged_rank + 1): raise ValueError('The innermost dimension of indices may not be ragged') # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions # that each index slices into. index_size = tensor_shape.dimension_value(indices_shape[-1]) if index_size is None: raise ValueError('indices.shape[-1] must be statically known.') # If `indices` has more than 2 dimensions, then recurse. If `indices` is # dense, then we convert it to ragged before recursing, and then convert # the result back to `dense` if appropriate. if indices_ndims > 2: indices_is_dense = not ragged_tensor.is_ragged(indices) if indices_is_dense: indices = ragged_conversion_ops.from_tensor( indices, ragged_rank=indices_ndims - 2) result = indices.with_flat_values(gather_nd(params, indices.flat_values)) if (indices_is_dense and ragged_tensor.is_ragged(result) and result.ragged_rank == indices_ndims - 2): result = ragged_conversion_ops.to_tensor(result) return result # indices_ndims <= 2, and the innermost dimension of indices may not be # ragged, so `indices` must not be ragged. assert not ragged_tensor.is_ragged(indices) assert ragged_tensor.is_ragged(params) # Handle corner case: An empty index tuple selects the entire `params` # value. So if `index_size` is zero, then tile `params`. if index_size == 0: params_ndims = params.ragged_rank + array_ops.rank(params.flat_values) for dim in range(indices_ndims - 1): params = expand_dims(params, axis=0) multiples = array_ops.concat([ array_ops.shape(indices)[:-1], array_ops.ones([params_ndims], dtypes.int32) ], axis=0) return tile(params, multiples) # When index_size=1, we can just flatten the index tuples and use gather. elif index_size == 1: flattened_index_tuples = array_ops.reshape(indices, [-1]) return gather(params, flattened_index_tuples) # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor. # Flatten both the index tuples and the params, such that the flattened # index tuples point to the correct values in the flattened params; and # then use ragged.gather on the flattened index tuples & params. else: indices = math_ops.to_int64(indices) # Flatten the outermost 2 dimensions of the index tuples & params. flattened_index_tuples = array_ops.gather(params.row_splits, indices[..., 0]) flattened_index_tuples += indices[..., 1] flattened_params = params.values # Flatten any remaining dimensions. for dim in range(2, index_size): if not ragged_tensor.is_ragged(flattened_params): flattened_index_tuples = array_ops.expand_dims( flattened_index_tuples, axis=1) flattened_index_tuples = array_ops.concat( [flattened_index_tuples, indices[..., dim:]], axis=1) return array_ops.gather_nd(flattened_params, flattened_index_tuples) flattened_index_tuples = array_ops.gather( flattened_params.row_starts(), flattened_index_tuples) flattened_index_tuples += indices[..., dim] flattened_params = flattened_params.values # Gather using the flattened index tuples and params. return gather(flattened_params, flattened_index_tuples)
def batch_gather(params, indices, name=None): """Gathers slices from `params` according to `indices` with batch dims. This operation is similar to `gather`, but it assumes that the leading `N` dimensions of `indices` and `params` are batch dimensions, and performs a gather within each batch. In particular, when using this operation with `N` batch dimensions `B1...BN`: * `indices` has shape `[B1...BN, I]` * `params` has shape `[B1...BN, P1...PM]`. * `result` has shape `[B1...BN, I, P2...PM]`. * `result[b1...bN, i, p2...pM] = params[b1...bN, indices[b1...bN, i], p2...pM]` Args: params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`, `M>0`). indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`). name: A name for the operation (optional). Returns: A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`. `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`. #### Example: ```python >>> params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']]) >>> indices = tf.ragged.constant([[1, 2, 0], [], [], [0, 0]]) >>> ragged.batch_gather(params, indices) [['b', 'c', 'a'], [], [], ['e', 'e']] ``` """ if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)): return array_ops.batch_gather(params, indices, name) with ops.name_scope(name, 'RaggedBatchGather', [params, indices]): params = ragged_tensor.convert_to_tensor_or_ragged_tensor( params, name='params') indices = ragged_tensor.convert_to_tensor_or_ragged_tensor( indices, name='indices') indices_ndims = indices.shape.ndims if indices_ndims is None: raise ValueError( 'batch_gather does not allow indices with unknown shape.') if indices_ndims == 0: raise ValueError('indices.rank must be at least 1.') if ragged_tensor.is_ragged(indices): # If the outermost ragged dimension is a batch dimension, recurse. if indices_ndims > 2: if not ragged_tensor.is_ragged(params): raise ValueError('batch shape from indices does ' 'not match params shape') checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)] with ops.control_dependencies(checks): return ragged_tensor.RaggedTensor.from_row_splits( batch_gather(params.values, indices.values), indices.row_splits) # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension. else: # Ensure that `params` is ragged and has at least 2 dimensions. if not ragged_tensor.is_ragged(params): if params.shape.ndims is not None and params.shape.ndims < 2: raise ValueError('batch shape from indices does ' 'not match params shape') params = ragged_conversion_ops.from_tensor(params, ragged_rank=1) # Adjust indices from within-batch to global (in params.values), and # then use ragged.gather to gather them. num_indices = indices.row_lengths() params_starts = params.row_starts() adjustments = ragged_util.repeat(params_starts, num_indices, axis=0) adjusted_index_values = math_ops.to_int64(indices.values) + adjustments return ragged_tensor.RaggedTensor.from_row_splits( gather(params.values, adjusted_index_values), indices.row_splits) else: # params is a RaggedTensor and indices is a Tensor. if indices_ndims == 1: return gather(params, indices) elif indices_ndims == 2: # Adjust indices from batch-local to global (in params.values) adjustments = array_ops.expand_dims(params.row_starts(), 1) adjusted_indices = math_ops.to_int64(indices) + adjustments return gather(params.values, adjusted_indices) else: raise ValueError('batch shape from indices does not match params shape')
def boolean_mask(data, mask, keepdims=False, name=None): """Applies a boolean mask to `data`. Returns a potentially ragged tensor that is formed by retaining the elements in `data` where the corresponding value in `mask` is `True`. If `keepdims` is true then outer dimensions (corresponding to the `mask` dimensions) are preserved, and: * `output[a1...aA, i, b1...bB] = data[a1...aA, j, b1...bB]` Where `j` is the `i`th `True` entry of `mask[a1...aA]`. If `keepdims` is false, then the outer dimensions are collapsed (similar to the behavior of `tf.boolean_mask`), and: * `output[i, b1...bB] = data[a1...aA, b1...bB]` Where `(a1...aA)` is the `i`th `True` entry of `mask` (in row-major order). Args: data: A potentially ragged tensor. mask: A potentially ragged boolean tensor. `mask`'s shape must be a prefix of `data`'s shape. `rank(mask)` must be known statically. keepdims: Whether to preserve the outer dimensions (`keepdims=True`) or flatten them (`keepdims=False`). name: A name prefix for the returned tensor (optional). Returns: A potentially ragged tensor that is formed by retaining the elements in `data` where the corresponding value in `mask` is `True`. If `keepdims` is false: * `rank(output) = rank(data) - rank(mask) + 1`. * `output.ragged_rank = max(data.ragged_rank - rank(mask) + 1, 0)`. If `keepdims` is true: * `rank(output) = rank(data)`. * `output.ragged_rank = max(data.ragged_rank, rank(mask) - 1)`. Raises: ValueError: if `rank(mask)` is not known statically; or if `mask.shape` is not a prefix of `data.shape`. #### Examples: ```python >>> # Aliases for True & False so data and mask line up. >>> T, F = (True, False) >>> tf.ragged.boolean_mask( # Mask a 2D Tensor. Flatten outer dims. ... data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], ... mask=[[T, F, T], [F, F, F], [T, F, F]], ... keepdims=False).tolist() [1, 3, 7] >>> tf.ragged.boolean_mask( # Mask a 2D Tensor. Preserve outer dims. ... data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], ... mask=[[T, F, T], [F, F, F], [T, F, F]], ... keepdims=True).tolist() [[1, 3], [], [7]] >>> tf.ragged.boolean_mask( # Mask a 2D RaggedTensor. Flatten outer dims. ... tf.ragged.constant([[1, 2, 3], [4], [5, 6]]), ... tf.ragged.constant([[F, F, T], [F], [T, T]]), ... keepdims=False).tolist() [3, 5, 6] >>> tf.ragged.boolean_mask( # Mask a 2D RaggedTensor. Preserve outer dims. ... tf.ragged.constant([[1, 2, 3], [4], [5, 6]]), ... tf.ragged.constant([[F, F, T], [F], [T, T]]), ... keepdims=True).tolist() [[3], [], [5, 6]] >>> tf.ragged.boolean_mask( # Mask rows of a 2D RaggedTensor. ... tf.ragged.constant([[1, 2, 3], [4], [5, 6]]), ... tf.ragged.constant([True, False, True]), ... keepdims=True).tolist() [[1, 2, 3], [5, 6]] ``` """ with ops.name_scope(name, 'RaggedMask', [data, mask]): # Convert inputs to tensors. data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name='data') mask = ragged_tensor.convert_to_tensor_or_ragged_tensor( mask, dtypes.bool, name='mask') # Get static rank of mask. if mask.shape.ndims is None: raise ValueError('mask.shape.ndims must be known statically.') elif mask.shape.ndims == 0: raise ValueError('mask cannot be scalar.') # If mask is ragged, then recurse with a non-ragged mask. if ragged_tensor.is_ragged(mask): if not ragged_tensor.is_ragged(data): data = ragged_conversion_ops.from_tensor( data, ragged_rank=mask.ragged_rank) # Check that mask.nested_row_splits is a prefix of # data.nested_row_splits. splits_list = [ mask.nested_row_splits, data.nested_row_splits[:mask.ragged_rank] ] with ops.control_dependencies( ragged_util.assert_splits_match(splits_list)): # Strip off ragged `splits` until `mask` is non-ragged. Keep the splits # that we strip off in `splits`, so we can add them back on after # we recursively mask the non-ragged data. splits = [] while ragged_tensor.is_ragged(mask): if mask.shape.ndims > 2: splits.append(mask.row_splits) else: # Count the number of True mask values in each row to find the # lengths of the filtered rows; then convert to splits. int_mask = ragged_functional_ops.map_flat_values( math_ops.cast, mask, dtype=dtypes.int64) masked_row_lengths = ragged_math_ops.reduce_sum(int_mask, axis=1) splits.append(ragged_util.lengths_to_splits(masked_row_lengths)) mask = mask.values data = data.values # Recursively apply the nested non-ragged mask to the nested data. masked_values = boolean_mask(data, mask, keepdims) # Add the ragged `splits` back to the result. if keepdims: masked_values = ragged_tensor.RaggedTensor.from_nested_row_splits( masked_values, splits) return masked_values # If mask is non-ragged and has rank 1, and data is ragged, then build a # ragged tensor with the indicated rows. elif ragged_tensor.is_ragged(data) and mask.shape.ndims == 1: # Get the masked splits: first get the length of each row, then filter # out the rows that we are deleting, and convert that filtered set of # masks back to a splits tensor. lengths = data.row_lengths() masked_lengths = array_ops.boolean_mask(lengths, mask) masked_splits = ragged_util.lengths_to_splits(masked_lengths) # Get the masked values: first get row ids corresponding to each # value, then use tf.gather to build a boolean mask that's false for # values that come from rows that we are deleting, and use that mask to # construct the masked values tensor. segment_ids = segment_id_ops.row_splits_to_segment_ids(data.row_splits) segment_mask = array_ops.gather(mask, segment_ids) masked_values = boolean_mask(data.values, segment_mask, keepdims=False) return ragged_tensor.RaggedTensor.from_row_splits(masked_values, masked_splits) # If mask is non-ragged and has rank>1, then convert it to be ragged, # with a ragged rank matching data. if ragged_tensor.is_ragged(data): mask = ragged_conversion_ops.from_tensor( mask, ragged_rank=min(data.ragged_rank, mask.shape.ndims - 1)) return boolean_mask(data, mask, keepdims) # Otherwise, data and mask are both `Tensor`s. else: # Apply `boolean_mask` to get the masked values. masked_values = array_ops.boolean_mask(data, mask) if mask.shape.ndims >= 2 and keepdims: # Add the innermost ragged dimension. For each innermost cell, get the # number of values it contains. Then flatten that to get a list of # cell lengths, and convert it to splits. Finally, combine the splits # and values to get the innermost ragged tensor. masked_lengths = math_ops.count_nonzero(mask, axis=-1) flattened_masked_lengths = array_ops.reshape(masked_lengths, [-1]) masked_values = ragged_tensor.RaggedTensor.from_row_lengths( masked_values, flattened_masked_lengths) # Wrap remaining ragged dimensions. if mask.shape.ndims > 2 and keepdims: mask_shape = array_ops.shape(mask, out_type=dtypes.int64) split_size = math_ops.cumprod(mask_shape) + 1 for dim in range(mask.shape.ndims - 3, -1, -1): elt_size = mask_shape[dim + 1] masked_splits = math_ops.range(split_size[dim]) * elt_size masked_values = ragged_tensor.RaggedTensor.from_row_splits( masked_values, masked_splits) return masked_values
def unicode_encode(input, output_encoding, errors="replace", replacement_char=65533, name=None): r"""Encodes each sequence of Unicode code points in `input` into a string. `result[i1...iN]` is the string formed by concatenating the Unicode codepoints `input[1...iN, :]`, encoded using `output_encoding`. Args: input: An `N+1` dimensional potentially ragged integer tensor with shape `[D1...DN, num_chars]`. output_encoding: Unicode encoding that should be used to encode each codepoint sequence. Can be `"UTF-8"`, `"UTF-16-BE"`, or `"UTF-32-BE"`. errors: Specifies the response when an invalid codepoint is encountered (optional). One of: * `'replace'`: Replace invalid codepoint with the `replacement_char`. (default) * `'ignore'`: Skip invalid codepoints. * `'strict'`: Raise an exception for any invalid codepoint. replacement_char: The replacement character codepoint to be used in place of any invalid input when `errors='replace'`. Any valid unicode codepoint may be used. The default value is the default unicode replacement character which is 0xFFFD (U+65533). name: A name for the operation (optional). Returns: A `N` dimensional `string` tensor with shape `[D1...DN]`. #### Example: ```python >>> input = [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]] >>> unicode_encode(input, 'UTF8') ['G\xc3\xb6\xc3\xb6dnight', '\xf0\x9f\x98\x8a'] ``` """ with ops.name_scope(name, "UnicodeEncode", [input]): input_tensor = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(input) if input_tensor.shape.ndims is None: raise ValueError("Rank of input_tensor must be statically known.") if ragged_tensor.is_ragged(input_tensor): if input_tensor.inner_values.shape.ndims > 1: # If the inner_values of our ragged tensor is multi-dimensional, we can # process it separately and our output will have the same nested splits # as our input. return input_tensor.with_inner_values( unicode_encode(input_tensor.inner_values, output_encoding, errors, replacement_char)) elif input_tensor.ragged_rank > 1: # Recursively process the values of the ragged tensor. return input_tensor.with_values( unicode_encode(input_tensor.values, output_encoding, errors, replacement_char)) else: # Our ragged tensor is of the correct shape (rank 1 inner_values tensor # with ragged_rank of 1) so we can process it as normal. return gen_string_ops.unicode_encode( input_values=input_tensor.values, input_splits=input_tensor.row_splits, output_encoding=output_encoding, errors=errors, replacement_char=replacement_char) else: if input_tensor.shape.ndims == 2: # The input tensor is of the correct 2-D shape, it's just not ragged. return unicode_encode(ragged_conversion_ops.from_tensor(input_tensor), output_encoding, errors, replacement_char) elif input_tensor.shape.ndims > 2: # We need to initially flatten the input tensor to 2-D, and then can # reshape the output of our processed flattened tensor. flat_input_tensor = array_ops.reshape( input_tensor, array_ops.stack([-1, array_ops.shape(input_tensor)[-1]])) flat_output_tensor = unicode_encode(flat_input_tensor, output_encoding, errors, replacement_char) return array_ops.reshape(flat_output_tensor, input_tensor.shape[:-1]) elif input_tensor.shape.ndims == 0: raise ValueError("input_tensor's rank must be at least 1.") else: # Our input tensor is rank 1, so we create a ragged tensor with an added # dimension to create the correct input shape & type, and then remove # the additional dimension from the output and return the string scalar. ragged_input_tensor = ragged_factory_ops.from_row_splits( input_tensor, array_ops.stack([0, array_ops.shape(input_tensor, out_type=dtypes.int64)[0]])) output_tensor = unicode_encode(ragged_input_tensor, output_encoding, errors, replacement_char) return array_ops.reshape(output_tensor, [])
def unicode_encode(input, output_encoding, errors="replace", replacement_char=65533, name=None): r"""Encodes each sequence of Unicode code points in `input` into a string. `result[i1...iN]` is the string formed by concatenating the Unicode codepoints `input[1...iN, :]`, encoded using `output_encoding`. Args: input: An `N+1` dimensional potentially ragged integer tensor with shape `[D1...DN, num_chars]`. output_encoding: Unicode encoding that should be used to encode each codepoint sequence. Can be `"UTF-8"`, `"UTF-16-BE"`, or `"UTF-32-BE"`. errors: Specifies the response when an invalid codepoint is encountered (optional). One of: * `'replace'`: Replace invalid codepoint with the `replacement_char`. (default) * `'ignore'`: Skip invalid codepoints. * `'strict'`: Raise an exception for any invalid codepoint. replacement_char: The replacement character codepoint to be used in place of any invalid input when `errors='replace'`. Any valid unicode codepoint may be used. The default value is the default unicode replacement character which is 0xFFFD (U+65533). name: A name for the operation (optional). Returns: A `N` dimensional `string` tensor with shape `[D1...DN]`. #### Example: ```python >>> input = [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]] >>> unicode_encode(input, 'UTF8') ['G\xc3\xb6\xc3\xb6dnight', '\xf0\x9f\x98\x8a'] ``` """ with ops.name_scope(name, "UnicodeEncode", [input]): input_tensor = ragged_factory_ops.convert_to_tensor_or_ragged_tensor( input) if input_tensor.shape.ndims is None: raise ValueError("Rank of input_tensor must be statically known.") if ragged_tensor.is_ragged(input_tensor): if input_tensor.inner_values.shape.ndims > 1: # If the inner_values of our ragged tensor is multi-dimensional, we can # process it separately and our output will have the same nested splits # as our input. return input_tensor.with_inner_values( unicode_encode(input_tensor.inner_values, output_encoding, errors, replacement_char)) elif input_tensor.ragged_rank > 1: # Recursively process the values of the ragged tensor. return input_tensor.with_values( unicode_encode(input_tensor.values, output_encoding, errors, replacement_char)) else: # Our ragged tensor is of the correct shape (rank 1 inner_values tensor # with ragged_rank of 1) so we can process it as normal. return gen_string_ops.unicode_encode( input_values=input_tensor.values, input_splits=input_tensor.row_splits, output_encoding=output_encoding, errors=errors, replacement_char=replacement_char) else: if input_tensor.shape.ndims == 2: # The input tensor is of the correct 2-D shape, it's just not ragged. return unicode_encode( ragged_conversion_ops.from_tensor(input_tensor), output_encoding, errors, replacement_char) elif input_tensor.shape.ndims > 2: # We need to initially flatten the input tensor to 2-D, and then can # reshape the output of our processed flattened tensor. flat_input_tensor = array_ops.reshape( input_tensor, array_ops.stack([-1, array_ops.shape(input_tensor)[-1]])) flat_output_tensor = unicode_encode(flat_input_tensor, output_encoding, errors, replacement_char) return array_ops.reshape(flat_output_tensor, input_tensor.shape[:-1]) elif input_tensor.shape.ndims == 0: raise ValueError("input_tensor's rank must be at least 1.") else: # Our input tensor is rank 1, so we create a ragged tensor with an added # dimension to create the correct input shape & type, and then remove # the additional dimension from the output and return the string scalar. ragged_input_tensor = ragged_factory_ops.from_row_splits( input_tensor, array_ops.stack([ 0, array_ops.shape(input_tensor, out_type=dtypes.int64)[0] ])) output_tensor = unicode_encode(ragged_input_tensor, output_encoding, errors, replacement_char) return array_ops.reshape(output_tensor, [])
def batch_gather(params, indices, name=None): """Gathers slices from `params` according to `indices` with batch dims. This operation is similar to `gather`, but it assumes that the leading `N` dimensions of `indices` and `params` are batch dimensions, and performs a gather within each batch. In particular, when using this operation with `N` batch dimensions `B1...BN`: * `indices` has shape `[B1...BN, I]` * `params` has shape `[B1...BN, P1...PM]`. * `result` has shape `[B1...BN, I, P2...PM]`. * `result[b1...bN, i, p2...pM] = params[b1...bN, indices[b1...bN, i], p2...pM]` Args: params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`, `M>0`). indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`). name: A name for the operation (optional). Returns: A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`. `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`. #### Example: ```python >>> params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']]) >>> indices = tf.ragged.constant([[1, 2, 0], [], [], [0, 0]]) >>> tf.batch_gather(params, indices) [['b', 'c', 'a'], [], [], ['e', 'e']] ``` """ if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)): return array_ops.batch_gather(params, indices, name) with ops.name_scope(name, 'RaggedBatchGather', [params, indices]): params = ragged_tensor.convert_to_tensor_or_ragged_tensor( params, name='params') indices = ragged_tensor.convert_to_tensor_or_ragged_tensor( indices, name='indices') indices_ndims = indices.shape.ndims if indices_ndims is None: raise ValueError( 'batch_gather does not allow indices with unknown shape.') if indices_ndims == 0: raise ValueError('indices.rank must be at least 1.') if ragged_tensor.is_ragged(indices): # If the outermost ragged dimension is a batch dimension, recurse. if indices_ndims > 2: if not ragged_tensor.is_ragged(params): raise ValueError('batch shape from indices does ' 'not match params shape') checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)] with ops.control_dependencies(checks): return ragged_tensor.RaggedTensor.from_row_splits( batch_gather(params.values, indices.values), indices.row_splits) # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension. else: # Ensure that `params` is ragged and has at least 2 dimensions. if not ragged_tensor.is_ragged(params): if params.shape.ndims is not None and params.shape.ndims < 2: raise ValueError('batch shape from indices does ' 'not match params shape') params = ragged_conversion_ops.from_tensor(params, ragged_rank=1) # Adjust indices from within-batch to global (in params.values), and # then use ragged.gather to gather them. num_indices = indices.row_lengths() params_starts = params.row_starts() adjustments = ragged_util.repeat(params_starts, num_indices, axis=0) adjusted_index_values = math_ops.to_int64(indices.values) + adjustments return ragged_tensor.RaggedTensor.from_row_splits( ragged_gather_ops.gather(params.values, adjusted_index_values), indices.row_splits) else: # params is a RaggedTensor and indices is a Tensor. if indices_ndims == 1: return ragged_gather_ops.gather(params, indices) elif indices_ndims == 2: # Adjust indices from batch-local to global (in params.values) adjustments = array_ops.expand_dims(params.row_starts(), 1) adjusted_indices = math_ops.to_int64(indices) + adjustments return ragged_gather_ops.gather(params.values, adjusted_indices) else: raise ValueError('batch shape from indices does not match params shape')
def _broadcast_to_ragged_shape(rt_input, dst_shape, broadcast_inner_dimensions): """Broadcasts rt_input to the ragged shape `dst_shape`.""" # dst_shape's rank and ragged_rank must be greater than or equal to rt_input's if rt_input.shape.ndims is None or dst_shape.rank is None: raise ValueError('Unable to broadcast: unknown rank') if rt_input.shape.ndims > dst_shape.rank: raise ValueError('Incompatible with shape: rank mismatch') if (isinstance(rt_input, ragged_tensor.RaggedTensor) and rt_input.ragged_rank >= dst_shape.num_partitioned_dimensions): raise ValueError('Incompatible with shape: ragged rank mismatch') src_shape = RaggedTensorDynamicShape.from_tensor(rt_input) src_shape = src_shape.broadcast_to_rank(dst_shape.rank) # Add dimensions to rt_input so its rank and ragged_rank matches dst_shape. if dst_shape.rank > rt_input.shape.ndims: if rt_input.shape.ndims < dst_shape.num_inner_dimensions + 1: rt_input = array_ops.reshape( rt_input, array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0)) for _ in range(dst_shape.rank - rt_input.shape.ndims): if ragged_tensor.is_ragged(rt_input): nrows = rt_input.nrows() else: nrows = array_ops.shape(rt_input, out_type=dtypes.int64)[0] rt_input = ragged_tensor.RaggedTensor.from_row_lengths( rt_input, [nrows]) # Add ragged dimensions to match dst_shape. if ragged_tensor.is_ragged(rt_input): inner_rank_diff = (rt_input.flat_values.shape.ndims - 1 - dst_shape.num_inner_dimensions) if inner_rank_diff > 0: rt_input = rt_input.with_flat_values( ragged_conversion_ops.from_tensor(rt_input.flat_values, ragged_rank=inner_rank_diff)) else: rt_input = ragged_conversion_ops.from_tensor( rt_input, ragged_rank=dst_shape.num_partitioned_dimensions - 1) # Do broadcasting for any dimensions that will remain uniform. We can do # these all at once, since they're independent of one another. multiples = [1] * dst_shape.rank for axis in range(dst_shape.num_partitioned_dimensions): if not src_shape.is_ragged(axis) and not dst_shape.is_ragged(axis): src_size = src_shape.dimension_size(axis) dst_size = dst_shape.dimension_size(axis) if ((tensor_util.constant_value(src_size) in (1, None)) and (tensor_util.constant_value(dst_size) != 1)): multiples[axis] = array_ops.where(math_ops.equal(src_size, 1), dst_size, 1) if not all(isinstance(v, int) and v == 1 for v in multiples): multiples = array_ops.stack(multiples, axis=0) rt_input = ragged_array_ops.tile(rt_input, multiples) if broadcast_inner_dimensions: rt_input = rt_input.with_flat_values( array_ops.reshape( rt_input.flat_values, array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0))) # Do broadcasting for dimensions that become ragged. We must do these from # outermost to innermost. for axis in range(dst_shape.num_partitioned_dimensions): if not src_shape.is_ragged(axis) and dst_shape.is_ragged(axis): dst_size = dst_shape.dimension_size(axis) rt_input = _ragged_tile_axis(rt_input, axis, dst_size) return rt_input
def _ragged_stack_concat_helper(rt_inputs, axis, stack_values): """Helper function to concatenate or stack ragged tensors. Args: rt_inputs: A list of RaggedTensors or Tensors to combine. axis: The axis along which to concatenate or stack. stack_values: A boolean -- if true, then stack values; otherwise, concatenate them. Returns: A RaggedTensor. Raises: ValueError: If rt_inputs is empty, or if axis is out of range. """ # Validate parameters. if not rt_inputs: raise ValueError('rt_inputs may not be empty.') # Convert input tensors. rt_inputs = [ ragged_tensor.convert_to_tensor_or_ragged_tensor( rt_input, name='rt_input') for rt_input in rt_inputs ] # Special case: if there's only one input, then return it as-is. if len(rt_inputs) == 1: if stack_values: return ragged_array_ops.expand_dims(rt_inputs[0], axis=axis) else: return rt_inputs[0] # Check the rank (number of dimensions) of the input tensors. ndims = None for rt in rt_inputs: if ndims is None: ndims = rt.shape.ndims else: rt.shape.assert_has_rank(ndims) out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1 axis = ragged_util.get_positive_axis(axis, out_ndims) # If all the inputs are Tensors, and we're combining the final dimension, # then we can delegate to the tf.stack/tf.concat operation, and return a # Tensor. if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs): if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1): if stack_values: return array_ops.stack(rt_inputs, axis) else: return array_ops.concat(rt_inputs, axis) # Convert any Tensor inputs to RaggedTensors. This makes it # possible to concatenate Tensors and RaggedTensors together. for i in range(len(rt_inputs)): if not ragged_tensor.is_ragged(rt_inputs[i]): rt_inputs[i] = ragged_conversion_ops.from_tensor( rt_inputs[i], ragged_rank=1) # Convert the input tensors to all have the same ragged_rank. ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1) rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank) for rt in rt_inputs] if axis == 0: return _ragged_stack_concat_axis_0(rt_inputs, stack_values) elif axis == 1: return _ragged_stack_concat_axis_1(rt_inputs, stack_values) else: # axis > 1: recurse. values = [rt.values for rt in rt_inputs] splits = [[rt_input.row_splits] for rt_input in rt_inputs] with ops.control_dependencies(ragged_util.assert_splits_match(splits)): return ragged_tensor.RaggedTensor.from_row_splits( _ragged_stack_concat_helper(values, axis - 1, stack_values), splits[0][0])
def test_passing_simple_from_dense(self, input_list, squeeze_ranks=None): dt = constant_op.constant(input_list) rt = ragged_conversion_ops.from_tensor(dt) rt_s = ragged_squeeze_op.squeeze(rt, squeeze_ranks) dt_s = array_ops.squeeze(dt, squeeze_ranks) self.assertRaggedEqual(ragged_conversion_ops.to_tensor(rt_s), dt_s)
def gather_nd(params, indices, name=None): """Gather slices from `params` using `n`-dimensional indices. This operation is similar to `gather`, but it uses the innermost dimension of `indices` to define a slice into `params`. In particular, if: * `indices` has shape `[A1...AN, I]` * `params` has shape `[B1...BM]` Then: * `result` has shape `[A1...AN, B_{I+1}...BM]`. * `result[a1...aN] = params[indices[a1...aN, :]]` Args: params: A potentially ragged tensor with shape `[A1...AN, I]`. indices: A potentially ragged tensor with shape `[B1...BM]`. name: A name for the operation (optional). Returns: A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`. #### Examples: ```python >>> params = tf.ragged.constant_value( ... [ [ ['000', '001'], ['010' ] ], ... [ ['100' ], ['110', '111', '112'], ['120'] ], ... [ [ ], ['210' ] ] ]) >>> # Gather 2D slices from a 3D tensor >>> ragged.gather_nd(params, [[2], [0]]) [ [ [ ], ['210'] ] [ ['000', '001'], ['010'] ] ] >>> # Gather 1D slices from a 3D tensor >>> ragged.gather_nd(params, [[2, 1], [0, 0]]) [['210'], ['000', '001']] >>> # Gather scalars from a 3D tensor >>> ragged.gather_nd(params, [[0, 0, 1], [1, 1, 2]]) ['001', '112'] ``` """ if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)): return array_ops.gather_nd(params, indices, name) with ops.name_scope(name, 'RaggedGatherNd', [params, indices]): params = ragged_tensor.convert_to_tensor_or_ragged_tensor( params, name='params') indices = ragged_tensor.convert_to_tensor_or_ragged_tensor( indices, name='indices') indices_shape = indices.shape indices_ndims = indices_shape.ndims if indices_ndims is None: raise ValueError('indices.rank be statically known.') if indices_ndims == 0: raise ValueError('indices.rank must be at least 1.') if (ragged_tensor.is_ragged(indices) and indices_ndims == indices.ragged_rank + 1): raise ValueError('The innermost dimension of indices may not be ragged') # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions # that each index slices into. index_size = tensor_shape.dimension_value(indices_shape[-1]) if index_size is None: raise ValueError('indices.shape[-1] must be statically known.') # If `indices` has more than 2 dimensions, then recurse. If `indices` is # dense, then we convert it to ragged before recursing, and then convert # the result back to `dense` if appropriate. if indices_ndims > 2: indices_is_dense = not ragged_tensor.is_ragged(indices) if indices_is_dense: indices = ragged_conversion_ops.from_tensor( indices, ragged_rank=indices_ndims - 2) result = indices.with_flat_values(gather_nd(params, indices.flat_values)) if (indices_is_dense and ragged_tensor.is_ragged(result) and result.ragged_rank == indices_ndims - 2): result = ragged_conversion_ops.to_tensor(result) return result # indices_ndims <= 2, and the innermost dimension of indices may not be # ragged, so `indices` must not be ragged. assert not ragged_tensor.is_ragged(indices) assert ragged_tensor.is_ragged(params) # Handle corner case: An empty index tuple selects the entire `params` # value. So if `index_size` is zero, then tile `params`. if index_size == 0: params_ndims = params.ragged_rank + array_ops.rank(params.flat_values) for dim in range(indices_ndims - 1): params = ragged_array_ops.expand_dims(params, axis=0) multiples = array_ops.concat([ array_ops.shape(indices)[:-1], array_ops.ones([params_ndims], dtypes.int32) ], axis=0) return ragged_array_ops.tile(params, multiples) # When index_size=1, we can just flatten the index tuples and use gather. elif index_size == 1: flattened_index_tuples = array_ops.reshape(indices, [-1]) return gather(params, flattened_index_tuples) # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor. # Flatten both the index tuples and the params, such that the flattened # index tuples point to the correct values in the flattened params; and # then use ragged.gather on the flattened index tuples & params. else: indices = math_ops.to_int64(indices) # Flatten the outermost 2 dimensions of the index tuples & params. flattened_index_tuples = array_ops.gather(params.row_splits, indices[..., 0]) flattened_index_tuples += indices[..., 1] flattened_params = params.values # Flatten any remaining dimensions. for dim in range(2, index_size): if not ragged_tensor.is_ragged(flattened_params): flattened_index_tuples = array_ops.expand_dims( flattened_index_tuples, axis=1) flattened_index_tuples = array_ops.concat( [flattened_index_tuples, indices[..., dim:]], axis=1) return array_ops.gather_nd(flattened_params, flattened_index_tuples) flattened_index_tuples = array_ops.gather( flattened_params.row_starts(), flattened_index_tuples) flattened_index_tuples += indices[..., dim] flattened_params = flattened_params.values # Gather using the flattened index tuples and params. return gather(flattened_params, flattened_index_tuples)
def tokenize_with_offsets(self, input, name=None): # pylint: disable=redefined-builtin """Tokenizes a tensor of UTF-8 strings. Args: input: A `RaggedTensor` or `Tensor` of UTF-8 strings with any shape. name: The name argument that is passed to the op function. Returns: A tuple `(tokens, start_offsets, end_offsets)` where: * `tokens` is an N+1-dimensional UTF-8 string or integer `Tensor` or `RaggedTensor`. * `start_offsets` is an N+1-dimensional integer `Tensor` or `RaggedTensor` containing the starting indices of each token (byte indices for input strings). * `end_offsets` is an N+1-dimensional integer `Tensor` or `RaggedTensor` containing the exclusive ending indices of each token (byte indices for input strings). """ with ops.name_scope(name, "SentenceTokenizer", [input, self]): input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor( input) if input_tensor.shape.ndims is None: raise ValueError( "Rank of input_tensor must be statically known.") if ragged_tensor.is_ragged(input_tensor): # Recursively process the values of the ragged tensor (tokens, starts, ends) = self.tokenize_with_offsets(input_tensor.flat_values) tokens = input_tensor.with_flat_values(tokens) starts = input_tensor.with_flat_values(starts) ends = input_tensor.with_flat_values(ends) return (tokens, starts, ends) else: if input_tensor.shape.ndims > 1: # Convert the input tensor to ragged and process it. return self.tokenize_with_offsets( ragged_conversion_ops.from_tensor(input_tensor)) elif input_tensor.shape.ndims == 0: (tokens, starts, ends) = self.tokenize_with_offsets( array_ops.stack([input_tensor])) tokens = tokens.values starts = starts.values ends = ends.values return (tokens, starts, ends) else: # Our rank 1 tensor is the correct shape, so we can process it as # normal. (output_values, output_splits, output_offset_starts, output_offset_ends) = ( gen_sentencepiece_tokenizer. sentencepiece_tokenize_with_offsets_op( self._model_resource.resource_handle, input_tensor, self.nbest_size, self.alpha, self.add_bos, self.add_eos, self.reverse, self.enable_sampling, self.out_type)) tokens = RaggedTensor.from_nested_row_splits( flat_values=output_values, nested_row_splits=[output_splits], validate=False) starts = RaggedTensor.from_nested_row_splits( flat_values=output_offset_starts, nested_row_splits=[output_splits], validate=False) ends = RaggedTensor.from_nested_row_splits( flat_values=output_offset_ends, nested_row_splits=[output_splits], validate=False) return (tokens, starts, ends)
def find_source_offsets(offsets_map, input_offsets, name=None): """Maps the input post-normalized string offsets to pre-normalized offsets. Returns the source (i.e. pre-normalized) string offsets mapped from the input post-normalized string offsets using the input offsets_map, which is an output from the `normalize_utf8_with_offsets_map` op. offsets_map can be indexed or sliced along with the input_offsets. Example usage: >>> post_normalized_str, offsets_map = normalize_utf8_with_offsets_map( ... ["株式会社", "KADOKAWA"]) >>> find_source_offsets(offsets_map, [[0, 1, 2], [0, 1, 2]]) <tf.Tensor: shape=(2, 3), dtype=int64, numpy=array([[0, 1, 2], [0, 3, 6]])> >>> find_source_offsets(offsets_map[1], [[0, 1, 2]]) # indexed offsets_map <tf.Tensor: shape=(1, 3), dtype=int64, numpy=array([[0, 3, 6]])> Args: offsets_map: A `Tensor` or `RaggedTensor` of type `variant`, used to map the post-normalized string offsets to pre-normalized string offsets. offsets_map is an output from `normalize_utf8_with_offsets_map` function. input_offsets: A `Tensor` or `RaggedTensor` of type int64 representing the the post-normalized string offsets, name: The name for this op (optional). Returns: results: A `Tensor` or `RaggedTensor` of type int64, with pre-normalized string offsets. """ with ops.name_scope(name, "FindSourceOffsets", [offsets_map, input_offsets]): offsets_map_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor( offsets_map, dtype=dtypes.variant) input_offsets_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor( input_offsets, dtype=dtypes.int64) if ragged_tensor.is_ragged(input_offsets_tensor): if ragged_tensor.is_ragged(offsets_map_tensor): offsets_map_values = offsets_map_tensor.flat_values else: offsets_map_values = array_ops.reshape(offsets_map_tensor, [-1]) output_values = gen_normalize_ops.find_source_offsets( offsets_map=offsets_map_values, input_offsets_values=input_offsets_tensor.flat_values, input_offsets_splits=input_offsets_tensor.nested_row_splits[-1]) return input_offsets_tensor.with_flat_values(output_values) else: if input_offsets_tensor.shape.ndims > 1: output_offsets = find_source_offsets( offsets_map, ragged_conversion_ops.from_tensor( input_offsets_tensor, ragged_rank=input_offsets_tensor.shape.ndims - 1)) return ragged_conversion_ops.to_tensor(output_offsets) elif input_offsets_tensor.shape.ndims == 0: output_offsets = find_source_offsets( offsets_map, array_ops.expand_dims(input_offsets_tensor, 0)) return output_offsets[0] else: output_offsets = find_source_offsets( offsets_map, array_ops.expand_dims(input_offsets_tensor, 0)) return array_ops.squeeze(output_offsets, [0])
def _broadcast_to_ragged_shape(rt_input, dst_shape, broadcast_inner_dimensions): """Broadcasts rt_input to the ragged shape `dst_shape`.""" # dst_shape's rank and ragged_rank must be greater than or equal to rt_input's if rt_input.shape.ndims is None or dst_shape.rank is None: raise ValueError('Unable to broadcast: unknown rank') if rt_input.shape.ndims > dst_shape.rank: raise ValueError('Incompatible with shape: rank mismatch') if (isinstance(rt_input, ragged_tensor.RaggedTensor) and rt_input.ragged_rank >= dst_shape.num_partitioned_dimensions): raise ValueError('Incompatible with shape: ragged rank mismatch') src_shape = RaggedTensorDynamicShape.from_tensor(rt_input) src_shape = src_shape.broadcast_to_rank(dst_shape.rank) # Add dimensions to rt_input so its rank and ragged_rank matches dst_shape. if dst_shape.rank > rt_input.shape.ndims: if rt_input.shape.ndims < dst_shape.num_inner_dimensions + 1: rt_input = array_ops.reshape( rt_input, array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0)) for _ in range(dst_shape.rank - rt_input.shape.ndims): if ragged_tensor.is_ragged(rt_input): nrows = rt_input.nrows() else: nrows = array_ops.shape(rt_input, out_type=dtypes.int64)[0] rt_input = ragged_tensor.RaggedTensor.from_row_lengths(rt_input, [nrows]) # Add ragged dimensions to match dst_shape. if ragged_tensor.is_ragged(rt_input): inner_rank_diff = ( rt_input.flat_values.shape.ndims - 1 - dst_shape.num_inner_dimensions) if inner_rank_diff > 0: rt_input = rt_input.with_flat_values( ragged_conversion_ops.from_tensor( rt_input.flat_values, ragged_rank=inner_rank_diff)) else: rt_input = ragged_conversion_ops.from_tensor( rt_input, ragged_rank=dst_shape.num_partitioned_dimensions - 1) # Do broadcasting for any dimensions that will remain uniform. We can do # these all at once, since they're independent of one another. multiples = [1] * dst_shape.rank for axis in range(dst_shape.num_partitioned_dimensions): if not src_shape.is_ragged(axis) and not dst_shape.is_ragged(axis): src_size = src_shape.dimension_size(axis) dst_size = dst_shape.dimension_size(axis) if ((tensor_util.constant_value(src_size) in (1, None)) and (tensor_util.constant_value(dst_size) != 1)): multiples[axis] = array_ops.where( math_ops.equal(src_size, 1), dst_size, 1) if not all(isinstance(v, int) and v == 1 for v in multiples): multiples = array_ops.stack(multiples, axis=0) rt_input = ragged_array_ops.tile(rt_input, multiples) if broadcast_inner_dimensions: rt_input = rt_input.with_flat_values( array_ops.reshape( rt_input.flat_values, array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0))) # Do broadcasting for dimensions that become ragged. We must do these from # outermost to innermost. for axis in range(dst_shape.num_partitioned_dimensions): if not src_shape.is_ragged(axis) and dst_shape.is_ragged(axis): dst_size = dst_shape.dimension_size(axis) rt_input = _ragged_tile_axis(rt_input, axis, dst_size) return rt_input