def testRaggedExpandDims(self, rt_input, axis, expected, ragged_rank=None, expected_shape=None): rt = ragged_factory_ops.constant(rt_input, ragged_rank=ragged_rank) expanded = ragged_array_ops.expand_dims(rt, axis=axis) self.assertEqual(expanded.shape.ndims, rt.shape.ndims + 1) if expected_shape is not None: self.assertEqual(expanded.shape.as_list(), expected_shape) self.assertAllEqual(expanded, expected)
def testRaggedExpandDims(self, rt_input, axis, expected, ragged_rank=None, expected_shape=None): rt = ragged_factory_ops.constant(rt_input, ragged_rank=ragged_rank) expanded = ragged_array_ops.expand_dims(rt, axis=axis) self.assertEqual(expanded.shape.ndims, rt.shape.ndims + 1) if expected_shape is not None: self.assertEqual(expanded.shape.as_list(), expected_shape) self.assertRaggedEqual(expanded, expected)
def unicode_split(input, input_encoding, errors="replace", replacement_char=0xFFFD, name=None): r"""Splits each string in `input` into a sequence of Unicode code points. `result[i1...iN, j]` is the substring of `input[i1...iN]` that encodes its `j`th character, when decoded using `input_encoding`. Args: input: An `N` dimensional potentially ragged `string` tensor with shape `[D1...DN]`. `N` must be statically known. input_encoding: String name for the unicode encoding that should be used to decode each string. errors: Specifies the response when an input string can't be converted using the indicated encoding. One of: * `'strict'`: Raise an exception for any illegal substrings. * `'replace'`: Replace illegal substrings with `replacement_char`. * `'ignore'`: Skip illegal substrings. replacement_char: The replacement codepoint to be used in place of invalid substrings in `input` when `errors='replace'`. name: A name for the operation (optional). Returns: A `N+1` dimensional `int32` tensor with shape `[D1...DN, (num_chars)]`. The returned tensor is a `tf.Tensor` if `input` is a scalar, or a `tf.RaggedTensor` otherwise. #### Example: ```python >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')] >>> tf.strings.unicode_split(input, 'UTF-8').tolist() [['G', '\xc3\xb6', '\xc3\xb6', 'd', 'n', 'i', 'g', 'h', 't'], ['\xf0\x9f\x98\x8a']] ``` """ with ops.name_scope(name, "UnicodeSplit", [input]): codepoints = _unicode_decode(input, input_encoding, errors, replacement_char, False, with_offsets=False) return unicode_encode(ragged_array_ops.expand_dims(codepoints, -1), output_encoding=input_encoding, errors=errors, replacement_char=replacement_char)
def unicode_split(input, input_encoding, errors="replace", replacement_char=0xFFFD, name=None): r"""Splits each string in `input` into a sequence of Unicode code points. `result[i1...iN, j]` is the substring of `input[i1...iN]` that encodes its `j`th character, when decoded using `input_encoding`. Args: input: An `N` dimensional potentially ragged `string` tensor with shape `[D1...DN]`. `N` must be statically known. input_encoding: String name for the unicode encoding that should be used to decode each string. errors: Specifies the response when an input string can't be converted using the indicated encoding. One of: * `'strict'`: Raise an exception for any illegal substrings. * `'replace'`: Replace illegal substrings with `replacement_char`. * `'ignore'`: Skip illegal substrings. replacement_char: The replacement codepoint to be used in place of invalid substrings in `input` when `errors='replace'`. name: A name for the operation (optional). Returns: A `N+1` dimensional `int32` tensor with shape `[D1...DN, (num_chars)]`. The returned tensor is a `tf.Tensor` if `input` is a scalar, or a `tf.RaggedTensor` otherwise. #### Example: ```python >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')] >>> tf.strings.unicode_split(input, 'UTF-8').tolist() [['G', '\xc3\xb6', '\xc3\xb6', 'd', 'n', 'i', 'g', 'h', 't'], ['\xf0\x9f\x98\x8a']] ``` """ with ops.name_scope(name, "UnicodeSplit", [input]): codepoints = _unicode_decode(input, input_encoding, errors, replacement_char, False, with_offsets=False) return unicode_encode( ragged_array_ops.expand_dims(codepoints, -1), output_encoding=input_encoding, errors=errors, replacement_char=replacement_char)
def _ragged_stack_concat_helper(rt_inputs, axis, stack_values): """Helper function to concatenate or stack ragged tensors. Args: rt_inputs: A list of RaggedTensors or Tensors to combine. axis: The axis along which to concatenate or stack. stack_values: A boolean -- if true, then stack values; otherwise, concatenate them. Returns: A RaggedTensor. Raises: ValueError: If rt_inputs is empty, or if axis is out of range. """ # Validate parameters. if not rt_inputs: raise ValueError('rt_inputs may not be empty.') # Convert input tensors. rt_inputs = [ ragged_tensor.convert_to_tensor_or_ragged_tensor(rt_input, name='rt_input') for rt_input in rt_inputs ] row_splits_dtype, rt_inputs = ragged_tensor.match_row_splits_dtypes( *rt_inputs, return_dtype=True) rt_inputs = list(rt_inputs) # Special case: if there's only one input, then return it as-is. if len(rt_inputs) == 1: if stack_values: return ragged_array_ops.expand_dims(rt_inputs[0], axis=axis) else: return rt_inputs[0] # Check the rank (number of dimensions) of the input tensors. ndims = None for rt in rt_inputs: if ndims is None: ndims = rt.shape.ndims else: rt.shape.assert_has_rank(ndims) out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1 axis = ragged_util.get_positive_axis(axis, out_ndims) # If all the inputs are Tensors, and we're combining the final dimension, # then we can delegate to the tf.stack/tf.concat operation, and return a # Tensor. if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs): if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1): if stack_values: return array_ops.stack(rt_inputs, axis) else: return array_ops.concat(rt_inputs, axis) # Convert any Tensor inputs to RaggedTensors. This makes it # possible to concatenate Tensors and RaggedTensors together. for i in range(len(rt_inputs)): if not ragged_tensor.is_ragged(rt_inputs[i]): rt_inputs[i] = ragged_tensor.RaggedTensor.from_tensor( rt_inputs[i], ragged_rank=1, row_splits_dtype=row_splits_dtype) # Convert the input tensors to all have the same ragged_rank. ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1) rt_inputs = [ _increase_ragged_rank_to(rt, ragged_rank, row_splits_dtype) for rt in rt_inputs ] if axis == 0: return _ragged_stack_concat_axis_0(rt_inputs, stack_values) elif axis == 1: return _ragged_stack_concat_axis_1(rt_inputs, stack_values) else: # axis > 1: recurse. values = [rt.values for rt in rt_inputs] splits = [[rt_input.row_splits] for rt_input in rt_inputs] with ops.control_dependencies(ragged_util.assert_splits_match(splits)): return ragged_tensor.RaggedTensor.from_row_splits( _ragged_stack_concat_helper(values, axis - 1, stack_values), splits[0][0], validate=False)
def gather_nd(params, indices, batch_dims=0, name=None): """Gather slices from `params` using `n`-dimensional indices. This operation is similar to `gather`, but it uses the innermost dimension of `indices` to define a slice into `params`. In particular, if: * `indices` has shape `[A1...AN, I]` * `params` has shape `[B1...BM]` Then: * `result` has shape `[A1...AN, B_{I+1}...BM]`. * `result[a1...aN] = params[indices[a1...aN, :]]` Args: params: A potentially ragged tensor with shape `[A1...AN, I]`. indices: A potentially ragged tensor with shape `[B1...BM]`. batch_dims: Must be zero. name: A name for the operation (optional). Returns: A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`. #### Examples: ```python >>> params = tf.compat.v1.ragged.constant_value( ... [ [ ['000', '001'], ['010' ] ], ... [ ['100' ], ['110', '111', '112'], ['120'] ], ... [ [ ], ['210' ] ] ]) >>> # Gather 2D slices from a 3D tensor >>> ragged.gather_nd(params, [[2], [0]]) [ [ [ ], ['210'] ] [ ['000', '001'], ['010'] ] ] >>> # Gather 1D slices from a 3D tensor >>> ragged.gather_nd(params, [[2, 1], [0, 0]]) [['210'], ['000', '001']] >>> # Gather scalars from a 3D tensor >>> ragged.gather_nd(params, [[0, 0, 1], [1, 1, 2]]) ['001', '112'] ``` """ if not isinstance(batch_dims, int) or batch_dims != 0: raise ValueError('batch_dims != 0 is not supported for ragged gather yet.') if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)): return array_ops.gather_nd(params, indices, name) with ops.name_scope(name, 'RaggedGatherNd', [params, indices]): params = ragged_tensor.convert_to_tensor_or_ragged_tensor( params, name='params') indices = ragged_tensor.convert_to_tensor_or_ragged_tensor( indices, name='indices') params, indices = ragged_tensor.match_row_splits_dtypes(params, indices) indices_shape = indices.shape indices_ndims = indices_shape.ndims if indices_ndims is None: raise ValueError('indices.rank be statically known.') if indices_ndims == 0: raise ValueError('indices.rank must be at least 1.') if (ragged_tensor.is_ragged(indices) and indices_ndims == indices.ragged_rank + 1): raise ValueError('The innermost dimension of indices may not be ragged') # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions # that each index slices into. index_size = tensor_shape.dimension_value(indices_shape[-1]) if index_size is None: raise ValueError('indices.shape[-1] must be statically known.') # If `indices` has more than 2 dimensions, then recurse. If `indices` is # dense, then we convert it to ragged before recursing, and then convert # the result back to `dense` if appropriate. if indices_ndims > 2: indices_is_dense = not ragged_tensor.is_ragged(indices) if indices_is_dense: indices = ragged_tensor.RaggedTensor.from_tensor( indices, ragged_rank=indices_ndims - 2, row_splits_dtype=params.row_splits.dtype) result = indices.with_flat_values(gather_nd(params, indices.flat_values)) if (indices_is_dense and ragged_tensor.is_ragged(result) and result.ragged_rank == indices_ndims - 2): result = ragged_tensor.RaggedTensor.to_tensor(result) return result # indices_ndims <= 2, and the innermost dimension of indices may not be # ragged, so `indices` must not be ragged. assert not ragged_tensor.is_ragged(indices) assert ragged_tensor.is_ragged(params) # Handle corner case: An empty index tuple selects the entire `params` # value. So if `index_size` is zero, then tile `params`. if index_size == 0: params_ndims = params.ragged_rank + array_ops.rank(params.flat_values) for dim in range(indices_ndims - 1): params = ragged_array_ops.expand_dims(params, axis=0) multiples = array_ops.concat([ array_ops.shape(indices)[:-1], array_ops.ones([params_ndims], dtypes.int32) ], axis=0) return ragged_array_ops.tile(params, multiples) # When index_size=1, we can just flatten the index tuples and use gather. elif index_size == 1: flattened_index_tuples = array_ops.reshape(indices, [-1]) return gather(params, flattened_index_tuples) # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor. # Flatten both the index tuples and the params, such that the flattened # index tuples point to the correct values in the flattened params; and # then use ragged.gather on the flattened index tuples & params. else: indices = math_ops.cast(indices, params.row_splits.dtype) # Flatten the outermost 2 dimensions of the index tuples & params. flattened_index_tuples = array_ops.gather(params.row_splits, indices[..., 0]) flattened_index_tuples += indices[..., 1] flattened_params = params.values # Flatten any remaining dimensions. for dim in range(2, index_size): if not ragged_tensor.is_ragged(flattened_params): flattened_index_tuples = array_ops.expand_dims( flattened_index_tuples, axis=1) flattened_index_tuples = array_ops.concat( [flattened_index_tuples, indices[..., dim:]], axis=1) return array_ops.gather_nd(flattened_params, flattened_index_tuples) flattened_index_tuples = array_ops.gather( flattened_params.row_starts(), flattened_index_tuples) flattened_index_tuples += indices[..., dim] flattened_params = flattened_params.values # Gather using the flattened index tuples and params. return gather(flattened_params, flattened_index_tuples)
def _ragged_expand_dims_v1(input, axis=None, name=None, dim=None): # pylint: disable=redefined-builtin if dim is not None: axis = dim return ragged_array_ops.expand_dims(input=input, axis=axis, name=name)
def batch_gather_with_default(params, indices, default_value='', name=None): """Same as `batch_gather` but inserts `default_value` for invalid indices. This operation is similar to `batch_gather` except that it will substitute the value for invalid indices with `default_value` as the contents. See `batch_gather` for more details. Args: params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`, `M>0`). indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`). default_value: A value to be inserted in places where `indices` are out of bounds. Must be the same dtype as params and either a scalar or rank 1. name: A name for the operation (optional). Returns: A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`. `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`. #### Example: ```python >>> params = tf.ragged.constant([ ['a', 'b', 'c'], ['d'], [], ['e']]) >>> indices = tf.ragged.constant([[1, 2, -1], [], [], [0, 10]]) >>> batch_gather_with_default(params, indices, 'FOO') [['b', 'c', 'FOO'], [], [], ['e', 'FOO']] ``` """ with ops.name_scope(name, 'RaggedBatchGatherWithDefault'): params = ragged_tensor.convert_to_tensor_or_ragged_tensor( params, name='params', ) indices = ragged_tensor.convert_to_tensor_or_ragged_tensor( indices, name='indices', ) default_value = ragged_tensor.convert_to_tensor_or_ragged_tensor( default_value, name='default_value', ) # TODO(hterry): lift this restriction and support default_values of # of rank > 1 if (default_value.shape.ndims is not 0 and default_value.shape.ndims is not 1): raise ValueError('"default_value" must be a scalar or vector') upper_bounds = None if indices.shape.ndims is None: raise ValueError('Indices must have a known rank.') if params.shape.ndims is None: raise ValueError('Params must have a known rank.') num_batch_dimensions = indices.shape.ndims - 1 pad = None # The logic for this works as follows: # - create a padded params, where: # padded_params[b1...bn, 0] = default_value # padded_params[b1...bn, i] = params[b1...bn, i-1] (i>0) # - create an `upper_bounds` Tensor that contains the number of elements # in each innermost rank. Broadcast `upper_bounds` to be the same shape # as `indices`. # - check to see which index in `indices` are out of bounds and substitute # it with the index containing `default_value` (the first). # - call batch_gather with the indices adjusted. with ops.control_dependencies([ check_ops.assert_greater_equal(array_ops.rank(params), array_ops.rank(indices)) ]): if ragged_tensor.is_ragged(params): row_lengths = ragged_array_ops.expand_dims( params.row_lengths(axis=num_batch_dimensions), axis=-1) upper_bounds = math_ops.cast(row_lengths, indices.dtype) pad_shape = _get_pad_shape(params, indices) pad = ragged_tensor_shape.broadcast_to(default_value, pad_shape) else: params_shape = array_ops.shape(params) pad_shape = array_ops.concat([ params_shape[:num_batch_dimensions], [1], params_shape[num_batch_dimensions + 1:params.shape.ndims] ], 0) upper_bounds = params_shape[num_batch_dimensions] pad = array_ops.broadcast_to(default_value, pad_shape) # Add `default_value` as the first value in the innermost (ragged) rank. pad = math_ops.cast(pad, params.dtype) padded_params = array_ops.concat([pad, params], axis=num_batch_dimensions) # Adjust the indices by substituting out-of-bound indices to the # default-value index (which is the first element) shifted_indices = indices + 1 is_out_of_bounds = (indices < 0) | (indices > upper_bounds) adjusted_indices = ragged_where_op.where( is_out_of_bounds, x=array_ops.zeros_like(indices), y=shifted_indices, ) return array_ops.batch_gather(params=padded_params, indices=adjusted_indices, name=name)
def unicode_split_with_offsets(input, input_encoding, errors="replace", replacement_char=0xFFFD, name=None): r"""Splits each string into a sequence of code points with start offsets. This op is similar to `tf.strings.decode(...)`, but it also returns the start offset for each character in its respective string. This information can be used to align the characters with the original byte sequence. Returns a tuple `(chars, start_offsets)` where: * `chars[i1...iN, j]` is the substring of `input[i1...iN]` that encodes its `j`th character, when decoded using `input_encoding`. * `start_offsets[i1...iN, j]` is the start byte offset for the `j`th character in `input[i1...iN]`, when decoded using `input_encoding`. Args: input: An `N` dimensional potentially ragged `string` tensor with shape `[D1...DN]`. `N` must be statically known. input_encoding: String name for the unicode encoding that should be used to decode each string. errors: Specifies the response when an input string can't be converted using the indicated encoding. One of: * `'strict'`: Raise an exception for any illegal substrings. * `'replace'`: Replace illegal substrings with `replacement_char`. * `'ignore'`: Skip illegal substrings. replacement_char: The replacement codepoint to be used in place of invalid substrings in `input` when `errors='replace'`. name: A name for the operation (optional). Returns: A tuple of `N+1` dimensional tensors `(codepoints, start_offsets)`. * `codepoints` is an `int32` tensor with shape `[D1...DN, (num_chars)]`. * `offsets` is an `int64` tensor with shape `[D1...DN, (num_chars)]`. The returned tensors are `tf.Tensor`s if `input` is a scalar, or `tf.RaggedTensor`s otherwise. #### Example: >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')] >>> result = tf.strings.unicode_split_with_offsets(input, 'UTF-8') >>> result[0].to_list() # character substrings [[b'G', b'\xc3\xb6', b'\xc3\xb6', b'd', b'n', b'i', b'g', b'h', b't'], [b'\xf0\x9f\x98\x8a']] >>> result[1].to_list() # offsets [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]] """ with ops.name_scope(name, "UnicodeSplitWithOffsets", [input]): codepoints, offsets = _unicode_decode(input, input_encoding, errors, replacement_char, False, with_offsets=True) chars = unicode_encode(ragged_array_ops.expand_dims(codepoints, -1), output_encoding=input_encoding, errors=errors, replacement_char=replacement_char) return chars, offsets
def batch_gather_with_default(params, indices, default_value='', name=None): """Same as `batch_gather` but inserts `default_value` for invalid indices. This operation is similar to `batch_gather` except that it will substitute the value for invalid indices with `default_value` as the contents. See `batch_gather` for more details. Args: params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`, `M>0`). indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`). default_value: A value to be inserted in places where `indices` are out of bounds. Must be the same dtype as params and either a scalar or rank 1. name: A name for the operation (optional). Returns: A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`. `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`. #### Example: ```python >>> params = tf.ragged.constant([ ['a', 'b', 'c'], ['d'], [], ['e']]) >>> indices = tf.ragged.constant([[1, 2, -1], [], [], [0, 10]]) >>> batch_gather_with_default(params, indices, 'FOO') [['b', 'c', 'FOO'], [], [], ['e', 'FOO']] ``` """ with ops.name_scope(name, 'RaggedBatchGatherWithDefault'): params = ragged_tensor.convert_to_tensor_or_ragged_tensor( params, name='params', ) indices = ragged_tensor.convert_to_tensor_or_ragged_tensor( indices, name='indices', ) default_value = ragged_tensor.convert_to_tensor_or_ragged_tensor( default_value, name='default_value', ) # TODO(hterry): lift this restriction and support default_values of # of rank > 1 if (default_value.shape.ndims is not 0 and default_value.shape.ndims is not 1): raise ValueError('"default_value" must be a scalar or vector') upper_bounds = None if indices.shape.ndims is None: raise ValueError('Indices must have a known rank.') if params.shape.ndims is None: raise ValueError('Params must have a known rank.') num_batch_dimensions = indices.shape.ndims - 1 pad = None # The logic for this works as follows: # - create a padded params, where: # padded_params[b1...bn, 0] = default_value # padded_params[b1...bn, i] = params[b1...bn, i-1] (i>0) # - create an `upper_bounds` Tensor that contains the number of elements # in each innermost rank. Broadcast `upper_bounds` to be the same shape # as `indices`. # - check to see which index in `indices` are out of bounds and substitute # it with the index containing `default_value` (the first). # - call batch_gather with the indices adjusted. with ops.control_dependencies([ check_ops.assert_greater_equal(array_ops.rank(params), array_ops.rank(indices))]): if ragged_tensor.is_ragged(params): row_lengths = ragged_array_ops.expand_dims( params.row_lengths(axis=num_batch_dimensions), axis=-1) upper_bounds = math_ops.cast(row_lengths, indices.dtype) pad_shape = _get_pad_shape(params, indices) pad = ragged_tensor_shape.broadcast_to( default_value, pad_shape) else: params_shape = array_ops.shape(params) pad_shape = array_ops.concat([ params_shape[:num_batch_dimensions], [1], params_shape[num_batch_dimensions + 1:params.shape.ndims] ], 0) upper_bounds = params_shape[num_batch_dimensions] pad = array_ops.broadcast_to(default_value, pad_shape) # Add `default_value` as the first value in the innermost (ragged) rank. pad = math_ops.cast(pad, params.dtype) padded_params = array_ops.concat( [pad, params], axis=num_batch_dimensions) # Adjust the indices by substituting out-of-bound indices to the # default-value index (which is the first element) shifted_indices = indices + 1 is_out_of_bounds = (indices < 0) | (indices > upper_bounds) adjusted_indices = ragged_where_op.where( is_out_of_bounds, x=array_ops.zeros_like(indices), y=shifted_indices, ) return array_ops.batch_gather( params=padded_params, indices=adjusted_indices, name=name)
def _ragged_expand_dims_v1(input, axis=None, name=None, dim=None): # pylint: disable=redefined-builtin if dim is not None: axis = dim return ragged_array_ops.expand_dims(input=input, axis=axis, name=name)
def gather_nd(params, indices, batch_dims=0, name=None): """Gather slices from `params` using `n`-dimensional indices. This operation is similar to `gather`, but it uses the innermost dimension of `indices` to define a slice into `params`. In particular, if: * `indices` has shape `[A1...AN, I]` * `params` has shape `[B1...BM]` Then: * `result` has shape `[A1...AN, B_{I+1}...BM]`. * `result[a1...aN] = params[indices[a1...aN, :]]` Args: params: A potentially ragged tensor with shape `[A1...AN, I]`. indices: A potentially ragged tensor with shape `[B1...BM]`. batch_dims: Must be zero. name: A name for the operation (optional). Returns: A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`. #### Examples: >>> params = tf.ragged.constant( ... [ [ ['000', '001'], ['010' ] ], ... [ ['100' ], ['110', '111', '112'], ['120'] ], ... [ [ ], ['210' ] ] ]) >>> # Gather 2D slices from a 3D tensor >>> tf.gather_nd(params, [[2], [0]]) <tf.RaggedTensor [[[], [b'210']], [[b'000', b'001'], [b'010']]]> >>> # Gather 1D slices from a 3D tensor >>> tf.gather_nd(params, [[2, 1], [0, 0]]) <tf.RaggedTensor [[b'210'], [b'000', b'001']]> >>> # Gather scalars from a 3D tensor >>> tf.gather_nd(params, [[0, 0, 1], [1, 1, 2]]).numpy() array([b'001', b'112'], dtype=object) """ if not isinstance(batch_dims, int) or batch_dims != 0: raise ValueError( 'batch_dims != 0 is not supported for ragged gather yet.') if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)): return array_ops.gather_nd(params, indices, name) with ops.name_scope(name, 'RaggedGatherNd', [params, indices]): params = ragged_tensor.convert_to_tensor_or_ragged_tensor( params, name='params') indices = ragged_tensor.convert_to_tensor_or_ragged_tensor( indices, name='indices') params, indices = ragged_tensor.match_row_splits_dtypes( params, indices) indices_shape = indices.shape indices_ndims = indices_shape.ndims if indices_ndims is None: raise ValueError('indices.rank be statically known.') if indices_ndims == 0: raise ValueError('indices.rank must be at least 1.') if (ragged_tensor.is_ragged(indices) and indices_ndims == indices.ragged_rank + 1): raise ValueError( 'The innermost dimension of indices may not be ragged') # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions # that each index slices into. index_size = tensor_shape.dimension_value(indices_shape[-1]) if index_size is None: raise ValueError('indices.shape[-1] must be statically known.') # If `indices` has more than 2 dimensions, then recurse. If `indices` is # dense, then we convert it to ragged before recursing, and then convert # the result back to `dense` if appropriate. if indices_ndims > 2: indices_is_dense = not ragged_tensor.is_ragged(indices) if indices_is_dense: indices = ragged_tensor.RaggedTensor.from_tensor( indices, ragged_rank=indices_ndims - 2, row_splits_dtype=params.row_splits.dtype) result = indices.with_flat_values( gather_nd(params, indices.flat_values)) if (indices_is_dense and ragged_tensor.is_ragged(result) and result.ragged_rank == indices_ndims - 2): result = ragged_tensor.RaggedTensor.to_tensor(result) return result # indices_ndims <= 2, and the innermost dimension of indices may not be # ragged, so `indices` must not be ragged. assert not ragged_tensor.is_ragged(indices) assert ragged_tensor.is_ragged(params) # Handle corner case: An empty index tuple selects the entire `params` # value. So if `index_size` is zero, then tile `params`. if index_size == 0: params_ndims = params.ragged_rank + array_ops.rank( params.flat_values) for dim in range(indices_ndims - 1): params = ragged_array_ops.expand_dims(params, axis=0) multiples = array_ops.concat([ array_ops.shape(indices)[:-1], array_ops.ones([params_ndims], dtypes.int32) ], axis=0) return ragged_array_ops.tile(params, multiples) # When index_size=1, we can just flatten the index tuples and use gather. elif index_size == 1: flattened_index_tuples = array_ops.reshape(indices, [-1]) return gather(params, flattened_index_tuples) # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor. # Flatten both the index tuples and the params, such that the flattened # index tuples point to the correct values in the flattened params; and # then use ragged.gather on the flattened index tuples & params. else: indices = math_ops.cast(indices, params.row_splits.dtype) # Flatten the outermost 2 dimensions of the index tuples & params. flattened_index_tuples = array_ops.gather(params.row_splits, indices[..., 0]) flattened_index_tuples += indices[..., 1] flattened_params = params.values # Flatten any remaining dimensions. for dim in range(2, index_size): if not ragged_tensor.is_ragged(flattened_params): flattened_index_tuples = array_ops.expand_dims( flattened_index_tuples, axis=1) flattened_index_tuples = array_ops.concat( [flattened_index_tuples, indices[..., dim:]], axis=1) return array_ops.gather_nd(flattened_params, flattened_index_tuples) flattened_index_tuples = array_ops.gather( flattened_params.row_starts(), flattened_index_tuples) flattened_index_tuples += indices[..., dim] flattened_params = flattened_params.values # Gather using the flattened index tuples and params. return gather(flattened_params, flattened_index_tuples)
def unicode_split_with_offsets(input, input_encoding, errors="replace", replacement_char=0xFFFD, name=None): r"""Splits each string into a sequence of code points with start offsets. This op is similar to `tf.strings.decode(...)`, but it also returns the start offset for each character in its respective string. This information can be used to align the characters with the original byte sequence. Returns a tuple `(chars, start_offsets)` where: * `chars[i1...iN, j]` is the substring of `input[i1...iN]` that encodes its `j`th character, when decoded using `input_encoding`. * `start_offsets[i1...iN, j]` is the start byte offset for the `j`th character in `input[i1...iN]`, when decoded using `input_encoding`. Args: input: An `N` dimensional potentially ragged `string` tensor with shape `[D1...DN]`. `N` must be statically known. input_encoding: String name for the unicode encoding that should be used to decode each string. errors: Specifies the response when an input string can't be converted using the indicated encoding. One of: * `'strict'`: Raise an exception for any illegal substrings. * `'replace'`: Replace illegal substrings with `replacement_char`. * `'ignore'`: Skip illegal substrings. replacement_char: The replacement codepoint to be used in place of invalid substrings in `input` when `errors='replace'`. name: A name for the operation (optional). Returns: A tuple of `N+1` dimensional tensors `(codepoints, start_offsets)`. * `codepoints` is an `int32` tensor with shape `[D1...DN, (num_chars)]`. * `offsets` is an `int64` tensor with shape `[D1...DN, (num_chars)]`. The returned tensors are `tf.Tensor`s if `input` is a scalar, or `tf.RaggedTensor`s otherwise. #### Example: ```python >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')] >>> result = tf.strings.unicode_split_with_offsets(input, 'UTF-8') >>> result[0].tolist() # character substrings [['G', '\xc3\xb6', '\xc3\xb6', 'd', 'n', 'i', 'g', 'h', 't'], ['\xf0\x9f\x98\x8a']] >>> result[1].tolist() # offsets [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]] ``` """ with ops.name_scope(name, "UnicodeSplitWithOffsets", [input]): codepoints, offsets = _unicode_decode(input, input_encoding, errors, replacement_char, False, with_offsets=True) chars = unicode_encode( ragged_array_ops.expand_dims(codepoints, -1), output_encoding=input_encoding, errors=errors, replacement_char=replacement_char) return chars, offsets
def _ragged_stack_concat_helper(rt_inputs, axis, stack_values): """Helper function to concatenate or stack ragged tensors. Args: rt_inputs: A list of RaggedTensors or Tensors to combine. axis: The axis along which to concatenate or stack. stack_values: A boolean -- if true, then stack values; otherwise, concatenate them. Returns: A RaggedTensor. Raises: ValueError: If rt_inputs is empty, or if axis is out of range. """ # Validate parameters. if not rt_inputs: raise ValueError('rt_inputs may not be empty.') # Convert input tensors. rt_inputs = [ ragged_tensor.convert_to_tensor_or_ragged_tensor( rt_input, name='rt_input') for rt_input in rt_inputs ] row_splits_dtype, rt_inputs = ragged_tensor.match_row_splits_dtypes( *rt_inputs, return_dtype=True) rt_inputs = list(rt_inputs) # Special case: if there's only one input, then return it as-is. if len(rt_inputs) == 1: if stack_values: return ragged_array_ops.expand_dims(rt_inputs[0], axis=axis) else: return rt_inputs[0] # Check the rank (number of dimensions) of the input tensors. ndims = None for rt in rt_inputs: if ndims is None: ndims = rt.shape.ndims else: rt.shape.assert_has_rank(ndims) out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1 axis = ragged_util.get_positive_axis(axis, out_ndims) # If all the inputs are Tensors, and we're combining the final dimension, # then we can delegate to the tf.stack/tf.concat operation, and return a # Tensor. if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs): if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1): if stack_values: return array_ops.stack(rt_inputs, axis) else: return array_ops.concat(rt_inputs, axis) # Convert any Tensor inputs to RaggedTensors. This makes it # possible to concatenate Tensors and RaggedTensors together. for i in range(len(rt_inputs)): if not ragged_tensor.is_ragged(rt_inputs[i]): rt_inputs[i] = ragged_tensor.RaggedTensor.from_tensor( rt_inputs[i], ragged_rank=1, row_splits_dtype=row_splits_dtype) # Convert the input tensors to all have the same ragged_rank. ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1) rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank, row_splits_dtype) for rt in rt_inputs] if axis == 0: return _ragged_stack_concat_axis_0(rt_inputs, stack_values) elif axis == 1: return _ragged_stack_concat_axis_1(rt_inputs, stack_values) else: # axis > 1: recurse. values = [rt.values for rt in rt_inputs] splits = [[rt_input.row_splits] for rt_input in rt_inputs] with ops.control_dependencies(ragged_util.assert_splits_match(splits)): return ragged_tensor.RaggedTensor.from_row_splits( _ragged_stack_concat_helper(values, axis - 1, stack_values), splits[0][0], validate=False)