def _unicode_decode(input, input_encoding, errors, replacement_char,
                    replace_control_characters, with_offsets):
    """Decodes each string into a sequence of codepoints."""
    input = ragged_tensor.convert_to_tensor_or_ragged_tensor(input,
                                                             name="input")
    input_ndims = input.shape.ndims
    if input_ndims is None:
        raise ValueError("Rank of `input` must be statically known.")

    if input_ndims > 1:
        # Convert to a ragged tensor with ragged_rank = input_ndims - 1.
        if not ragged_tensor.is_ragged(input):
            input = ragged_conversion_ops.from_tensor(input,
                                                      ragged_rank=input_ndims -
                                                      1)
        elif input.ragged_rank < input_ndims - 1:
            input = input.with_flat_values(
                ragged_conversion_ops.from_tensor(input.flat_values,
                                                  ragged_rank=input_ndims -
                                                  input.ragged_rank + 1))

    # Reshape the input to a flat vector, and apply the gen_string_ops op.
    if ragged_tensor.is_ragged(input):
        flat_input = array_ops.reshape(input.flat_values, [-1])
    else:
        flat_input = array_ops.reshape(input, [-1])

    if with_offsets:
        decode_op = gen_string_ops.unicode_decode_with_offsets
    else:
        decode_op = gen_string_ops.unicode_decode
    flat_result = decode_op(
        input=flat_input,
        input_encoding=input_encoding,
        errors=errors,
        replacement_char=replacement_char,
        replace_control_characters=replace_control_characters)

    if input_ndims == 0:
        codepoints = flat_result.char_values
        if with_offsets:
            offsets = flat_result.char_to_byte_starts
    else:
        codepoints = ragged_tensor.RaggedTensor.from_row_splits(
            flat_result.char_values, flat_result.row_splits)
        if input_ndims > 1:
            codepoints = input.with_flat_values(codepoints)
        if with_offsets:
            offsets = ragged_tensor.RaggedTensor.from_row_splits(
                flat_result.char_to_byte_starts, flat_result.row_splits)
            if input_ndims > 1:
                offsets = input.with_flat_values(offsets)

    if with_offsets:
        return codepoints, offsets
    else:
        return codepoints
def _unicode_decode(input, input_encoding, errors, replacement_char,
                    replace_control_characters, with_offsets):
  """Decodes each string into a sequence of codepoints."""
  input = ragged_tensor.convert_to_tensor_or_ragged_tensor(input, name="input")
  input_ndims = input.shape.ndims
  if input_ndims is None:
    raise ValueError("Rank of `input` must be statically known.")

  if input_ndims > 1:
    # Convert to a ragged tensor with ragged_rank = input_ndims - 1.
    if not ragged_tensor.is_ragged(input):
      input = ragged_conversion_ops.from_tensor(
          input, ragged_rank=input_ndims - 1)
    elif input.ragged_rank < input_ndims - 1:
      input = input.with_flat_values(
          ragged_conversion_ops.from_tensor(
              input.flat_values,
              ragged_rank=input_ndims - input.ragged_rank + 1))

  # Reshape the input to a flat vector, and apply the gen_string_ops op.
  if ragged_tensor.is_ragged(input):
    flat_input = array_ops.reshape(input.flat_values, [-1])
  else:
    flat_input = array_ops.reshape(input, [-1])

  if with_offsets:
    decode_op = gen_string_ops.unicode_decode_with_offsets
  else:
    decode_op = gen_string_ops.unicode_decode
  flat_result = decode_op(
      input=flat_input,
      input_encoding=input_encoding,
      errors=errors,
      replacement_char=replacement_char,
      replace_control_characters=replace_control_characters)

  if input_ndims == 0:
    codepoints = flat_result.char_values
    if with_offsets:
      offsets = flat_result.char_to_byte_starts
  else:
    codepoints = ragged_tensor.RaggedTensor.from_row_splits(
        flat_result.char_values, flat_result.row_splits)
    if input_ndims > 1:
      codepoints = input.with_flat_values(codepoints)
    if with_offsets:
      offsets = ragged_tensor.RaggedTensor.from_row_splits(
          flat_result.char_to_byte_starts, flat_result.row_splits)
      if input_ndims > 1:
        offsets = input.with_flat_values(offsets)

  if with_offsets:
    return codepoints, offsets
  else:
    return codepoints
Beispiel #3
0
    def tokenize_with_offsets(self, input, name=None):  # pylint: disable=redefined-builtin
        """Tokenizes a tensor of UTF-8 strings.

    Args:
      input: A `RaggedTensor` or `Tensor` of UTF-8 strings with any shape.
      name: The name argument that is passed to the op function.

    Returns:
      A `RaggedTensor` of tokenized text. The returned shape is the shape of the
      input tensor with an added ragged dimension for tokens of each string.
    """
        with ops.name_scope(name, "SentenceTokenizer", [input, self]):
            input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(
                input)
            if input_tensor.shape.ndims is None:
                raise ValueError(
                    "Rank of input_tensor must be statically known.")
            if ragged_tensor.is_ragged(input_tensor):
                # Recursively process the values of the ragged tensor
                (tokens, starts,
                 limits) = self.tokenize_with_offsets(input_tensor.flat_values)
                tokens = input_tensor.with_flat_values(tokens)
                starts = input_tensor.with_flat_values(starts)
                limits = input_tensor.with_flat_values(limits)
                return (tokens, starts, limits)
            else:
                if input_tensor.shape.ndims > 1:
                    # Convert the input tensor to ragged and process it.
                    return self.tokenize_with_offsets(
                        ragged_conversion_ops.from_tensor(input_tensor))
                elif input_tensor.shape.ndims == 0:
                    (tokens, starts, limits) = self.tokenize_with_offsets(
                        array_ops.stack([input_tensor]))
                    tokens = tokens.values
                    starts = starts.values
                    limits = limits.values
                    return (tokens, starts, limits)
                else:
                    # Our rank 1 tensor is the correct shape, so we can process it as
                    # normal.
                    (output_values, output_splits, output_offset_starts,
                     output_offset_limits) = (
                         gen_sentencepiece_tokenizer.
                         sentencepiece_tokenize_with_offsets_op(
                             self._resource_handle, input_tensor,
                             self.nbest_size, self.alpha, self.add_bos,
                             self.add_eos, self.reverse, self.out_type))
                    tokens = RaggedTensor.from_nested_row_splits(
                        flat_values=output_values,
                        nested_row_splits=[output_splits],
                        validate=False)
                    starts = RaggedTensor.from_nested_row_splits(
                        flat_values=output_offset_starts,
                        nested_row_splits=[output_splits],
                        validate=False)
                    limits = RaggedTensor.from_nested_row_splits(
                        flat_values=output_offset_limits,
                        nested_row_splits=[output_splits],
                        validate=False)
                    return (tokens, starts, limits)
Beispiel #4
0
    def tokenize_with_offsets(self, input):  # pylint: disable=redefined-builtin
        """Tokenizes a tensor of UTF-8 strings on Unicode script boundaries.

    The strings are split when a change in the Unicode script is detected
    between sequential tokens. The script codes used correspond to International
    Components for Unicode (ICU) UScriptCode values. See:
    http://icu-project.org/apiref/icu4c/uscript_8h.html

    ICU defined whitespace characters are dropped, unless the keep_whitespace
    option was specified at construction time.

    Args:
      input: A `RaggedTensor`or `Tensor` of UTF-8 strings with any shape.

    Returns:
      A tuple `(tokens, start_offsets, limit_offsets)` where:

        * `tokens`: A `RaggedTensor` of tokenized text.
        * `start_offsets`: A `RaggedTensor` of the tokens' starting byte offset.
        * `limit_offsets`: A `RaggedTensor` of the tokens' ending byte offset.
    """
        name = None
        with ops.name_scope(name, "UnicodeScriptTokenize", [input]):
            input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(
                input)
            if input_tensor.shape.ndims is None:
                raise ValueError(
                    "Rank of input_tensor must be statically known.")
            if ragged_tensor.is_ragged(input_tensor):
                if input_tensor.flat_values.shape.ndims > 1:
                    # If the flat_values of our ragged tensor is multi-dimensional, we can
                    # process it separately and our output will have the same nested
                    # splits as our input.
                    (tokens, starts, limits) = self.tokenize_with_offsets(
                        input_tensor.flat_values)
                    return (input_tensor.with_flat_values(tokens),
                            input_tensor.with_flat_values(starts),
                            input_tensor.with_flat_values(limits))
                else:
                    # Recursively process the values of the ragged tensor.
                    (tokens, starts,
                     limits) = self.tokenize_with_offsets(input_tensor.values)
                    return (input_tensor.with_values(tokens),
                            input_tensor.with_values(starts),
                            input_tensor.with_values(limits))
            else:
                if input_tensor.shape.ndims > 1:
                    # Convert the input tensor to ragged and process it.
                    return self.tokenize_with_offsets(
                        ragged_conversion_ops.from_tensor(input_tensor))
                elif input_tensor.shape.ndims == 0:
                    (tokens, starts, limits) = self.tokenize_with_offsets(
                        array_ops.stack([input_tensor]))
                    return tokens.values, starts.values, limits.values
                else:
                    # Our rank 1 tensor is the correct shape, so we can process it as
                    # normal
                    return self._tokenize_with_offsets_encode_decode_wrapper(
                        input_tensor)
def _increase_ragged_rank_to(rt_input, ragged_rank):
  """Adds ragged dimensions to `rt_input` so it has the desired ragged rank."""
  if ragged_rank > 0:
    if not ragged_tensor.is_ragged(rt_input):
      rt_input = ragged_conversion_ops.from_tensor(rt_input)
    if rt_input.ragged_rank < ragged_rank:
      rt_input = rt_input.with_values(
          _increase_ragged_rank_to(rt_input.values, ragged_rank - 1))
  return rt_input
def _increase_ragged_rank_to(rt_input, ragged_rank):
  """Adds ragged dimensions to `rt_input` so it has the desired ragged rank."""
  if ragged_rank > 0:
    if not ragged_tensor.is_ragged(rt_input):
      rt_input = ragged_conversion_ops.from_tensor(rt_input)
    if rt_input.ragged_rank < ragged_rank:
      rt_input = rt_input.with_values(
          _increase_ragged_rank_to(rt_input.values, ragged_rank - 1))
  return rt_input
    def tokenize_with_offsets(self, input):  # pylint: disable=redefined-builtin
        """Tokenizes a tensor of UTF-8 strings on whitespaces.

    The strings are split on ICU defined whitespace characters. These
    whitespace characters are dropped.

    Args:
      input: A `RaggedTensor`or `Tensor` of UTF-8 strings with any shape.

    Returns:
      A tuple `(tokens, start_offsets, end_offsets)` where:

        * `tokens`: A `RaggedTensor` of tokenized text.
        * `start_offsets`: A `RaggedTensor` of the tokens' starting byte offset.
        * `end_offsets`: A `RaggedTensor` of the tokens' ending byte offset.
    """
        name = None
        with ops.name_scope(name, "WhitespaceTokenize", [input]):
            input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(
                input)
            if input_tensor.shape.ndims is None:
                raise ValueError(
                    "Rank of input_tensor must be statically known.")
            if ragged_tensor.is_ragged(input_tensor):
                if input_tensor.flat_values.shape.ndims > 1:
                    # If the flat_values of our ragged tensor is multi-dimensional, we can
                    # process it separately and our output will have the same nested
                    # splits as our input.
                    (tokens, starts, ends) = self.tokenize_with_offsets(
                        input_tensor.flat_values)
                    return (input_tensor.with_flat_values(tokens),
                            input_tensor.with_flat_values(starts),
                            input_tensor.with_flat_values(ends))
                else:
                    # Recursively process the values of the ragged tensor.
                    (tokens, starts,
                     ends) = self.tokenize_with_offsets(input_tensor.values)
                    return (input_tensor.with_values(tokens),
                            input_tensor.with_values(starts),
                            input_tensor.with_values(ends))
            else:
                if input_tensor.shape.ndims > 1:
                    # Convert the input tensor to ragged and process it.
                    return self.tokenize_with_offsets(
                        ragged_conversion_ops.from_tensor(input_tensor))
                elif input_tensor.shape.ndims == 0:
                    (tokens, starts, ends) = self.tokenize_with_offsets(
                        array_ops.stack([input_tensor]))
                    return tokens.values, starts.values, ends.values
                else:
                    # Our rank 1 tensor is the correct shape, so we can process it as
                    # normal.
                    return self._whitespace_tokenize_with_offsets_encode_decode_wrapper(
                        input_tensor)
    def detokenize(self, input, name=None):  # pylint: disable=redefined-builtin
        """Detokenizes tokens into preprocessed text.

    Args:
      input: A `RaggedTensor` or `Tensor` of UTF-8 string tokens with a rank of
        at least 1.
      name: The name argument that is passed to the op function.

    Returns:
      A N-1 dimensional string Tensor or RaggedTensor of the detokenized text.
    """
        with ops.name_scope(name, "SentenceTokenizer", [input, self]):
            input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(
                input)
            if input_tensor.shape.ndims is None:
                raise ValueError(
                    "Rank of input_tensor must be statically known.")
            if input_tensor.shape.ndims == 0:
                raise ValueError("Rank of input_tensor must be at least 1.")
            if ragged_tensor.is_ragged(input_tensor):
                if input_tensor.flat_values.shape.ndims > 1:
                    # If the flat_values of our ragged tensor is multi-dimensional, we can
                    # process it separately and our output will have the same nested
                    # splits as our input.
                    tokens = self.detokenize(input_tensor.flat_values)
                    return input_tensor.with_flat_values(tokens)
                elif input_tensor.ragged_rank > 1:
                    # Recursively process the values of the ragged tensor.
                    tokens = self.detokenize(input_tensor.values)
                    return input_tensor.with_values(tokens)
                else:
                    return gen_sentencepiece_tokenizer.sentencepiece_detokenize_op(
                        self._model_resource.resource_handle,
                        input_tensor.flat_values, input_tensor.row_splits,
                        self.add_bos, self.add_eos, self.reverse)
            else:
                if input_tensor.shape.ndims > 1:
                    # Convert the input tensor to ragged and process it.
                    return self.detokenize(
                        ragged_conversion_ops.from_tensor(input_tensor))
                else:
                    tokens = self.detokenize(array_ops.stack([input_tensor]))
                    return array_ops.reshape(tokens, [])
Beispiel #9
0
def _ragged_tile_axis(rt_input, axis, repeats):
    """Tile a dimension of a RaggedTensor to match a ragged shape."""
    assert axis > 0  # Outermost dimension may not be ragged.

    if not ragged_tensor.is_ragged(rt_input):
        rt_input = ragged_conversion_ops.from_tensor(rt_input, ragged_rank=1)

    if axis > 1:
        return rt_input.with_values(
            _ragged_tile_axis(rt_input.values, axis - 1, repeats))
    else:
        src_row_splits = rt_input.nested_row_splits
        src_row_lengths = rt_input.nested_row_lengths()
        splits = src_row_splits[0]

        dst_row_lengths = [repeats]
        for i in range(1, len(src_row_lengths)):
            dst_row_lengths.append(
                ragged_util.repeat_ranges(src_row_lengths[i], splits, repeats))
            splits = array_ops.gather(src_row_splits[i], splits)
        dst_values = ragged_util.repeat_ranges(rt_input.flat_values, splits,
                                               repeats)
        return ragged_tensor.RaggedTensor.from_nested_row_lengths(
            dst_values, dst_row_lengths)
def _ragged_tile_axis(rt_input, axis, repeats):
  """Tile a dimension of a RaggedTensor to match a ragged shape."""
  assert axis > 0  # Outermost dimension may not be ragged.

  if not ragged_tensor.is_ragged(rt_input):
    rt_input = ragged_conversion_ops.from_tensor(rt_input, ragged_rank=1)

  if axis > 1:
    return rt_input.with_values(
        _ragged_tile_axis(rt_input.values, axis - 1, repeats))
  else:
    src_row_splits = rt_input.nested_row_splits
    src_row_lengths = rt_input.nested_row_lengths()
    splits = src_row_splits[0]

    dst_row_lengths = [repeats]
    for i in range(1, len(src_row_lengths)):
      dst_row_lengths.append(
          ragged_util.repeat_ranges(src_row_lengths[i], splits, repeats))
      splits = array_ops.gather(src_row_splits[i], splits)
    dst_values = ragged_util.repeat_ranges(rt_input.flat_values, splits,
                                           repeats)
    return ragged_tensor.RaggedTensor.from_nested_row_lengths(
        dst_values, dst_row_lengths)
 def test_passing_simple_from_dense(self, input_list, squeeze_ranks=None):
   dt = constant_op.constant(input_list)
   rt = ragged_conversion_ops.from_tensor(dt)
   rt_s = ragged_squeeze_op.squeeze(rt, squeeze_ranks)
   dt_s = array_ops.squeeze(dt, squeeze_ranks)
   self.assertRaggedEqual(ragged_conversion_ops.to_tensor(rt_s), dt_s)
Beispiel #12
0
def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
  """Helper function to concatenate or stack ragged tensors.

  Args:
    rt_inputs: A list of RaggedTensors or Tensors to combine.
    axis: The axis along which to concatenate or stack.
    stack_values: A boolean -- if true, then stack values; otherwise,
      concatenate them.

  Returns:
    A RaggedTensor.
  Raises:
    ValueError: If rt_inputs is empty, or if axis is out of range.
  """
  # Validate parameters.
  if not rt_inputs:
    raise ValueError('rt_inputs may not be empty.')

  # Convert input tensors.
  rt_inputs = [
      ragged_tensor.convert_to_tensor_or_ragged_tensor(
          rt_input, name='rt_input') for rt_input in rt_inputs
  ]

  # Special case: if there's only one input, then return it as-is.
  if len(rt_inputs) == 1:
    if stack_values:
      return expand_dims(rt_inputs[0], axis=0)
    else:
      return rt_inputs[0]

  # Check the rank (number of dimensions) of the input tensors.
  ndims = None
  for rt in rt_inputs:
    if ndims is None:
      ndims = rt.shape.ndims
    else:
      rt.shape.assert_has_rank(ndims)

  out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1
  axis = ragged_util.get_positive_axis(axis, out_ndims)

  # If all the inputs are Tensors, and we're combining the final dimension,
  # then we can delegate to the tf.stack/tf.concat operation, and return a
  # Tensor.
  if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs):
    if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1):
      if stack_values:
        return array_ops.stack(rt_inputs, axis)
      else:
        return array_ops.concat(rt_inputs, axis)

  # Convert any Tensor inputs to RaggedTensors.  This makes it
  # possible to concatenate Tensors and RaggedTensors together.
  for i in range(len(rt_inputs)):
    if not ragged_tensor.is_ragged(rt_inputs[i]):
      rt_inputs[i] = ragged_conversion_ops.from_tensor(
          rt_inputs[i], ragged_rank=1)

  # Convert the input tensors to all have the same ragged_rank.
  ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1)
  rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank) for rt in rt_inputs]

  if axis == 0:
    return _ragged_stack_concat_axis_0(rt_inputs, stack_values)
  elif axis == 1:
    return _ragged_stack_concat_axis_1(rt_inputs, stack_values)
  else:  # axis > 1: recurse.
    values = [rt.values for rt in rt_inputs]
    splits = [[rt_input.row_splits] for rt_input in rt_inputs]
    with ops.control_dependencies(ragged_util.assert_splits_match(splits)):
      return ragged_tensor.RaggedTensor.from_row_splits(
          _ragged_stack_concat_helper(values, axis - 1, stack_values),
          splits[0][0])
Beispiel #13
0
def boolean_mask(data, mask, keepdims=False, name=None):
  """Applies a boolean mask to `data`.

  Returns a potentially ragged tensor that is formed by retaining the elements
  in `data` where the corresponding value in `mask` is `True`.

  If `keepdims` is true then outer dimensions (corresponding to the `mask`
  dimensions) are preserved, and:

  * `output[a1...aA, i, b1...bB] = data[a1...aA, j, b1...bB]`

     Where `j` is the `i`th `True` entry of `mask[a1...aA]`.

  If `keepdims` is false, then the outer dimensions are collapsed (similar to
  the behavior of `tf.boolean_mask`), and:

  * `output[i, b1...bB] = data[a1...aA, b1...bB]`

     Where `(a1...aA)` is the `i`th `True` entry of `mask`
     (in row-major order).

  Args:
    data: A potentially ragged tensor.
    mask: A potentially ragged boolean tensor.  `mask`'s shape must be a prefix
      of `data`'s shape.  `rank(mask)` must be known statically.
    keepdims: Whether to preserve the outer dimensions (`keepdims=True`) or
      flatten them (`keepdims=False`).
    name: A name prefix for the returned tensor (optional).

  Returns:
    A potentially ragged tensor that is formed by retaining the elements in
    `data` where the corresponding value in `mask` is `True`.

    If `keepdims` is false:

    * `rank(output) = rank(data) - rank(mask) + 1`.
    * `output.ragged_rank = max(data.ragged_rank - rank(mask) + 1, 0)`.

    If `keepdims` is true:

    * `rank(output) = rank(data)`.
    * `output.ragged_rank = max(data.ragged_rank, rank(mask) - 1)`.

  Raises:
    ValueError: if `rank(mask)` is not known statically; or if `mask.shape` is
      not a prefix of `data.shape`.

  #### Examples:
    ```python
    >>> # Aliases for True & False so data and mask line up.
    >>> T, F = (True, False)

    >>> tf.ragged.boolean_mask(  # Mask a 2D Tensor.  Flatten outer dims.
    ...     data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
    ...     mask=[[T, F, T], [F, F, F], [T, F, F]],
    ...     keepdims=False).tolist()
    [1, 3, 7]

    >>> tf.ragged.boolean_mask(  # Mask a 2D Tensor.  Preserve outer dims.
    ...     data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
    ...     mask=[[T, F, T], [F, F, F], [T, F, F]],
    ...     keepdims=True).tolist()
    [[1, 3], [], [7]]

    >>> tf.ragged.boolean_mask(  # Mask a 2D RaggedTensor.  Flatten outer dims.
    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
    ...     tf.ragged.constant([[F, F, T], [F], [T, T]]),
    ...     keepdims=False).tolist()
    [3, 5, 6]

    >>> tf.ragged.boolean_mask(  # Mask a 2D RaggedTensor.  Preserve outer dims.
    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
    ...     tf.ragged.constant([[F, F, T], [F], [T, T]]),
    ...     keepdims=True).tolist()
    [[3], [], [5, 6]]

    >>> tf.ragged.boolean_mask(  # Mask rows of a 2D RaggedTensor.
    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
    ...     tf.ragged.constant([True, False, True]),
    ...     keepdims=True).tolist()
    [[1, 2, 3], [5, 6]]
    ```
  """
  with ops.name_scope(name, 'RaggedMask', [data, mask]):
    # Convert inputs to tensors.
    data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name='data')
    mask = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        mask, dtypes.bool, name='mask')

    # Get static rank of mask.
    if mask.shape.ndims is None:
      raise ValueError('mask.shape.ndims must be kown statically.')
    elif mask.shape.ndims == 0:
      raise ValueError('mask cannot be scalar.')

    # If mask is ragged, then recurse with a non-ragged mask.
    if ragged_tensor.is_ragged(mask):
      if not ragged_tensor.is_ragged(data):
        data = ragged_conversion_ops.from_tensor(
            data, ragged_rank=mask.ragged_rank)
      # Check that mask.nested_row_splits is a prefix of
      # data.nested_row_splits.
      splits_list = [
          mask.nested_row_splits, data.nested_row_splits[:mask.ragged_rank]
      ]
      with ops.control_dependencies(
          ragged_util.assert_splits_match(splits_list)):
        # Strip off ragged `splits` until `mask` is non-ragged.  Keep the splits
        # that we strip off in `splits`, so we can add them back on after
        # we recursively mask the non-ragged data.
        splits = []
        while ragged_tensor.is_ragged(mask):
          if mask.shape.ndims > 2:
            splits.append(mask.row_splits)
          else:
            # Count the number of True mask values in each row to find the
            # lengths of the filtered rows; then convert to splits.
            int_mask = ragged_functional_ops.map_flat_values(
                math_ops.cast, mask, dtype=dtypes.int64)
            masked_row_lengths = ragged_math_ops.reduce_sum(int_mask, axis=1)
            splits.append(ragged_util.lengths_to_splits(masked_row_lengths))
          mask = mask.values
          data = data.values

        # Recursively apply the nested non-ragged mask to the nested data.
        masked_values = boolean_mask(data, mask, keepdims)

        # Add the ragged `splits` back to the result.
        if keepdims:
          masked_values = ragged_tensor.RaggedTensor.from_nested_row_splits(
              masked_values, splits)

        return masked_values

    # If mask is non-ragged and has rank 1, and data is ragged, then build a
    # ragged tensor with the indicated rows.
    elif ragged_tensor.is_ragged(data) and mask.shape.ndims == 1:
      # Get the masked splits: first get the length of each row, then filter
      # out the rows that we are deleting, and convert that filtered set of
      # masks back to a splits tensor.
      lengths = data.row_lengths()
      masked_lengths = array_ops.boolean_mask(lengths, mask)
      masked_splits = ragged_util.lengths_to_splits(masked_lengths)

      # Get the masked values: first get row ids corresponding to each
      # value, then use tf.gather to build a boolean mask that's false for
      # values that come from rows that we are deleting, and use that mask to
      # construct the masked values tensor.
      segment_ids = segment_id_ops.row_splits_to_segment_ids(data.row_splits)
      segment_mask = array_ops.gather(mask, segment_ids)
      masked_values = boolean_mask(data.values, segment_mask, keepdims=False)

      return ragged_tensor.RaggedTensor.from_row_splits(masked_values,
                                                        masked_splits)

    # If mask is non-ragged and has rank>1, then convert it to be ragged,
    # with a ragged rank matching data.
    if ragged_tensor.is_ragged(data):
      mask = ragged_conversion_ops.from_tensor(
          mask, ragged_rank=min(data.ragged_rank, mask.shape.ndims - 1))
      return boolean_mask(data, mask, keepdims)

    # Otherwise, data and mask are both `Tensor`s.
    else:
      # Apply `boolean_mask` to get the masked values.
      masked_values = array_ops.boolean_mask(data, mask)

      if mask.shape.ndims >= 2 and keepdims:
        # Add the innermost ragged dimension.  For each innermost cell, get the
        # number of values it contains.  Then flatten that to get a list of
        # cell lengths, and convert it to splits.  Finally, combine the splits
        # and values to get the innermost ragged tensor.
        masked_lengths = math_ops.count_nonzero(mask, axis=-1)
        flattened_masked_lengths = array_ops.reshape(masked_lengths, [-1])
        masked_values = ragged_tensor.RaggedTensor.from_row_lengths(
            masked_values, flattened_masked_lengths)

        # Wrap remaining ragged dimensions.
        if mask.shape.ndims > 2 and keepdims:
          mask_shape = array_ops.shape(mask, out_type=dtypes.int64)
          split_size = math_ops.cumprod(mask_shape) + 1
          for dim in range(mask.shape.ndims - 3, -1, -1):
            elt_size = mask_shape[dim + 1]
            masked_splits = math_ops.range(split_size[dim]) * elt_size
            masked_values = ragged_tensor.RaggedTensor.from_row_splits(
                masked_values, masked_splits)

      return masked_values
Beispiel #14
0
def gather_nd(params, indices, name=None):
  """Gather slices from `params` using `n`-dimensional indices.

  This operation is similar to `gather`, but it uses the innermost dimension
  of `indices` to define a slice into `params`.  In particular, if:

  * `indices` has shape `[A1...AN, I]`
  * `params` has shape `[B1...BM]`

  Then:

  * `result` has shape `[A1...AN, B_{I+1}...BM]`.
  * `result[a1...aN] = params[indices[a1...aN, :]]`

  Args:
    params: A potentially ragged tensor with shape `[A1...AN, I]`.
    indices: A potentially ragged tensor with shape `[B1...BM]`.
    name: A name for the operation (optional).

  Returns:
    A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`.

  #### Examples:
    ```python
    >>> params = tf.ragged.constant_value(
    ...     [ [ ['000', '001'], ['010'              ]          ],
    ...       [ ['100'       ], ['110', '111', '112'], ['120'] ],
    ...       [ [            ], ['210'              ]          ] ])

    >>> # Gather 2D slices from a 3D tensor
    >>> ragged.gather_nd(params, [[2], [0]])
    [ [ [            ], ['210'] ]
      [ ['000', '001'], ['010'] ] ]

    >>> # Gather 1D slices from a 3D tensor
    >>> ragged.gather_nd(params, [[2, 1], [0, 0]])
    [['210'], ['000', '001']]

    >>> # Gather scalars from a 3D tensor
    >>> ragged.gather_nd(params, [[0, 0, 1], [1, 1, 2]])
    ['001', '112']
    ```
  """
  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
    return array_ops.gather_nd(params, indices, name)

  with ops.name_scope(name, 'RaggedGatherNd', [params, indices]):

    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        params, name='params')
    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        indices, name='indices')
    indices_shape = indices.shape
    indices_ndims = indices_shape.ndims
    if indices_ndims is None:
      raise ValueError('indices.rank be statically known.')
    if indices_ndims == 0:
      raise ValueError('indices.rank must be at least 1.')
    if (ragged_tensor.is_ragged(indices) and
        indices_ndims == indices.ragged_rank + 1):
      raise ValueError('The innermost dimension of indices may not be ragged')

    # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions
    # that each index slices into.
    index_size = tensor_shape.dimension_value(indices_shape[-1])
    if index_size is None:
      raise ValueError('indices.shape[-1] must be statically known.')

    # If `indices` has more than 2 dimensions, then recurse.  If `indices` is
    # dense, then we convert it to ragged before recursing, and then convert
    # the result back to `dense` if appropriate.
    if indices_ndims > 2:
      indices_is_dense = not ragged_tensor.is_ragged(indices)
      if indices_is_dense:
        indices = ragged_conversion_ops.from_tensor(
            indices, ragged_rank=indices_ndims - 2)
      result = indices.with_flat_values(gather_nd(params, indices.flat_values))
      if (indices_is_dense and ragged_tensor.is_ragged(result) and
          result.ragged_rank == indices_ndims - 2):
        result = ragged_conversion_ops.to_tensor(result)
      return result

    # indices_ndims <= 2, and the innermost dimension of indices may not be
    # ragged, so `indices` must not be ragged.
    assert not ragged_tensor.is_ragged(indices)
    assert ragged_tensor.is_ragged(params)

    # Handle corner case: An empty index tuple selects the entire `params`
    # value.  So if `index_size` is zero, then tile `params`.
    if index_size == 0:
      params_ndims = params.ragged_rank + array_ops.rank(params.flat_values)
      for dim in range(indices_ndims - 1):
        params = expand_dims(params, axis=0)
      multiples = array_ops.concat([
          array_ops.shape(indices)[:-1],
          array_ops.ones([params_ndims], dtypes.int32)
      ],
                                   axis=0)
      return tile(params, multiples)

    # When index_size=1, we can just flatten the index tuples and use gather.
    elif index_size == 1:
      flattened_index_tuples = array_ops.reshape(indices, [-1])
      return gather(params, flattened_index_tuples)

    # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor.
    # Flatten both the index tuples and the params, such that the flattened
    # index tuples point to the correct values in the flattened params; and
    # then use ragged.gather on the flattened index tuples & params.
    else:
      indices = math_ops.to_int64(indices)

      # Flatten the outermost 2 dimensions of the index tuples & params.
      flattened_index_tuples = array_ops.gather(params.row_splits,
                                                indices[..., 0])
      flattened_index_tuples += indices[..., 1]
      flattened_params = params.values

      # Flatten any remaining dimensions.
      for dim in range(2, index_size):
        if not ragged_tensor.is_ragged(flattened_params):
          flattened_index_tuples = array_ops.expand_dims(
              flattened_index_tuples, axis=1)
          flattened_index_tuples = array_ops.concat(
              [flattened_index_tuples, indices[..., dim:]], axis=1)
          return array_ops.gather_nd(flattened_params, flattened_index_tuples)

        flattened_index_tuples = array_ops.gather(
            flattened_params.row_starts(), flattened_index_tuples)
        flattened_index_tuples += indices[..., dim]
        flattened_params = flattened_params.values

      # Gather using the flattened index tuples and params.
      return gather(flattened_params, flattened_index_tuples)
Beispiel #15
0
def batch_gather(params, indices, name=None):
  """Gathers slices from `params` according to `indices` with batch dims.

  This operation is similar to `gather`, but it assumes that the leading `N`
  dimensions of `indices` and `params` are batch dimensions, and performs a
  gather within each batch.  In particular, when using this operation with `N`
  batch dimensions `B1...BN`:

  * `indices` has shape `[B1...BN, I]`
  * `params` has shape `[B1...BN, P1...PM]`.
  * `result` has shape `[B1...BN, I, P2...PM]`.
  * `result[b1...bN, i, p2...pM] =
    params[b1...bN, indices[b1...bN, i], p2...pM]`

  Args:
    params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`,
      `M>0`).
    indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`).
    name: A name for the operation (optional).

  Returns:
    A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`.
    `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.

  #### Example:
    ```python
    >>> params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
    >>> indices = tf.ragged.constant([[1, 2, 0], [], [], [0, 0]])
    >>> ragged.batch_gather(params, indices)
    [['b', 'c', 'a'], [], [], ['e', 'e']]
    ```
  """
  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
    return array_ops.batch_gather(params, indices, name)

  with ops.name_scope(name, 'RaggedBatchGather', [params, indices]):
    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        params, name='params')
    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        indices, name='indices')
    indices_ndims = indices.shape.ndims
    if indices_ndims is None:
      raise ValueError(
          'batch_gather does not allow indices with unknown shape.')
    if indices_ndims == 0:
      raise ValueError('indices.rank must be at least 1.')

    if ragged_tensor.is_ragged(indices):
      # If the outermost ragged dimension is a batch dimension, recurse.
      if indices_ndims > 2:
        if not ragged_tensor.is_ragged(params):
          raise ValueError('batch shape from indices does '
                           'not match params shape')
        checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)]
        with ops.control_dependencies(checks):
          return ragged_tensor.RaggedTensor.from_row_splits(
              batch_gather(params.values, indices.values), indices.row_splits)

      # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension.
      else:
        # Ensure that `params` is ragged and has at least 2 dimensions.
        if not ragged_tensor.is_ragged(params):
          if params.shape.ndims is not None and params.shape.ndims < 2:
            raise ValueError('batch shape from indices does '
                             'not match params shape')
          params = ragged_conversion_ops.from_tensor(params, ragged_rank=1)

        # Adjust indices from within-batch to global (in params.values), and
        # then use ragged.gather to gather them.
        num_indices = indices.row_lengths()
        params_starts = params.row_starts()
        adjustments = ragged_util.repeat(params_starts, num_indices, axis=0)
        adjusted_index_values = math_ops.to_int64(indices.values) + adjustments
        return ragged_tensor.RaggedTensor.from_row_splits(
            gather(params.values, adjusted_index_values), indices.row_splits)

    else:  # params is a RaggedTensor and indices is a Tensor.
      if indices_ndims == 1:
        return gather(params, indices)
      elif indices_ndims == 2:
        # Adjust indices from batch-local to global (in params.values)
        adjustments = array_ops.expand_dims(params.row_starts(), 1)
        adjusted_indices = math_ops.to_int64(indices) + adjustments
        return gather(params.values, adjusted_indices)
      else:
        raise ValueError('batch shape from indices does not match params shape')
Beispiel #16
0
def boolean_mask(data, mask, keepdims=False, name=None):
  """Applies a boolean mask to `data`.

  Returns a potentially ragged tensor that is formed by retaining the elements
  in `data` where the corresponding value in `mask` is `True`.

  If `keepdims` is true then outer dimensions (corresponding to the `mask`
  dimensions) are preserved, and:

  * `output[a1...aA, i, b1...bB] = data[a1...aA, j, b1...bB]`

     Where `j` is the `i`th `True` entry of `mask[a1...aA]`.

  If `keepdims` is false, then the outer dimensions are collapsed (similar to
  the behavior of `tf.boolean_mask`), and:

  * `output[i, b1...bB] = data[a1...aA, b1...bB]`

     Where `(a1...aA)` is the `i`th `True` entry of `mask`
     (in row-major order).

  Args:
    data: A potentially ragged tensor.
    mask: A potentially ragged boolean tensor.  `mask`'s shape must be a prefix
      of `data`'s shape.  `rank(mask)` must be known statically.
    keepdims: Whether to preserve the outer dimensions (`keepdims=True`) or
      flatten them (`keepdims=False`).
    name: A name prefix for the returned tensor (optional).

  Returns:
    A potentially ragged tensor that is formed by retaining the elements in
    `data` where the corresponding value in `mask` is `True`.

    If `keepdims` is false:

    * `rank(output) = rank(data) - rank(mask) + 1`.
    * `output.ragged_rank = max(data.ragged_rank - rank(mask) + 1, 0)`.

    If `keepdims` is true:

    * `rank(output) = rank(data)`.
    * `output.ragged_rank = max(data.ragged_rank, rank(mask) - 1)`.

  Raises:
    ValueError: if `rank(mask)` is not known statically; or if `mask.shape` is
      not a prefix of `data.shape`.

  #### Examples:
    ```python
    >>> # Aliases for True & False so data and mask line up.
    >>> T, F = (True, False)

    >>> tf.ragged.boolean_mask(  # Mask a 2D Tensor.  Flatten outer dims.
    ...     data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
    ...     mask=[[T, F, T], [F, F, F], [T, F, F]],
    ...     keepdims=False).tolist()
    [1, 3, 7]

    >>> tf.ragged.boolean_mask(  # Mask a 2D Tensor.  Preserve outer dims.
    ...     data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
    ...     mask=[[T, F, T], [F, F, F], [T, F, F]],
    ...     keepdims=True).tolist()
    [[1, 3], [], [7]]

    >>> tf.ragged.boolean_mask(  # Mask a 2D RaggedTensor.  Flatten outer dims.
    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
    ...     tf.ragged.constant([[F, F, T], [F], [T, T]]),
    ...     keepdims=False).tolist()
    [3, 5, 6]

    >>> tf.ragged.boolean_mask(  # Mask a 2D RaggedTensor.  Preserve outer dims.
    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
    ...     tf.ragged.constant([[F, F, T], [F], [T, T]]),
    ...     keepdims=True).tolist()
    [[3], [], [5, 6]]

    >>> tf.ragged.boolean_mask(  # Mask rows of a 2D RaggedTensor.
    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
    ...     tf.ragged.constant([True, False, True]),
    ...     keepdims=True).tolist()
    [[1, 2, 3], [5, 6]]
    ```
  """
  with ops.name_scope(name, 'RaggedMask', [data, mask]):
    # Convert inputs to tensors.
    data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name='data')
    mask = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        mask, dtypes.bool, name='mask')

    # Get static rank of mask.
    if mask.shape.ndims is None:
      raise ValueError('mask.shape.ndims must be known statically.')
    elif mask.shape.ndims == 0:
      raise ValueError('mask cannot be scalar.')

    # If mask is ragged, then recurse with a non-ragged mask.
    if ragged_tensor.is_ragged(mask):
      if not ragged_tensor.is_ragged(data):
        data = ragged_conversion_ops.from_tensor(
            data, ragged_rank=mask.ragged_rank)
      # Check that mask.nested_row_splits is a prefix of
      # data.nested_row_splits.
      splits_list = [
          mask.nested_row_splits, data.nested_row_splits[:mask.ragged_rank]
      ]
      with ops.control_dependencies(
          ragged_util.assert_splits_match(splits_list)):
        # Strip off ragged `splits` until `mask` is non-ragged.  Keep the splits
        # that we strip off in `splits`, so we can add them back on after
        # we recursively mask the non-ragged data.
        splits = []
        while ragged_tensor.is_ragged(mask):
          if mask.shape.ndims > 2:
            splits.append(mask.row_splits)
          else:
            # Count the number of True mask values in each row to find the
            # lengths of the filtered rows; then convert to splits.
            int_mask = ragged_functional_ops.map_flat_values(
                math_ops.cast, mask, dtype=dtypes.int64)
            masked_row_lengths = ragged_math_ops.reduce_sum(int_mask, axis=1)
            splits.append(ragged_util.lengths_to_splits(masked_row_lengths))
          mask = mask.values
          data = data.values

        # Recursively apply the nested non-ragged mask to the nested data.
        masked_values = boolean_mask(data, mask, keepdims)

        # Add the ragged `splits` back to the result.
        if keepdims:
          masked_values = ragged_tensor.RaggedTensor.from_nested_row_splits(
              masked_values, splits)

        return masked_values

    # If mask is non-ragged and has rank 1, and data is ragged, then build a
    # ragged tensor with the indicated rows.
    elif ragged_tensor.is_ragged(data) and mask.shape.ndims == 1:
      # Get the masked splits: first get the length of each row, then filter
      # out the rows that we are deleting, and convert that filtered set of
      # masks back to a splits tensor.
      lengths = data.row_lengths()
      masked_lengths = array_ops.boolean_mask(lengths, mask)
      masked_splits = ragged_util.lengths_to_splits(masked_lengths)

      # Get the masked values: first get row ids corresponding to each
      # value, then use tf.gather to build a boolean mask that's false for
      # values that come from rows that we are deleting, and use that mask to
      # construct the masked values tensor.
      segment_ids = segment_id_ops.row_splits_to_segment_ids(data.row_splits)
      segment_mask = array_ops.gather(mask, segment_ids)
      masked_values = boolean_mask(data.values, segment_mask, keepdims=False)

      return ragged_tensor.RaggedTensor.from_row_splits(masked_values,
                                                        masked_splits)

    # If mask is non-ragged and has rank>1, then convert it to be ragged,
    # with a ragged rank matching data.
    if ragged_tensor.is_ragged(data):
      mask = ragged_conversion_ops.from_tensor(
          mask, ragged_rank=min(data.ragged_rank, mask.shape.ndims - 1))
      return boolean_mask(data, mask, keepdims)

    # Otherwise, data and mask are both `Tensor`s.
    else:
      # Apply `boolean_mask` to get the masked values.
      masked_values = array_ops.boolean_mask(data, mask)

      if mask.shape.ndims >= 2 and keepdims:
        # Add the innermost ragged dimension.  For each innermost cell, get the
        # number of values it contains.  Then flatten that to get a list of
        # cell lengths, and convert it to splits.  Finally, combine the splits
        # and values to get the innermost ragged tensor.
        masked_lengths = math_ops.count_nonzero(mask, axis=-1)
        flattened_masked_lengths = array_ops.reshape(masked_lengths, [-1])
        masked_values = ragged_tensor.RaggedTensor.from_row_lengths(
            masked_values, flattened_masked_lengths)

        # Wrap remaining ragged dimensions.
        if mask.shape.ndims > 2 and keepdims:
          mask_shape = array_ops.shape(mask, out_type=dtypes.int64)
          split_size = math_ops.cumprod(mask_shape) + 1
          for dim in range(mask.shape.ndims - 3, -1, -1):
            elt_size = mask_shape[dim + 1]
            masked_splits = math_ops.range(split_size[dim]) * elt_size
            masked_values = ragged_tensor.RaggedTensor.from_row_splits(
                masked_values, masked_splits)

      return masked_values
def unicode_encode(input, output_encoding, errors="replace",
                   replacement_char=65533, name=None):
  r"""Encodes each sequence of Unicode code points in `input` into a string.

  `result[i1...iN]` is the string formed by concatenating the Unicode
  codepoints `input[1...iN, :]`, encoded using `output_encoding`.

  Args:
    input: An `N+1` dimensional potentially ragged integer tensor with
        shape `[D1...DN, num_chars]`.
    output_encoding: Unicode encoding that should be used to encode each
      codepoint sequence.  Can be `"UTF-8"`, `"UTF-16-BE"`, or `"UTF-32-BE"`.
    errors: Specifies the response when an invalid codepoint is encountered
      (optional). One of:
            * `'replace'`: Replace invalid codepoint with the
              `replacement_char`. (default)
            * `'ignore'`: Skip invalid codepoints.
            * `'strict'`: Raise an exception for any invalid codepoint.
    replacement_char: The replacement character codepoint to be used in place of
      any invalid input when `errors='replace'`. Any valid unicode codepoint may
      be used. The default value is the default unicode replacement character
      which is 0xFFFD (U+65533).
    name: A name for the operation (optional).

  Returns:
    A `N` dimensional `string` tensor with shape `[D1...DN]`.

  #### Example:
    ```python
      >>> input = [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]
      >>> unicode_encode(input, 'UTF8')
      ['G\xc3\xb6\xc3\xb6dnight', '\xf0\x9f\x98\x8a']
    ```
  """
  with ops.name_scope(name, "UnicodeEncode", [input]):
    input_tensor = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(input)
    if input_tensor.shape.ndims is None:
      raise ValueError("Rank of input_tensor must be statically known.")
    if ragged_tensor.is_ragged(input_tensor):
      if input_tensor.inner_values.shape.ndims > 1:
        # If the inner_values of our ragged tensor is multi-dimensional, we can
        # process it separately and our output will have the same nested splits
        # as our input.
        return input_tensor.with_inner_values(
            unicode_encode(input_tensor.inner_values, output_encoding, errors,
                           replacement_char))
      elif input_tensor.ragged_rank > 1:
        # Recursively process the values of the ragged tensor.
        return input_tensor.with_values(
            unicode_encode(input_tensor.values, output_encoding, errors,
                           replacement_char))
      else:
        # Our ragged tensor is of the correct shape (rank 1 inner_values tensor
        # with ragged_rank of 1) so we can process it as normal.
        return gen_string_ops.unicode_encode(
            input_values=input_tensor.values,
            input_splits=input_tensor.row_splits,
            output_encoding=output_encoding,
            errors=errors,
            replacement_char=replacement_char)
    else:
      if input_tensor.shape.ndims == 2:
        # The input tensor is of the correct 2-D shape, it's just not ragged.
        return unicode_encode(ragged_conversion_ops.from_tensor(input_tensor),
                              output_encoding, errors, replacement_char)
      elif input_tensor.shape.ndims > 2:
        # We need to initially flatten the input tensor to 2-D, and then can
        # reshape the output of our processed flattened tensor.
        flat_input_tensor = array_ops.reshape(
            input_tensor,
            array_ops.stack([-1, array_ops.shape(input_tensor)[-1]]))
        flat_output_tensor = unicode_encode(flat_input_tensor, output_encoding,
                                            errors, replacement_char)
        return array_ops.reshape(flat_output_tensor, input_tensor.shape[:-1])
      elif input_tensor.shape.ndims == 0:
        raise ValueError("input_tensor's rank must be at least 1.")
      else:
        # Our input tensor is rank 1, so we create a ragged tensor with an added
        # dimension to create the correct input shape & type, and then remove
        # the additional dimension from the output and return the string scalar.
        ragged_input_tensor = ragged_factory_ops.from_row_splits(
            input_tensor,
            array_ops.stack([0, array_ops.shape(input_tensor,
                                                out_type=dtypes.int64)[0]]))
        output_tensor = unicode_encode(ragged_input_tensor, output_encoding,
                                       errors, replacement_char)
        return array_ops.reshape(output_tensor, [])
def unicode_encode(input,
                   output_encoding,
                   errors="replace",
                   replacement_char=65533,
                   name=None):
    r"""Encodes each sequence of Unicode code points in `input` into a string.

  `result[i1...iN]` is the string formed by concatenating the Unicode
  codepoints `input[1...iN, :]`, encoded using `output_encoding`.

  Args:
    input: An `N+1` dimensional potentially ragged integer tensor with
        shape `[D1...DN, num_chars]`.
    output_encoding: Unicode encoding that should be used to encode each
      codepoint sequence.  Can be `"UTF-8"`, `"UTF-16-BE"`, or `"UTF-32-BE"`.
    errors: Specifies the response when an invalid codepoint is encountered
      (optional). One of:
            * `'replace'`: Replace invalid codepoint with the
              `replacement_char`. (default)
            * `'ignore'`: Skip invalid codepoints.
            * `'strict'`: Raise an exception for any invalid codepoint.
    replacement_char: The replacement character codepoint to be used in place of
      any invalid input when `errors='replace'`. Any valid unicode codepoint may
      be used. The default value is the default unicode replacement character
      which is 0xFFFD (U+65533).
    name: A name for the operation (optional).

  Returns:
    A `N` dimensional `string` tensor with shape `[D1...DN]`.

  #### Example:
    ```python
      >>> input = [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]
      >>> unicode_encode(input, 'UTF8')
      ['G\xc3\xb6\xc3\xb6dnight', '\xf0\x9f\x98\x8a']
    ```
  """
    with ops.name_scope(name, "UnicodeEncode", [input]):
        input_tensor = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
            input)
        if input_tensor.shape.ndims is None:
            raise ValueError("Rank of input_tensor must be statically known.")
        if ragged_tensor.is_ragged(input_tensor):
            if input_tensor.inner_values.shape.ndims > 1:
                # If the inner_values of our ragged tensor is multi-dimensional, we can
                # process it separately and our output will have the same nested splits
                # as our input.
                return input_tensor.with_inner_values(
                    unicode_encode(input_tensor.inner_values, output_encoding,
                                   errors, replacement_char))
            elif input_tensor.ragged_rank > 1:
                # Recursively process the values of the ragged tensor.
                return input_tensor.with_values(
                    unicode_encode(input_tensor.values, output_encoding,
                                   errors, replacement_char))
            else:
                # Our ragged tensor is of the correct shape (rank 1 inner_values tensor
                # with ragged_rank of 1) so we can process it as normal.
                return gen_string_ops.unicode_encode(
                    input_values=input_tensor.values,
                    input_splits=input_tensor.row_splits,
                    output_encoding=output_encoding,
                    errors=errors,
                    replacement_char=replacement_char)
        else:
            if input_tensor.shape.ndims == 2:
                # The input tensor is of the correct 2-D shape, it's just not ragged.
                return unicode_encode(
                    ragged_conversion_ops.from_tensor(input_tensor),
                    output_encoding, errors, replacement_char)
            elif input_tensor.shape.ndims > 2:
                # We need to initially flatten the input tensor to 2-D, and then can
                # reshape the output of our processed flattened tensor.
                flat_input_tensor = array_ops.reshape(
                    input_tensor,
                    array_ops.stack([-1, array_ops.shape(input_tensor)[-1]]))
                flat_output_tensor = unicode_encode(flat_input_tensor,
                                                    output_encoding, errors,
                                                    replacement_char)
                return array_ops.reshape(flat_output_tensor,
                                         input_tensor.shape[:-1])
            elif input_tensor.shape.ndims == 0:
                raise ValueError("input_tensor's rank must be at least 1.")
            else:
                # Our input tensor is rank 1, so we create a ragged tensor with an added
                # dimension to create the correct input shape & type, and then remove
                # the additional dimension from the output and return the string scalar.
                ragged_input_tensor = ragged_factory_ops.from_row_splits(
                    input_tensor,
                    array_ops.stack([
                        0,
                        array_ops.shape(input_tensor, out_type=dtypes.int64)[0]
                    ]))
                output_tensor = unicode_encode(ragged_input_tensor,
                                               output_encoding, errors,
                                               replacement_char)
                return array_ops.reshape(output_tensor, [])
def batch_gather(params, indices, name=None):
  """Gathers slices from `params` according to `indices` with batch dims.

  This operation is similar to `gather`, but it assumes that the leading `N`
  dimensions of `indices` and `params` are batch dimensions, and performs a
  gather within each batch.  In particular, when using this operation with `N`
  batch dimensions `B1...BN`:

  * `indices` has shape `[B1...BN, I]`
  * `params` has shape `[B1...BN, P1...PM]`.
  * `result` has shape `[B1...BN, I, P2...PM]`.
  * `result[b1...bN, i, p2...pM] =
    params[b1...bN, indices[b1...bN, i], p2...pM]`

  Args:
    params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`,
      `M>0`).
    indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`).
    name: A name for the operation (optional).

  Returns:
    A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`.
    `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.

  #### Example:
    ```python
    >>> params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
    >>> indices = tf.ragged.constant([[1, 2, 0], [], [], [0, 0]])
    >>> tf.batch_gather(params, indices)
    [['b', 'c', 'a'], [], [], ['e', 'e']]
    ```
  """
  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
    return array_ops.batch_gather(params, indices, name)

  with ops.name_scope(name, 'RaggedBatchGather', [params, indices]):
    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        params, name='params')
    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        indices, name='indices')
    indices_ndims = indices.shape.ndims
    if indices_ndims is None:
      raise ValueError(
          'batch_gather does not allow indices with unknown shape.')
    if indices_ndims == 0:
      raise ValueError('indices.rank must be at least 1.')

    if ragged_tensor.is_ragged(indices):
      # If the outermost ragged dimension is a batch dimension, recurse.
      if indices_ndims > 2:
        if not ragged_tensor.is_ragged(params):
          raise ValueError('batch shape from indices does '
                           'not match params shape')
        checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)]
        with ops.control_dependencies(checks):
          return ragged_tensor.RaggedTensor.from_row_splits(
              batch_gather(params.values, indices.values), indices.row_splits)

      # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension.
      else:
        # Ensure that `params` is ragged and has at least 2 dimensions.
        if not ragged_tensor.is_ragged(params):
          if params.shape.ndims is not None and params.shape.ndims < 2:
            raise ValueError('batch shape from indices does '
                             'not match params shape')
          params = ragged_conversion_ops.from_tensor(params, ragged_rank=1)

        # Adjust indices from within-batch to global (in params.values), and
        # then use ragged.gather to gather them.
        num_indices = indices.row_lengths()
        params_starts = params.row_starts()
        adjustments = ragged_util.repeat(params_starts, num_indices, axis=0)
        adjusted_index_values = math_ops.to_int64(indices.values) + adjustments
        return ragged_tensor.RaggedTensor.from_row_splits(
            ragged_gather_ops.gather(params.values, adjusted_index_values),
            indices.row_splits)

    else:  # params is a RaggedTensor and indices is a Tensor.
      if indices_ndims == 1:
        return ragged_gather_ops.gather(params, indices)
      elif indices_ndims == 2:
        # Adjust indices from batch-local to global (in params.values)
        adjustments = array_ops.expand_dims(params.row_starts(), 1)
        adjusted_indices = math_ops.to_int64(indices) + adjustments
        return ragged_gather_ops.gather(params.values, adjusted_indices)
      else:
        raise ValueError('batch shape from indices does not match params shape')
Beispiel #20
0
def _broadcast_to_ragged_shape(rt_input, dst_shape,
                               broadcast_inner_dimensions):
    """Broadcasts rt_input to the ragged shape `dst_shape`."""
    # dst_shape's rank and ragged_rank must be greater than or equal to rt_input's
    if rt_input.shape.ndims is None or dst_shape.rank is None:
        raise ValueError('Unable to broadcast: unknown rank')
    if rt_input.shape.ndims > dst_shape.rank:
        raise ValueError('Incompatible with shape: rank mismatch')
    if (isinstance(rt_input, ragged_tensor.RaggedTensor)
            and rt_input.ragged_rank >= dst_shape.num_partitioned_dimensions):
        raise ValueError('Incompatible with shape: ragged rank mismatch')

    src_shape = RaggedTensorDynamicShape.from_tensor(rt_input)
    src_shape = src_shape.broadcast_to_rank(dst_shape.rank)

    # Add dimensions to rt_input so its rank and ragged_rank matches dst_shape.
    if dst_shape.rank > rt_input.shape.ndims:
        if rt_input.shape.ndims < dst_shape.num_inner_dimensions + 1:
            rt_input = array_ops.reshape(
                rt_input,
                array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0))
        for _ in range(dst_shape.rank - rt_input.shape.ndims):
            if ragged_tensor.is_ragged(rt_input):
                nrows = rt_input.nrows()
            else:
                nrows = array_ops.shape(rt_input, out_type=dtypes.int64)[0]
            rt_input = ragged_tensor.RaggedTensor.from_row_lengths(
                rt_input, [nrows])

    # Add ragged dimensions to match dst_shape.
    if ragged_tensor.is_ragged(rt_input):
        inner_rank_diff = (rt_input.flat_values.shape.ndims - 1 -
                           dst_shape.num_inner_dimensions)
        if inner_rank_diff > 0:
            rt_input = rt_input.with_flat_values(
                ragged_conversion_ops.from_tensor(rt_input.flat_values,
                                                  ragged_rank=inner_rank_diff))
    else:
        rt_input = ragged_conversion_ops.from_tensor(
            rt_input, ragged_rank=dst_shape.num_partitioned_dimensions - 1)

    # Do broadcasting for any dimensions that will remain uniform.  We can do
    # these all at once, since they're independent of one another.
    multiples = [1] * dst_shape.rank
    for axis in range(dst_shape.num_partitioned_dimensions):
        if not src_shape.is_ragged(axis) and not dst_shape.is_ragged(axis):
            src_size = src_shape.dimension_size(axis)
            dst_size = dst_shape.dimension_size(axis)
            if ((tensor_util.constant_value(src_size) in (1, None))
                    and (tensor_util.constant_value(dst_size) != 1)):
                multiples[axis] = array_ops.where(math_ops.equal(src_size, 1),
                                                  dst_size, 1)
    if not all(isinstance(v, int) and v == 1 for v in multiples):
        multiples = array_ops.stack(multiples, axis=0)
        rt_input = ragged_array_ops.tile(rt_input, multiples)

    if broadcast_inner_dimensions:
        rt_input = rt_input.with_flat_values(
            array_ops.reshape(
                rt_input.flat_values,
                array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0)))

    # Do broadcasting for dimensions that become ragged.  We must do these from
    # outermost to innermost.
    for axis in range(dst_shape.num_partitioned_dimensions):
        if not src_shape.is_ragged(axis) and dst_shape.is_ragged(axis):
            dst_size = dst_shape.dimension_size(axis)
            rt_input = _ragged_tile_axis(rt_input, axis, dst_size)

    return rt_input
Beispiel #21
0
def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
  """Helper function to concatenate or stack ragged tensors.

  Args:
    rt_inputs: A list of RaggedTensors or Tensors to combine.
    axis: The axis along which to concatenate or stack.
    stack_values: A boolean -- if true, then stack values; otherwise,
      concatenate them.

  Returns:
    A RaggedTensor.
  Raises:
    ValueError: If rt_inputs is empty, or if axis is out of range.
  """
  # Validate parameters.
  if not rt_inputs:
    raise ValueError('rt_inputs may not be empty.')

  # Convert input tensors.
  rt_inputs = [
      ragged_tensor.convert_to_tensor_or_ragged_tensor(
          rt_input, name='rt_input') for rt_input in rt_inputs
  ]

  # Special case: if there's only one input, then return it as-is.
  if len(rt_inputs) == 1:
    if stack_values:
      return ragged_array_ops.expand_dims(rt_inputs[0], axis=axis)
    else:
      return rt_inputs[0]

  # Check the rank (number of dimensions) of the input tensors.
  ndims = None
  for rt in rt_inputs:
    if ndims is None:
      ndims = rt.shape.ndims
    else:
      rt.shape.assert_has_rank(ndims)

  out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1
  axis = ragged_util.get_positive_axis(axis, out_ndims)

  # If all the inputs are Tensors, and we're combining the final dimension,
  # then we can delegate to the tf.stack/tf.concat operation, and return a
  # Tensor.
  if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs):
    if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1):
      if stack_values:
        return array_ops.stack(rt_inputs, axis)
      else:
        return array_ops.concat(rt_inputs, axis)

  # Convert any Tensor inputs to RaggedTensors.  This makes it
  # possible to concatenate Tensors and RaggedTensors together.
  for i in range(len(rt_inputs)):
    if not ragged_tensor.is_ragged(rt_inputs[i]):
      rt_inputs[i] = ragged_conversion_ops.from_tensor(
          rt_inputs[i], ragged_rank=1)

  # Convert the input tensors to all have the same ragged_rank.
  ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1)
  rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank) for rt in rt_inputs]

  if axis == 0:
    return _ragged_stack_concat_axis_0(rt_inputs, stack_values)
  elif axis == 1:
    return _ragged_stack_concat_axis_1(rt_inputs, stack_values)
  else:  # axis > 1: recurse.
    values = [rt.values for rt in rt_inputs]
    splits = [[rt_input.row_splits] for rt_input in rt_inputs]
    with ops.control_dependencies(ragged_util.assert_splits_match(splits)):
      return ragged_tensor.RaggedTensor.from_row_splits(
          _ragged_stack_concat_helper(values, axis - 1, stack_values),
          splits[0][0])
 def test_passing_simple_from_dense(self, input_list, squeeze_ranks=None):
     dt = constant_op.constant(input_list)
     rt = ragged_conversion_ops.from_tensor(dt)
     rt_s = ragged_squeeze_op.squeeze(rt, squeeze_ranks)
     dt_s = array_ops.squeeze(dt, squeeze_ranks)
     self.assertRaggedEqual(ragged_conversion_ops.to_tensor(rt_s), dt_s)
Beispiel #23
0
def gather_nd(params, indices, name=None):
  """Gather slices from `params` using `n`-dimensional indices.

  This operation is similar to `gather`, but it uses the innermost dimension
  of `indices` to define a slice into `params`.  In particular, if:

  * `indices` has shape `[A1...AN, I]`
  * `params` has shape `[B1...BM]`

  Then:

  * `result` has shape `[A1...AN, B_{I+1}...BM]`.
  * `result[a1...aN] = params[indices[a1...aN, :]]`

  Args:
    params: A potentially ragged tensor with shape `[A1...AN, I]`.
    indices: A potentially ragged tensor with shape `[B1...BM]`.
    name: A name for the operation (optional).

  Returns:
    A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`.

  #### Examples:
    ```python
    >>> params = tf.ragged.constant_value(
    ...     [ [ ['000', '001'], ['010'              ]          ],
    ...       [ ['100'       ], ['110', '111', '112'], ['120'] ],
    ...       [ [            ], ['210'              ]          ] ])

    >>> # Gather 2D slices from a 3D tensor
    >>> ragged.gather_nd(params, [[2], [0]])
    [ [ [            ], ['210'] ]
      [ ['000', '001'], ['010'] ] ]

    >>> # Gather 1D slices from a 3D tensor
    >>> ragged.gather_nd(params, [[2, 1], [0, 0]])
    [['210'], ['000', '001']]

    >>> # Gather scalars from a 3D tensor
    >>> ragged.gather_nd(params, [[0, 0, 1], [1, 1, 2]])
    ['001', '112']
    ```
  """
  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
    return array_ops.gather_nd(params, indices, name)

  with ops.name_scope(name, 'RaggedGatherNd', [params, indices]):

    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        params, name='params')
    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        indices, name='indices')
    indices_shape = indices.shape
    indices_ndims = indices_shape.ndims
    if indices_ndims is None:
      raise ValueError('indices.rank be statically known.')
    if indices_ndims == 0:
      raise ValueError('indices.rank must be at least 1.')
    if (ragged_tensor.is_ragged(indices) and
        indices_ndims == indices.ragged_rank + 1):
      raise ValueError('The innermost dimension of indices may not be ragged')

    # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions
    # that each index slices into.
    index_size = tensor_shape.dimension_value(indices_shape[-1])
    if index_size is None:
      raise ValueError('indices.shape[-1] must be statically known.')

    # If `indices` has more than 2 dimensions, then recurse.  If `indices` is
    # dense, then we convert it to ragged before recursing, and then convert
    # the result back to `dense` if appropriate.
    if indices_ndims > 2:
      indices_is_dense = not ragged_tensor.is_ragged(indices)
      if indices_is_dense:
        indices = ragged_conversion_ops.from_tensor(
            indices, ragged_rank=indices_ndims - 2)
      result = indices.with_flat_values(gather_nd(params, indices.flat_values))
      if (indices_is_dense and ragged_tensor.is_ragged(result) and
          result.ragged_rank == indices_ndims - 2):
        result = ragged_conversion_ops.to_tensor(result)
      return result

    # indices_ndims <= 2, and the innermost dimension of indices may not be
    # ragged, so `indices` must not be ragged.
    assert not ragged_tensor.is_ragged(indices)
    assert ragged_tensor.is_ragged(params)

    # Handle corner case: An empty index tuple selects the entire `params`
    # value.  So if `index_size` is zero, then tile `params`.
    if index_size == 0:
      params_ndims = params.ragged_rank + array_ops.rank(params.flat_values)
      for dim in range(indices_ndims - 1):
        params = ragged_array_ops.expand_dims(params, axis=0)
      multiples = array_ops.concat([
          array_ops.shape(indices)[:-1],
          array_ops.ones([params_ndims], dtypes.int32)
      ],
                                   axis=0)
      return ragged_array_ops.tile(params, multiples)

    # When index_size=1, we can just flatten the index tuples and use gather.
    elif index_size == 1:
      flattened_index_tuples = array_ops.reshape(indices, [-1])
      return gather(params, flattened_index_tuples)

    # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor.
    # Flatten both the index tuples and the params, such that the flattened
    # index tuples point to the correct values in the flattened params; and
    # then use ragged.gather on the flattened index tuples & params.
    else:
      indices = math_ops.to_int64(indices)

      # Flatten the outermost 2 dimensions of the index tuples & params.
      flattened_index_tuples = array_ops.gather(params.row_splits,
                                                indices[..., 0])
      flattened_index_tuples += indices[..., 1]
      flattened_params = params.values

      # Flatten any remaining dimensions.
      for dim in range(2, index_size):
        if not ragged_tensor.is_ragged(flattened_params):
          flattened_index_tuples = array_ops.expand_dims(
              flattened_index_tuples, axis=1)
          flattened_index_tuples = array_ops.concat(
              [flattened_index_tuples, indices[..., dim:]], axis=1)
          return array_ops.gather_nd(flattened_params, flattened_index_tuples)

        flattened_index_tuples = array_ops.gather(
            flattened_params.row_starts(), flattened_index_tuples)
        flattened_index_tuples += indices[..., dim]
        flattened_params = flattened_params.values

      # Gather using the flattened index tuples and params.
      return gather(flattened_params, flattened_index_tuples)
Beispiel #24
0
    def tokenize_with_offsets(self, input, name=None):  # pylint: disable=redefined-builtin
        """Tokenizes a tensor of UTF-8 strings.

    Args:
      input: A `RaggedTensor` or `Tensor` of UTF-8 strings with any shape.
      name: The name argument that is passed to the op function.

    Returns:
      A tuple `(tokens, start_offsets, end_offsets)` where:

        * `tokens` is an N+1-dimensional UTF-8 string or integer `Tensor` or
            `RaggedTensor`.
        * `start_offsets` is an N+1-dimensional integer `Tensor` or
            `RaggedTensor` containing the starting indices of each token (byte
            indices for input strings).
        * `end_offsets` is an N+1-dimensional integer `Tensor` or
            `RaggedTensor` containing the exclusive ending indices of each token
            (byte indices for input strings).
    """
        with ops.name_scope(name, "SentenceTokenizer", [input, self]):
            input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(
                input)
            if input_tensor.shape.ndims is None:
                raise ValueError(
                    "Rank of input_tensor must be statically known.")
            if ragged_tensor.is_ragged(input_tensor):
                # Recursively process the values of the ragged tensor
                (tokens, starts,
                 ends) = self.tokenize_with_offsets(input_tensor.flat_values)
                tokens = input_tensor.with_flat_values(tokens)
                starts = input_tensor.with_flat_values(starts)
                ends = input_tensor.with_flat_values(ends)
                return (tokens, starts, ends)
            else:
                if input_tensor.shape.ndims > 1:
                    # Convert the input tensor to ragged and process it.
                    return self.tokenize_with_offsets(
                        ragged_conversion_ops.from_tensor(input_tensor))
                elif input_tensor.shape.ndims == 0:
                    (tokens, starts, ends) = self.tokenize_with_offsets(
                        array_ops.stack([input_tensor]))
                    tokens = tokens.values
                    starts = starts.values
                    ends = ends.values
                    return (tokens, starts, ends)
                else:
                    # Our rank 1 tensor is the correct shape, so we can process it as
                    # normal.
                    (output_values, output_splits, output_offset_starts,
                     output_offset_ends) = (
                         gen_sentencepiece_tokenizer.
                         sentencepiece_tokenize_with_offsets_op(
                             self._model_resource.resource_handle,
                             input_tensor, self.nbest_size, self.alpha,
                             self.add_bos, self.add_eos, self.reverse,
                             self.enable_sampling, self.out_type))
                    tokens = RaggedTensor.from_nested_row_splits(
                        flat_values=output_values,
                        nested_row_splits=[output_splits],
                        validate=False)
                    starts = RaggedTensor.from_nested_row_splits(
                        flat_values=output_offset_starts,
                        nested_row_splits=[output_splits],
                        validate=False)
                    ends = RaggedTensor.from_nested_row_splits(
                        flat_values=output_offset_ends,
                        nested_row_splits=[output_splits],
                        validate=False)
                    return (tokens, starts, ends)
Beispiel #25
0
def find_source_offsets(offsets_map, input_offsets, name=None):
  """Maps the input post-normalized string offsets to pre-normalized offsets.

  Returns the source (i.e. pre-normalized) string offsets mapped from the input
  post-normalized string offsets using the input offsets_map, which is an output
  from the `normalize_utf8_with_offsets_map` op. offsets_map can be indexed or
  sliced along with the input_offsets.

  Example usage:

  >>> post_normalized_str, offsets_map = normalize_utf8_with_offsets_map(
  ...     ["株式会社", "KADOKAWA"])
  >>> find_source_offsets(offsets_map, [[0, 1, 2], [0, 1, 2]])
  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=array([[0, 1, 2], [0, 3, 6]])>
  >>> find_source_offsets(offsets_map[1], [[0, 1, 2]])  # indexed offsets_map
  <tf.Tensor: shape=(1, 3), dtype=int64, numpy=array([[0, 3, 6]])>

  Args:
    offsets_map: A `Tensor` or `RaggedTensor` of type `variant`, used to map the
      post-normalized string offsets to pre-normalized string offsets.
      offsets_map is an output from `normalize_utf8_with_offsets_map` function.
    input_offsets: A `Tensor` or `RaggedTensor` of type int64 representing the
      the post-normalized string offsets,
    name: The name for this op (optional).

  Returns:
    results: A `Tensor` or `RaggedTensor` of type int64, with pre-normalized
      string offsets.
  """

  with ops.name_scope(name, "FindSourceOffsets", [offsets_map, input_offsets]):
    offsets_map_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        offsets_map, dtype=dtypes.variant)
    input_offsets_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        input_offsets, dtype=dtypes.int64)

    if ragged_tensor.is_ragged(input_offsets_tensor):
      if ragged_tensor.is_ragged(offsets_map_tensor):
        offsets_map_values = offsets_map_tensor.flat_values
      else:
        offsets_map_values = array_ops.reshape(offsets_map_tensor, [-1])

      output_values = gen_normalize_ops.find_source_offsets(
          offsets_map=offsets_map_values,
          input_offsets_values=input_offsets_tensor.flat_values,
          input_offsets_splits=input_offsets_tensor.nested_row_splits[-1])
      return input_offsets_tensor.with_flat_values(output_values)
    else:
      if input_offsets_tensor.shape.ndims > 1:
        output_offsets = find_source_offsets(
            offsets_map,
            ragged_conversion_ops.from_tensor(
                input_offsets_tensor,
                ragged_rank=input_offsets_tensor.shape.ndims - 1))
        return ragged_conversion_ops.to_tensor(output_offsets)
      elif input_offsets_tensor.shape.ndims == 0:
        output_offsets = find_source_offsets(
            offsets_map, array_ops.expand_dims(input_offsets_tensor, 0))
        return output_offsets[0]
      else:
        output_offsets = find_source_offsets(
            offsets_map, array_ops.expand_dims(input_offsets_tensor, 0))
        return array_ops.squeeze(output_offsets, [0])
def _broadcast_to_ragged_shape(rt_input, dst_shape, broadcast_inner_dimensions):
  """Broadcasts rt_input to the ragged shape `dst_shape`."""
  # dst_shape's rank and ragged_rank must be greater than or equal to rt_input's
  if rt_input.shape.ndims is None or dst_shape.rank is None:
    raise ValueError('Unable to broadcast: unknown rank')
  if rt_input.shape.ndims > dst_shape.rank:
    raise ValueError('Incompatible with shape: rank mismatch')
  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
      rt_input.ragged_rank >= dst_shape.num_partitioned_dimensions):
    raise ValueError('Incompatible with shape: ragged rank mismatch')

  src_shape = RaggedTensorDynamicShape.from_tensor(rt_input)
  src_shape = src_shape.broadcast_to_rank(dst_shape.rank)

  # Add dimensions to rt_input so its rank and ragged_rank matches dst_shape.
  if dst_shape.rank > rt_input.shape.ndims:
    if rt_input.shape.ndims < dst_shape.num_inner_dimensions + 1:
      rt_input = array_ops.reshape(
          rt_input, array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0))
    for _ in range(dst_shape.rank - rt_input.shape.ndims):
      if ragged_tensor.is_ragged(rt_input):
        nrows = rt_input.nrows()
      else:
        nrows = array_ops.shape(rt_input, out_type=dtypes.int64)[0]
      rt_input = ragged_tensor.RaggedTensor.from_row_lengths(rt_input, [nrows])

  # Add ragged dimensions to match dst_shape.
  if ragged_tensor.is_ragged(rt_input):
    inner_rank_diff = (
        rt_input.flat_values.shape.ndims - 1 - dst_shape.num_inner_dimensions)
    if inner_rank_diff > 0:
      rt_input = rt_input.with_flat_values(
          ragged_conversion_ops.from_tensor(
              rt_input.flat_values, ragged_rank=inner_rank_diff))
  else:
    rt_input = ragged_conversion_ops.from_tensor(
        rt_input, ragged_rank=dst_shape.num_partitioned_dimensions - 1)

  # Do broadcasting for any dimensions that will remain uniform.  We can do
  # these all at once, since they're independent of one another.
  multiples = [1] * dst_shape.rank
  for axis in range(dst_shape.num_partitioned_dimensions):
    if not src_shape.is_ragged(axis) and not dst_shape.is_ragged(axis):
      src_size = src_shape.dimension_size(axis)
      dst_size = dst_shape.dimension_size(axis)
      if ((tensor_util.constant_value(src_size) in (1, None)) and
          (tensor_util.constant_value(dst_size) != 1)):
        multiples[axis] = array_ops.where(
            math_ops.equal(src_size, 1), dst_size, 1)
  if not all(isinstance(v, int) and v == 1 for v in multiples):
    multiples = array_ops.stack(multiples, axis=0)
    rt_input = ragged_array_ops.tile(rt_input, multiples)

  if broadcast_inner_dimensions:
    rt_input = rt_input.with_flat_values(
        array_ops.reshape(
            rt_input.flat_values,
            array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0)))

  # Do broadcasting for dimensions that become ragged.  We must do these from
  # outermost to innermost.
  for axis in range(dst_shape.num_partitioned_dimensions):
    if not src_shape.is_ragged(axis) and dst_shape.is_ragged(axis):
      dst_size = dst_shape.dimension_size(axis)
      rt_input = _ragged_tile_axis(rt_input, axis, dst_size)

  return rt_input