def string_split(source, sep=None, skip_empty=True, delimiter=None):  # pylint: disable=invalid-name
    """Split elements of `source` based on `delimiter` into a `SparseTensor`.

  Let N be the size of source (typically N will be the batch size). Split each
  element of `source` based on `delimiter` and return a `SparseTensor`
  containing the split tokens. Empty tokens are ignored.

  If `sep` is an empty string, each element of the `source` is split
  into individual strings, each containing one byte. (This includes splitting
  multibyte sequences of UTF-8.) If delimiter contains multiple bytes, it is
  treated as a set of delimiters with each considered a potential split point.

  For example:
  N = 2, source[0] is 'hello world' and source[1] is 'a b c', then the output
  will be

  st.indices = [0, 0;
                0, 1;
                1, 0;
                1, 1;
                1, 2]
  st.shape = [2, 3]
  st.values = ['hello', 'world', 'a', 'b', 'c']

  Args:
    source: `1-D` string `Tensor`, the strings to split.
    sep: `0-D` string `Tensor`, the delimiter character, the string should
      be length 0 or 1. Default is ' '.
    skip_empty: A `bool`. If `True`, skip the empty strings from the result.
    delimiter: deprecated alias for `sep`.

  Raises:
    ValueError: If delimiter is not a string.

  Returns:
    A `SparseTensor` of rank `2`, the strings split according to the delimiter.
    The first column of the indices corresponds to the row in `source` and the
    second column corresponds to the index of the split component in this row.
  """
    delimiter = deprecation.deprecated_argument_lookup("sep", sep, "delimiter",
                                                       delimiter)

    if delimiter is None:
        delimiter = " "
    delimiter = ops.convert_to_tensor(delimiter, dtype=dtypes.string)
    source = ops.convert_to_tensor(source, dtype=dtypes.string)

    indices, values, shape = gen_string_ops.string_split(source,
                                                         delimiter=delimiter,
                                                         skip_empty=skip_empty)
    indices.set_shape([None, 2])
    values.set_shape([None])
    shape.set_shape([2])
    return sparse_tensor.SparseTensor(indices, values, shape)
Example #2
0
def string_split(source, sep=None, skip_empty=True, delimiter=None):  # pylint: disable=invalid-name
  """Split elements of `source` based on `delimiter` into a `SparseTensor`.

  Let N be the size of source (typically N will be the batch size). Split each
  element of `source` based on `delimiter` and return a `SparseTensor`
  containing the split tokens. Empty tokens are ignored.

  If `sep` is an empty string, each element of the `source` is split
  into individual strings, each containing one byte. (This includes splitting
  multibyte sequences of UTF-8.) If delimiter contains multiple bytes, it is
  treated as a set of delimiters with each considered a potential split point.

  For example:
  N = 2, source[0] is 'hello world' and source[1] is 'a b c', then the output
  will be

  st.indices = [0, 0;
                0, 1;
                1, 0;
                1, 1;
                1, 2]
  st.shape = [2, 3]
  st.values = ['hello', 'world', 'a', 'b', 'c']

  Args:
    source: `1-D` string `Tensor`, the strings to split.
    sep: `0-D` string `Tensor`, the delimiter character, the string should
      be length 0 or 1. Default is ' '.
    skip_empty: A `bool`. If `True`, skip the empty strings from the result.
    delimiter: deprecated alias for `sep`.

  Raises:
    ValueError: If delimiter is not a string.

  Returns:
    A `SparseTensor` of rank `2`, the strings split according to the delimiter.
    The first column of the indices corresponds to the row in `source` and the
    second column corresponds to the index of the split component in this row.
  """
  delimiter = deprecation.deprecated_argument_lookup(
      "sep", sep, "delimiter", delimiter)

  if delimiter is None:
    delimiter = " "
  delimiter = ops.convert_to_tensor(delimiter, dtype=dtypes.string)
  source = ops.convert_to_tensor(source, dtype=dtypes.string)

  indices, values, shape = gen_string_ops.string_split(
      source, delimiter=delimiter, skip_empty=skip_empty)
  indices.set_shape([None, 2])
  values.set_shape([None])
  shape.set_shape([2])
  return sparse_tensor.SparseTensor(indices, values, shape)
def string_bytes_split(input, name=None):  # pylint: disable=redefined-builtin
    """Split string elements of `input` into bytes.

  Examples:

  ```python
  >>> tf.strings.bytes_split('hello')
  ['h', 'e', 'l', 'l', 'o']
  >>> tf.strings.bytes_split(['hello', '123'])
  <RaggedTensor [['h', 'e', 'l', 'l', 'o'], ['1', '2', '3']]>
  ```

  Note that this op splits strings into bytes, not unicode characters.  To
  split strings into unicode characters, use `tf.strings.unicode_split`.

  See also: `tf.io.decode_raw`, `tf.strings.split`, `tf.strings.unicode_split`.

  Args:
    input: A string `Tensor` or `RaggedTensor`: the strings to split.  Must
      have a statically known rank (`N`).
    name: A name for the operation (optional).

  Returns:
    A `RaggedTensor` of rank `N+1`: the bytes that make up the source strings.
  """
    with ops.name_scope(name, "StringsByteSplit", [input]):
        input = ragged_tensor.convert_to_tensor_or_ragged_tensor(input,
                                                                 name="input")
        if isinstance(input, ragged_tensor.RaggedTensor):
            return input.with_flat_values(string_bytes_split(
                input.flat_values))

        rank = input.shape.ndims
        if rank is None:
            raise ValueError("input must have a statically-known rank.")

        if rank == 0:
            return string_bytes_split(array_ops.stack([input]))[0]
        elif rank == 1:
            indices, values, shape = gen_string_ops.string_split(
                input, delimiter="", skip_empty=False)
            return ragged_tensor.RaggedTensor.from_value_rowids(
                values=values,
                value_rowids=indices[:, 0],
                nrows=shape[0],
                validate=False)
        else:
            return string_bytes_split(
                ragged_tensor.RaggedTensor.from_tensor(input))
def string_bytes_split(input, name=None):  # pylint: disable=redefined-builtin
  """Split string elements of `input` into bytes.

  Examples:

  ```python
  >>> tf.strings.to_bytes('hello')
  ['h', 'e', 'l', 'l', 'o']
  >>> tf.strings.to_bytes(['hello', '123'])
  <RaggedTensor [['h', 'e', 'l', 'l', 'o'], ['1', '2', '3']]>
  ```

  Note that this op splits strings into bytes, not unicode characters.  To
  split strings into unicode characters, use `tf.strings.unicode_split`.

  See also: `tf.io.decode_raw`, `tf.strings.split`, `tf.strings.unicode_split`.

  Args:
    input: A string `Tensor` or `RaggedTensor`: the strings to split.  Must
      have a statically known rank (`N`).
    name: A name for the operation (optional).

  Returns:
    A `RaggedTensor` of rank `N+1`: the bytes that make up the soruce strings.
  """
  with ops.name_scope(name, "StringsByteSplit", [input]):
    input = ragged_tensor.convert_to_tensor_or_ragged_tensor(input,
                                                             name="input")
    if isinstance(input, ragged_tensor.RaggedTensor):
      return input.with_flat_values(string_bytes_split(input.flat_values))

    rank = input.shape.ndims
    if rank is None:
      raise ValueError("input must have a statically-known rank.")

    if rank == 0:
      return string_bytes_split(array_ops.stack([input]))[0]
    elif rank == 1:
      indices, values, shape = gen_string_ops.string_split(
          input, delimiter="", skip_empty=False)
      return ragged_tensor.RaggedTensor.from_value_rowids(
          values=values, value_rowids=indices[:, 0], nrows=shape[0],
          validate=False)
    else:
      return string_bytes_split(ragged_tensor.RaggedTensor.from_tensor(input))