def string_split(source, sep=None, skip_empty=True, delimiter=None): # pylint: disable=invalid-name """Split elements of `source` based on `delimiter` into a `SparseTensor`. Let N be the size of source (typically N will be the batch size). Split each element of `source` based on `delimiter` and return a `SparseTensor` containing the split tokens. Empty tokens are ignored. If `sep` is an empty string, each element of the `source` is split into individual strings, each containing one byte. (This includes splitting multibyte sequences of UTF-8.) If delimiter contains multiple bytes, it is treated as a set of delimiters with each considered a potential split point. For example: N = 2, source[0] is 'hello world' and source[1] is 'a b c', then the output will be st.indices = [0, 0; 0, 1; 1, 0; 1, 1; 1, 2] st.shape = [2, 3] st.values = ['hello', 'world', 'a', 'b', 'c'] Args: source: `1-D` string `Tensor`, the strings to split. sep: `0-D` string `Tensor`, the delimiter character, the string should be length 0 or 1. Default is ' '. skip_empty: A `bool`. If `True`, skip the empty strings from the result. delimiter: deprecated alias for `sep`. Raises: ValueError: If delimiter is not a string. Returns: A `SparseTensor` of rank `2`, the strings split according to the delimiter. The first column of the indices corresponds to the row in `source` and the second column corresponds to the index of the split component in this row. """ delimiter = deprecation.deprecated_argument_lookup("sep", sep, "delimiter", delimiter) if delimiter is None: delimiter = " " delimiter = ops.convert_to_tensor(delimiter, dtype=dtypes.string) source = ops.convert_to_tensor(source, dtype=dtypes.string) indices, values, shape = gen_string_ops.string_split(source, delimiter=delimiter, skip_empty=skip_empty) indices.set_shape([None, 2]) values.set_shape([None]) shape.set_shape([2]) return sparse_tensor.SparseTensor(indices, values, shape)
def string_split(source, sep=None, skip_empty=True, delimiter=None): # pylint: disable=invalid-name """Split elements of `source` based on `delimiter` into a `SparseTensor`. Let N be the size of source (typically N will be the batch size). Split each element of `source` based on `delimiter` and return a `SparseTensor` containing the split tokens. Empty tokens are ignored. If `sep` is an empty string, each element of the `source` is split into individual strings, each containing one byte. (This includes splitting multibyte sequences of UTF-8.) If delimiter contains multiple bytes, it is treated as a set of delimiters with each considered a potential split point. For example: N = 2, source[0] is 'hello world' and source[1] is 'a b c', then the output will be st.indices = [0, 0; 0, 1; 1, 0; 1, 1; 1, 2] st.shape = [2, 3] st.values = ['hello', 'world', 'a', 'b', 'c'] Args: source: `1-D` string `Tensor`, the strings to split. sep: `0-D` string `Tensor`, the delimiter character, the string should be length 0 or 1. Default is ' '. skip_empty: A `bool`. If `True`, skip the empty strings from the result. delimiter: deprecated alias for `sep`. Raises: ValueError: If delimiter is not a string. Returns: A `SparseTensor` of rank `2`, the strings split according to the delimiter. The first column of the indices corresponds to the row in `source` and the second column corresponds to the index of the split component in this row. """ delimiter = deprecation.deprecated_argument_lookup( "sep", sep, "delimiter", delimiter) if delimiter is None: delimiter = " " delimiter = ops.convert_to_tensor(delimiter, dtype=dtypes.string) source = ops.convert_to_tensor(source, dtype=dtypes.string) indices, values, shape = gen_string_ops.string_split( source, delimiter=delimiter, skip_empty=skip_empty) indices.set_shape([None, 2]) values.set_shape([None]) shape.set_shape([2]) return sparse_tensor.SparseTensor(indices, values, shape)
def string_bytes_split(input, name=None): # pylint: disable=redefined-builtin """Split string elements of `input` into bytes. Examples: ```python >>> tf.strings.bytes_split('hello') ['h', 'e', 'l', 'l', 'o'] >>> tf.strings.bytes_split(['hello', '123']) <RaggedTensor [['h', 'e', 'l', 'l', 'o'], ['1', '2', '3']]> ``` Note that this op splits strings into bytes, not unicode characters. To split strings into unicode characters, use `tf.strings.unicode_split`. See also: `tf.io.decode_raw`, `tf.strings.split`, `tf.strings.unicode_split`. Args: input: A string `Tensor` or `RaggedTensor`: the strings to split. Must have a statically known rank (`N`). name: A name for the operation (optional). Returns: A `RaggedTensor` of rank `N+1`: the bytes that make up the source strings. """ with ops.name_scope(name, "StringsByteSplit", [input]): input = ragged_tensor.convert_to_tensor_or_ragged_tensor(input, name="input") if isinstance(input, ragged_tensor.RaggedTensor): return input.with_flat_values(string_bytes_split( input.flat_values)) rank = input.shape.ndims if rank is None: raise ValueError("input must have a statically-known rank.") if rank == 0: return string_bytes_split(array_ops.stack([input]))[0] elif rank == 1: indices, values, shape = gen_string_ops.string_split( input, delimiter="", skip_empty=False) return ragged_tensor.RaggedTensor.from_value_rowids( values=values, value_rowids=indices[:, 0], nrows=shape[0], validate=False) else: return string_bytes_split( ragged_tensor.RaggedTensor.from_tensor(input))
def string_bytes_split(input, name=None): # pylint: disable=redefined-builtin """Split string elements of `input` into bytes. Examples: ```python >>> tf.strings.to_bytes('hello') ['h', 'e', 'l', 'l', 'o'] >>> tf.strings.to_bytes(['hello', '123']) <RaggedTensor [['h', 'e', 'l', 'l', 'o'], ['1', '2', '3']]> ``` Note that this op splits strings into bytes, not unicode characters. To split strings into unicode characters, use `tf.strings.unicode_split`. See also: `tf.io.decode_raw`, `tf.strings.split`, `tf.strings.unicode_split`. Args: input: A string `Tensor` or `RaggedTensor`: the strings to split. Must have a statically known rank (`N`). name: A name for the operation (optional). Returns: A `RaggedTensor` of rank `N+1`: the bytes that make up the soruce strings. """ with ops.name_scope(name, "StringsByteSplit", [input]): input = ragged_tensor.convert_to_tensor_or_ragged_tensor(input, name="input") if isinstance(input, ragged_tensor.RaggedTensor): return input.with_flat_values(string_bytes_split(input.flat_values)) rank = input.shape.ndims if rank is None: raise ValueError("input must have a statically-known rank.") if rank == 0: return string_bytes_split(array_ops.stack([input]))[0] elif rank == 1: indices, values, shape = gen_string_ops.string_split( input, delimiter="", skip_empty=False) return ragged_tensor.RaggedTensor.from_value_rowids( values=values, value_rowids=indices[:, 0], nrows=shape[0], validate=False) else: return string_bytes_split(ragged_tensor.RaggedTensor.from_tensor(input))