def string_split(source, delimiter=" "): # pylint: disable=invalid-name """Split elements of `source` based on `delimiter` into a `SparseTensor`. Let N be the size of source (typically N will be the batch size). Split each element of `source` based on `delimiter` and return a `SparseTensor` containing the splitted tokens. Empty tokens are ignored. If `delimiter` is an empty string, each element of the `source` is split into individual strings, each containing one byte. (This includes splitting multibyte sequences of UTF-8.) For example: N = 2, source[0] is 'hello world' and source[1] is 'a b c', then the output will be st.indices = [0, 0; 0, 1; 1, 0; 1, 1; 1, 2] st.shape = [2, 3] st.values = ['hello', 'world', 'a', 'b', 'c'] Args: source: `1-D` string `Output`, the strings to split. delimiter: `0-D` string `Output`, the delimiter character, the string should be length 0 or 1. Returns: A `SparseTensor` of rank `2`, the strings split according to the delimiter. The first column of the indices corresponds to the row in `source` and the second column corresponds to the index of the split component in this row. Raises: ValueError: If delimiter is not a single-byte character. """ if isinstance(delimiter, six.string_types) and len(delimiter) > 1: raise ValueError("delimiter must be a single byte-character, got %s" % delimiter) delimiter = ops.convert_to_tensor(delimiter, dtype=dtypes.string) source = ops.convert_to_tensor(source, dtype=dtypes.string) # pylint: disable=protected-access indices, values, shape = gen_string_ops._string_split(source, delimiter=delimiter) # pylint: enable=protected-access indices.set_shape([None, 2]) values.set_shape([None]) shape.set_shape([2]) return sparse_tensor.SparseTensor(indices, values, shape)
def string_split(source, delimiter=" "): # pylint: disable=invalid-name """Split elements of `source` based on `delimiter` into a `SparseTensor`. Let N be the size of source (typically N will be the batch size). Split each element of `source` based on `delimiter` and return a `SparseTensor` containing the splitted tokens. Empty tokens are ignored. If `delimiter` is an empty string, each element of the `source` is split into individual strings, each containing one byte. (This includes splitting multibyte sequences of UTF-8.) For example: N = 2, source[0] is 'hello world' and source[1] is 'a b c', then the output will be st.indices = [0, 0; 0, 1; 1, 0; 1, 1; 1, 2] st.shape = [2, 3] st.values = ['hello', 'world', 'a', 'b', 'c'] Args: source: `1-D` string `Tensor`, the strings to split. delimiter: `0-D` string `Tensor`, the delimiter character, the string should be length 0 or 1. Returns: A `SparseTensor` of rank `2`, the strings split according to the delimiter. The first column of the indices corresponds to the row in `source` and the second column corresponds to the index of the split component in this row. Raises: ValueError: If delimiter is not a single-byte character. """ if isinstance(delimiter, six.string_types) and len(delimiter) > 1: raise ValueError("delimiter must be a single byte-character, got %s" % delimiter) delimiter = ops.convert_to_tensor(delimiter, dtype=dtypes.string) source = ops.convert_to_tensor(source, dtype=dtypes.string) # pylint: disable=protected-access indices, values, shape = gen_string_ops._string_split( source, delimiter=delimiter) # pylint: enable=protected-access indices.set_shape([None, 2]) values.set_shape([None]) shape.set_shape([2]) return sparse_tensor.SparseTensor(indices, values, shape)
def string_split(source, delimiter=" ", skip_empty=True): # pylint: disable=invalid-name """Split elements of `source` based on `delimiter` into a `SparseTensor`. Let N be the size of source (typically N will be the batch size). Split each element of `source` based on `delimiter` and return a `SparseTensor` containing the split tokens. Empty tokens are ignored. If `delimiter` is an empty string, each element of the `source` is split into individual strings, each containing one byte. (This includes splitting multibyte sequences of UTF-8.) If delimiter contains multiple bytes, it is treated as a set of delimiters with each considered a potential split point. For example: N = 2, source[0] is 'hello world' and source[1] is 'a b c', then the output will be st.indices = [0, 0; 0, 1; 1, 0; 1, 1; 1, 2] st.shape = [2, 3] st.values = ['hello', 'world', 'a', 'b', 'c'] Args: source: `1-D` string `Tensor`, the strings to split. delimiter: `0-D` string `Tensor`, the delimiter character, the string should be length 0 or 1. skip_empty: A `bool`. If `True`, skip the empty strings from the result. Raises: ValueError: If delimiter is not a string. Returns: A `SparseTensor` of rank `2`, the strings split according to the delimiter. The first column of the indices corresponds to the row in `source` and the second column corresponds to the index of the split component in this row. """ delimiter = ops.convert_to_tensor(delimiter, dtype=dtypes.string) source = ops.convert_to_tensor(source, dtype=dtypes.string) # pylint: disable=protected-access indices, values, shape = gen_string_ops._string_split( source, delimiter=delimiter, skip_empty=skip_empty) # pylint: enable=protected-access indices.set_shape([None, 2]) values.set_shape([None]) shape.set_shape([2]) return sparse_tensor.SparseTensor(indices, values, shape)
def string_split(source, delimiter=" ", skip_empty=True): # pylint: disable=invalid-name """Split elements of `source` based on `delimiter` into a `SparseTensor`. Let N be the size of source (typically N will be the batch size). Split each element of `source` based on `delimiter` and return a `SparseTensor` containing the split tokens. Empty tokens are ignored. If `delimiter` is an empty string, each element of the `source` is split into individual strings, each containing one byte. (This includes splitting multibyte sequences of UTF-8.) If delimiter contains multiple bytes, it is treated as a set of delimiters with each considered a potential split point. For example: N = 2, source[0] is 'hello world' and source[1] is 'a b c', then the output will be st.indices = [0, 0; 0, 1; 1, 0; 1, 1; 1, 2] st.shape = [2, 3] st.values = ['hello', 'world', 'a', 'b', 'c'] Args: source: `1-D` string `Tensor`, the strings to split. delimiter: `0-D` string `Tensor`, the delimiter character, the string should be length 0 or 1. skip_empty: A `bool`. If `True`, skip the empty strings from the result. Raises: ValueError: If delimiter is not a string. Returns: A `SparseTensor` of rank `2`, the strings split according to the delimiter. The first column of the indices corresponds to the row in `source` and the second column corresponds to the index of the split component in this row. """ delimiter = ops.convert_to_tensor(delimiter, dtype=dtypes.string) source = ops.convert_to_tensor(source, dtype=dtypes.string) # pylint: disable=protected-access indices, values, shape = gen_string_ops._string_split( source, delimiter=delimiter, skip_empty=skip_empty) # pylint: enable=protected-access indices.set_shape([None, 2]) values.set_shape([None]) shape.set_shape([2]) return sparse_tensor.SparseTensor(indices, values, shape)