コード例 #1
0
def strings_split_v1(input=None, sep=None, maxsplit=-1,  # pylint: disable=redefined-builtin
                     result_type="SparseTensor", source=None, name=None):
  """Split elements of `input` based on `sep`.

  Let N be the size of `input` (typically N will be the batch size). Split each
  element of `input` based on `sep` and return a `SparseTensor` or
  `RaggedTensor` containing the split tokens. Empty tokens are ignored.

  Examples:

  ```python
  >>> tf.strings.split(['hello world', 'a b c'])
  tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0], [1, 1], [1, 2]],
                  values=['hello', 'world', 'a', 'b', 'c']
                  dense_shape=[2, 3])

  >>> tf.strings.split(['hello world', 'a b c'], result_type="RaggedTensor")
  <tf.RaggedTensor [['hello', 'world'], ['a', 'b', 'c']]>
  ```

  If `sep` is given, consecutive delimiters are not grouped together and are
  deemed to delimit empty strings. For example, `input` of `"1<>2<><>3"` and
  `sep` of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
  string, consecutive whitespace are regarded as a single separator, and the
  result will contain no empty strings at the start or end if the string has
  leading or trailing whitespace.

  Note that the above mentioned behavior matches python's str.split.

  Args:
    input: `1-D` string `Tensor`, the strings to split.
    sep: `0-D` string `Tensor`, the delimiter character.
    maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
    result_type: The tensor type for the result: one of `"RaggedTensor"` or
      `"SparseTensor"`.
    source: alias for "input" argument.
    name: A name for the operation (optional).

  Raises:
    ValueError: If sep is not a string.

  Returns:
    A `SparseTensor` of rank `2`, the strings split according to the delimiter.
    The first column of the indices corresponds to the row in `source` and the
    second column corresponds to the index of the split component in this row.
  """
  source = deprecation.deprecated_argument_lookup(
      "input", input, "source", source)
  with ops.name_scope(name, "StringSplit", [source]):
    sparse_result = string_ops.string_split_v2(
        source, sep=sep, maxsplit=maxsplit)
    if result_type == "SparseTensor":
      return sparse_result
    elif result_type == "RaggedTensor":
      return ragged_tensor.RaggedTensor.from_value_rowids(
          values=sparse_result.values,
          value_rowids=sparse_result.indices[:, 0],
          nrows=sparse_result.dense_shape[0])
    else:
      raise ValueError("result_type must be 'RaggedTensor' or 'SparseTensor'.")
コード例 #2
0
def string_split_v2(input, sep=None, maxsplit=-1, name=None):  # pylint: disable=redefined-builtin
  """Split elements of `input` based on `sep` into a `RaggedTensor`.

  Let N be the size of `input` (typically N will be the batch size). Split each
  element of `input` based on `sep` and return a `SparseTensor` or
  `RaggedTensor` containing the split tokens. Empty tokens are ignored.

  Example:

  ```python
  >>> tf.strings.split('hello world')
  <Tensor ['hello', 'world']>
  >>> tf.strings.split(['hello world', 'a b c'])
  <tf.RaggedTensor [['hello', 'world'], ['a', 'b', 'c']]>
  ```

  If `sep` is given, consecutive delimiters are not grouped together and are
  deemed to delimit empty strings. For example, `input` of `"1<>2<><>3"` and
  `sep` of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
  string, consecutive whitespace are regarded as a single separator, and the
  result will contain no empty strings at the start or end if the string has
  leading or trailing whitespace.

  Note that the above mentioned behavior matches python's str.split.

  Args:
    input: A string `Tensor` of rank `N`, the strings to split.  If
      `rank(input)` is not known statically, then it is assumed to be `1`.
    sep: `0-D` string `Tensor`, the delimiter string.
    maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
    name: A name for the operation (optional).

  Raises:
    ValueError: If sep is not a string.

  Returns:
    A `RaggedTensor` of rank `N+1`, the strings split according to the
    delimiter.
  """
  with ops.name_scope(name, "StringSplit", [input]):
    input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        input, dtype=dtypes.string, name="input")
    if isinstance(input, ragged_tensor.RaggedTensor):
      return input.with_flat_values(
          string_split_v2(input.flat_values, sep, maxsplit))

    rank = input.shape.ndims
    if rank == 0:
      return string_split_v2(array_ops.stack([input]), sep, maxsplit)[0]
    elif rank == 1 or rank is None:
      sparse_result = string_ops.string_split_v2(
          input, sep=sep, maxsplit=maxsplit)
      return ragged_tensor.RaggedTensor.from_value_rowids(
          values=sparse_result.values,
          value_rowids=sparse_result.indices[:, 0],
          nrows=sparse_result.dense_shape[0],
          validate=False)
    else:
      return string_split_v2(
          ragged_tensor.RaggedTensor.from_tensor(input), sep, maxsplit)
コード例 #3
0
def string_split_v2(input, sep=None, maxsplit=-1, name=None):  # pylint: disable=redefined-builtin
  """Split elements of `input` based on `sep` into a `RaggedTensor`.

  Let N be the size of `input` (typically N will be the batch size). Split each
  element of `input` based on `sep` and return a `SparseTensor` or
  `RaggedTensor` containing the split tokens. Empty tokens are ignored.

  Example:

  ```python
  >>> tf.strings.split('hello world')
  <Tensor ['hello', 'world']>
  >>> tf.strings.split(['hello world', 'a b c'])
  <tf.RaggedTensor [['hello', 'world'], ['a', 'b', 'c']]>
  ```

  If `sep` is given, consecutive delimiters are not grouped together and are
  deemed to delimit empty strings. For example, `input` of `"1<>2<><>3"` and
  `sep` of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
  string, consecutive whitespace are regarded as a single separator, and the
  result will contain no empty strings at the start or end if the string has
  leading or trailing whitespace.

  Note that the above mentioned behavior matches python's str.split.

  Args:
    input: A string `Tensor` of rank `N`, the strings to split.  If
      `rank(input)` is not known statically, then it is assumed to be `1`.
    sep: `0-D` string `Tensor`, the delimiter string.
    maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
    name: A name for the operation (optional).

  Raises:
    ValueError: If sep is not a string.

  Returns:
    A `RaggedTensor` of rank `N+1`, the strings split according to the
    delimiter.
  """
  with ops.name_scope(name, "StringSplit", [input]):
    input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        input, dtype=dtypes.string, name="input")
    if isinstance(input, ragged_tensor.RaggedTensor):
      return input.with_flat_values(
          string_split_v2(input.flat_values, sep, maxsplit))

    rank = input.shape.ndims
    if rank == 0:
      return string_split_v2(array_ops.stack([input]), sep, maxsplit)[0]
    elif rank == 1 or rank is None:
      sparse_result = string_ops.string_split_v2(
          input, sep=sep, maxsplit=maxsplit)
      return ragged_tensor.RaggedTensor.from_value_rowids(
          values=sparse_result.values,
          value_rowids=sparse_result.indices[:, 0],
          nrows=sparse_result.dense_shape[0],
          validate=False)
    else:
      return string_split_v2(
          ragged_tensor.RaggedTensor.from_tensor(input), sep, maxsplit)
コード例 #4
0
def strings_split_v1(input=None, sep=None, maxsplit=-1,  # pylint: disable=redefined-builtin
                     result_type="SparseTensor", source=None, name=None):
  """Split elements of `input` based on `sep`.

  Let N be the size of `input` (typically N will be the batch size). Split each
  element of `input` based on `sep` and return a `SparseTensor` or
  `RaggedTensor` containing the split tokens. Empty tokens are ignored.

  Examples:

  >>> print(tf.compat.v1.strings.split(['hello world', 'a b c']))
  SparseTensor(indices=tf.Tensor( [[0 0] [0 1] [1 0] [1 1] [1 2]], ...),
               values=tf.Tensor([b'hello' b'world' b'a' b'b' b'c'], ...),
               dense_shape=tf.Tensor([2 3], shape=(2,), dtype=int64))

  >>> print(tf.compat.v1.strings.split(['hello world', 'a b c'],
  ...     result_type="RaggedTensor"))
  <tf.RaggedTensor [[b'hello', b'world'], [b'a', b'b', b'c']]>

  If `sep` is given, consecutive delimiters are not grouped together and are
  deemed to delimit empty strings. For example, `input` of `"1<>2<><>3"` and
  `sep` of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
  string, consecutive whitespace are regarded as a single separator, and the
  result will contain no empty strings at the start or end if the string has
  leading or trailing whitespace.

  Note that the above mentioned behavior matches python's str.split.

  Args:
    input: A string `Tensor` of rank `N`, the strings to split.  If
      `rank(input)` is not known statically, then it is assumed to be `1`.
    sep: `0-D` string `Tensor`, the delimiter character.
    maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
    result_type: The tensor type for the result: one of `"RaggedTensor"` or
      `"SparseTensor"`.
    source: alias for "input" argument.
    name: A name for the operation (optional).

  Raises:
    ValueError: If sep is not a string.

  Returns:
    A `SparseTensor` or `RaggedTensor` of rank `N+1`, the strings split
    according to the delimiter.
  """
  input = deprecation.deprecated_argument_lookup(
      "input", input, "source", source)
  with ops.name_scope(name, "StringSplit", [input]):
    input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        input, dtype=dtypes.string, name="input")
    if result_type == "SparseTensor" and input.shape.rank == 1:
      return string_ops.string_split_v2(input, sep=sep, maxsplit=maxsplit)

    ragged_result = string_split_v2(input, sep=sep, maxsplit=maxsplit)
    if result_type == "SparseTensor":
      return ragged_result.to_sparse()
    elif result_type == "RaggedTensor":
      return ragged_result
    else:
      raise ValueError("result_type must be 'RaggedTensor' or 'SparseTensor'.")
コード例 #5
0
    def testSplitV2(self, input, expected, input_is_ragged=False, **kwargs):  # pylint: disable=redefined-builtin
        # Check that we are matching the behavior of Python's str.split:
        self.assertEqual(expected, self._py_split(input, **kwargs))

        # Prepare the input tensor.
        if input_is_ragged:
            input = ragged_factory_ops.constant(input, dtype=dtypes.string)
        else:
            input = constant_op.constant(input, dtype=dtypes.string)

        # Check that the public version (which returns a RaggedTensor) works
        # correctly.
        expected_ragged = ragged_factory_ops.constant(
            expected, ragged_rank=input.shape.ndims)
        actual_ragged_v2 = ragged_string_ops.string_split_v2(input, **kwargs)
        actual_ragged_v2_input_kwarg = ragged_string_ops.string_split_v2(
            input=input, **kwargs)
        self.assertAllEqual(expected_ragged, actual_ragged_v2)
        self.assertAllEqual(expected_ragged, actual_ragged_v2_input_kwarg)

        # Check that the internal version (which returns a SparseTensor) works
        # correctly.  Note: the internal version oly supports vector inputs.
        if input.shape.ndims == 1:
            expected_sparse = self.evaluate(expected_ragged.to_sparse())
            actual_sparse_v2 = string_ops.string_split_v2(input, **kwargs)
            self.assertEqual(expected_sparse.indices.tolist(),
                             self.evaluate(actual_sparse_v2.indices).tolist())
            self.assertEqual(expected_sparse.values.tolist(),
                             self.evaluate(actual_sparse_v2.values).tolist())
            self.assertEqual(
                expected_sparse.dense_shape.tolist(),
                self.evaluate(actual_sparse_v2.dense_shape).tolist())
コード例 #6
0
def strings_split_v1(source,
                     sep=None,
                     maxsplit=-1,
                     result_type="SparseTensor"):
    """Split elements of `source` based on `sep`.

  Let N be the size of source (typically N will be the batch size). Split each
  element of `source` based on `sep` and return a `SparseTensor` or
  `RaggedTensor` containing the split tokens. Empty tokens are ignored.

  Examples:

  ```python
  >>> tf.strings.split(['hello world', 'a b c'])
  tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0], [1, 1], [1, 2]],
                  values=['hello', 'world', 'a', 'b', 'c']
                  dense_shape=[2, 3])

  >>> tf.strings.split(['hello world', 'a b c'], result_type="RaggedTensor")
  <tf.RaggedTensor [['hello', 'world'], ['a', 'b', 'c']]>
  ```

  If `sep` is given, consecutive delimiters are not grouped together and are
  deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
  sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
  string, consecutive whitespace are regarded as a single separator, and the
  result will contain no empty strings at the start or end if the string has
  leading or trailing whitespace.

  Note that the above mentioned behavior matches python's str.split.

  Args:
    source: `1-D` string `Tensor`, the strings to split.
    sep: `0-D` string `Tensor`, the delimiter character.
    maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
    result_type: The tensor type for the result: one of `"RaggedTensor"` or
      `"SparseTensor"`.

  Raises:
    ValueError: If sep is not a string.

  Returns:
    A `SparseTensor` of rank `2`, the strings split according to the delimiter.
    The first column of the indices corresponds to the row in `source` and the
    second column corresponds to the index of the split component in this row.
  """
    sparse_result = string_ops.string_split_v2(source,
                                               sep=sep,
                                               maxsplit=maxsplit)
    if result_type == "SparseTensor":
        return sparse_result
    elif result_type == "RaggedTensor":
        return ragged_tensor.RaggedTensor.from_value_rowids(
            values=sparse_result.values,
            value_rowids=sparse_result.indices[:, 0],
            nrows=sparse_result.dense_shape[0])
    else:
        raise ValueError(
            "result_type must be 'RaggedTensor' or 'SparseTensor'.")
コード例 #7
0
  def testSplitV2(self):
    strings = ["pigs on the wing", "animals"]

    with self.cached_session() as sess:
      tokens = string_ops.string_split_v2(strings)
      indices, values, shape = self.evaluate(tokens)
      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
      self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
      self.assertAllEqual(shape, [2, 4])
コード例 #8
0
  def testSplitV2(self):
    strings = ["pigs on the wing", "animals"]

    tokens = string_ops.string_split_v2(strings)
    indices, values, shape = self.evaluate(tokens)
    self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
    self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
    self.assertAllEqual(shape, [2, 4])

    ragged_tokens = ragged_string_ops.string_split_v2(strings)
    self.assertAllEqual(ragged_tokens.row_splits, [0, 4, 5])
    self.assertAllEqual(ragged_tokens.values,
                        [b"pigs", b"on", b"the", b"wing", b"animals"])
コード例 #9
0
    def testSplitV2EmptySeparatorMaxSplit(self):
        # Match Python behavior:
        # '1 2 3'.split(maxsplit=1)
        # ['1', '2 3']
        # >>> "  4  5    6  ".split(maxsplit=1)
        # ['4', '5    6  ']
        strings = ["1 2 3", "  4  5    6  "]

        with self.cached_session() as sess:
            tokens = string_ops.string_split_v2(strings, maxsplit=1)
            indices, values, shape = sess.run(tokens)
            self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0], [1, 1]])
            self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
            self.assertAllEqual(shape, [2, 2])
コード例 #10
0
    def testSplitV2SimpleSeparatorMaxSplit(self):
        # Match Python behavior:
        # >>> '1,2,3'.split(',', maxsplit=1)
        # ['1', '2,3']
        # >>> '4,5,,6,'.split(',', maxsplit=1)
        # ['4', '5,,6,']
        strings = ["1,2,3", "4,5,,6,"]

        with self.cached_session() as sess:
            tokens = string_ops.string_split_v2(strings, sep=',', maxsplit=1)
            indices, values, shape = sess.run(tokens)
            self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0], [1, 1]])
            self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
            self.assertAllEqual(shape, [2, 2])
コード例 #11
0
  def testSplitV2EmptySeparatorMaxSplit(self):
    # Match Python behavior:
    # '1 2 3'.split(maxsplit=1)
    # ['1', '2 3']
    # >>> "  4  5    6  ".split(maxsplit=1)
    # ['4', '5    6  ']
    strings = ["1 2 3", "  4  5    6  "]

    with self.cached_session() as sess:
      tokens = string_ops.string_split_v2(strings, maxsplit=1)
      indices, values, shape = self.evaluate(tokens)
      self.assertAllEqual(indices, [[0, 0], [0, 1],
                                    [1, 0], [1, 1]])
      self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
      self.assertAllEqual(shape, [2, 2])
コード例 #12
0
  def testSplitV2SimpleSeparatorMaxSplit(self):
    # Match Python behavior:
    # >>> '1,2,3'.split(',', maxsplit=1)
    # ['1', '2,3']
    # >>> '4,5,,6,'.split(',', maxsplit=1)
    # ['4', '5,,6,']
    strings = ["1,2,3", "4,5,,6,"]

    with self.cached_session() as sess:
      tokens = string_ops.string_split_v2(strings, sep=',', maxsplit=1)
      indices, values, shape = self.evaluate(tokens)
      self.assertAllEqual(indices, [[0, 0], [0, 1],
                                    [1, 0], [1, 1]])
      self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
      self.assertAllEqual(shape, [2, 2])
コード例 #13
0
  def testSplitV2EmptySeparator(self):
    # Match Python behavior:
    # >>> '1 2 3'.split()
    # ['1', '2', '3']
    #>>> '   1   2   3   '.split()
    #['1', '2', '3']
    strings = ["1 2 3", "  4  5    6  "]

    with self.cached_session() as sess:
      tokens = string_ops.string_split_v2(strings)
      indices, values, shape = self.evaluate(tokens)
      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
                                    [1, 0], [1, 1], [1, 2]])
      self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
      self.assertAllEqual(shape, [2, 3])
コード例 #14
0
  def testSplitV2SimpleSeparator(self):
    # Match Python behavior:
    # >>> '1,2,3'.split(',')
    # ['1', '2', '3']
    # >>> '1,2,,3,'.split(',')
    # ['1', '2', '', '3', '']
    strings = ["1,2,3", "4,5,,6,"]

    with self.cached_session() as sess:
      tokens = string_ops.string_split_v2(strings, sep=',')
      indices, values, shape = self.evaluate(tokens)
      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
                                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
      self.assertAllEqual(values, [b"1", b"2", b"3",
                                   b"4", b"5", b"", b"6", b""])
      self.assertAllEqual(shape, [2, 5])
コード例 #15
0
  def testSplitV2(self,
                  input,
                  expected,
                  input_is_ragged=False,
                  **kwargs):  # pylint: disable=redefined-builtin
    # Check that we are matching the behavior of Python's str.split:
    self.assertEqual(expected, self._py_split(input, **kwargs))

    # Prepare the input tensor.
    if input_is_ragged:
      input = ragged_factory_ops.constant(input, dtype=dtypes.string)
    else:
      input = constant_op.constant(input, dtype=dtypes.string)

    # Check that the public version (which returns a RaggedTensor) works
    # correctly.
    expected_ragged = ragged_factory_ops.constant(
        expected, ragged_rank=input.shape.ndims)
    actual_ragged_v1 = ragged_string_ops.strings_split_v1(
        input, result_type="RaggedTensor", **kwargs)
    actual_ragged_v1_input_kwarg = ragged_string_ops.strings_split_v1(
        input=input, result_type="RaggedTensor", **kwargs)
    actual_ragged_v1_source_kwarg = ragged_string_ops.strings_split_v1(
        source=input, result_type="RaggedTensor", **kwargs)
    actual_ragged_v2 = ragged_string_ops.string_split_v2(input, **kwargs)
    actual_ragged_v2_input_kwarg = ragged_string_ops.string_split_v2(
        input=input, **kwargs)
    self.assertRaggedEqual(expected_ragged, actual_ragged_v1)
    self.assertRaggedEqual(expected_ragged, actual_ragged_v1_input_kwarg)
    self.assertRaggedEqual(expected_ragged, actual_ragged_v1_source_kwarg)
    self.assertRaggedEqual(expected_ragged, actual_ragged_v2)
    self.assertRaggedEqual(expected_ragged, actual_ragged_v2_input_kwarg)

    # Check that the internal version (which returns a SparseTensor) works
    # correctly.  Note: the internal version oly supports vector inputs.
    if input.shape.ndims == 1:
      expected_sparse = self.evaluate(expected_ragged.to_sparse())
      actual_sparse_v1 = ragged_string_ops.strings_split_v1(
          input, result_type="SparseTensor", **kwargs)
      actual_sparse_v2 = string_ops.string_split_v2(input, **kwargs)
      for actual_sparse in [actual_sparse_v1, actual_sparse_v2]:
        self.assertEqual(expected_sparse.indices.tolist(),
                         self.evaluate(actual_sparse.indices).tolist())
        self.assertEqual(expected_sparse.values.tolist(),
                         self.evaluate(actual_sparse.values).tolist())
        self.assertEqual(expected_sparse.dense_shape.tolist(),
                         self.evaluate(actual_sparse.dense_shape).tolist())
コード例 #16
0
  def testSplitV2MultiCharSeparator(self):
    # Match Python behavior:
    # >>> '1<>2<>3'.split('<>')
    # ['1', '2', '3']
    # >>> "<><>4<>5<><>6<>".split("<>")
    # ['', '', '4', '5', '', '6', '']
    strings = ["1<>2<>3", "<><>4<>5<><>6<>"]

    with self.cached_session() as sess:
      tokens = string_ops.string_split_v2(strings, sep="<>")
      indices, values, shape = self.evaluate(tokens)
      self.assertAllEqual(
          indices, [[0, 0], [0, 1], [0, 2],
                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
      self.assertAllEqual(values, [b"1", b"2", b"3",
                                   b"", b"", b"4", b"5", b"", b"6", b""])
      self.assertAllEqual(shape, [2, 7])
コード例 #17
0
  def testSplitV2EmptySeparatorMaxSplit(self):
    # Match Python behavior:
    # '1 2 3'.split(maxsplit=1)
    # ['1', '2 3']
    # >>> "  4  5    6  ".split(maxsplit=1)
    # ['4', '5    6  ']
    strings = ["1 2 3", "  4  5    6  "]

    tokens = string_ops.string_split_v2(strings, maxsplit=1)
    indices, values, shape = self.evaluate(tokens)
    self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0], [1, 1]])
    self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
    self.assertAllEqual(shape, [2, 2])

    ragged_tokens = ragged_string_ops.string_split_v2(strings, maxsplit=1)
    self.assertAllEqual(ragged_tokens.row_splits, [0, 2, 4])
    self.assertAllEqual(ragged_tokens.values, [b"1", b"2 3", b"4", b"5    6  "])
コード例 #18
0
  def testSplitV2SimpleSeparatorMaxSplit(self):
    # Match Python behavior:
    # >>> '1,2,3'.split(',', maxsplit=1)
    # ['1', '2,3']
    # >>> '4,5,,6,'.split(',', maxsplit=1)
    # ['4', '5,,6,']
    strings = ["1,2,3", "4,5,,6,"]

    tokens = string_ops.string_split_v2(strings, sep=",", maxsplit=1)
    indices, values, shape = self.evaluate(tokens)
    self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0], [1, 1]])
    self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
    self.assertAllEqual(shape, [2, 2])

    ragged_tokens = ragged_string_ops.string_split_v2(
        strings, sep=",", maxsplit=1)
    self.assertAllEqual(ragged_tokens.row_splits, [0, 2, 4])
    self.assertAllEqual(ragged_tokens.values, [b"1", b"2,3", b"4", b"5,,6,"])
コード例 #19
0
def string_split_v2(input, sep=None, maxsplit=-1, name=None):  # pylint: disable=redefined-builtin
    """Split elements of `input` based on `sep` into a `RaggedTensor`.

  Let N be the size of `input` (typically N will be the batch size). Split each
  element of `input` based on `sep` and return a `SparseTensor` or
  `RaggedTensor` containing the split tokens. Empty tokens are ignored.

  Example:

  ```python
  >>> tf.strings.split(['hello world', 'a b c'])
  <tf.RaggedTensor [['hello', 'world'], ['a', 'b', 'c']]>
  ```

  If `sep` is given, consecutive delimiters are not grouped together and are
  deemed to delimit empty strings. For example, `input` of `"1<>2<><>3"` and
  `sep` of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
  string, consecutive whitespace are regarded as a single separator, and the
  result will contain no empty strings at the start or end if the string has
  leading or trailing whitespace.

  Note that the above mentioned behavior matches python's str.split.

  Args:
    input: `1-D` string `Tensor`, the strings to split.
    sep: `0-D` string `Tensor`, the delimiter string.
    maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
    name: A name for the operation (optional).

  Raises:
    ValueError: If sep is not a string.

  Returns:
    A `RaggedTensor` of rank `2`: the strings split according to the delimiter.
  """
    with ops.name_scope(name, "StringSplit", [input]):
        sparse_result = string_ops.string_split_v2(input,
                                                   sep=sep,
                                                   maxsplit=maxsplit)
        return ragged_tensor.RaggedTensor.from_value_rowids(
            values=sparse_result.values,
            value_rowids=sparse_result.indices[:, 0],
            nrows=sparse_result.dense_shape[0])
コード例 #20
0
def string_split_v2(input, sep=None, maxsplit=-1, name=None):  # pylint: disable=redefined-builtin
  """Split elements of `input` based on `sep` into a `RaggedTensor`.

  Let N be the size of `input` (typically N will be the batch size). Split each
  element of `input` based on `sep` and return a `SparseTensor` or
  `RaggedTensor` containing the split tokens. Empty tokens are ignored.

  Example:

  ```python
  >>> tf.strings.split(['hello world', 'a b c'])
  <tf.RaggedTensor [['hello', 'world'], ['a', 'b', 'c']]>
  ```

  If `sep` is given, consecutive delimiters are not grouped together and are
  deemed to delimit empty strings. For example, `input` of `"1<>2<><>3"` and
  `sep` of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
  string, consecutive whitespace are regarded as a single separator, and the
  result will contain no empty strings at the start or end if the string has
  leading or trailing whitespace.

  Note that the above mentioned behavior matches python's str.split.

  Args:
    input: `1-D` string `Tensor`, the strings to split.
    sep: `0-D` string `Tensor`, the delimiter string.
    maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
    name: A name for the operation (optional).

  Raises:
    ValueError: If sep is not a string.

  Returns:
    A `RaggedTensor` of rank `2`: the strings split according to the delimiter.
  """
  with ops.name_scope(name, "StringSplit", [input]):
    sparse_result = string_ops.string_split_v2(input, sep=sep,
                                               maxsplit=maxsplit)
    return ragged_tensor.RaggedTensor.from_value_rowids(
        values=sparse_result.values,
        value_rowids=sparse_result.indices[:, 0],
        nrows=sparse_result.dense_shape[0])