Esempio n. 1
0
    def testSplitV2(self, input, expected, input_is_ragged=False, **kwargs):  # pylint: disable=redefined-builtin
        # Check that we are matching the behavior of Python's str.split:
        self.assertEqual(expected, self._py_split(input, **kwargs))

        # Prepare the input tensor.
        if input_is_ragged:
            input = ragged_factory_ops.constant(input, dtype=dtypes.string)
        else:
            input = constant_op.constant(input, dtype=dtypes.string)

        # Check that the public version (which returns a RaggedTensor) works
        # correctly.
        expected_ragged = ragged_factory_ops.constant(
            expected, ragged_rank=input.shape.ndims)
        actual_ragged_v2 = ragged_string_ops.string_split_v2(input, **kwargs)
        actual_ragged_v2_input_kwarg = ragged_string_ops.string_split_v2(
            input=input, **kwargs)
        self.assertAllEqual(expected_ragged, actual_ragged_v2)
        self.assertAllEqual(expected_ragged, actual_ragged_v2_input_kwarg)

        # Check that the internal version (which returns a SparseTensor) works
        # correctly.  Note: the internal version oly supports vector inputs.
        if input.shape.ndims == 1:
            expected_sparse = self.evaluate(expected_ragged.to_sparse())
            actual_sparse_v2 = string_ops.string_split_v2(input, **kwargs)
            self.assertEqual(expected_sparse.indices.tolist(),
                             self.evaluate(actual_sparse_v2.indices).tolist())
            self.assertEqual(expected_sparse.values.tolist(),
                             self.evaluate(actual_sparse_v2.values).tolist())
            self.assertEqual(
                expected_sparse.dense_shape.tolist(),
                self.evaluate(actual_sparse_v2.dense_shape).tolist())
Esempio n. 2
0
    def _preprocess(self, inputs):
        if self._standardize is LOWER_AND_STRIP_PUNCTUATION:
            lowercase_inputs = gen_string_ops.string_lower(inputs)
            inputs = string_ops.regex_replace(lowercase_inputs,
                                              self._strip_regex, "")
        elif self._standardize is not None:
            # TODO(momernick): Support callables here.
            raise RuntimeError("Not a supported standardization.")

        if self._split is SPLIT_ON_WHITESPACE:
            # If split isn't None, we validate that the 1st axis is of dimension 1 and
            # so can be squeezed out. We do this here instead of after splitting for
            # performance reasons - it's more expensive to squeeze a ragged tensor.
            inputs = array_ops.squeeze(inputs, axis=1)
            # This treats multiple whitespaces as one whitespace, and strips leading
            # and trailing whitespace.
            inputs = ragged_string_ops.string_split_v2(inputs)
        elif self._split is not None:
            # TODO(momernick): Support callables here.
            raise RuntimeError("Not a supported splitting.")

        # Note that 'inputs' here can be either ragged or dense depending on the
        # configuration choices for this Layer. The strings.ngrams op, however, does
        # support both ragged and dense inputs.
        if self._ngrams is not None:
            inputs = ragged_string_ops.ngrams(inputs,
                                              ngram_width=self._ngrams,
                                              separator=" ")

        return inputs
  def testSplitV2(self,
                  input,
                  expected,
                  input_is_ragged=False,
                  **kwargs):  # pylint: disable=redefined-builtin
    # Check that we are matching the behavior of Python's str.split:
    self.assertEqual(expected, self._py_split(input, **kwargs))

    # Prepare the input tensor.
    if input_is_ragged:
      input = ragged_factory_ops.constant(input, dtype=dtypes.string)
    else:
      input = constant_op.constant(input, dtype=dtypes.string)

    # Check that the public version (which returns a RaggedTensor) works
    # correctly.
    expected_ragged = ragged_factory_ops.constant(
        expected, ragged_rank=input.shape.ndims)
    actual_ragged_v1 = ragged_string_ops.strings_split_v1(
        input, result_type="RaggedTensor", **kwargs)
    actual_ragged_v1_input_kwarg = ragged_string_ops.strings_split_v1(
        input=input, result_type="RaggedTensor", **kwargs)
    actual_ragged_v1_source_kwarg = ragged_string_ops.strings_split_v1(
        source=input, result_type="RaggedTensor", **kwargs)
    actual_ragged_v2 = ragged_string_ops.string_split_v2(input, **kwargs)
    actual_ragged_v2_input_kwarg = ragged_string_ops.string_split_v2(
        input=input, **kwargs)
    self.assertRaggedEqual(expected_ragged, actual_ragged_v1)
    self.assertRaggedEqual(expected_ragged, actual_ragged_v1_input_kwarg)
    self.assertRaggedEqual(expected_ragged, actual_ragged_v1_source_kwarg)
    self.assertRaggedEqual(expected_ragged, actual_ragged_v2)
    self.assertRaggedEqual(expected_ragged, actual_ragged_v2_input_kwarg)

    # Check that the internal version (which returns a SparseTensor) works
    # correctly.  Note: the internal version oly supports vector inputs.
    if input.shape.ndims == 1:
      expected_sparse = self.evaluate(expected_ragged.to_sparse())
      actual_sparse_v1 = ragged_string_ops.strings_split_v1(
          input, result_type="SparseTensor", **kwargs)
      actual_sparse_v2 = string_ops.string_split_v2(input, **kwargs)
      for actual_sparse in [actual_sparse_v1, actual_sparse_v2]:
        self.assertEqual(expected_sparse.indices.tolist(),
                         self.evaluate(actual_sparse.indices).tolist())
        self.assertEqual(expected_sparse.values.tolist(),
                         self.evaluate(actual_sparse.values).tolist())
        self.assertEqual(expected_sparse.dense_shape.tolist(),
                         self.evaluate(actual_sparse.dense_shape).tolist())
Esempio n. 4
0
    def _preprocess(self, inputs):
        if self._standardize == LOWER_AND_STRIP_PUNCTUATION:
            if ragged_tensor.is_ragged(inputs):
                lowercase_inputs = ragged_functional_ops.map_flat_values(
                    gen_string_ops.string_lower, inputs)
                # Depending on configuration, we may never touch the non-data tensor
                # in the ragged inputs tensor. If that is the case, and this is the
                # only layer in the keras model, running it will throw an error.
                # To get around this, we wrap the result in an identity.
                lowercase_inputs = array_ops.identity(lowercase_inputs)
            else:
                lowercase_inputs = gen_string_ops.string_lower(inputs)
            inputs = string_ops.regex_replace(lowercase_inputs,
                                              DEFAULT_STRIP_REGEX, "")
        elif callable(self._standardize):
            inputs = self._standardize(inputs)
        elif self._standardize is not None:
            raise ValueError(
                ("%s is not a supported standardization. "
                 "TextVectorization supports the following options "
                 "for `standardize`: None, "
                 "'lower_and_strip_punctuation', or a "
                 "Callable.") % self._standardize)

        if self._split is not None:
            # If we are splitting, we validate that the 1st axis is of dimension 1 and
            # so can be squeezed out. We do this here instead of after splitting for
            # performance reasons - it's more expensive to squeeze a ragged tensor.
            if inputs.shape.ndims > 1:
                inputs = array_ops.squeeze(inputs, axis=-1)
            if self._split == SPLIT_ON_WHITESPACE:
                # This treats multiple whitespaces as one whitespace, and strips leading
                # and trailing whitespace.
                inputs = ragged_string_ops.string_split_v2(inputs)
            elif callable(self._split):
                inputs = self._split(inputs)
            else:
                raise ValueError(
                    ("%s is not a supported splitting."
                     "TextVectorization supports the following options "
                     "for `split`: None, 'whitespace', or a Callable.") %
                    self._split)

        # Note that 'inputs' here can be either ragged or dense depending on the
        # configuration choices for this Layer. The strings.ngrams op, however, does
        # support both ragged and dense inputs.
        if self._ngrams is not None:
            inputs = ragged_string_ops.ngrams(inputs,
                                              ngram_width=self._ngrams,
                                              separator=" ")

        return inputs
Esempio n. 5
0
  def testSplitV2(self):
    strings = ["pigs on the wing", "animals"]

    tokens = string_ops.string_split_v2(strings)
    indices, values, shape = self.evaluate(tokens)
    self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
    self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
    self.assertAllEqual(shape, [2, 4])

    ragged_tokens = ragged_string_ops.string_split_v2(strings)
    self.assertAllEqual(ragged_tokens.row_splits, [0, 4, 5])
    self.assertAllEqual(ragged_tokens.values,
                        [b"pigs", b"on", b"the", b"wing", b"animals"])
Esempio n. 6
0
    def test_custom_string_splitting(self):
        input_array = np.array([["earth>wind>and fire"],
                                ["\tfire>and\nearth>michigan"]])
        expected_output = [[b"earth", b"wind", b"and fire"],
                           [b"\tfire", b"and\nearth", b"michigan"]]

        custom_split = lambda x: ragged_string_ops.string_split_v2(x, sep=">")
        input_data = keras.Input(shape=(1, ), dtype=dtypes.string)
        layer = get_layer_class()(max_tokens=None,
                                  standardize=None,
                                  split=custom_split,
                                  ngrams=None,
                                  output_mode=None)
        int_data = layer(input_data)
        model = keras.Model(inputs=input_data, outputs=int_data)
        output_dataset = model.predict(input_array)
        self.assertAllEqual(expected_output, output_dataset)
Esempio n. 7
0
  def testSplitV2EmptySeparatorMaxSplit(self):
    # Match Python behavior:
    # '1 2 3'.split(maxsplit=1)
    # ['1', '2 3']
    # >>> "  4  5    6  ".split(maxsplit=1)
    # ['4', '5    6  ']
    strings = ["1 2 3", "  4  5    6  "]

    tokens = string_ops.string_split_v2(strings, maxsplit=1)
    indices, values, shape = self.evaluate(tokens)
    self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0], [1, 1]])
    self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
    self.assertAllEqual(shape, [2, 2])

    ragged_tokens = ragged_string_ops.string_split_v2(strings, maxsplit=1)
    self.assertAllEqual(ragged_tokens.row_splits, [0, 2, 4])
    self.assertAllEqual(ragged_tokens.values, [b"1", b"2 3", b"4", b"5    6  "])
Esempio n. 8
0
  def testSplitV2SimpleSeparatorMaxSplit(self):
    # Match Python behavior:
    # >>> '1,2,3'.split(',', maxsplit=1)
    # ['1', '2,3']
    # >>> '4,5,,6,'.split(',', maxsplit=1)
    # ['4', '5,,6,']
    strings = ["1,2,3", "4,5,,6,"]

    tokens = string_ops.string_split_v2(strings, sep=",", maxsplit=1)
    indices, values, shape = self.evaluate(tokens)
    self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0], [1, 1]])
    self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
    self.assertAllEqual(shape, [2, 2])

    ragged_tokens = ragged_string_ops.string_split_v2(
        strings, sep=",", maxsplit=1)
    self.assertAllEqual(ragged_tokens.row_splits, [0, 2, 4])
    self.assertAllEqual(ragged_tokens.values, [b"1", b"2,3", b"4", b"5,,6,"])
    def testSplitV2EmptySeparator(self):
        # Match Python behavior:
        # >>> '1 2 3'.split()
        # ['1', '2', '3']
        # >>> '   1   2   3   '.split()
        # ['1', '2', '3']
        strings = ["1 2 3", "  4  5    6  "]

        tokens = string_ops.string_split_v2(strings)
        indices, values, shape = self.evaluate(tokens)
        self.assertAllEqual(indices,
                            [[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2]])
        self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
        self.assertAllEqual(shape, [2, 3])

        ragged_tokens = ragged_string_ops.string_split_v2(strings)
        self.assertAllEqual(ragged_tokens.row_splits, [0, 3, 6])
        self.assertAllEqual(ragged_tokens.values,
                            [b"1", b"2", b"3", b"4", b"5", b"6"])
  def testSplitV2EmptySeparator(self):
    # Match Python behavior:
    # >>> '1 2 3'.split()
    # ['1', '2', '3']
    # >>> '   1   2   3   '.split()
    # ['1', '2', '3']
    strings = ["1 2 3", "  4  5    6  "]

    tokens = string_ops.string_split_v2(strings)
    indices, values, shape = self.evaluate(tokens)
    self.assertAllEqual(indices,
                        [[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2]])
    self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
    self.assertAllEqual(shape, [2, 3])

    ragged_tokens = ragged_string_ops.string_split_v2(strings)
    self.assertAllEqual(ragged_tokens.row_splits, [0, 3, 6])
    self.assertAllEqual(ragged_tokens.values,
                        [b"1", b"2", b"3", b"4", b"5", b"6"])
Esempio n. 11
0
    def _preprocess(self, inputs):
        if self._standardize is LOWER_AND_STRIP_PUNCTUATION:
            lowercase_inputs = gen_string_ops.string_lower(inputs)
            inputs = string_ops.regex_replace(lowercase_inputs,
                                              DEFAULT_STRIP_REGEX, "")
        elif callable(self._standardize):
            inputs = self._standardize(inputs)
        elif self._standardize is not None:
            raise ValueError(
                ("%s is not a supported standardization. "
                 "TextVectorization supports the following options "
                 "for `standardize`: None, "
                 "'lower_and_strip_punctuation', or a "
                 "Callable.") % self._standardize)

        if self._split is not None:
            # If we are splitting, we validate that the 1st axis is of dimension 1 and
            # so can be squeezed out. We do this here instead of after splitting for
            # performance reasons - it's more expensive to squeeze a ragged tensor.
            inputs = array_ops.squeeze(inputs, axis=1)
            if self._split is SPLIT_ON_WHITESPACE:
                # This treats multiple whitespaces as one whitespace, and strips leading
                # and trailing whitespace.
                inputs = ragged_string_ops.string_split_v2(inputs)
            elif callable(self._split):
                inputs = self._split(inputs)
            else:
                raise ValueError(
                    ("%s is not a supported splitting."
                     "TextVectorization supports the following options "
                     "for `split`: None, 'whitespace', or a Callable.") %
                    self._split)

        # Note that 'inputs' here can be either ragged or dense depending on the
        # configuration choices for this Layer. The strings.ngrams op, however, does
        # support both ragged and dense inputs.
        if self._ngrams is not None:
            inputs = ragged_string_ops.ngrams(inputs,
                                              ngram_width=self._ngrams,
                                              separator=" ")

        return inputs
Esempio n. 12
0
  def testSplitV2SimpleSeparator(self):
    # Match Python behavior:
    # >>> '1,2,3'.split(',')
    # ['1', '2', '3']
    # >>> '1,2,,3,'.split(',')
    # ['1', '2', '', '3', '']
    strings = ["1,2,3", "4,5,,6,"]

    tokens = string_ops.string_split_v2(strings, sep=",")
    indices, values, shape = self.evaluate(tokens)
    self.assertAllEqual(
        indices,
        [[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
    self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"", b"6", b""])
    self.assertAllEqual(shape, [2, 5])

    ragged_tokens = ragged_string_ops.string_split_v2(strings, sep=",")
    self.assertAllEqual(ragged_tokens.row_splits, [0, 3, 8])
    self.assertAllEqual(ragged_tokens.values,
                        [b"1", b"2", b"3", b"4", b"5", b"", b"6", b""])
Esempio n. 13
0
    def testSplitV2MultiCharSeparator(self):
        # Match Python behavior:
        # >>> '1<>2<>3'.split('<>')
        # ['1', '2', '3']
        # >>> "<><>4<>5<><>6<>".split("<>")
        # ['', '', '4', '5', '', '6', '']
        strings = ["1<>2<>3", "<><>4<>5<><>6<>"]

        tokens = string_ops.string_split_v2(strings, sep="<>")
        indices, values, shape = self.evaluate(tokens)
        self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [1, 0], [1, 1],
                                      [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
        self.assertAllEqual(
            values, [b"1", b"2", b"3", b"", b"", b"4", b"5", b"", b"6", b""])
        self.assertAllEqual(shape, [2, 7])

        ragged_tokens = ragged_string_ops.string_split_v2(strings, sep="<>")
        self.assertAllEqual(ragged_tokens.row_splits, [0, 3, 10])
        self.assertAllEqual(
            ragged_tokens.values,
            [b"1", b"2", b"3", b"", b"", b"4", b"5", b"", b"6", b""])
  def testSplitV2MultiCharSeparator(self):
    # Match Python behavior:
    # >>> '1<>2<>3'.split('<>')
    # ['1', '2', '3']
    # >>> "<><>4<>5<><>6<>".split("<>")
    # ['', '', '4', '5', '', '6', '']
    strings = ["1<>2<>3", "<><>4<>5<><>6<>"]

    tokens = string_ops.string_split_v2(strings, sep="<>")
    indices, values, shape = self.evaluate(tokens)
    self.assertAllEqual(indices,
                        [[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2], [1, 3],
                         [1, 4], [1, 5], [1, 6]])
    self.assertAllEqual(
        values, [b"1", b"2", b"3", b"", b"", b"4", b"5", b"", b"6", b""])
    self.assertAllEqual(shape, [2, 7])

    ragged_tokens = ragged_string_ops.string_split_v2(strings, sep="<>")
    self.assertAllEqual(ragged_tokens.row_splits, [0, 3, 10])
    self.assertAllEqual(
        ragged_tokens.values,
        [b"1", b"2", b"3", b"", b"", b"4", b"5", b"", b"6", b""])
Esempio n. 15
0
    def detokenize(self, token_ids):
        r"""Convert a `Tensor` or `RaggedTensor` of wordpiece IDs to string-words.

    >>> import pathlib
    >>> pathlib.Path('/tmp/detok_vocab.txt').write_text(
    ...     'a b c ##a ##b ##c'.replace(' ', '\n'))
    >>> wordpiece = WordpieceTokenizer('/tmp/detok_vocab.txt')
    >>> token_ids = [[0, 4, 5, 2, 5, 5, 5]]
    >>> wordpiece.detokenize(token_ids)
    <tf.RaggedTensor [[b'abc', b'cccc']]>

    The word pieces are joined along the innermost axis to make words. So the
    result has the same rank as the input, but the innermost axis of the result
    indexes words instead of word pieces.

    The shape transformation is: `[..., wordpieces] => [..., words]`

    When the input shape is `[..., words, wordpieces]` (like the output of
    `WordpieceTokenizer.tokenize`) the result's shape is `[..., words, 1]`.
    The additional ragged axis can be removed using `words.merge_dims(-2, -1)`.

    Note: This method assumes wordpiece IDs are dense on the interval
    `[0, vocab_size)`.

    Args:
      token_ids: A `RaggedTensor` or `Tensor` with an int dtype. Must have
      `ndims >= 2`

    Returns:
      A `RaggedTensor` with dtype `string` and the rank as the input
      `token_ids`.
    """
        # If there are performance issues with this method or problems with lookup
        # tables using sparse IDs see the notes in b/177610044.
        vocab, ids = self._get_vocab_and_ids()
        token_ids = ragged_tensor.convert_to_tensor_or_ragged_tensor(token_ids)

        first_is_zero = math_ops.equal(ids[0], 0)
        steps = ids[1:] - ids[:-1]
        all_one_step = math_ops.reduce_all(math_ops.equal(steps, 1))

        check = control_flow_ops.Assert(
            first_is_zero & all_one_step,
            data=[('`detokenize` only works with vocabulary tables where the '
                   'indices are dense on the interval `[0, vocab_size)`')])
        with ops.control_dependencies([check]):
            token_ids = math_ops.minimum(
                token_ids,
                # Limit the OOV buckets to a single index.
                math_ops.cast(array_ops.size(vocab), token_ids.dtype))

        # Add the unknown token at that index.
        vocab = array_ops.concat([vocab, [self._unknown_token]], axis=0)

        # Lookup the text tokens and join them along the innermost axis.
        txt_tokens = array_ops.gather(vocab, token_ids)

        # Ensure the input is Ragged.
        if not isinstance(txt_tokens, RaggedTensor):
            txt_tokens = RaggedTensor.from_tensor(txt_tokens)

        # Join the tokens along the last axis.
        words = string_ops.reduce_join_v2(txt_tokens, axis=-1, separator=' ')

        # Collapse " ##" in all strings to make words.
        words = string_ops.regex_replace(
            words, ' ' + re.escape(self._suffix_indicator), '')

        # Strip leading and trailing spaces.
        words = string_ops.regex_replace(words, '^ +| +$', '')

        # Split on spaces so the last axis is "words".
        words = ragged_string_ops.string_split_v2(words, sep=' ')
        return words
Esempio n. 16
0
def custom_split_fn(x):
    return ragged_string_ops.string_split_v2(x, sep=">")