コード例 #1
0
 def test_error(self,
                data,
                ngram_width,
                separator=" ",
                pad_values=None,
                padding_width=None,
                preserve_short_sequences=False,
                error=None,
                exception=ValueError):
     with self.assertRaisesRegex(exception, error):
         ragged_string_ops.ngrams(data, ngram_width, separator, pad_values,
                                  padding_width, preserve_short_sequences)
コード例 #2
0
 def test_input_with_no_values(self):
     data = ragged_factory_ops.constant([[], [], []], dtype=dtypes.string)
     ngram_op = ragged_string_ops.ngrams(data, (1, 2))
     result = self.evaluate(ngram_op)
     self.assertAllEqual([0, 0, 0, 0], result.row_splits)
     self.assertAllEqual(constant_op.constant([], dtype=dtypes.string),
                         result.values)
コード例 #3
0
    def _preprocess(self, inputs):
        if self._standardize is LOWER_AND_STRIP_PUNCTUATION:
            lowercase_inputs = gen_string_ops.string_lower(inputs)
            inputs = string_ops.regex_replace(lowercase_inputs,
                                              self._strip_regex, "")
        elif self._standardize is not None:
            # TODO(momernick): Support callables here.
            raise RuntimeError("Not a supported standardization.")

        if self._split is SPLIT_ON_WHITESPACE:
            # If split isn't None, we validate that the 1st axis is of dimension 1 and
            # so can be squeezed out. We do this here instead of after splitting for
            # performance reasons - it's more expensive to squeeze a ragged tensor.
            inputs = array_ops.squeeze(inputs, axis=1)
            # This treats multiple whitespaces as one whitespace, and strips leading
            # and trailing whitespace.
            inputs = ragged_string_ops.string_split_v2(inputs)
        elif self._split is not None:
            # TODO(momernick): Support callables here.
            raise RuntimeError("Not a supported splitting.")

        # Note that 'inputs' here can be either ragged or dense depending on the
        # configuration choices for this Layer. The strings.ngrams op, however, does
        # support both ragged and dense inputs.
        if self._ngrams is not None:
            inputs = ragged_string_ops.ngrams(inputs,
                                              ngram_width=self._ngrams,
                                              separator=" ")

        return inputs
コード例 #4
0
 def test_unpadded_ngrams(self):
   data = [[b"aa", b"bb", b"cc", b"dd"], [b"ee", b"ff"]]
   data_tensor = ragged_factory_ops.constant(data)
   ngram_op = ragged_string_ops.ngrams(
       data_tensor, ngram_width=3, separator=b"|")
   result = self.evaluate(ngram_op)
   expected_ngrams = [[b"aa|bb|cc", b"bb|cc|dd"], []]
   self.assertAllEqual(expected_ngrams, result)
コード例 #5
0
 def test_vector_input(self):
   data = [b"a", b"z"]
   data_tensor = ragged_factory_ops.constant(data)
   ngram_op = ragged_string_ops.ngrams(
       data_tensor, ngram_width=3, separator=b"|", pad_values=(b"LP", b"RP"))
   result = self.evaluate(ngram_op)
   expected_ngrams = [b"LP|LP|a", b"LP|a|z", b"a|z|RP", b"z|RP|RP"]
   self.assertAllEqual(expected_ngrams, result)
コード例 #6
0
 def test_ragged_inputs_with_multiple_ragged_dimensions_bigrams(self):
   data = [[[[b"aa", b"bb", b"cc", b"dd"]], [[b"ee", b"ff"]]]]
   data_tensor = ragged_factory_ops.constant(data)
   ngram_op = ragged_string_ops.ngrams(
       data_tensor, ngram_width=2, separator=b"|")
   result = self.evaluate(ngram_op)
   expected_ngrams = [[[[b"aa|bb", b"bb|cc", b"cc|dd"]], [[b"ee|ff"]]]]
   self.assertAllEqual(expected_ngrams, result)
コード例 #7
0
 def test_tuple_multi_ngrams_inverted_order(self):
   data = [[b"aa", b"bb", b"cc", b"dd"], [b"ee", b"ff"]]
   data_tensor = ragged_factory_ops.constant(data)
   ngram_op = ragged_string_ops.ngrams(
       data_tensor, ngram_width=(3, 2), separator=b"|")
   result = self.evaluate(ngram_op)
   expected_ngrams = [[b"aa|bb|cc", b"bb|cc|dd", b"aa|bb", b"bb|cc", b"cc|dd"],
                      [b"ee|ff"]]
   self.assertAllEqual(expected_ngrams, result)
コード例 #8
0
 def test_dense_input_with_multiple_ngrams(self):
   data = [[b"a", b"b", b"c", b"d"], [b"e", b"f", b"g", b"h"]]
   data_tensor = ragged_factory_ops.constant(data)
   ngram_op = ragged_string_ops.ngrams(
       data_tensor, ngram_width=(1, 2, 3), separator=b"|")
   result = self.evaluate(ngram_op)
   expected_ngrams = [[
       b"a", b"b", b"c", b"d", b"a|b", b"b|c", b"c|d", b"a|b|c", b"b|c|d"
   ], [b"e", b"f", b"g", b"h", b"e|f", b"f|g", b"g|h", b"e|f|g", b"f|g|h"]]
   self.assertAllEqual(expected_ngrams, result)
コード例 #9
0
 def test_explicit_multiply_padded_ngrams(self):
     data = [[b"a"]]
     data_tensor = ragged_factory_ops.constant(data)
     ngram_op = ragged_string_ops.ngrams(data_tensor,
                                         ngram_width=5,
                                         separator=b"|",
                                         pad_values=(b"LP", b"RP"),
                                         padding_width=2)
     result = self.evaluate(ngram_op)
     expected_ngrams = [[b"LP|LP|a|RP|RP"]]
     self.assertAllEqual(expected_ngrams, result)
コード例 #10
0
 def test_single_padding_string(self):
     data = [[b"a"], [b"b", b"c", b"d"], [b"e", b"f"]]
     data_tensor = ragged_factory_ops.constant(data)
     ngram_op = ragged_string_ops.ngrams(data_tensor,
                                         ngram_width=5,
                                         separator=b"|",
                                         pad_values=b"[PAD]",
                                         padding_width=1)
     result = self.evaluate(ngram_op)
     expected_ngrams = [[], [b"[PAD]|b|c|d|[PAD]"], []]
     self.assertAllEqual(expected_ngrams, result)
コード例 #11
0
 def test_input_list_input(self):
   data = [[b"a", b"z"], [b"b", b""], [b"e", b"f"]]
   ngram_op = ragged_string_ops.ngrams(
       data, ngram_width=3, separator=b"|", pad_values=(b"LP", b"RP"))
   result = self.evaluate(ngram_op)
   expected_ngrams = [
       [b"LP|LP|a", b"LP|a|z", b"a|z|RP", b"z|RP|RP"],
       [b"LP|LP|b", b"LP|b|", b"b||RP", b"|RP|RP"],
       [b"LP|LP|e", b"LP|e|f", b"e|f|RP", b"f|RP|RP"],
   ]
   self.assertAllEqual(expected_ngrams, result)
コード例 #12
0
 def test_singly_padded_multiple_ngrams(self):
     data = [[b"a"], [b"b", b"c", b"d"], [b"e", b"f"]]
     data_tensor = ragged_factory_ops.constant(data)
     ngram_op = ragged_string_ops.ngrams(data_tensor,
                                         ngram_width=(1, 5),
                                         separator=b"|",
                                         pad_values=(b"LP", b"RP"),
                                         padding_width=1)
     result = self.evaluate(ngram_op)
     expected_ngrams = [[b"a"], [b"b", b"c", b"d", b"LP|b|c|d|RP"],
                        [b"e", b"f"]]
     self.assertAllEqual(expected_ngrams, result)
コード例 #13
0
 def test_singly_padded_ngrams_with_preserve_short(self):
     data = [[b"a"], [b"b", b"c", b"d"], [b"e", b"f"]]
     data_tensor = ragged_factory_ops.constant(data)
     ngram_op = ragged_string_ops.ngrams(data_tensor,
                                         ngram_width=5,
                                         separator=b"|",
                                         pad_values=(b"LP", b"RP"),
                                         padding_width=1,
                                         preserve_short_sequences=True)
     result = self.evaluate(ngram_op)
     expected_ngrams = [[b"LP|a|RP"], [b"LP|b|c|d|RP"], [b"LP|e|f|RP"]]
     self.assertAllEqual(expected_ngrams, result)
コード例 #14
0
 def test_dense_input_rank_3(self):
   data = [[[b"a", b"z"], [b"b", b""]], [[b"b", b""], [b"e", b"f"]]]
   data_tensor = constant_op.constant(data)
   ngram_op = ragged_string_ops.ngrams(
       data_tensor, ngram_width=3, separator=b"|", pad_values=(b"LP", b"RP"))
   result = self.evaluate(ngram_op)
   expected_ngrams = [[[b"LP|LP|a", b"LP|a|z", b"a|z|RP", b"z|RP|RP"],
                       [b"LP|LP|b", b"LP|b|", b"b||RP", b"|RP|RP"]],
                      [[b"LP|LP|b", b"LP|b|", b"b||RP", b"|RP|RP"],
                       [b"LP|LP|e", b"LP|e|f", b"e|f|RP", b"f|RP|RP"]]]
   self.assertIsInstance(ngram_op, ops.Tensor)
   self.assertAllEqual(expected_ngrams, result)
コード例 #15
0
 def test_fully_padded_ngrams(self):
   data = [[b"a"], [b"b", b"c", b"d"], [b"e", b"f"]]
   data_tensor = ragged_factory_ops.constant(data)
   ngram_op = ragged_string_ops.ngrams(
       data_tensor, ngram_width=3, separator=b"|", pad_values=(b"LP", b"RP"))
   result = self.evaluate(ngram_op)
   expected_ngrams = [
       [b"LP|LP|a", b"LP|a|RP", b"a|RP|RP"],  # 0
       [b"LP|LP|b", b"LP|b|c", b"b|c|d", b"c|d|RP", b"d|RP|RP"],  # 1
       [b"LP|LP|e", b"LP|e|f", b"e|f|RP", b"f|RP|RP"]  # 2
   ]
   self.assertAllEqual(expected_ngrams, result)
コード例 #16
0
    def _preprocess(self, inputs):
        if self._standardize == LOWER_AND_STRIP_PUNCTUATION:
            if ragged_tensor.is_ragged(inputs):
                lowercase_inputs = ragged_functional_ops.map_flat_values(
                    gen_string_ops.string_lower, inputs)
                # Depending on configuration, we may never touch the non-data tensor
                # in the ragged inputs tensor. If that is the case, and this is the
                # only layer in the keras model, running it will throw an error.
                # To get around this, we wrap the result in an identity.
                lowercase_inputs = array_ops.identity(lowercase_inputs)
            else:
                lowercase_inputs = gen_string_ops.string_lower(inputs)
            inputs = string_ops.regex_replace(lowercase_inputs,
                                              DEFAULT_STRIP_REGEX, "")
        elif callable(self._standardize):
            inputs = self._standardize(inputs)
        elif self._standardize is not None:
            raise ValueError(
                ("%s is not a supported standardization. "
                 "TextVectorization supports the following options "
                 "for `standardize`: None, "
                 "'lower_and_strip_punctuation', or a "
                 "Callable.") % self._standardize)

        if self._split is not None:
            # If we are splitting, we validate that the 1st axis is of dimension 1 and
            # so can be squeezed out. We do this here instead of after splitting for
            # performance reasons - it's more expensive to squeeze a ragged tensor.
            if inputs.shape.ndims > 1:
                inputs = array_ops.squeeze(inputs, axis=-1)
            if self._split == SPLIT_ON_WHITESPACE:
                # This treats multiple whitespaces as one whitespace, and strips leading
                # and trailing whitespace.
                inputs = ragged_string_ops.string_split_v2(inputs)
            elif callable(self._split):
                inputs = self._split(inputs)
            else:
                raise ValueError(
                    ("%s is not a supported splitting."
                     "TextVectorization supports the following options "
                     "for `split`: None, 'whitespace', or a Callable.") %
                    self._split)

        # Note that 'inputs' here can be either ragged or dense depending on the
        # configuration choices for this Layer. The strings.ngrams op, however, does
        # support both ragged and dense inputs.
        if self._ngrams is not None:
            inputs = ragged_string_ops.ngrams(inputs,
                                              ngram_width=self._ngrams,
                                              separator=" ")

        return inputs
コード例 #17
0
 def test_ngram_padding_size_cap(self):
     # Validate that the padding size is never greater than ngram_size - 1.
     data = [[b"a"], [b"b", b"c", b"d"], [b"e", b"f"]]
     data_tensor = ragged_factory_ops.constant(data)
     ngram_op = ragged_string_ops.ngrams(data_tensor,
                                         ngram_width=3,
                                         separator=b"|",
                                         pad_values=(b"LP", b"RP"),
                                         padding_width=10)
     result = self.evaluate(ngram_op)
     expected_ngrams = [
         [b"LP|LP|a", b"LP|a|RP", b"a|RP|RP"],  # 0
         [b"LP|LP|b", b"LP|b|c", b"b|c|d", b"c|d|RP", b"d|RP|RP"],  # 1
         [b"LP|LP|e", b"LP|e|f", b"e|f|RP", b"f|RP|RP"]  # 2
     ]
     self.assertAllEqual(expected_ngrams, result)
コード例 #18
0
    def _preprocess(self, inputs):
        if self._standardize is LOWER_AND_STRIP_PUNCTUATION:
            lowercase_inputs = gen_string_ops.string_lower(inputs)
            inputs = string_ops.regex_replace(lowercase_inputs,
                                              DEFAULT_STRIP_REGEX, "")
        elif callable(self._standardize):
            inputs = self._standardize(inputs)
        elif self._standardize is not None:
            raise ValueError(
                ("%s is not a supported standardization. "
                 "TextVectorization supports the following options "
                 "for `standardize`: None, "
                 "'lower_and_strip_punctuation', or a "
                 "Callable.") % self._standardize)

        if self._split is not None:
            # If we are splitting, we validate that the 1st axis is of dimension 1 and
            # so can be squeezed out. We do this here instead of after splitting for
            # performance reasons - it's more expensive to squeeze a ragged tensor.
            inputs = array_ops.squeeze(inputs, axis=1)
            if self._split is SPLIT_ON_WHITESPACE:
                # This treats multiple whitespaces as one whitespace, and strips leading
                # and trailing whitespace.
                inputs = ragged_string_ops.string_split_v2(inputs)
            elif callable(self._split):
                inputs = self._split(inputs)
            else:
                raise ValueError(
                    ("%s is not a supported splitting."
                     "TextVectorization supports the following options "
                     "for `split`: None, 'whitespace', or a Callable.") %
                    self._split)

        # Note that 'inputs' here can be either ragged or dense depending on the
        # configuration choices for this Layer. The strings.ngrams op, however, does
        # support both ragged and dense inputs.
        if self._ngrams is not None:
            inputs = ragged_string_ops.ngrams(inputs,
                                              ngram_width=self._ngrams,
                                              separator=" ")

        return inputs
コード例 #19
0
 def f(v):
     return ragged_string_ops.ngrams(v, 2)