def test_error(self, data, ngram_width, separator=" ", pad_values=None, padding_width=None, preserve_short_sequences=False, error=None, exception=ValueError): with self.assertRaisesRegex(exception, error): ragged_string_ops.ngrams(data, ngram_width, separator, pad_values, padding_width, preserve_short_sequences)
def test_input_with_no_values(self): data = ragged_factory_ops.constant([[], [], []], dtype=dtypes.string) ngram_op = ragged_string_ops.ngrams(data, (1, 2)) result = self.evaluate(ngram_op) self.assertAllEqual([0, 0, 0, 0], result.row_splits) self.assertAllEqual(constant_op.constant([], dtype=dtypes.string), result.values)
def _preprocess(self, inputs): if self._standardize is LOWER_AND_STRIP_PUNCTUATION: lowercase_inputs = gen_string_ops.string_lower(inputs) inputs = string_ops.regex_replace(lowercase_inputs, self._strip_regex, "") elif self._standardize is not None: # TODO(momernick): Support callables here. raise RuntimeError("Not a supported standardization.") if self._split is SPLIT_ON_WHITESPACE: # If split isn't None, we validate that the 1st axis is of dimension 1 and # so can be squeezed out. We do this here instead of after splitting for # performance reasons - it's more expensive to squeeze a ragged tensor. inputs = array_ops.squeeze(inputs, axis=1) # This treats multiple whitespaces as one whitespace, and strips leading # and trailing whitespace. inputs = ragged_string_ops.string_split_v2(inputs) elif self._split is not None: # TODO(momernick): Support callables here. raise RuntimeError("Not a supported splitting.") # Note that 'inputs' here can be either ragged or dense depending on the # configuration choices for this Layer. The strings.ngrams op, however, does # support both ragged and dense inputs. if self._ngrams is not None: inputs = ragged_string_ops.ngrams(inputs, ngram_width=self._ngrams, separator=" ") return inputs
def test_unpadded_ngrams(self): data = [[b"aa", b"bb", b"cc", b"dd"], [b"ee", b"ff"]] data_tensor = ragged_factory_ops.constant(data) ngram_op = ragged_string_ops.ngrams( data_tensor, ngram_width=3, separator=b"|") result = self.evaluate(ngram_op) expected_ngrams = [[b"aa|bb|cc", b"bb|cc|dd"], []] self.assertAllEqual(expected_ngrams, result)
def test_vector_input(self): data = [b"a", b"z"] data_tensor = ragged_factory_ops.constant(data) ngram_op = ragged_string_ops.ngrams( data_tensor, ngram_width=3, separator=b"|", pad_values=(b"LP", b"RP")) result = self.evaluate(ngram_op) expected_ngrams = [b"LP|LP|a", b"LP|a|z", b"a|z|RP", b"z|RP|RP"] self.assertAllEqual(expected_ngrams, result)
def test_ragged_inputs_with_multiple_ragged_dimensions_bigrams(self): data = [[[[b"aa", b"bb", b"cc", b"dd"]], [[b"ee", b"ff"]]]] data_tensor = ragged_factory_ops.constant(data) ngram_op = ragged_string_ops.ngrams( data_tensor, ngram_width=2, separator=b"|") result = self.evaluate(ngram_op) expected_ngrams = [[[[b"aa|bb", b"bb|cc", b"cc|dd"]], [[b"ee|ff"]]]] self.assertAllEqual(expected_ngrams, result)
def test_tuple_multi_ngrams_inverted_order(self): data = [[b"aa", b"bb", b"cc", b"dd"], [b"ee", b"ff"]] data_tensor = ragged_factory_ops.constant(data) ngram_op = ragged_string_ops.ngrams( data_tensor, ngram_width=(3, 2), separator=b"|") result = self.evaluate(ngram_op) expected_ngrams = [[b"aa|bb|cc", b"bb|cc|dd", b"aa|bb", b"bb|cc", b"cc|dd"], [b"ee|ff"]] self.assertAllEqual(expected_ngrams, result)
def test_dense_input_with_multiple_ngrams(self): data = [[b"a", b"b", b"c", b"d"], [b"e", b"f", b"g", b"h"]] data_tensor = ragged_factory_ops.constant(data) ngram_op = ragged_string_ops.ngrams( data_tensor, ngram_width=(1, 2, 3), separator=b"|") result = self.evaluate(ngram_op) expected_ngrams = [[ b"a", b"b", b"c", b"d", b"a|b", b"b|c", b"c|d", b"a|b|c", b"b|c|d" ], [b"e", b"f", b"g", b"h", b"e|f", b"f|g", b"g|h", b"e|f|g", b"f|g|h"]] self.assertAllEqual(expected_ngrams, result)
def test_explicit_multiply_padded_ngrams(self): data = [[b"a"]] data_tensor = ragged_factory_ops.constant(data) ngram_op = ragged_string_ops.ngrams(data_tensor, ngram_width=5, separator=b"|", pad_values=(b"LP", b"RP"), padding_width=2) result = self.evaluate(ngram_op) expected_ngrams = [[b"LP|LP|a|RP|RP"]] self.assertAllEqual(expected_ngrams, result)
def test_single_padding_string(self): data = [[b"a"], [b"b", b"c", b"d"], [b"e", b"f"]] data_tensor = ragged_factory_ops.constant(data) ngram_op = ragged_string_ops.ngrams(data_tensor, ngram_width=5, separator=b"|", pad_values=b"[PAD]", padding_width=1) result = self.evaluate(ngram_op) expected_ngrams = [[], [b"[PAD]|b|c|d|[PAD]"], []] self.assertAllEqual(expected_ngrams, result)
def test_input_list_input(self): data = [[b"a", b"z"], [b"b", b""], [b"e", b"f"]] ngram_op = ragged_string_ops.ngrams( data, ngram_width=3, separator=b"|", pad_values=(b"LP", b"RP")) result = self.evaluate(ngram_op) expected_ngrams = [ [b"LP|LP|a", b"LP|a|z", b"a|z|RP", b"z|RP|RP"], [b"LP|LP|b", b"LP|b|", b"b||RP", b"|RP|RP"], [b"LP|LP|e", b"LP|e|f", b"e|f|RP", b"f|RP|RP"], ] self.assertAllEqual(expected_ngrams, result)
def test_singly_padded_multiple_ngrams(self): data = [[b"a"], [b"b", b"c", b"d"], [b"e", b"f"]] data_tensor = ragged_factory_ops.constant(data) ngram_op = ragged_string_ops.ngrams(data_tensor, ngram_width=(1, 5), separator=b"|", pad_values=(b"LP", b"RP"), padding_width=1) result = self.evaluate(ngram_op) expected_ngrams = [[b"a"], [b"b", b"c", b"d", b"LP|b|c|d|RP"], [b"e", b"f"]] self.assertAllEqual(expected_ngrams, result)
def test_singly_padded_ngrams_with_preserve_short(self): data = [[b"a"], [b"b", b"c", b"d"], [b"e", b"f"]] data_tensor = ragged_factory_ops.constant(data) ngram_op = ragged_string_ops.ngrams(data_tensor, ngram_width=5, separator=b"|", pad_values=(b"LP", b"RP"), padding_width=1, preserve_short_sequences=True) result = self.evaluate(ngram_op) expected_ngrams = [[b"LP|a|RP"], [b"LP|b|c|d|RP"], [b"LP|e|f|RP"]] self.assertAllEqual(expected_ngrams, result)
def test_dense_input_rank_3(self): data = [[[b"a", b"z"], [b"b", b""]], [[b"b", b""], [b"e", b"f"]]] data_tensor = constant_op.constant(data) ngram_op = ragged_string_ops.ngrams( data_tensor, ngram_width=3, separator=b"|", pad_values=(b"LP", b"RP")) result = self.evaluate(ngram_op) expected_ngrams = [[[b"LP|LP|a", b"LP|a|z", b"a|z|RP", b"z|RP|RP"], [b"LP|LP|b", b"LP|b|", b"b||RP", b"|RP|RP"]], [[b"LP|LP|b", b"LP|b|", b"b||RP", b"|RP|RP"], [b"LP|LP|e", b"LP|e|f", b"e|f|RP", b"f|RP|RP"]]] self.assertIsInstance(ngram_op, ops.Tensor) self.assertAllEqual(expected_ngrams, result)
def test_fully_padded_ngrams(self): data = [[b"a"], [b"b", b"c", b"d"], [b"e", b"f"]] data_tensor = ragged_factory_ops.constant(data) ngram_op = ragged_string_ops.ngrams( data_tensor, ngram_width=3, separator=b"|", pad_values=(b"LP", b"RP")) result = self.evaluate(ngram_op) expected_ngrams = [ [b"LP|LP|a", b"LP|a|RP", b"a|RP|RP"], # 0 [b"LP|LP|b", b"LP|b|c", b"b|c|d", b"c|d|RP", b"d|RP|RP"], # 1 [b"LP|LP|e", b"LP|e|f", b"e|f|RP", b"f|RP|RP"] # 2 ] self.assertAllEqual(expected_ngrams, result)
def _preprocess(self, inputs): if self._standardize == LOWER_AND_STRIP_PUNCTUATION: if ragged_tensor.is_ragged(inputs): lowercase_inputs = ragged_functional_ops.map_flat_values( gen_string_ops.string_lower, inputs) # Depending on configuration, we may never touch the non-data tensor # in the ragged inputs tensor. If that is the case, and this is the # only layer in the keras model, running it will throw an error. # To get around this, we wrap the result in an identity. lowercase_inputs = array_ops.identity(lowercase_inputs) else: lowercase_inputs = gen_string_ops.string_lower(inputs) inputs = string_ops.regex_replace(lowercase_inputs, DEFAULT_STRIP_REGEX, "") elif callable(self._standardize): inputs = self._standardize(inputs) elif self._standardize is not None: raise ValueError( ("%s is not a supported standardization. " "TextVectorization supports the following options " "for `standardize`: None, " "'lower_and_strip_punctuation', or a " "Callable.") % self._standardize) if self._split is not None: # If we are splitting, we validate that the 1st axis is of dimension 1 and # so can be squeezed out. We do this here instead of after splitting for # performance reasons - it's more expensive to squeeze a ragged tensor. if inputs.shape.ndims > 1: inputs = array_ops.squeeze(inputs, axis=-1) if self._split == SPLIT_ON_WHITESPACE: # This treats multiple whitespaces as one whitespace, and strips leading # and trailing whitespace. inputs = ragged_string_ops.string_split_v2(inputs) elif callable(self._split): inputs = self._split(inputs) else: raise ValueError( ("%s is not a supported splitting." "TextVectorization supports the following options " "for `split`: None, 'whitespace', or a Callable.") % self._split) # Note that 'inputs' here can be either ragged or dense depending on the # configuration choices for this Layer. The strings.ngrams op, however, does # support both ragged and dense inputs. if self._ngrams is not None: inputs = ragged_string_ops.ngrams(inputs, ngram_width=self._ngrams, separator=" ") return inputs
def test_ngram_padding_size_cap(self): # Validate that the padding size is never greater than ngram_size - 1. data = [[b"a"], [b"b", b"c", b"d"], [b"e", b"f"]] data_tensor = ragged_factory_ops.constant(data) ngram_op = ragged_string_ops.ngrams(data_tensor, ngram_width=3, separator=b"|", pad_values=(b"LP", b"RP"), padding_width=10) result = self.evaluate(ngram_op) expected_ngrams = [ [b"LP|LP|a", b"LP|a|RP", b"a|RP|RP"], # 0 [b"LP|LP|b", b"LP|b|c", b"b|c|d", b"c|d|RP", b"d|RP|RP"], # 1 [b"LP|LP|e", b"LP|e|f", b"e|f|RP", b"f|RP|RP"] # 2 ] self.assertAllEqual(expected_ngrams, result)
def _preprocess(self, inputs): if self._standardize is LOWER_AND_STRIP_PUNCTUATION: lowercase_inputs = gen_string_ops.string_lower(inputs) inputs = string_ops.regex_replace(lowercase_inputs, DEFAULT_STRIP_REGEX, "") elif callable(self._standardize): inputs = self._standardize(inputs) elif self._standardize is not None: raise ValueError( ("%s is not a supported standardization. " "TextVectorization supports the following options " "for `standardize`: None, " "'lower_and_strip_punctuation', or a " "Callable.") % self._standardize) if self._split is not None: # If we are splitting, we validate that the 1st axis is of dimension 1 and # so can be squeezed out. We do this here instead of after splitting for # performance reasons - it's more expensive to squeeze a ragged tensor. inputs = array_ops.squeeze(inputs, axis=1) if self._split is SPLIT_ON_WHITESPACE: # This treats multiple whitespaces as one whitespace, and strips leading # and trailing whitespace. inputs = ragged_string_ops.string_split_v2(inputs) elif callable(self._split): inputs = self._split(inputs) else: raise ValueError( ("%s is not a supported splitting." "TextVectorization supports the following options " "for `split`: None, 'whitespace', or a Callable.") % self._split) # Note that 'inputs' here can be either ragged or dense depending on the # configuration choices for this Layer. The strings.ngrams op, however, does # support both ragged and dense inputs. if self._ngrams is not None: inputs = ragged_string_ops.ngrams(inputs, ngram_width=self._ngrams, separator=" ") return inputs
def f(v): return ragged_string_ops.ngrams(v, 2)