Exemple #1
0
    def _preprocess(self, inputs):
        if self._standardize is LOWER_AND_STRIP_PUNCTUATION:
            lowercase_inputs = gen_string_ops.string_lower(inputs)
            inputs = string_ops.regex_replace(lowercase_inputs,
                                              self._strip_regex, "")
        elif self._standardize is not None:
            # TODO(momernick): Support callables here.
            raise RuntimeError("Not a supported standardization.")

        if self._split is SPLIT_ON_WHITESPACE:
            # If split isn't None, we validate that the 1st axis is of dimension 1 and
            # so can be squeezed out. We do this here instead of after splitting for
            # performance reasons - it's more expensive to squeeze a ragged tensor.
            inputs = array_ops.squeeze(inputs, axis=1)
            # This treats multiple whitespaces as one whitespace, and strips leading
            # and trailing whitespace.
            inputs = ragged_string_ops.string_split_v2(inputs)
        elif self._split is not None:
            # TODO(momernick): Support callables here.
            raise RuntimeError("Not a supported splitting.")

        # Note that 'inputs' here can be either ragged or dense depending on the
        # configuration choices for this Layer. The strings.ngrams op, however, does
        # support both ragged and dense inputs.
        if self._ngrams is not None:
            inputs = ragged_string_ops.ngrams(inputs,
                                              ngram_width=self._ngrams,
                                              separator=" ")

        return inputs
Exemple #2
0
    def _preprocess(self, inputs):
        if self._standardize == LOWER_AND_STRIP_PUNCTUATION:
            if ragged_tensor.is_ragged(inputs):
                lowercase_inputs = ragged_functional_ops.map_flat_values(
                    gen_string_ops.string_lower, inputs)
                # Depending on configuration, we may never touch the non-data tensor
                # in the ragged inputs tensor. If that is the case, and this is the
                # only layer in the keras model, running it will throw an error.
                # To get around this, we wrap the result in an identity.
                lowercase_inputs = array_ops.identity(lowercase_inputs)
            else:
                lowercase_inputs = gen_string_ops.string_lower(inputs)
            inputs = string_ops.regex_replace(lowercase_inputs,
                                              DEFAULT_STRIP_REGEX, "")
        elif callable(self._standardize):
            inputs = self._standardize(inputs)
        elif self._standardize is not None:
            raise ValueError(
                ("%s is not a supported standardization. "
                 "TextVectorization supports the following options "
                 "for `standardize`: None, "
                 "'lower_and_strip_punctuation', or a "
                 "Callable.") % self._standardize)

        if self._split is not None:
            # If we are splitting, we validate that the 1st axis is of dimension 1 and
            # so can be squeezed out. We do this here instead of after splitting for
            # performance reasons - it's more expensive to squeeze a ragged tensor.
            if inputs.shape.ndims > 1:
                inputs = array_ops.squeeze(inputs, axis=-1)
            if self._split == SPLIT_ON_WHITESPACE:
                # This treats multiple whitespaces as one whitespace, and strips leading
                # and trailing whitespace.
                inputs = ragged_string_ops.string_split_v2(inputs)
            elif callable(self._split):
                inputs = self._split(inputs)
            else:
                raise ValueError(
                    ("%s is not a supported splitting."
                     "TextVectorization supports the following options "
                     "for `split`: None, 'whitespace', or a Callable.") %
                    self._split)

        # Note that 'inputs' here can be either ragged or dense depending on the
        # configuration choices for this Layer. The strings.ngrams op, however, does
        # support both ragged and dense inputs.
        if self._ngrams is not None:
            inputs = ragged_string_ops.ngrams(inputs,
                                              ngram_width=self._ngrams,
                                              separator=" ")

        return inputs
    def _preprocess(self, inputs):
        if self._standardize is LOWER_AND_STRIP_PUNCTUATION:
            lowercase_inputs = gen_string_ops.string_lower(inputs)
            inputs = string_ops.regex_replace(lowercase_inputs,
                                              DEFAULT_STRIP_REGEX, "")
        elif callable(self._standardize):
            inputs = self._standardize(inputs)
        elif self._standardize is not None:
            raise ValueError(
                ("%s is not a supported standardization. "
                 "TextVectorization supports the following options "
                 "for `standardize`: None, "
                 "'lower_and_strip_punctuation', or a "
                 "Callable.") % self._standardize)

        if self._split is not None:
            # If we are splitting, we validate that the 1st axis is of dimension 1 and
            # so can be squeezed out. We do this here instead of after splitting for
            # performance reasons - it's more expensive to squeeze a ragged tensor.
            inputs = array_ops.squeeze(inputs, axis=1)
            if self._split is SPLIT_ON_WHITESPACE:
                # This treats multiple whitespaces as one whitespace, and strips leading
                # and trailing whitespace.
                inputs = ragged_string_ops.string_split_v2(inputs)
            elif callable(self._split):
                inputs = self._split(inputs)
            else:
                raise ValueError(
                    ("%s is not a supported splitting."
                     "TextVectorization supports the following options "
                     "for `split`: None, 'whitespace', or a Callable.") %
                    self._split)

        # Note that 'inputs' here can be either ragged or dense depending on the
        # configuration choices for this Layer. The strings.ngrams op, however, does
        # support both ragged and dense inputs.
        if self._ngrams is not None:
            inputs = ragged_string_ops.ngrams(inputs,
                                              ngram_width=self._ngrams,
                                              separator=" ")

        return inputs
Exemple #4
0
def custom_standardize_fn(x):
    return gen_string_ops.string_lower(x)