Esempio n. 1
0
    def tokenize_with_offsets(self, text_input):
        """Performs basic word tokenization for BERT.

    Args:
      text_input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings.
    Returns:
      A `RaggedTensor` of tokenized strings from text_input.
    """
        # lowercase and strip accents (if option is set)
        if self._lower_case:
            text_input = case_fold_utf8(text_input)
            text_input = normalize_utf8(text_input, "NFD")
            text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "")
        else:
            # utf8 normalization
            if self._normalization_form is not None:
                text_input = normalize_utf8(text_input,
                                            self._normalization_form)

        # strip out control characters
        text_input = string_ops.regex_replace(text_input, r"\p{Cc}|\p{Cf}",
                                              " ")

        return regex_split_ops.regex_split_with_offsets(
            text_input, _DELIM_REGEX_PATTERN, self._keep_delim_regex_pattern,
            "BertBasicTokenizer")
 def testRegexReplaceDelegation(self, pattern_fn, rewrite_fn):
     with self.cached_session():
         input_vector = constant_op.constant("foo", dtypes.string)
         pattern = pattern_fn("[a-z]")
         replace = rewrite_fn(".")
         op = string_ops.regex_replace(input_vector, pattern, replace)
         self.assertTrue(op.name.startswith("RegexReplace"))
 def testRegexReplaceDelegation(self, pattern_fn, rewrite_fn):
   with self.cached_session():
     input_vector = constant_op.constant("foo", dtypes.string)
     pattern = pattern_fn("[a-z]")
     replace = rewrite_fn(".")
     op = string_ops.regex_replace(input_vector, pattern, replace)
     self.assertTrue(op.name.startswith("RegexReplace"))
Esempio n. 4
0
    def _preprocess(self, inputs):
        if self._standardize is LOWER_AND_STRIP_PUNCTUATION:
            lowercase_inputs = gen_string_ops.string_lower(inputs)
            inputs = string_ops.regex_replace(lowercase_inputs,
                                              self._strip_regex, "")
        elif self._standardize is not None:
            # TODO(momernick): Support callables here.
            raise RuntimeError("Not a supported standardization.")

        if self._split is SPLIT_ON_WHITESPACE:
            # If split isn't None, we validate that the 1st axis is of dimension 1 and
            # so can be squeezed out. We do this here instead of after splitting for
            # performance reasons - it's more expensive to squeeze a ragged tensor.
            inputs = array_ops.squeeze(inputs, axis=1)
            # This treats multiple whitespaces as one whitespace, and strips leading
            # and trailing whitespace.
            inputs = ragged_string_ops.string_split_v2(inputs)
        elif self._split is not None:
            # TODO(momernick): Support callables here.
            raise RuntimeError("Not a supported splitting.")

        # Note that 'inputs' here can be either ragged or dense depending on the
        # configuration choices for this Layer. The strings.ngrams op, however, does
        # support both ragged and dense inputs.
        if self._ngrams is not None:
            inputs = ragged_string_ops.ngrams(inputs,
                                              ngram_width=self._ngrams,
                                              separator=" ")

        return inputs
 def testGlobal(self):
   values = ["ababababab", "abcabcabc", ""]
   with self.test_session():
     input_vector = constant_op.constant(values, dtypes.string)
     stripped = string_ops.regex_replace(input_vector, "ab", "abc",
                                         True).eval()
     self.assertAllEqual([b"abcabcabcabcabc", b"abccabccabcc", b""], stripped)
 def testStaticRegexReplaceDelegation(self):
     with self.cached_session():
         input_vector = constant_op.constant("foo", dtypes.string)
         pattern = "[a-z]"
         replace = "."
         op = string_ops.regex_replace(input_vector, pattern, replace)
         self.assertTrue(op.name.startswith("StaticRegexReplace"))
 def testStaticRegexReplaceDelegation(self):
   with self.cached_session():
     input_vector = constant_op.constant("foo", dtypes.string)
     pattern = "[a-z]"
     replace = "."
     op = string_ops.regex_replace(input_vector, pattern, replace)
     self.assertTrue(op.name.startswith("StaticRegexReplace"))
 def testRemovePrefix(self):
   values = ["a:foo", "a:bar", "a:foo", "b:baz", "b:qux", "ca:b"]
   with self.test_session():
     input_vector = constant_op.constant(values, dtypes.string)
     stripped = string_ops.regex_replace(
         input_vector, "^(a:|b:)", "", replace_global=False).eval()
     self.assertAllEqual([b"foo", b"bar", b"foo", b"baz", b"qux", b"ca:b"],
                         stripped)
 def testRegexReplaceDelegation(self, pattern_fn, rewrite_fn):
     with compat.forward_compatibility_horizon(2018, 10, 11):
         with self.test_session():
             input_vector = constant_op.constant("foo", dtypes.string)
             pattern = pattern_fn("[a-z]")
             replace = rewrite_fn(".")
             op = string_ops.regex_replace(input_vector, pattern, replace)
             self.assertTrue(op.name.startswith("RegexReplace"))
 def testInvalidPattern(self):
   values = ["abc", "1"]
   with self.test_session():
     input_vector = constant_op.constant(values, dtypes.string)
     invalid_pattern = "A["
     replace = string_ops.regex_replace(input_vector, invalid_pattern, "x")
     with self.assertRaisesOpError("Invalid pattern"):
       replace.eval()
Esempio n. 11
0
 def testStaticRegexReplaceDelegation(self):
   with compat.forward_compatibility_horizon(2018, 10, 11):
     with self.test_session():
       input_vector = constant_op.constant("foo", dtypes.string)
       pattern = "[a-z]"
       replace = "."
       op = string_ops.regex_replace(input_vector, pattern, replace)
       self.assertTrue(op.name.startswith("StaticRegexReplace"))
Esempio n. 12
0
    def _preprocess(self, inputs):
        if self._standardize == LOWER_AND_STRIP_PUNCTUATION:
            if ragged_tensor.is_ragged(inputs):
                lowercase_inputs = ragged_functional_ops.map_flat_values(
                    gen_string_ops.string_lower, inputs)
                # Depending on configuration, we may never touch the non-data tensor
                # in the ragged inputs tensor. If that is the case, and this is the
                # only layer in the keras model, running it will throw an error.
                # To get around this, we wrap the result in an identity.
                lowercase_inputs = array_ops.identity(lowercase_inputs)
            else:
                lowercase_inputs = gen_string_ops.string_lower(inputs)
            inputs = string_ops.regex_replace(lowercase_inputs,
                                              DEFAULT_STRIP_REGEX, "")
        elif callable(self._standardize):
            inputs = self._standardize(inputs)
        elif self._standardize is not None:
            raise ValueError(
                ("%s is not a supported standardization. "
                 "TextVectorization supports the following options "
                 "for `standardize`: None, "
                 "'lower_and_strip_punctuation', or a "
                 "Callable.") % self._standardize)

        if self._split is not None:
            # If we are splitting, we validate that the 1st axis is of dimension 1 and
            # so can be squeezed out. We do this here instead of after splitting for
            # performance reasons - it's more expensive to squeeze a ragged tensor.
            if inputs.shape.ndims > 1:
                inputs = array_ops.squeeze(inputs, axis=-1)
            if self._split == SPLIT_ON_WHITESPACE:
                # This treats multiple whitespaces as one whitespace, and strips leading
                # and trailing whitespace.
                inputs = ragged_string_ops.string_split_v2(inputs)
            elif callable(self._split):
                inputs = self._split(inputs)
            else:
                raise ValueError(
                    ("%s is not a supported splitting."
                     "TextVectorization supports the following options "
                     "for `split`: None, 'whitespace', or a Callable.") %
                    self._split)

        # Note that 'inputs' here can be either ragged or dense depending on the
        # configuration choices for this Layer. The strings.ngrams op, however, does
        # support both ragged and dense inputs.
        if self._ngrams is not None:
            inputs = ragged_string_ops.ngrams(inputs,
                                              ngram_width=self._ngrams,
                                              separator=" ")

        return inputs
Esempio n. 13
0
  def tokenize(self, text_input):
    """Performs basic word tokenization for BERT.

    Args:
      text_input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings.
    Returns:
      A `RaggedTensor` of tokenized strings from text_input.
    """
    # lowercase and strip accents (if option is set)
    if self._lower_case:
      text_input = case_fold_utf8(text_input)
      text_input = normalize_utf8(text_input, "NFD")
      text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "")
    else:
      # utf8 normalization
      if self._normalization_form is not None:
        text_input = normalize_utf8(text_input, self._normalization_form)

    # strip out control characters
    text_input = string_ops.regex_replace(text_input, r"\p{Cc}|\p{Cf}", " ")

    # For chinese and emoji characters, tokenize by unicode codepoints
    unicode_tokenizer = UnicodeScriptTokenizer(
        keep_whitespace=self._keep_whitespace)
    script_tokenized = unicode_tokenizer.tokenize(text_input)

    split_cond = self._should_split(script_tokenized)

    unicode_char_split = ragged_string_ops.unicode_split(
        script_tokenized, "UTF-8")
    unicode_split_tokens = array_ops.where(
        array_ops.squeeze(split_cond),
        y=array_ops.expand_dims(script_tokenized.values, axis=1),
        x=unicode_char_split.values)
    final_tokens = script_tokenized.with_flat_values(unicode_split_tokens)
    return final_tokens.merge_dims(-2, -1)
Esempio n. 14
0
    def test_bert_tokenizer(self,
                            text_inputs,
                            expected,
                            vocab=None,
                            expected_extracted=None,
                            lower_case=True,
                            num_oov=1,
                            preserve_unused_token=False):
        text_inputs = constant_op.constant(text_inputs)
        if not vocab:
            vocab = _VOCAB
        table = _create_table(vocab, num_oov)
        self.evaluate(table.initializer)
        tokenizer = bert_tokenizer.BertTokenizer(
            table,
            token_out_type=dtypes.string,
            lower_case=lower_case,
            preserve_unused_token=preserve_unused_token)
        results = tokenizer.tokenize(text_inputs)
        self.assertAllEqual(results, expected)

        # Verify that the int ids are the same.
        expected_rt = ragged_factory_ops.constant(expected)
        expected_int = table.lookup(expected_rt.flat_values)
        expected_int_rt = ragged_tensor.RaggedTensor.from_nested_row_splits(
            expected_int, expected_rt.nested_row_splits)
        int_tokenizer = bert_tokenizer.BertTokenizer(
            vocab_lookup_table=table,
            token_out_type=dtypes.int64,
            lower_case=lower_case,
            preserve_unused_token=preserve_unused_token)
        results_int = int_tokenizer.tokenize(text_inputs)
        self.assertAllEqual(results_int, expected_int_rt)

        # Verify that the offsets can extract the expected tokens
        _, begin, end = tokenizer.tokenize_with_offsets(text_inputs)

        extracted_wordpieces = _ragged_substr(text_inputs, begin, end)
        if expected_extracted:
            self.assertAllEqual(extracted_wordpieces, expected_extracted)
        else:
            # The extracted won't have any wordpieces with '##' prefix. Strip them
            # out.
            stripped_prefix_flat = string_ops.regex_replace(
                expected_rt.flat_values, '##', '')
            stripped_prefix = expected_rt.with_flat_values(
                stripped_prefix_flat)
            self.assertAllEqual(extracted_wordpieces, stripped_prefix)
Esempio n. 15
0
def ragged_tensor_to_string(rt, summarize=None):
    """Returns a scalar string tensor with the contents of a RaggedTensor.

  Requires that `rt.shape.rank` is not `None`.

  Note: this converts the entire `RaggedTensor` into a single string scalar.
  If you want to convert individual elements, use `tf.strings.as_string(rt)`.

  >>> rt1 = tf.ragged.constant([[1, 2, 3], [4, 5]])
  >>> ragged_tensor_to_string(rt1).numpy()
  b'[[1, 2, 3], [4, 5]]'

  >>> rt2 = tf.ragged.constant([[['a'], ['b', 'c']], [['d', 'e', 'f'], []]])
  >>> ragged_tensor_to_string(rt2).numpy()
  b"[[['a'], ['b', 'c']], [['d', 'e', 'f'], []]]"

  >>> rt3 = tf.ragged.constant([[1], [2, 3, 4, 5, 6], [], [], [7], [8, 9]])
  >>> ragged_tensor_to_string(rt3, summarize=2).numpy()
  b'[[1], [2, 3, ..., 5, 6], ..., [7], [8, 9]]'

  Args:
    rt: The RaggedTensor that should be converted to a string.
    summarize: If specified, then only the first and last `summarize` elements
      within each dimension are included in the string. If `-1` or `None`, then
      all elements are included.
  """
    if (summarize is not None and summarize != -1
            and not (isinstance(summarize, int) and summarize > 0)):
        raise ValueError(
            "Expected summarize to be -1 or a positive int, got %r" %
            summarize)
    with ops.name_scope(None, "AsString", [rt]):
        rt = ragged_tensor.convert_to_tensor_or_ragged_tensor(rt)
        if rt.shape.rank is None:
            raise ValueError(
                "RaggedTensor to_string requires that rt.shape.rank "
                "is not None.")
        # Convert all elements of `rt` to strings.
        if rt.dtype == dtypes.string:
            escaped = string_ops.regex_replace(rt.flat_values, r"(['\\])",
                                               r"\\\1")
            str_t = rt.with_flat_values("'" + escaped + "'")
        else:
            str_t = rt.with_flat_values(string_ops.as_string(rt.flat_values))

        return _ragged_tensor_to_string(str_t, summarize)
Esempio n. 16
0
    def _preprocess(self, inputs):
        if self._standardize is LOWER_AND_STRIP_PUNCTUATION:
            lowercase_inputs = gen_string_ops.string_lower(inputs)
            inputs = string_ops.regex_replace(lowercase_inputs,
                                              DEFAULT_STRIP_REGEX, "")
        elif callable(self._standardize):
            inputs = self._standardize(inputs)
        elif self._standardize is not None:
            raise ValueError(
                ("%s is not a supported standardization. "
                 "TextVectorization supports the following options "
                 "for `standardize`: None, "
                 "'lower_and_strip_punctuation', or a "
                 "Callable.") % self._standardize)

        if self._split is not None:
            # If we are splitting, we validate that the 1st axis is of dimension 1 and
            # so can be squeezed out. We do this here instead of after splitting for
            # performance reasons - it's more expensive to squeeze a ragged tensor.
            inputs = array_ops.squeeze(inputs, axis=1)
            if self._split is SPLIT_ON_WHITESPACE:
                # This treats multiple whitespaces as one whitespace, and strips leading
                # and trailing whitespace.
                inputs = ragged_string_ops.string_split_v2(inputs)
            elif callable(self._split):
                inputs = self._split(inputs)
            else:
                raise ValueError(
                    ("%s is not a supported splitting."
                     "TextVectorization supports the following options "
                     "for `split`: None, 'whitespace', or a Callable.") %
                    self._split)

        # Note that 'inputs' here can be either ragged or dense depending on the
        # configuration choices for this Layer. The strings.ngrams op, however, does
        # support both ragged and dense inputs.
        if self._ngrams is not None:
            inputs = ragged_string_ops.ngrams(inputs,
                                              ngram_width=self._ngrams,
                                              separator=" ")

        return inputs
Esempio n. 17
0
def process_dataset(dataset, tokenizer, batch_size, buffer_size):
    ds = dataset.map(tf.strings.strip)
    ds = ds.map(tf.strings.lower)

    DEFAULT_STRIP_REGEX = r'[!"#$%&()\*\+,-\./:;<=>?@\[\\\]^_`{|}~\']'
    ds = ds.map(lambda s: string_ops.regex_replace(s, DEFAULT_STRIP_REGEX, ""))
    ds = ds.map(tf.strings.split)
    ds = ds.filter(lambda s: tf.shape(s)[0] >= buffer_size)

    prev_words = ds.map(lambda x: x[:buffer_size - 1])
    prev_words = prev_words.map(
        lambda s: tf.strings.reduce_join(s, separator=" "))

    next_word = ds.map(lambda x: x[buffer_size - 1])
    next_word = next_word.map(lambda s: tf.expand_dims(s, 0))
    next_word = next_word.map(lambda s: tokenizer(s)[0, 0])

    ds = tf.data.Dataset.zip((prev_words, next_word))
    ds = ds.shuffle(10000).batch(batch_size, drop_remainder=True)

    return ds
 def testRegexReplace(self):
   values = ["aba\naba", "abcdabcde"]
   with self.test_session():
     input_vector = constant_op.constant(values, dtypes.string)
     stripped = string_ops.regex_replace(input_vector, "a.*a", "(\\0)").eval()
     self.assertAllEqual([b"(aba)\n(aba)", b"(abcda)bcde"], stripped)
 def testEmptyMatch(self):
   values = ["abc", "1"]
   with self.test_session():
     input_vector = constant_op.constant(values, dtypes.string)
     stripped = string_ops.regex_replace(input_vector, "", "x").eval()
     self.assertAllEqual([b"xaxbxcx", b"x1x"], stripped)
Esempio n. 20
0
 def lower_case(self, text_input):
     """Lower-cases the `text_input'."""
     text_input = case_fold_utf8(text_input)
     text_input = normalize_utf8(text_input, "NFD")
     text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "")
     return text_input
Esempio n. 21
0
    def detokenize(self, token_ids):
        r"""Convert a `Tensor` or `RaggedTensor` of wordpiece IDs to string-words.

    >>> import pathlib
    >>> pathlib.Path('/tmp/detok_vocab.txt').write_text(
    ...     'a b c ##a ##b ##c'.replace(' ', '\n'))
    >>> wordpiece = WordpieceTokenizer('/tmp/detok_vocab.txt')
    >>> token_ids = [[0, 4, 5, 2, 5, 5, 5]]
    >>> wordpiece.detokenize(token_ids)
    <tf.RaggedTensor [[b'abc', b'cccc']]>

    The word pieces are joined along the innermost axis to make words. So the
    result has the same rank as the input, but the innermost axis of the result
    indexes words instead of word pieces.

    The shape transformation is: `[..., wordpieces] => [..., words]`

    When the input shape is `[..., words, wordpieces]` (like the output of
    `WordpieceTokenizer.tokenize`) the result's shape is `[..., words, 1]`.
    The additional ragged axis can be removed using `words.merge_dims(-2, -1)`.

    Note: This method assumes wordpiece IDs are dense on the interval
    `[0, vocab_size)`.

    Args:
      token_ids: A `RaggedTensor` or `Tensor` with an int dtype. Must have
      `ndims >= 2`

    Returns:
      A `RaggedTensor` with dtype `string` and the rank as the input
      `token_ids`.
    """
        # If there are performance issues with this method or problems with lookup
        # tables using sparse IDs see the notes in b/177610044.
        vocab, ids = self._get_vocab_and_ids()
        token_ids = ragged_tensor.convert_to_tensor_or_ragged_tensor(token_ids)

        first_is_zero = math_ops.equal(ids[0], 0)
        steps = ids[1:] - ids[:-1]
        all_one_step = math_ops.reduce_all(math_ops.equal(steps, 1))

        check = control_flow_ops.Assert(
            first_is_zero & all_one_step,
            data=[('`detokenize` only works with vocabulary tables where the '
                   'indices are dense on the interval `[0, vocab_size)`')])
        with ops.control_dependencies([check]):
            token_ids = math_ops.minimum(
                token_ids,
                # Limit the OOV buckets to a single index.
                math_ops.cast(array_ops.size(vocab), token_ids.dtype))

        # Add the unknown token at that index.
        vocab = array_ops.concat([vocab, [self._unknown_token]], axis=0)

        # Lookup the text tokens and join them along the innermost axis.
        txt_tokens = array_ops.gather(vocab, token_ids)

        # Ensure the input is Ragged.
        if not isinstance(txt_tokens, RaggedTensor):
            txt_tokens = RaggedTensor.from_tensor(txt_tokens)

        # Join the tokens along the last axis.
        words = string_ops.reduce_join_v2(txt_tokens, axis=-1, separator=' ')

        # Collapse " ##" in all strings to make words.
        words = string_ops.regex_replace(
            words, ' ' + re.escape(self._suffix_indicator), '')

        # Strip leading and trailing spaces.
        words = string_ops.regex_replace(words, '^ +| +$', '')

        # Split on spaces so the last axis is "words".
        words = ragged_string_ops.string_split_v2(words, sep=' ')
        return words