Ejemplo n.º 1
0
 def test_normalize_nfd(self):
     txt = [u"\u1e9b\u0323"]
     expected = [
         u"\u017f\u0323\u0307".encode("utf-8"),
     ]
     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFD"))
     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfd"))
Ejemplo n.º 2
0
    def tokenize_with_offsets(self, text_input):
        """Performs basic word tokenization for BERT.

    Args:
      text_input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings.
    Returns:
      A `RaggedTensor` of tokenized strings from text_input.
    """
        # lowercase and strip accents (if option is set)
        if self._lower_case:
            text_input = case_fold_utf8(text_input)
            text_input = normalize_utf8(text_input, "NFD")
            text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "")
        else:
            # utf8 normalization
            if self._normalization_form is not None:
                text_input = normalize_utf8(text_input,
                                            self._normalization_form)

        # strip out control characters
        text_input = string_ops.regex_replace(text_input, r"\p{Cc}|\p{Cf}",
                                              " ")

        return regex_split_ops.regex_split_with_offsets(
            text_input, _DELIM_REGEX_PATTERN, self._keep_delim_regex_pattern,
            "BertBasicTokenizer")
Ejemplo n.º 3
0
 def test_normalize_nfkc_ragged(self):
   txt = ragged_factory_ops.constant([[[u"\u1e9b\u0323 \ufb01"], []],
                                      [[u"\u1e9b\u0323", u"\ufb01"]]])
   expected = [[[u"ṩ fi".encode("utf-8")], []],
               [[u"ṩ".encode("utf-8"), b"fi"]]]
   self.assertRaggedEqual(expected, normalize_ops.normalize_utf8(txt, "NFKC"))
   self.assertRaggedEqual(expected, normalize_ops.normalize_utf8(txt, "nfkc"))
Ejemplo n.º 4
0
 def test_normalize_nfc(self):
     txt = [
         u"\u1e9b\u0323",
     ]
     expected = [
         u"\u1e9b\u0323".encode("utf-8"),
     ]
     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFC"))
     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfc"))
Ejemplo n.º 5
0
 def test_normalize_nfkc_batch(self):
   txt = [
       u"\u1e9b\u0323",
       u"\ufb01",
   ]
   expected = [
       b"\xe1\xb9\xa9",
       b"fi",
   ]
   self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, u"NFKC"))
   self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, u"nfkc"))
Ejemplo n.º 6
0
 def test_normalize_nfkc_batch(self):
     txt = [
         u"\u1e9b\u0323",
         u"\ufb01",
     ]
     expected = [
         u"ṩ".encode("utf-8"),
         "fi",
     ]
     self.assertAllEqual(expected,
                         normalize_ops.normalize_utf8(txt, "NFKC"))
     self.assertAllEqual(expected,
                         normalize_ops.normalize_utf8(txt, "nfkc"))
Ejemplo n.º 7
0
  def tokenize(self, text_input):
    """Performs basic word tokenization for BERT.

    Args:
      text_input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings.
    Returns:
      A `RaggedTensor` of tokenized strings from text_input.
    """
    # lowercase and strip accents (if option is set)
    if self._lower_case:
      text_input = case_fold_utf8(text_input)
      text_input = normalize_utf8(text_input, "NFD")
      text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "")
    else:
      # utf8 normalization
      if self._normalization_form is not None:
        text_input = normalize_utf8(text_input, self._normalization_form)

    # strip out control characters
    text_input = string_ops.regex_replace(text_input, r"\p{Cc}|\p{Cf}", " ")

    # For chinese and emoji characters, tokenize by unicode codepoints
    unicode_tokenizer = UnicodeScriptTokenizer(
        keep_whitespace=self._keep_whitespace)
    script_tokenized = unicode_tokenizer.tokenize(text_input)

    split_cond = self._should_split(script_tokenized)

    unicode_char_split = ragged_string_ops.unicode_split(
        script_tokenized, "UTF-8")
    unicode_split_tokens = array_ops.where(
        array_ops.squeeze(split_cond),
        y=array_ops.expand_dims(script_tokenized.values, axis=1),
        x=unicode_char_split.values)
    final_tokens = script_tokenized.with_flat_values(unicode_split_tokens)
    return final_tokens.merge_dims(-2, -1)
Ejemplo n.º 8
0
 def test_unknown_normalization_form(self):
     with self.assertRaises(errors.InvalidArgumentError):
         bomb = normalize_ops.normalize_utf8(
             ["cant readme", "wont read me"], "cantfindme")
         self.evaluate(bomb)
Ejemplo n.º 9
0
 def lower_case(self, text_input):
     """Lower-cases the `text_input'."""
     text_input = case_fold_utf8(text_input)
     text_input = normalize_utf8(text_input, "NFD")
     text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "")
     return text_input