Python normalize_utf8 Beispiele

Programmiersprache: Python

Namespace / Paketname: tensorflow_text.python.ops.normalize_ops

Methode / Funktion: normalize_utf8

Beispiele auf hotexamples.com: 9

Python normalize_utf8 - 9 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die tensorflow_text.python.ops.normalize_ops.normalize_utf8, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

Datei: normalize_ops_test.py Projekt: sxjscience/text

 def test_normalize_nfd(self):
     txt = [u"\u1e9b\u0323"]
     expected = [
         u"\u017f\u0323\u0307".encode("utf-8"),
     ]
     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFD"))
     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfd"))

Beispiel #2

Datei anzeigen

Datei: bert_tokenizer.py Projekt: kornesh/text

    def tokenize_with_offsets(self, text_input):
        """Performs basic word tokenization for BERT.

    Args:
      text_input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings.
    Returns:
      A `RaggedTensor` of tokenized strings from text_input.
    """
        # lowercase and strip accents (if option is set)
        if self._lower_case:
            text_input = case_fold_utf8(text_input)
            text_input = normalize_utf8(text_input, "NFD")
            text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "")
        else:
            # utf8 normalization
            if self._normalization_form is not None:
                text_input = normalize_utf8(text_input,
                                            self._normalization_form)

        # strip out control characters
        text_input = string_ops.regex_replace(text_input, r"\p{Cc}|\p{Cf}",
                                              " ")

        return regex_split_ops.regex_split_with_offsets(
            text_input, _DELIM_REGEX_PATTERN, self._keep_delim_regex_pattern,
            "BertBasicTokenizer")

Beispiel #3

Datei anzeigen

 def test_normalize_nfkc_ragged(self):
   txt = ragged_factory_ops.constant([[[u"\u1e9b\u0323 \ufb01"], []],
                                      [[u"\u1e9b\u0323", u"\ufb01"]]])
   expected = [[[u"ṩ fi".encode("utf-8")], []],
               [[u"ṩ".encode("utf-8"), b"fi"]]]
   self.assertRaggedEqual(expected, normalize_ops.normalize_utf8(txt, "NFKC"))
   self.assertRaggedEqual(expected, normalize_ops.normalize_utf8(txt, "nfkc"))

Beispiel #4

Datei anzeigen

Datei: normalize_ops_test.py Projekt: sxjscience/text

 def test_normalize_nfc(self):
     txt = [
         u"\u1e9b\u0323",
     ]
     expected = [
         u"\u1e9b\u0323".encode("utf-8"),
     ]
     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFC"))
     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfc"))

Beispiel #5

Datei anzeigen

 def test_normalize_nfkc_batch(self):
   txt = [
       u"\u1e9b\u0323",
       u"\ufb01",
   ]
   expected = [
       b"\xe1\xb9\xa9",
       b"fi",
   ]
   self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, u"NFKC"))
   self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, u"nfkc"))

Beispiel #6

Datei anzeigen

Datei: normalize_ops_test.py Projekt: fendaq/text-1

 def test_normalize_nfkc_batch(self):
     txt = [
         u"\u1e9b\u0323",
         u"\ufb01",
     ]
     expected = [
         u"ṩ".encode("utf-8"),
         "fi",
     ]
     self.assertAllEqual(expected,
                         normalize_ops.normalize_utf8(txt, "NFKC"))
     self.assertAllEqual(expected,
                         normalize_ops.normalize_utf8(txt, "nfkc"))

Beispiel #7

Datei anzeigen

  def tokenize(self, text_input):
    """Performs basic word tokenization for BERT.

    Args:
      text_input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings.
    Returns:
      A `RaggedTensor` of tokenized strings from text_input.
    """
    # lowercase and strip accents (if option is set)
    if self._lower_case:
      text_input = case_fold_utf8(text_input)
      text_input = normalize_utf8(text_input, "NFD")
      text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "")
    else:
      # utf8 normalization
      if self._normalization_form is not None:
        text_input = normalize_utf8(text_input, self._normalization_form)

    # strip out control characters
    text_input = string_ops.regex_replace(text_input, r"\p{Cc}|\p{Cf}", " ")

    # For chinese and emoji characters, tokenize by unicode codepoints
    unicode_tokenizer = UnicodeScriptTokenizer(
        keep_whitespace=self._keep_whitespace)
    script_tokenized = unicode_tokenizer.tokenize(text_input)

    split_cond = self._should_split(script_tokenized)

    unicode_char_split = ragged_string_ops.unicode_split(
        script_tokenized, "UTF-8")
    unicode_split_tokens = array_ops.where(
        array_ops.squeeze(split_cond),
        y=array_ops.expand_dims(script_tokenized.values, axis=1),
        x=unicode_char_split.values)
    final_tokens = script_tokenized.with_flat_values(unicode_split_tokens)
    return final_tokens.merge_dims(-2, -1)

Beispiel #8

Datei anzeigen

Datei: normalize_ops_test.py Projekt: sxjscience/text

 def test_unknown_normalization_form(self):
     with self.assertRaises(errors.InvalidArgumentError):
         bomb = normalize_ops.normalize_utf8(
             ["cant readme", "wont read me"], "cantfindme")
         self.evaluate(bomb)

Beispiel #9

Datei anzeigen

 def lower_case(self, text_input):
     """Lower-cases the `text_input'."""
     text_input = case_fold_utf8(text_input)
     text_input = normalize_utf8(text_input, "NFD")
     text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "")
     return text_input