Python tokenize Examples

Programming Language: Python

Namespace/Package Name: tensorflow_onmttok.python.ops.onmttok_ops

Method/Function: tokenize

Examples at hotexamples.com: 15

Python tokenize - 15 examples found. These are the top rated real world Python examples of tensorflow_onmttok.python.ops.onmttok_ops.tokenize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

 def testCaseFeature(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"Tom"],
                      mode="conservative",
                      case_feature=True),
             np.array([b"tom"])
         )

Example #2

Show file

 def testJoinerAnnotate(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"Hello!"],
                      mode="conservative",
                      joiner_annotate=True),
             np.array([b"Hello", b"\xc3\xaf\xc2\xbf\xc2\xad!"])
         )

Example #3

Show file

 def testCaseMarkup(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"Tom"],
                      mode="conservative",
                      case_markup=True),
             np.array([b"\xef\xbd\x9fmrk_case_modifier_C\xef\xbd\xa0", b"tom"])
         )

Example #4

Show file

 def testSegmentAlphabetChange(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"\xe6\xb8\xac\xe8\xa9\xa6abc"],
                      mode="conservative",
                      segment_alphabet_change=True),
             np.array([b"\xe6\xb8\xac\xe8\xa9\xa6", b"abc"])
         )

Example #5

Show file

 def testSegmentNumbers(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"1234"],
                      mode="aggressive",
                      segment_numbers=True),
             np.array([b"1", b"2", b"3", b"4"])
         )

Example #6

Show file

 def testSegmentAlphabet(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"abcd"],
                      mode="conservative",
                      segment_alphabet=["Latin"]),
             np.array([b"a", b"b", b"c", b"d"])
         )

Example #7

Show file

 def testSegmentCase(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"WiFi"],
                      mode="conservative",
                      segment_case=True),
             np.array([b"Wi", b"Fi"])
         )

Example #8

Show file

 def testSpacerAnnotate(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"Hello world"],
                      mode="conservative",
                      spacer_annotate=True),
             np.array([b"Hello", b"\xe2\x96\x81world"])
         )

Example #9

Show file

 def testJoinerCustom(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"Hello!"],
                      mode="conservative",
                      joiner_annotate=True,
                      joiner="@@"),
             np.array([b"Hello", b"@@!"])
         )

Example #10

Show file

 def testPreservePlaceholders(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"Hello \xef\xbd\x9fWorld\xef\xbd\xa0"],
                      mode="conservative",
                      joiner_annotate=True,
                      preserve_placeholders=True),
             np.array([b"Hello", b"\xef\xbd\x9fWorld\xef\xbd\xa0"])
         )

Example #11

Show file

 def testSoftCaseRegions(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"U.N"],
                      mode="conservative",
                      case_markup=True,
                      soft_case_regions=True),
             np.array([b"\xef\xbd\x9fmrk_begin_case_region_U\xef\xbd\xa0",
                       b"u.",
                       b"n",
                       b"\xef\xbd\x9fmrk_end_case_region_U\xef\xbd\xa0"])
         )

Example #12

Show file

    def testModes(self):
        with self.session():
            self.assertAllEqual(
                tokenize([b"Mary-Ann is here."], mode="conservative"),
                np.array([b"Mary-Ann", b"is", b"here", b"."])
            )

            self.assertAllEqual(
                tokenize([b"Mary-Ann is here."], mode="aggressive"),
                np.array([b"Mary", b"-", b"Ann", b"is", b"here", b"."])
            )

            self.assertAllEqual(
                tokenize([b"Tom"], mode="char"),
                np.array([b"T", b"o", b"m"])
            )

            self.assertAllEqual(
                tokenize([b"Mary-Ann is here."], mode="space"),
                np.array([b"Mary-Ann", b"is", b"here."])
            )

Example #13

Show file

 def testSupportPriorJoiners(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"pre\xc3\xaf\xc2\xbf\xc2\xad tokenization."],
                      mode="aggressive",
                      joiner_annotate=True,
                      support_prior_joiners=True),
             np.array([b"pre\xc3\xaf",
                       b"\xc3\xaf\xc2\xbf\xc2\xad\xc2\xbf",
                       b"\xc3\xaf\xc2\xbf\xc2\xad\xc2\xad",
                       b"tokenization",
                       b"\xc3\xaf\xc2\xbf\xc2\xad."])
         )

Example #14

Show file

 def testPreserveSegTokens(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"\xe6\xb8\xac\xe8\xa9\xa6abc"],
                      mode="aggressive",
                      joiner_annotate=True,
                      segment_alphabet=["Han"],
                      segment_alphabet_change=True,
                      preserve_segmented_tokens=True),
             np.array([b"\xe6\xb8\xac",
                       b"\xc3\xaf\xc2\xbf\xc2\xad",
                       b"\xe8\xa9\xa6",
                       b"\xc3\xaf\xc2\xbf\xc2\xad",
                       b"abc"])
         )

Example #15

Show file

 def _tokenize_tensor(self, text):
     return tokenize(text, **self._config)