コード例 #1
0
 def testCaseFeature(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"Tom"],
                      mode="conservative",
                      case_feature=True),
             np.array([b"tom"])
         )
コード例 #2
0
 def testJoinerAnnotate(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"Hello!"],
                      mode="conservative",
                      joiner_annotate=True),
             np.array([b"Hello", b"\xc3\xaf\xc2\xbf\xc2\xad!"])
         )
コード例 #3
0
 def testCaseMarkup(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"Tom"],
                      mode="conservative",
                      case_markup=True),
             np.array([b"\xef\xbd\x9fmrk_case_modifier_C\xef\xbd\xa0", b"tom"])
         )
コード例 #4
0
 def testSegmentAlphabetChange(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"\xe6\xb8\xac\xe8\xa9\xa6abc"],
                      mode="conservative",
                      segment_alphabet_change=True),
             np.array([b"\xe6\xb8\xac\xe8\xa9\xa6", b"abc"])
         )
コード例 #5
0
 def testSegmentNumbers(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"1234"],
                      mode="aggressive",
                      segment_numbers=True),
             np.array([b"1", b"2", b"3", b"4"])
         )
コード例 #6
0
 def testSegmentAlphabet(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"abcd"],
                      mode="conservative",
                      segment_alphabet=["Latin"]),
             np.array([b"a", b"b", b"c", b"d"])
         )
コード例 #7
0
 def testSegmentCase(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"WiFi"],
                      mode="conservative",
                      segment_case=True),
             np.array([b"Wi", b"Fi"])
         )
コード例 #8
0
 def testSpacerAnnotate(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"Hello world"],
                      mode="conservative",
                      spacer_annotate=True),
             np.array([b"Hello", b"\xe2\x96\x81world"])
         )
コード例 #9
0
 def testJoinerCustom(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"Hello!"],
                      mode="conservative",
                      joiner_annotate=True,
                      joiner="@@"),
             np.array([b"Hello", b"@@!"])
         )
コード例 #10
0
 def testPreservePlaceholders(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"Hello \xef\xbd\x9fWorld\xef\xbd\xa0"],
                      mode="conservative",
                      joiner_annotate=True,
                      preserve_placeholders=True),
             np.array([b"Hello", b"\xef\xbd\x9fWorld\xef\xbd\xa0"])
         )
コード例 #11
0
 def testSoftCaseRegions(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"U.N"],
                      mode="conservative",
                      case_markup=True,
                      soft_case_regions=True),
             np.array([b"\xef\xbd\x9fmrk_begin_case_region_U\xef\xbd\xa0",
                       b"u.",
                       b"n",
                       b"\xef\xbd\x9fmrk_end_case_region_U\xef\xbd\xa0"])
         )
コード例 #12
0
    def testModes(self):
        with self.session():
            self.assertAllEqual(
                tokenize([b"Mary-Ann is here."], mode="conservative"),
                np.array([b"Mary-Ann", b"is", b"here", b"."])
            )

            self.assertAllEqual(
                tokenize([b"Mary-Ann is here."], mode="aggressive"),
                np.array([b"Mary", b"-", b"Ann", b"is", b"here", b"."])
            )

            self.assertAllEqual(
                tokenize([b"Tom"], mode="char"),
                np.array([b"T", b"o", b"m"])
            )

            self.assertAllEqual(
                tokenize([b"Mary-Ann is here."], mode="space"),
                np.array([b"Mary-Ann", b"is", b"here."])
            )
コード例 #13
0
 def testSupportPriorJoiners(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"pre\xc3\xaf\xc2\xbf\xc2\xad tokenization."],
                      mode="aggressive",
                      joiner_annotate=True,
                      support_prior_joiners=True),
             np.array([b"pre\xc3\xaf",
                       b"\xc3\xaf\xc2\xbf\xc2\xad\xc2\xbf",
                       b"\xc3\xaf\xc2\xbf\xc2\xad\xc2\xad",
                       b"tokenization",
                       b"\xc3\xaf\xc2\xbf\xc2\xad."])
         )
コード例 #14
0
 def testPreserveSegTokens(self):
     with self.session():
         self.assertAllEqual(
             tokenize([b"\xe6\xb8\xac\xe8\xa9\xa6abc"],
                      mode="aggressive",
                      joiner_annotate=True,
                      segment_alphabet=["Han"],
                      segment_alphabet_change=True,
                      preserve_segmented_tokens=True),
             np.array([b"\xe6\xb8\xac",
                       b"\xc3\xaf\xc2\xbf\xc2\xad",
                       b"\xe8\xa9\xa6",
                       b"\xc3\xaf\xc2\xbf\xc2\xad",
                       b"abc"])
         )
コード例 #15
0
 def _tokenize_tensor(self, text):
     return tokenize(text, **self._config)