Esempio n. 1
0
 def test_tokenizer(self) -> None:
     lowercase = True
     tokenizer1 = ScriptWordTokenizer(lowercase)
     self.assertEqual(
         tokenizer1.tokenize("Order me a coffee"),
         [("order", 0, 5), ("me", 6, 8), ("a", 9, 10), ("coffee", 11, 17)],
     )
     self.assertEqual(
         tokenizer1.tokenize("Order    me    a     coffee"),
         [("order", 0, 5), ("me", 9, 11), ("a", 15, 16),
          ("coffee", 21, 27)],
     )
     self.assertEqual(tokenizer1.tokenize("Order"), [("order", 0, 5)])
     lowercase = False
     tokenizer2 = ScriptWordTokenizer(lowercase)
     self.assertEqual(
         tokenizer2.tokenize("Order me a coffee"),
         [("Order", 0, 5), ("me", 6, 8), ("a", 9, 10), ("coffee", 11, 17)],
     )
     self.assertEqual(tokenizer2.tokenize(""), [])
Esempio n. 2
0
 def torchscriptify(self):
     # torchscriptify only supports space spliting tokenizer
     if self.split_regex == r"\s+":
         return ScriptWordTokenizer(self.lowercase)
     else:
         NotImplementedError
Esempio n. 3
0
 def test_tokenizer(self, raw_token, lowercase, result) -> None:
     tokenizer = ScriptWordTokenizer(lowercase)
     output = tokenizer.tokenize(raw_token)
     self.assertEqual(output, result)