コード例 #1
0
 def reset_space_after_punctuation(self):
     for token_idx in range(1, len(self._error_tokens)):
         if self._error_tokens[token_idx].startswith("##") and len(
                 self._error_tokens[token_idx -
                                    1]) == 1 and _is_punctuation(
                                        self._error_tokens[token_idx - 1]):
             self._error_tokens[token_idx] = self._error_tokens[token_idx][
                 2:]
コード例 #2
0
def detokenize_char(tokenizer, char_token):
    if char_token.startswith("##"):
        return char_token[2:]
    if char_token in set(
        [tokenizer.cls_token, '[SEP]', '[MASK]', '[PAD]', '[UNK]']):
        return char_token
    if _is_punctuation(char_token):
        return char_token
    return (" " + char_token)
コード例 #3
0
    def test_is_punctuation(self):
        self.assertTrue(tokenization._is_punctuation(u"-"))
        self.assertTrue(tokenization._is_punctuation(u"$"))
        self.assertTrue(tokenization._is_punctuation(u"`"))
        self.assertTrue(tokenization._is_punctuation(u"."))

        self.assertFalse(tokenization._is_punctuation(u"A"))
        self.assertFalse(tokenization._is_punctuation(u" "))