def reset_space_after_punctuation(self): for token_idx in range(1, len(self._error_tokens)): if self._error_tokens[token_idx].startswith("##") and len( self._error_tokens[token_idx - 1]) == 1 and _is_punctuation( self._error_tokens[token_idx - 1]): self._error_tokens[token_idx] = self._error_tokens[token_idx][ 2:]
def detokenize_char(tokenizer, char_token): if char_token.startswith("##"): return char_token[2:] if char_token in set( [tokenizer.cls_token, '[SEP]', '[MASK]', '[PAD]', '[UNK]']): return char_token if _is_punctuation(char_token): return char_token return (" " + char_token)
def test_is_punctuation(self): self.assertTrue(tokenization._is_punctuation(u"-")) self.assertTrue(tokenization._is_punctuation(u"$")) self.assertTrue(tokenization._is_punctuation(u"`")) self.assertTrue(tokenization._is_punctuation(u".")) self.assertFalse(tokenization._is_punctuation(u"A")) self.assertFalse(tokenization._is_punctuation(u" "))