def test_is_control(self):
        self.assertTrue(_is_control(u"\u0005"))

        self.assertFalse(_is_control(u"A"))
        self.assertFalse(_is_control(u" "))
        self.assertFalse(_is_control(u"\t"))
        self.assertFalse(_is_control(u"\r"))
Example #2
0
 def _clean_text(self, text) -> Tuple[List[str], List[str]]:
     """
     相较于原本的transformers.tokenization_bert.BertTokenizer._clean_text,
     将控制字符也输出为空格。主要目的是保持输出前后的长度一致。
     同时也应该输出原始text
     :param text:
     :return: (cleaned_text, raw_text, )
     """
     output = []
     raw_rsv_output = []
     token = ''
     for char in text:
         cp = ord(char)
         if cp == 0 or cp == 0xFFFD or _is_control(char) or _is_whitespace(
                 char):
             if len(token) > 0:
                 output.append(token)
                 raw_rsv_output.append(token)
             token = ''
             output.append(" ")
         else:
             token += char
         if len(token) > 0:
             output.append(token)
             raw_rsv_output.append(token)
     assert sum(len(token) for token in output) == sum(
         len(token) for token in raw_rsv_output)
     return output, raw_rsv_output
 def _clean_token(self, token: RawRsvSimpleToken) -> List[RawRsvSimpleToken]:
     """Performs invalid character removal and whitespace cleanup on text."""
     output = []
     is_start = True
     for char, idx, raw_char in token:
         cp = ord(char)
         if cp == 0 or cp == 0xFFFD or _is_control(char) or _is_whitespace(char):
             output.append(RawRsvSimpleToken(text=" ", pos_ids=[idx], raw_text=char))
             is_start = True
         else:
             if is_start:
                 output.append(RawRsvSimpleToken())
                 is_start = False
             output[-1].text += char
             output[-1].pos_ids.append(idx)
             output[-1].raw_text += char
     return output