def _clean_text(self, text) -> Tuple[List[str], List[str]]: """ 相较于原本的transformers.tokenization_bert.BertTokenizer._clean_text, 将控制字符也输出为空格。主要目的是保持输出前后的长度一致。 同时也应该输出原始text :param text: :return: (cleaned_text, raw_text, ) """ output = [] raw_rsv_output = [] token = '' for char in text: cp = ord(char) if cp == 0 or cp == 0xFFFD or _is_control(char) or _is_whitespace( char): if len(token) > 0: output.append(token) raw_rsv_output.append(token) token = '' output.append(" ") else: token += char if len(token) > 0: output.append(token) raw_rsv_output.append(token) assert sum(len(token) for token in output) == sum( len(token) for token in raw_rsv_output) return output, raw_rsv_output
def test_is_whitespace(self): self.assertTrue(_is_whitespace(u" ")) self.assertTrue(_is_whitespace(u"\t")) self.assertTrue(_is_whitespace(u"\r")) self.assertTrue(_is_whitespace(u"\n")) self.assertTrue(_is_whitespace(u"\u00A0")) self.assertFalse(_is_whitespace(u"A")) self.assertFalse(_is_whitespace(u"-"))
def strip_whitespace(tokens: List[RawRsvSimpleToken]) -> List[RawRsvSimpleToken]: output = [] is_start = True for token in tokens: for char, pos_idx, raw_char in token: if _is_whitespace(char): is_start = True else: if is_start: output.append(RawRsvSimpleToken()) is_start = False output[-1].text += char output[-1].pos_ids.append(pos_idx) output[-1].raw_text += raw_char return output
def process_inner_text(inner_text: List[str]) -> List[str]: output = [] is_start = True inner_text = inner_text[0] for ch in inner_text: if _is_whitespace(ch): output.append(ch) is_start = True else: if is_start: output.append(ch) is_start = False else: output[-1] += ch return output
def _clean_token(self, token: RawRsvSimpleToken) -> List[RawRsvSimpleToken]: """Performs invalid character removal and whitespace cleanup on text.""" output = [] is_start = True for char, idx, raw_char in token: cp = ord(char) if cp == 0 or cp == 0xFFFD or _is_control(char) or _is_whitespace(char): output.append(RawRsvSimpleToken(text=" ", pos_ids=[idx], raw_text=char)) is_start = True else: if is_start: output.append(RawRsvSimpleToken()) is_start = False output[-1].text += char output[-1].pos_ids.append(idx) output[-1].raw_text += char return output