def customize_tokenizer(text, do_lower_case=False): tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) temp_x = "" text = tokenization.convert_to_unicode(text) for c in text: if tokenizer._is_chinese_char(ord(c)) or tokenization._is_punctuation(c) or tokenization._is_whitespace(c) or tokenization._is_control(c): temp_x += " " + c + " " else: temp_x += c if do_lower_case: temp_x = temp_x.lower() return temp_x.split() # 所以我们这里会拿到一个list
def _joinTokens_orig(self, example): tokens = [] for t0i, token0 in enumerate(example.tokens0): if token0.startswith("##"): while len(tokens) > 0 and tokens[-1] == " " and not ( len(tokens) > 1 and tokenizationOrig._is_punctuation(tokens[-2][-1])): tokens.pop() token0 = token0[2:] tokens.append(token0) text = "".join(tokens) return text
def test_is_punctuation(self): self.assertTrue(tokenization._is_punctuation(u"-")) self.assertTrue(tokenization._is_punctuation(u"$")) self.assertTrue(tokenization._is_punctuation(u"`")) self.assertTrue(tokenization._is_punctuation(u".")) self.assertFalse(tokenization._is_punctuation(u"A")) self.assertFalse(tokenization._is_punctuation(u" "))
def customize_tokenizer(text, do_lower_case=True): temp_x = "" text = tokenization.convert_to_unicode(text) for c in text: if _is_chinese_char(ord(c)) or tokenization._is_punctuation( c) or tokenization._is_whitespace( c) or tokenization._is_control(c): temp_x += " " + c + " " else: temp_x += c if do_lower_case: temp_x = temp_x.lower() return temp_x.split()
def _is_chinese_or_punctuation(ch): return _is_chinese_char(ch) or _is_punctuation(ch)