def test_keep_with_cache(): raw = ( '点击http://t.cn/RtU0mZ1 查看,123456,test' ) expected = ( u'点击 http://t.cn/RtU0mZ1 查看' ) assert expected == keep(raw, [CHINESE, URL]) assert expected == keep(raw, [CHINESE, URL])
def chinese_cleaners(text): print ("before chinese_cleaners= ", text) text = text.encode('UTF-8').decode('UTF-8') text = keep (text, [ASCII, GENERAL_PUNCTUATION, SYMBOLS_AND_PUNCTUATION, CHINESE, CHINESE_SYMBOLS_AND_PUNCTUATION]) text = unicodedata.normalize('NFKC', text) text = remove (text, [CHINESE_SYMBOLS_AND_PUNCTUATION]) text = collapse_whitespace(text) print ("after chinese_cleaners= ", text) return text
def test_keep(): raw = ( '点击http://t.cn/RtU0mZ1 查看,123456,test' ) expected = ( u'点击 查看' ) assert expected == keep(raw, [CHINESE])
def CleanText(text): result = keep( text, [ASCII], ) result = URL.remove(result) expression = '(\#[a-zA-Z0-9]+)|(\@[A-Za-z0-9]+)|\$(\w+)|([#@$"|])|([0-9]+)' result = ' '.join(re.sub(expression, " ", result).split()) return result
def clean(self, text): keep_list = [] #rm_list = [URL, ESCAPED_WHITESPACE,WECHAT_EMOJI ] rm_list = [RESTRICT_URL, ESCAPED_WHITESPACE, WECHAT_EMOJI] keep_list.append(CHINESE_CHARACTER) keep_list.append(ALPHA) keep_list.append(DIGIT) text = remove( text, rm_list, ) text = keep( text, keep_list, ) return text.strip()
def test_keep_with_cache(): raw = ('点击http://t.cn/RtU0mZ1 查看,123456,test') expected = (u'点击 http://t.cn/RtU0mZ1 查看') assert expected == keep(raw, [CHINESE, URL]) assert expected == keep(raw, [CHINESE, URL])
def test_keep(): raw = ('点击http://t.cn/RtU0mZ1 查看,123456,test') expected = (u'点击 查看') assert expected == keep(raw, [CHINESE])