def test_keep_with_cache():
    raw = (
        '点击http://t.cn/RtU0mZ1 查看,123456,test'
    )
    expected = (
        u'点击 http://t.cn/RtU0mZ1 查看'
    )

    assert expected == keep(raw, [CHINESE, URL])
    assert expected == keep(raw, [CHINESE, URL])
Beispiel #2
0
def chinese_cleaners(text):
  print ("before chinese_cleaners= ", text)
  text = text.encode('UTF-8').decode('UTF-8')
  text = keep (text, [ASCII, GENERAL_PUNCTUATION, SYMBOLS_AND_PUNCTUATION, CHINESE, CHINESE_SYMBOLS_AND_PUNCTUATION])
  text = unicodedata.normalize('NFKC', text)
  text = remove (text, [CHINESE_SYMBOLS_AND_PUNCTUATION])
  text = collapse_whitespace(text)
  print ("after chinese_cleaners= ", text)
  return text
def test_keep():
    raw = (
        '点击http://t.cn/RtU0mZ1 查看,123456,test'
    )
    expected = (
        u'点击 查看'
    )

    assert expected == keep(raw, [CHINESE])
def CleanText(text):
    result = keep(
        text,
        [ASCII],
    )
    result = URL.remove(result)

    expression = '(\#[a-zA-Z0-9]+)|(\@[A-Za-z0-9]+)|\$(\w+)|([#@$"|])|([0-9]+)'

    result = ' '.join(re.sub(expression, " ", result).split())
    return result
Beispiel #5
0
    def clean(self, text):
        keep_list = []
        #rm_list = [URL, ESCAPED_WHITESPACE,WECHAT_EMOJI ]
        rm_list = [RESTRICT_URL, ESCAPED_WHITESPACE, WECHAT_EMOJI]

        keep_list.append(CHINESE_CHARACTER)
        keep_list.append(ALPHA)
        keep_list.append(DIGIT)

        text = remove(
            text,
            rm_list,
        )

        text = keep(
            text,
            keep_list,
        )

        return text.strip()
Beispiel #6
0
def test_keep_with_cache():
    raw = ('点击http://t.cn/RtU0mZ1 查看,123456,test')
    expected = (u'点击 http://t.cn/RtU0mZ1 查看')

    assert expected == keep(raw, [CHINESE, URL])
    assert expected == keep(raw, [CHINESE, URL])
Beispiel #7
0
def test_keep():
    raw = ('点击http://t.cn/RtU0mZ1 查看,123456,test')
    expected = (u'点击 查看')

    assert expected == keep(raw, [CHINESE])