Ejemplo n.º 1
0
 def clean_text(self, origin):
     cltweets = []
     ht = HarvestText()
     for twcl in origin:
         if type(twcl) == list:
             cltwcl = []
             for etwcl in twcl:
                 cltwcl.append(
                     ht.clean_text(emojiswitch.demojize(etwcl,
                                                        delimiters=("[",
                                                                    "]")),
                                   t2s=True,
                                   weibo_at=False))
                 # cltweets.append(
                 #     ht.clean_text(emojiswitch.demojize(twcl, delimiters=("[", "]")), t2s=True))
             cltweets.append(cltwcl)
         else:
             cltweets.append(
                 ht.clean_text(emojiswitch.demojize(twcl,
                                                    delimiters=("[", "]")),
                               t2s=True,
                               weibo_at=False))
             # cltweets.append(
             #     ht.clean_text(emojiswitch.demojize(twcl, delimiters=("[", "]")), t2s=True))
     # print(cltweets)
     return cltweets
Ejemplo n.º 2
0
def test_hard_text_cleaning():
    ht = HarvestText()
    # 不可见字符
    text1 = "捧杀!干得漂亮![doge] \\u200b\\u200b\\u200b"
    text2 = ht.clean_text(text1)
    print("清洗前:", [text1])
    print("清洗后:", [text2])
    assert text2 == "捧杀!干得漂亮!"
    text1 = "捧杀!干得漂亮![doge] \u200b\u200b\u200b"
    text2 = ht.clean_text(text1)
    assert text2 == "捧杀!干得漂亮!"
    print("清洗前:", [text1])
    print("清洗后:", [text2])
    # 两个表情符号中间有内容
    text1 = "#缺钱找新浪# 瞎找不良网贷不如用新浪官方借款,不查负债不填联系人。  http://t.cn/A643boyi \n新浪[浪]用户专享福利,[浪]新浪产品用的越久额度越高,借万元日利率最低至0.03%,最长可分12期慢慢还! http://t.cn/A643bojv  http://t.cn/A643bKHS \u200b\u200b\u200b"
    text2 = ht.clean_text(text1)
    print("清洗前:", [text1])
    print("清洗后:", [text2])
    assert text2 == "#缺钱找新浪# 瞎找不良网贷不如用新浪官方借款,不查负债不填联系人。\n新浪用户专享福利,新浪产品用的越久额度越高,借万元日利率最低至0.03%,最长可分12期慢慢还!"
    # 包含emoji
    text1 = "各位大神们🙏求教一下这是什么动物呀![疑问]\n\n为什么它同时长得有点吓人又有点可爱[允悲]\n\n#thosetiktoks# http://t.cn/A6bXIC44 \u200b\u200b\u200b"
    text2 = ht.clean_text(text1)
    print("清洗前:", [text1])
    print("清洗后:", [text2])
    assert text2 == "各位大神们求教一下这是什么动物呀!\n为什么它同时长得有点吓人又有点可爱\n#thosetiktoks#"
    text1 = "JJ棋牌数据4.3万。数据链接http://www.jj.cn/,数据第一个账号,第二个密码,95%可登录,可以登录官网查看数据是否准确"
    text2 = ht.clean_text(text1)
    assert text2 == "JJ棋牌数据4.3万。数据链接,数据第一个账号,第二个密码,95%可登录,可以登录官网查看数据是否准确"
Ejemplo n.º 3
0
def clean_text(file, save_dir):
    ht = HarvestText()
    CharTable = pyhanlp.JClass('com.hankcs.hanlp.dictionary.other.CharTable')
    data = read_json(file)
    num_null = 0
    cleaned_data = []
    for i in trange(len(data)):
        content = CharTable.convert(data[i]['content'])
        cleaned_content = remove_url(ht.clean_text(content,
                                                   emoji=False))  # 过滤@后最多6个字符
        num_null += 1 if cleaned_content == '' else 0
        if 'train' in file and (not content or not cleaned_content
                                ):  # 删除train中的自带的空数据或清洗后出现的空数据
            continue
        if 'eval' in file or 'test' in file:
            cleaned_data.append({
                'id': data[i]['id'],
                'content': cleaned_content
            })
        else:
            cleaned_data.append({
                'id': data[i]['id'],
                'content': cleaned_content,
                'label': data[i]['label']
            })
    filename = file.split('/')[-1]
    save_json(cleaned_data, os.path.join(save_dir, filename))
    print('num data: ', num_null)
Ejemplo n.º 4
0
 def clean_cn_text_by_third_party(self, sentence):
     """
     用第三方库清洗中文文本
     """
     from harvesttext import HarvestText
     ht_obj = HarvestText()
     # 去掉微博的@,表情符;网址;email;html代码中的一类的特殊字符等
     _text = sentence.replace('\u2028', '').replace('\n', '').replace('\t', '')
     re_h = re.compile('<(/?\w+|!--|!DOCTYPE|\?xml)[^>]*>')
     _text = re_h.sub('', _text)  # html处理
     clean_text = ht_obj.clean_text(_text)
     return clean_text
Ejemplo n.º 5
0
def clean_text_whole(original_text):
    ht = HarvestText()

    original_text = re.compile(r'【.*?】').sub('', original_text)  # 去掉方括号
    original_text = re.compile(r'(\d{4}-\d{2}-\d{2})').sub(
        '', original_text)  # 去掉日期
    original_text = re.compile(r'(\d{2}:\d{2}:\d{2})').sub(
        '', original_text)  # 去掉时间
    original_text = re.compile(r'(\d{2}:\d{2})').sub('', original_text)  # 去掉时间
    cleaned_text = ht.clean_text(original_text)

    return cleaned_text
Ejemplo n.º 6
0
def clean_text():
    print("各种清洗文本")
    ht0 = HarvestText()
    # 默认的设置可用于清洗微博文本
    text1 = "回复@钱旭明QXM:[嘻嘻][嘻嘻] //@钱旭明QXM:杨大哥[good][good]"
    print("清洗微博【@和表情符等】")
    print("原:", text1)
    print("清洗后:", ht0.clean_text(text1))
    # URL的清理
    text1 = "【#赵薇#:正筹备下一部电影 但不是青春片....http://t.cn/8FLopdQ"
    print("清洗网址URL")
    print("原:", text1)
    print("清洗后:", ht0.clean_text(text1, remove_url=True))
    # 清洗邮箱
    text1 = "我的邮箱是[email protected],欢迎联系"
    print("清洗邮箱")
    print("原:", text1)
    print("清洗后:", ht0.clean_text(text1, email=True))
    # 处理URL转义字符
    text1 = "www.%E4%B8%AD%E6%96%87%20and%20space.com"
    print("URL转正常字符")
    print("原:", text1)
    print("清洗后:", ht0.clean_text(text1, norm_url=True, remove_url=False))
    text1 = "www.中文 and space.com"
    print("正常字符转URL[含有中文和空格的request需要注意]")
    print("原:", text1)
    print("清洗后:", ht0.clean_text(text1, to_url=True, remove_url=False))
    # 处理HTML转义字符
    text1 = "&lt;a c&gt;&nbsp;&#x27;&#x27;"
    print("HTML转正常字符")
    print("原:", text1)
    print("清洗后:", ht0.clean_text(text1, norm_html=True))