Esempio n. 1
0
def clean_text(file, save_dir):
    ht = HarvestText()
    CharTable = pyhanlp.JClass('com.hankcs.hanlp.dictionary.other.CharTable')
    data = read_json(file)
    num_null = 0
    cleaned_data = []
    for i in trange(len(data)):
        content = CharTable.convert(data[i]['content'])
        cleaned_content = remove_url(ht.clean_text(content,
                                                   emoji=False))  # 过滤@后最多6个字符
        num_null += 1 if cleaned_content == '' else 0
        if 'train' in file and (not content or not cleaned_content
                                ):  # 删除train中的自带的空数据或清洗后出现的空数据
            continue
        if 'eval' in file or 'test' in file:
            cleaned_data.append({
                'id': data[i]['id'],
                'content': cleaned_content
            })
        else:
            cleaned_data.append({
                'id': data[i]['id'],
                'content': cleaned_content,
                'label': data[i]['label']
            })
    filename = file.split('/')[-1]
    save_json(cleaned_data, os.path.join(save_dir, filename))
    print('num data: ', num_null)
Esempio n. 2
0
def filter_el_with_rule():
    # 当候选实体集很大的时候,实体链接得到的指称重可能有很多噪声,可以利用一些规则进行筛选
    # 1. 词性:全部由动词v,形容词a, 副词d, 连词c,介词p等组成的,一般不是传统意义上会关心的实体
    # 2. 词长:指称长度只有1的,一般信息不足
    # 由于这些规则可以高度定制化,所以不直接写入库中,而在外部定义。这段代码提供一个示例:
    def el_filtering(entities_info, ch_pos):
        return [([l, r], (entity0, type0))
                for [l, r], (entity0, type0) in entities_info if not all(
                    bool(re.search("^(v|a|d|c|p|y|z)", pos))
                    for pos in ch_pos[l:r]) and (r - l) > 1]

    ht0 = HarvestText()
    text = "《记得》:谁还记得 是谁先说 永远的爱我"
    entity_mention_dict = {
        '记得(歌曲)': ['记得', '《记得》'],
        "我(张国荣演唱歌曲)": ['我', '《我》']
    }
    entity_type_dict = {'记得(歌曲)': '歌名', '我(张国荣演唱歌曲)': '歌名'}
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    entities_info, ch_pos = ht0.entity_linking(
        text, with_ch_pos=True)  # 显式设定了with_ch_pos=True才有
    print("filter_el_with_rule")
    print("Sentence:", text)
    print("Original Entities:", entities_info)
    filtered_entities = el_filtering(entities_info, ch_pos)
    # 我 因为词长被过滤,而 记得 因为是纯动词而被过滤,但是《记得》包括了标点,不会被过滤
    print("filtered_entities:", filtered_entities)
Esempio n. 3
0
 def clean_text(self, origin):
     cltweets = []
     ht = HarvestText()
     for twcl in origin:
         if type(twcl) == list:
             cltwcl = []
             for etwcl in twcl:
                 cltwcl.append(
                     ht.clean_text(emojiswitch.demojize(etwcl,
                                                        delimiters=("[",
                                                                    "]")),
                                   t2s=True,
                                   weibo_at=False))
                 # cltweets.append(
                 #     ht.clean_text(emojiswitch.demojize(twcl, delimiters=("[", "]")), t2s=True))
             cltweets.append(cltwcl)
         else:
             cltweets.append(
                 ht.clean_text(emojiswitch.demojize(twcl,
                                                    delimiters=("[", "]")),
                               t2s=True,
                               weibo_at=False))
             # cltweets.append(
             #     ht.clean_text(emojiswitch.demojize(twcl, delimiters=("[", "]")), t2s=True))
     # print(cltweets)
     return cltweets
Esempio n. 4
0
def test_named_entity_recognition():
    sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read()
    ht0 = HarvestText()
    sent = "上海上港足球队的武磊是中国最好的前锋。"
    print(ht0.named_entity_recognition(sent))

    sys.stdout.close()
    assert open(get_current_function_name()+"_current").read() == expected
Esempio n. 5
0
    def auto_pre_one(self, start, text):
        tt = tkitText.Text()
        # sents=tt.sentence_segmentation_v1(text)
        ht0 = HarvestText()
        sents = ht0.cut_paragraphs(text, 50)

        text_a = start
        li = self.pre(text_a[-200:], sents)

        return li
Esempio n. 6
0
def using_typed_words():
    from harvesttext.resources import get_qh_typed_words, get_baidu_stopwords
    ht0 = HarvestText()
    typed_words, stopwords = get_qh_typed_words(), get_baidu_stopwords()
    ht0.add_typed_words(typed_words)
    print("加载清华领域词典,并使用停用词")
    print("全部类型", typed_words.keys())
    sentence = "THUOCL是自然语言处理的一套中文词库,词表来自主流网站的社会标签、搜索热词、输入法词库等。"
    print(sentence)
    print(ht0.posseg(sentence, stopwords=stopwords))
    print("一些词语被赋予特殊类型IT,而“是”等词语被筛出。")
Esempio n. 7
0
 def clean_cn_text_by_third_party(self, sentence):
     """
     用第三方库清洗中文文本
     """
     from harvesttext import HarvestText
     ht_obj = HarvestText()
     # 去掉微博的@,表情符;网址;email;html代码中的一类的特殊字符等
     _text = sentence.replace('\u2028', '').replace('\n', '').replace('\t', '')
     re_h = re.compile('<(/?\w+|!--|!DOCTYPE|\?xml)[^>]*>')
     _text = re_h.sub('', _text)  # html处理
     clean_text = ht_obj.clean_text(_text)
     return clean_text
Esempio n. 8
0
def clean_text_whole(original_text):
    ht = HarvestText()

    original_text = re.compile(r'【.*?】').sub('', original_text)  # 去掉方括号
    original_text = re.compile(r'(\d{4}-\d{2}-\d{2})').sub(
        '', original_text)  # 去掉日期
    original_text = re.compile(r'(\d{2}:\d{2}:\d{2})').sub(
        '', original_text)  # 去掉时间
    original_text = re.compile(r'(\d{2}:\d{2})').sub('', original_text)  # 去掉时间
    cleaned_text = ht.clean_text(original_text)

    return cleaned_text
Esempio n. 9
0
def cut_paragraph():
    print("文本自动分段")
    ht0 = HarvestText()
    text = """备受社会关注的湖南常德滴滴司机遇害案,将于1月3日9时许,在汉寿县人民法院开庭审理。此前,犯罪嫌疑人、19岁大学生杨某淇被鉴定为作案时患有抑郁症,为“有限定刑事责任能力”。
新京报此前报道,2019年3月24日凌晨,滴滴司机陈师傅,搭载19岁大学生杨某淇到常南汽车总站附近。坐在后排的杨某淇趁陈某不备,朝陈某连捅数刀致其死亡。事发监控显示,杨某淇杀人后下车离开。随后,杨某淇到公安机关自首,并供述称“因悲观厌世,精神崩溃,无故将司机杀害”。据杨某淇就读学校的工作人员称,他家有四口人,姐姐是聋哑人。
今日上午,田女士告诉新京报记者,明日开庭时间不变,此前已提出刑事附带民事赔偿,但通过与法院的沟通后获知,对方父母已经没有赔偿的意愿。当时按照人身死亡赔偿金计算共计80多万元,那时也想考虑对方家庭的经济状况。
田女士说,她相信法律,对最后的结果也做好心理准备。对方一家从未道歉,此前庭前会议中,对方提出了嫌疑人杨某淇作案时患有抑郁症的辩护意见。另具警方出具的鉴定书显示,嫌疑人作案时有限定刑事责任能力。
新京报记者从陈师傅的家属处获知,陈师傅有两个儿子,大儿子今年18岁,小儿子还不到5岁。“这对我来说是一起悲剧,对我们生活的影响,肯定是很大的”,田女士告诉新京报记者,丈夫遇害后,他们一家的主劳动力没有了,她自己带着两个孩子和两个老人一起过,“生活很艰辛”,她说,“还好有妹妹的陪伴,现在已经好些了。”"""
    print("原始文本[5段]")
    print(text + "\n")
    print("预测文本[手动设置分3段]")
    predicted_paras = ht0.cut_paragraphs(text, num_paras=3)
    print("\n".join(predicted_paras) + "\n")
Esempio n. 10
0
def clean_text():
    print("各种清洗文本")
    ht0 = HarvestText()
    # 默认的设置可用于清洗微博文本
    text1 = "回复@钱旭明QXM:[嘻嘻][嘻嘻] //@钱旭明QXM:杨大哥[good][good]"
    print("清洗微博【@和表情符等】")
    print("原:", text1)
    print("清洗后:", ht0.clean_text(text1))
    # URL的清理
    text1 = "【#赵薇#:正筹备下一部电影 但不是青春片....http://t.cn/8FLopdQ"
    print("清洗网址URL")
    print("原:", text1)
    print("清洗后:", ht0.clean_text(text1, remove_url=True))
    # 清洗邮箱
    text1 = "我的邮箱是[email protected],欢迎联系"
    print("清洗邮箱")
    print("原:", text1)
    print("清洗后:", ht0.clean_text(text1, email=True))
    # 处理URL转义字符
    text1 = "www.%E4%B8%AD%E6%96%87%20and%20space.com"
    print("URL转正常字符")
    print("原:", text1)
    print("清洗后:", ht0.clean_text(text1, norm_url=True, remove_url=False))
    text1 = "www.中文 and space.com"
    print("正常字符转URL[含有中文和空格的request需要注意]")
    print("原:", text1)
    print("清洗后:", ht0.clean_text(text1, to_url=True, remove_url=False))
    # 处理HTML转义字符
    text1 = "&lt;a c&gt;&nbsp;&#x27;&#x27;"
    print("HTML转正常字符")
    print("原:", text1)
    print("清洗后:", ht0.clean_text(text1, norm_html=True))
Esempio n. 11
0
def test_using_typed_words():
    sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read()
    from harvesttext.resources import get_qh_typed_words,get_baidu_stopwords
    ht0 = HarvestText()
    typed_words, stopwords = get_qh_typed_words(), get_baidu_stopwords()
    ht0.add_typed_words(typed_words)
    print("加载清华领域词典,并使用停用词")
    print("全部类型",typed_words.keys())
    sentence = "THUOCL是自然语言处理的一套中文词库,词表来自主流网站的社会标签、搜索热词、输入法词库等。"
    print(sentence)
    print(ht0.posseg(sentence,stopwords=stopwords))
    print("一些词语被赋予特殊类型IT,而“是”等词语被筛出。")

    sys.stdout.close()
    assert open(get_current_function_name() + "_current").read() == expected
Esempio n. 12
0
def linking_strategy():
    ht0 = HarvestText()
    def test_case(text0,entity_mention_dict,strategy,entity_type_dict=None,**kwargs):
        ht0.add_entities(entity_mention_dict,entity_type_dict)
        ht0.set_linking_strategy(strategy,**kwargs)
        print(ht0.entity_linking(text0))
        ht0.clear()
    # latest 例
    test_case('X老师您好。请问老师这题怎么做?',
              entity_mention_dict={"X老师": ["X老师", "老师"], "Y老师": ["Y老师", "老师"]},
              strategy="latest"
              )

    test_case('谢谢老师',
              entity_mention_dict={"X老师": ["X老师", "老师"], "Y老师": ["Y老师", "老师"]},
              strategy="latest",
              lastest_mention={"老师": "X老师"})

    # freq 单字面值例
    test_case('市长',
              entity_mention_dict={"A市长": ["市长"], "B市长": ["长江"]},
              strategy="freq",
              entity_freq={"A市长": 5, "B市长": 3})

    # freq 重叠字面值例
    test_case('xx市长江yy',
              entity_mention_dict={"xx市长":["xx市长"],"长江yy":["长江yy"]},
              strategy="freq",
              entity_freq={"xx市长":3,"长江yy":5})

    test_case('我叫小沈阳',
              entity_mention_dict={"沈阳": ["沈阳"], "小沈阳": ["小沈阳"]},
              strategy="freq",
              entity_type_dict={"沈阳": "地名", "小沈阳": "人名"},
              type_freq={"地名": -1})
Esempio n. 13
0
def build_word_ego_graph():
    import networkx as nx
    import matplotlib.pyplot as plt
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 步骤一(替换sans-serif字体)
    plt.rcParams['axes.unicode_minus'] = False  # 步骤二(解决坐标轴负数的负号显示问题)
    from harvesttext import get_sanguo, get_sanguo_entity_dict, get_baidu_stopwords

    ht0 = HarvestText()
    entity_mention_dict, entity_type_dict = get_sanguo_entity_dict()
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    sanguo1 = get_sanguo()[0]
    stopwords = get_baidu_stopwords()
    docs = ht0.cut_sentences(sanguo1)
    G = ht0.build_word_ego_graph(docs,
                                 "刘备",
                                 min_freq=3,
                                 other_min_freq=2,
                                 stopwords=stopwords)
    pos = nx.kamada_kawai_layout(G)
    nx.draw(G, pos)
    nx.draw_networkx_labels(G, pos)
    plt.show()
    G = ht0.build_entity_ego_graph(docs, "刘备", min_freq=3, other_min_freq=2)
    pos = nx.spring_layout(G)
    nx.draw(G, pos)
    nx.draw_networkx_labels(G, pos)
    plt.show()
Esempio n. 14
0
def test_build_word_ego_graph():
    sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read()
    import networkx as nx
    import matplotlib.pyplot as plt
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 步骤一(替换sans-serif字体)
    plt.rcParams['axes.unicode_minus'] = False  # 步骤二(解决坐标轴负数的负号显示问题)
    from harvesttext import get_sanguo, get_sanguo_entity_dict, get_baidu_stopwords

    ht0 = HarvestText()
    entity_mention_dict, entity_type_dict = get_sanguo_entity_dict()
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    sanguo1 = get_sanguo()[0]
    stopwords = get_baidu_stopwords()
    docs = ht0.cut_sentences(sanguo1)
    G = ht0.build_word_ego_graph(docs,"刘备",min_freq=3,other_min_freq=2,stopwords=stopwords)
    pos = nx.kamada_kawai_layout(G)
    nx.draw(G,pos)
    nx.draw_networkx_labels(G,pos)

    G = ht0.build_entity_ego_graph(docs, "刘备", min_freq=3, other_min_freq=2)
    pos = nx.spring_layout(G)
    nx.draw(G, pos)
    nx.draw_networkx_labels(G, pos)


    sys.stdout.close()
    assert open(get_current_function_name() + "_current").read() == expected
Esempio n. 15
0
def test_hard_text_cleaning():
    ht = HarvestText()
    # 不可见字符
    text1 = "捧杀!干得漂亮![doge] \\u200b\\u200b\\u200b"
    text2 = ht.clean_text(text1)
    print("清洗前:", [text1])
    print("清洗后:", [text2])
    assert text2 == "捧杀!干得漂亮!"
    text1 = "捧杀!干得漂亮![doge] \u200b\u200b\u200b"
    text2 = ht.clean_text(text1)
    assert text2 == "捧杀!干得漂亮!"
    print("清洗前:", [text1])
    print("清洗后:", [text2])
    # 两个表情符号中间有内容
    text1 = "#缺钱找新浪# 瞎找不良网贷不如用新浪官方借款,不查负债不填联系人。  http://t.cn/A643boyi \n新浪[浪]用户专享福利,[浪]新浪产品用的越久额度越高,借万元日利率最低至0.03%,最长可分12期慢慢还! http://t.cn/A643bojv  http://t.cn/A643bKHS \u200b\u200b\u200b"
    text2 = ht.clean_text(text1)
    print("清洗前:", [text1])
    print("清洗后:", [text2])
    assert text2 == "#缺钱找新浪# 瞎找不良网贷不如用新浪官方借款,不查负债不填联系人。\n新浪用户专享福利,新浪产品用的越久额度越高,借万元日利率最低至0.03%,最长可分12期慢慢还!"
    # 包含emoji
    text1 = "各位大神们🙏求教一下这是什么动物呀![疑问]\n\n为什么它同时长得有点吓人又有点可爱[允悲]\n\n#thosetiktoks# http://t.cn/A6bXIC44 \u200b\u200b\u200b"
    text2 = ht.clean_text(text1)
    print("清洗前:", [text1])
    print("清洗后:", [text2])
    assert text2 == "各位大神们求教一下这是什么动物呀!\n为什么它同时长得有点吓人又有点可爱\n#thosetiktoks#"
    text1 = "JJ棋牌数据4.3万。数据链接http://www.jj.cn/,数据第一个账号,第二个密码,95%可登录,可以登录官网查看数据是否准确"
    text2 = ht.clean_text(text1)
    assert text2 == "JJ棋牌数据4.3万。数据链接,数据第一个账号,第二个密码,95%可登录,可以登录官网查看数据是否准确"
Esempio n. 16
0
def get_seq(text):
    """
    获取关键内容
    三元组抽取    
    """
    ht=HarvestText()
    s=[]
    text=tkitText.Text().clear(text)
    for item in ht.triple_extraction(sent=text, standard_name=False, stopwords=None, expand = "all"):
        if item=='':
            pass
        else:
            # print(' '.join(item))
            # s.append(str(item))
            s.append(''.join(item))
    # s="。".join(s)


    return s
def f():
    from harvesttext import HarvestText
    ht = HarvestText()
    entity_mention_dict = {
        '武磊': ['武磊', '武球王'],
        '郜林': ['郜林', '郜飞机'],
        '前锋': ['前锋'],
        '上海上港': ['上港'],
        '广州恒大': ['恒大'],
        '单刀球': ['单刀']
    }
    entity_type_dict = {
        '武磊': '球员',
        '郜林': '球员',
        '前锋': '位,置',
        '上海上港': '球队',
        '广州恒大': '球队',
        '单刀球': '术语'
    }
    ht.add_entities(entity_mention_dict, entity_type_dict)
Esempio n. 18
0
def entity_error_check():
    ht0 = HarvestText()
    typed_words = {"人名":["武磊"]}
    ht0.add_typed_words(typed_words)
    sent0 = "武磊和吴磊拼音相同"
    print(sent0)
    print(ht0.entity_linking(sent0, pinyin_tolerance=0))
    sent1 = "武磊和吴力只差一个拼音"
    print(sent1)
    print(ht0.entity_linking(sent1, pinyin_tolerance=1))
    sent2 = "武磊和吴磊只差一个字"
    print(sent2)
    print(ht0.entity_linking(sent2, char_tolerance=1))
    sent3 = "吴磊和吴力都可能是武磊的代称"
    print(sent3)
    print(ht0.get_linking_mention_candidates(sent3, pinyin_tolerance=1, char_tolerance=1))
Esempio n. 19
0
def depend_parse():
    ht0 = HarvestText()
    para = "上港的武磊武球王是中国最好的前锋。"
    entity_mention_dict = {'武磊': ['武磊', '武球王'], "上海上港": ["上港"]}
    entity_type_dict = {'武磊': '球员', "上海上港": "球队"}
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    for arc in ht0.dependency_parse(para):
        print(arc)
    print(ht0.triple_extraction(para))
Esempio n. 20
0
def test_depend_parse():
    sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read()
    ht0 = HarvestText()
    para = "上港的武磊武球王是中国最好的前锋。"
    entity_mention_dict = {'武磊': ['武磊', '武球王'], "上海上港":["上港"]}
    entity_type_dict = {'武磊': '球员', "上海上港":"球队"}
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    for arc in ht0.dependency_parse(para):
        print(arc)
    print(ht0.triple_extraction(para))

    sys.stdout.close()
    assert open(get_current_function_name() + "_current").read() == expected
Esempio n. 21
0
def test_english():
    # ♪ "Until the Day" by JJ Lin
    test_text = """
    In the middle of the night. 
    Lonely souls travel in time.
    Familiar hearts start to entwine.
    We imagine what we'll find, in another life.  
    """.lower()
    ht_eng = HarvestText(language="en")
    sentences = ht_eng.cut_sentences(test_text)
    print("\n".join(sentences))
    print(ht_eng.seg(sentences[-1]))
    print(ht_eng.posseg(sentences[0], stopwords={"in"}))
    sent_dict = ht_eng.build_sent_dict(sentences, pos_seeds=["familiar"], neg_seeds=["lonely"],
                                       min_times=1, stopwords={'in', 'to'})
    print("Sentiment analysis")
    for sent0 in sentences:
        print(sent0, "%.3f" % ht_eng.analyse_sent(sent0))
    print("Segmentation")
    print("\n".join(ht_eng.cut_paragraphs(test_text, num_paras=2)))
Esempio n. 22
0
def test_entity_error_check():
    sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read()
    ht0 = HarvestText()
    typed_words = {"人名":["武磊"]}
    ht0.add_typed_words(typed_words)
    sent1 = "武磊和吴力只差一个拼音"
    print(sent1)
    print(ht0.entity_linking(sent1, pinyin_recheck=True))
    sent2 = "武磊和吴磊只差一个字"
    print(sent2)
    print(ht0.entity_linking(sent2, char_recheck=True))
    sent3 = "吴磊和吴力都可能是武磊的代称"
    print(sent3)
    print(ht0.get_linking_mention_candidates(sent3, pinyin_recheck=True, char_recheck=True))

    sys.stdout.close()
    assert open(get_current_function_name() + "_current").read() == expected
Esempio n. 23
0
def test_linking_strategy():
    sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read()
    ht0 = HarvestText()
    def test_case(text0,entity_mention_dict,strategy,entity_type_dict=None,**kwargs):
        ht0.add_entities(entity_mention_dict,entity_type_dict)
        ht0.set_linking_strategy(strategy,**kwargs)
        print(ht0.entity_linking(text0))
        ht0.clear()
    # latest 例
    test_case('X老师您好。请问老师这题怎么做?',
              entity_mention_dict={"X老师": ["X老师", "老师"], "Y老师": ["Y老师", "老师"]},
              strategy="latest"
              )

    test_case('谢谢老师',
              entity_mention_dict={"X老师": ["X老师", "老师"], "Y老师": ["Y老师", "老师"]},
              strategy="latest",
              lastest_mention={"老师": "X老师"})

    # freq 单字面值例
    test_case('市长',
              entity_mention_dict={"A市长": ["市长"], "B市长": ["长江"]},
              strategy="freq",
              entity_freq={"A市长": 5, "B市长": 3})

    # freq 重叠字面值例
    test_case('xx市长江yy',
              entity_mention_dict={"xx市长":["xx市长"],"长江yy":["长江yy"]},
              strategy="freq",
              entity_freq={"xx市长":3,"长江yy":5})

    test_case('我叫小沈阳',
              entity_mention_dict={"沈阳": ["沈阳"], "小沈阳": ["小沈阳"]},
              strategy="freq",
              entity_type_dict={"沈阳": "地名", "小沈阳": "人名"},
              type_freq={"地名": -1})

    sys.stdout.close()
    assert open(get_current_function_name() + "_current").read() == expected
Esempio n. 24
0
def test_find_with_rules():
    sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read()
    from harvesttext.match_patterns import UpperFirst, AllEnglish, Contains, StartsWith, EndsWith
    # some more patterns is provided
    text0 = "我喜欢Python,因为requests库很适合爬虫"
    ht0 = HarvestText()

    found_entities = ht0.find_entity_with_rule(text0, rulesets=[AllEnglish()], type0="英文名")
    print(found_entities)
    print(ht0.posseg(text0))
    print(ht0.mention2entity("Python"))


    # Satisfying one of the rules
    ht0.clear()
    found_entities = ht0.find_entity_with_rule(text0,rulesets=[AllEnglish(),Contains("爬")],type0="技术")
    print(found_entities)
    print(ht0.posseg(text0))

    # Satisfying a couple of rules [using tuple]
    ht0.clear()
    found_entities = ht0.find_entity_with_rule(text0, rulesets=[(AllEnglish(),UpperFirst())], type0="专有英文词")
    print(found_entities)
    print(ht0.posseg(text0))

    sys.stdout.close()
    assert open(get_current_function_name() + "_current").read() == expected
Esempio n. 25
0
#coding=utf-8
import os
import sys
import inspect

def get_current_function_name():
    return os.path.dirname(os.path.abspath(__file__))+"/"+inspect.stack()[1][3]
import _locale
_locale._getdefaultlocale = (lambda *args: ['zh_CN', 'utf8'])

from harvesttext import HarvestText
ht = HarvestText()

def test_new_word_discover():
    sys.stdout = open(get_current_function_name()+"_current","w")
    expected = open(get_current_function_name()+"_expected").read()
    para = "上港的武磊和恒大的郜林,谁是中国最好的前锋?那当然是武磊武球王了,他是射手榜第一,原来是弱点的单刀也有了进步"
    # 返回关于新词质量的一系列信息,允许手工改进筛选(pd.DataFrame型)
    new_words_info = ht.word_discover(para)
    # new_words_info = ht.word_discover(para, threshold_seeds=["武磊"])
    new_words = new_words_info.index.tolist()
    print(new_words)
    sys.stdout.close()
    assert open(get_current_function_name() + "_current").read() == expected

def test_new_word_register():
    sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read()
    new_words = ["落叶球","666"]
    ht.add_new_words(new_words)        # 作为广义上的"新词"登录
    ht.add_new_entity("落叶球", mention0="落叶球", type0="术语") # 作为特定类型登录
    print(ht.seg("这个落叶球踢得真是666", return_sent=True))
Esempio n. 26
0
def find_with_rules():
    from harvesttext.match_patterns import UpperFirst, AllEnglish, Contains, StartsWith, EndsWith
    # some more patterns is provided
    text0 = "我喜欢Python,因为requests库很适合爬虫"
    ht0 = HarvestText()

    found_entities = ht0.find_entity_with_rule(text0,
                                               rulesets=[AllEnglish()],
                                               type0="英文名")
    print(found_entities)
    print(ht0.posseg(text0))
    print(ht0.mention2entity("Python"))

    # Satisfying one of the rules
    ht0.clear()
    found_entities = ht0.find_entity_with_rule(
        text0, rulesets=[AllEnglish(), Contains("爬")], type0="技术")
    print(found_entities)
    print(ht0.posseg(text0))

    # Satisfying a couple of rules [using tuple]
    ht0.clear()
    found_entities = ht0.find_entity_with_rule(text0,
                                               rulesets=[(AllEnglish(),
                                                          UpperFirst())],
                                               type0="专有英文词")
    print(found_entities)
    print(ht0.posseg(text0))
Esempio n. 27
0
class process_news_data:
    def __init__(self, bertTokenizer):
        self.bertTokenizer = bertTokenizer
        self._load_raw_data()
        self.ht = HarvestText()

    def is_chinese(self, uchar):
        if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
            return True
        else:
            return False

    def _load_raw_data(self):

        with open("../data/news_data/allNewsFilter.json", 'r') as files:
            self.news_data_0 = json.load(files)

        with open("../data/news_data/430News.json", 'r') as files:
            self.news_data_1 = json.load(files)

        self.news_data = {**self.news_data_0, **self.news_data_1}
        print("Train news_data: {} + {} = {}".format(len(self.news_data_0),
                                                     len(self.news_data_1),
                                                     len(self.news_data)))

        # data_dir = "/home/chenbo/entity_linking/data/news_data/"
        data_dir = "/data1/chenbo/"
        with open(data_dir + "news_name2aid2pid_mongo_2_9_whole.json",
                  'r') as files:
            self.name2aid2pid = json.load(files)

        with open(data_dir + "news_pub_dict_mongo_2_9_whole.json",
                  'r') as files:
            self.pub_dict = json.load(files)

        with open(data_dir + "news2aminer_whole_2_9.json", 'r') as files:
            self.process_news = json.load(files)

        # print("Test news_num: {}".format(len(self.process_news)))

        self.mention2res = []

        for news_id, attr in self.process_news.items():
            mentions = attr["mention2result"]
            for author_name, attr in mentions.items():
                try:
                    name_alias = attr["alias"]
                except:
                    name_alias = ""
                # a_id = attr["ids"]
                a_id = "---".join(attr["ids"])
                # print(author_name)
                tag = self.is_chinese(author_name)
                if (tag == False):
                    # print(author_name.isalpha())
                    self.mention2res.append(
                        (author_name, author_name, a_id, news_id))
                else:
                    # name_en = attr["alias"]
                    # a_id = attr["ids"]
                    # print(attr)
                    # print(name_alias)
                    self.mention2res.append(
                        (author_name, name_alias, a_id, news_id))

        print("Test news_num: {} Test_data: {}".format(len(self.process_news),
                                                       len(self.mention2res)))

    def generate_test_news(self, ins_num):
        # ins_dict = {}
        infos = []
        instance = []
        test_list = self.mention2res
        # random.seed(configs["seed"])
        random.shuffle(test_list)
        count = 0
        err_c = 0
        for news_info in test_list:
            ori_name = news_info[0]
            can_name = news_info[1]
            pos_aid = news_info[2]
            news_id = news_info[-1]
            # print(news_info)
            # if(ori_name != "万小军"):
            # continue
            if (self.name2aid2pid.get(can_name) == None):
                continue
            candidate_authors = set(self.name2aid2pid[can_name].keys())
            # if(pos_aid not in candidate_authors):
            # print("error! --- ", ori_name, can_name)
            # exit()s
            # continue
            if ((len(candidate_authors) - 1) < 1):
                # print("filter_can:", len(candidate_authors))
                continue
            # print("can:", len(candidate_authors))
            candidate_authors = list(candidate_authors)
            pos_aid_set = pos_aid.split("---")
            flag = False
            for ins_pos in pos_aid_set:
                if (ins_pos in candidate_authors):
                    flag = True
            if (flag == False):
                err_c += 1
                # print(err_c, news_info)
                continue
            # try:
            #     candidate_authors.remove(pos_aid)
            # except:
            #     print(news_info)
            #     # exit()
            #     continue

            # neg_author_lists = random.sample(candidate_authors, min(len(candidate_authors), 19))
            each_ins = (ori_name, can_name, pos_aid, candidate_authors,
                        news_id)
            # print(each_ins)
            # exit()
            tag, tokenizer_ins, ins_info = self.test_tokenizer_padding(
                each_ins)
            # ins_dict[ins_info[0]] = ins_info[1]
            if (tag == False):
                continue
            count += 1
            instance.append(tokenizer_ins)
            infos.append(ins_info)
            if (count == ins_num):
                break
        # with open("test_news_info.json", 'w') as files:
        # json.dump(ins_dict, files, indent=4, ensure_ascii=False)
        return instance, infos

    def test_tokenizer_padding(self, each):
        ori_name, can_name, pos_aid, neg_author_lists, news_id = each
        key = ori_name + '-' + can_name + '-' + pos_aid + '-' + news_id

        flag, filter_context, total_filter_context = self.preprocess_text(
            ori_name, self.process_news[news_id]["content"])
        if (flag == False):
            # print("ffffff")
            # print(news_id, ori_name)
            return False, [], []
        news_input_ids_list = []
        news_attention_masks_list = []
        for para in filter_context:
            context_token = self.bertTokenizer.encode_plus(
                para, max_length=configs["test_max_news_each_para_length"])

            input_ids = context_token["input_ids"]
            attention_masks = [1] * len(input_ids)
            # type_ids = [0] *

            # news_input_ids = []
            # news_attention_masks = []

            padding_length = configs["test_max_news_each_para_length"] - len(
                input_ids)
            padding_input_ids = input_ids + [0] * padding_length
            # qa_padding_token_type_ids = qa_token_type_ids + [1] * padding_length
            padding_attention_masks = attention_masks + [0] * padding_length
            news_input_ids_list.append(padding_input_ids)
            news_attention_masks_list.append(padding_attention_masks)
        # context_token = self.bertTokenizer.encode_plus(total_filter_context, max_length = configs["max_news_length"])

        # input_ids = context_token["input_ids"]
        # attention_masks = [1] * len(input_ids)
        # padding_length = configs["max_news_length"] - len(input_ids)
        # padding_input_ids = input_ids + [0] * padding_length
        # # qa_padding_token_type_ids = qa_token_type_ids + [1] * padding_length
        # padding_attention_masks = attention_masks + [0] * padding_length

        # news_input_ids = []
        # news_attention_masks = []
        # news_input_ids.append(padding_input_ids)
        # news_attention_masks.append(padding_attention_masks)

        # pos_per_paper_input_ids, pos_per_paper_attention_masks = self.get_author_encoder(pos_aid, can_name)

        # neg_author
        per_neg_per_paper_input_ids = []
        per_neg_per_paper_attention_masks = []

        for neg_author_id in neg_author_lists:
            neg_per_paper_input_ids, neg_per_paper_attention_masks = self.get_author_encoder(
                neg_author_id, can_name)
            per_neg_per_paper_input_ids.append(neg_per_paper_input_ids)
            per_neg_per_paper_attention_masks.append(
                neg_per_paper_attention_masks)

        print_ids = neg_author_lists
        print_ids.append(pos_aid)
        return True, (news_input_ids_list, news_attention_masks_list,
                      per_neg_per_paper_input_ids,
                      per_neg_per_paper_attention_masks), (key, filter_context,
                                                           print_ids)
        # return True, (news_input_ids, news_attention_masks, pos_per_paper_input_ids, pos_per_paper_attention_masks, per_neg_per_paper_input_ids, per_neg_per_paper_attention_masks)

    def preprocess_text(self, anchor_mention, news_content):
        sentence_list = self.ht.cut_sentences(news_content)
        merge_set = set()
        ne_set = set()
        filter_list = []
        for i, sent in enumerate(sentence_list):
            if (sent.find(anchor_mention) != -1):
                # merge_list.append()
                merge_set.add(i)
                # filter_list.append(sent)
                # merge_set.add()
        if (len(merge_set) == 0):
            return False, filter_list, []
        else:
            for i in merge_set:
                # filter_list.append(sentence_list[])
                filter_list.append((sentence_list[i], i))
                ne_set.add(i)
                for sent_s in range(i - 6, i):
                    if (sent_s >= 0) and (sent_s not in ne_set):
                        filter_list.append((sentence_list[sent_s], sent_s))
                        ne_set.add(sent_s)

                # filter_list.append((sentence_list[i], i))
                # ne_set.add(i)

                # if (i + 3) < len(sentence_list):
                for sent_s in range(i, i + 6):
                    if (sent_s < len(sentence_list)) and (sent_s
                                                          not in ne_set):
                        filter_list.append((sentence_list[sent_s], sent_s))
                        ne_set.add(sent_s)

            sort_filter_sentence = sorted(filter_list, key=itemgetter(1))
            # print("mention: ", anchor_mention)
            # print("merge: ", merge_set)
            # print("sort: ", sort_filter_sentence)
            context_sentence = []
            for context in sort_filter_sentence:
                para = context[0]
                context_sentence.append(para)
            # print("context: ", context_sentence)
            # exit()
            total_context = " ".join(context_sentence)
            # seg_list = jieba.cut(total_context, cut_all=False)
            # seg_list = seg_list[:configs["max_news_length"]]
            # return True, total_context
            return True, context_sentence, total_context

    def get_author_encoder(self, author_id, author_name):
        input_ids_list = []
        attention_masks_list = []

        author_papers = self.name2aid2pid[author_name][author_id]
        # random.seed(configs["seed"])
        random.shuffle(author_papers)
        # sample_papers = random.sample(author_papers, configs["max_papers_each_author"])
        paper_count = 0
        for paper_id in author_papers:
            tag, input_ids, attention_masks = self.paper_encoder(paper_id)
            if (tag == False):
                continue
            input_ids_list.append(input_ids)
            attention_masks_list.append(attention_masks)
            paper_count += 1
            if (paper_count == configs["test_news_max_papers_each_author"]):

                break
        return input_ids_list, attention_masks_list

    def paper_encoder(self, paper_id):
        pid = paper_id.split('-')[0]
        papers_attr = self.pub_dict[pid]
        tag, paper_str = self.get_res_abs(papers_attr)
        if (tag == False):
            return False, [], []
        # print("paper:", paper_str)
        outputs = self.bertTokenizer.encode_plus(
            paper_str, max_length=configs["test_max_paper_length"])
        input_ids = outputs["input_ids"]
        # print(len(input_ids))
        attention_masks = [1] * len(input_ids)
        # type_ids = [0] *

        padding_length = configs["test_max_paper_length"] - len(input_ids)
        padding_input_ids = input_ids + [0] * padding_length
        # qa_padding_token_type_ids = qa_token_type_ids + [1] * padding_length
        padding_attention_masks = attention_masks + [0] * padding_length
        # qa_padding_positions_id = qa_position_ids + [0] * padding_length

        return True, padding_input_ids, padding_attention_masks

    def get_res_abs(self, papers_attr):
        # print(papers_attr)
        name_info = set()
        org_info = set()
        keywords_info = set()
        try:
            title = papers_attr["title"].strip().lower()
        except:
            title = ""

        try:
            venue = papers_attr["venue"].strip().lower()
        except:
            venue = ""

        try:
            keywords = papers_attr["keywords"]
        except:
            keywords = []
        for ins in keywords:
            keywords_info.add(ins.strip().lower())

        for ins_author in papers_attr["authors"]:
            try:
                name = ins_author["name"].strip().lower()
            except:
                name = ""
            if (name != ""):
                name_info.add(name)

            try:
                orgnizations = ins_author["org"].strip().lower()
            except:
                orgnizations = ""
            if (orgnizations.strip().lower() != ""):
                org_info.add(orgnizations)

        name_str = " ".join(name_info).strip()
        org_str = " ".join(org_info).strip()
        keywords_str = " ".join(keywords_info).strip()

        # whole_info = keywords_info
        # whole_info_str = title + ' ' + keywords_str + ' ' + name_str + " " + org_str + ' ' + venue
        whole_info_str = title + ' ' + keywords_str + " " + org_str + ' ' + venue
        # print(whole_info_str)
        if (len(whole_info_str.strip().lower()) == 0):
            return False, ""
        else:
            return True, whole_info_str

    def generate_train_news(self, ins_num, mode):
        # instances_input_ids = []
        # instances_attention_masks = []
        instances = []
        count = 0
        break_tag = False
        # data_num = 0
        if (mode == "TRAIN"):
            news_data = self.news_data
        elif (mode == "TEST"):
            exit()
        else:
            print("ERROR! NO SUCH MODE!")
        news_id_list = list(news_data.keys())
        # random.seed(1)
        random.shuffle(news_id_list)
        for news_id in news_id_list:
            attr = news_data[news_id]
            entity_attr = attr["entity"]
            news_abs = attr["abstract"]
            news_content = attr["content"]
            sentence_list = self.ht.cut_sentences(news_content)
            for i, sent in enumerate(sentence_list):
                # print(sent)
                each_ins = (sent, news_id)
                tag, each_input_ids, each_attention_masks = self.tokenizer_padding(
                    each_ins)
                instances.append((each_input_ids, each_attention_masks))
                # instances_input_ids.append(each_input_ids)
                # instances_attention_masks.append(each_attention_masks)
                if (tag == False):
                    continue

                count += 1
                # instances.append(tokenizer_ins)
                if (count == ins_num):
                    break_tag = True
                    break
            if (break_tag == True):
                break

        return instances

    def tokenizer_padding(self, each):

        # text_infos = str(each[-1]) + "-" + str(each[0])
        tag, input_ids, attention_masks = self.tokenizer(each)
        if (tag == False):
            return False, [], [], []
        # total_data.append((input_ids, attention_masks, text_infos, 0))
        # exit()
        return True, input_ids, attention_masks

    def tokenizer(self, each):

        sent, news_id = each

        # news_input_ids_list = []
        # news_attention_masks_list = []

        context_token = self.bertTokenizer.encode_plus(
            sent, max_length=configs["train_max_news_each_para_length"])

        input_ids = context_token["input_ids"]
        attention_masks = [1] * len(input_ids)
        # type_ids = [0] *
        # print(sent)
        # print(len(input_ids))

        news_input_ids = []
        news_attention_masks = []

        padding_length = configs["train_max_news_each_para_length"] - len(
            input_ids)
        padding_input_ids = input_ids + [0] * padding_length
        # qa_padding_token_type_ids = qa_token_type_ids + [1] * padding_length
        padding_attention_masks = attention_masks + [0] * padding_length
        # news_input_ids_list.append(padding_input_ids)
        # news_attention_masks_list.append(padding_attention_masks)

        # context_token = self.bertTokenizer.encode_plus(filter_context, max_length = configs["max_news_length"])

        # input_ids = context_token["input_ids"]
        # attention_masks = [1] * len(input_ids)
        # # type_ids = [0] *

        # padding_length = configs["max_news_length"] - len(input_ids)
        # padding_input_ids = input_ids + [0] * padding_length
        # # qa_padding_token_type_ids = qa_token_type_ids + [1] * padding_length
        # padding_attention_masks = attention_masks + [0] * padding_length
        # # qa_padding_positions_id = qa_position_ids + [0] * padding_length

        return True, padding_input_ids, padding_attention_masks
Esempio n. 28
0
def el_keep_all():
    ht0 = HarvestText()
    entity_mention_dict = {'李娜1': ['李娜'], "李娜2": ['李娜']}
    entity_type_dict = {'李娜1': '运动员', '李娜2': '歌手'}
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    print(ht0.entity_linking("打球的李娜和唱歌的李娜不是一个人", keep_all=True))
Esempio n. 29
0
 def __init__(self, bertTokenizer):
     self.bertTokenizer = bertTokenizer
     self._load_raw_data()
     self.ht = HarvestText()
Esempio n. 30
0
def named_entity_recognition():
    ht0 = HarvestText()
    sent = "上海上港足球队的武磊是中国最好的前锋。"
    print(ht0.named_entity_recognition(sent))