Beispiel #1
0
def test_build_word_ego_graph():
    sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read()
    import networkx as nx
    import matplotlib.pyplot as plt
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 步骤一(替换sans-serif字体)
    plt.rcParams['axes.unicode_minus'] = False  # 步骤二(解决坐标轴负数的负号显示问题)
    from harvesttext import get_sanguo, get_sanguo_entity_dict, get_baidu_stopwords

    ht0 = HarvestText()
    entity_mention_dict, entity_type_dict = get_sanguo_entity_dict()
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    sanguo1 = get_sanguo()[0]
    stopwords = get_baidu_stopwords()
    docs = ht0.cut_sentences(sanguo1)
    G = ht0.build_word_ego_graph(docs,"刘备",min_freq=3,other_min_freq=2,stopwords=stopwords)
    pos = nx.kamada_kawai_layout(G)
    nx.draw(G,pos)
    nx.draw_networkx_labels(G,pos)

    G = ht0.build_entity_ego_graph(docs, "刘备", min_freq=3, other_min_freq=2)
    pos = nx.spring_layout(G)
    nx.draw(G, pos)
    nx.draw_networkx_labels(G, pos)


    sys.stdout.close()
    assert open(get_current_function_name() + "_current").read() == expected
Beispiel #2
0
def build_word_ego_graph():
    import networkx as nx
    import matplotlib.pyplot as plt
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 步骤一(替换sans-serif字体)
    plt.rcParams['axes.unicode_minus'] = False  # 步骤二(解决坐标轴负数的负号显示问题)
    from harvesttext import get_sanguo, get_sanguo_entity_dict, get_baidu_stopwords

    ht0 = HarvestText()
    entity_mention_dict, entity_type_dict = get_sanguo_entity_dict()
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    sanguo1 = get_sanguo()[0]
    stopwords = get_baidu_stopwords()
    docs = ht0.cut_sentences(sanguo1)
    G = ht0.build_word_ego_graph(docs,
                                 "刘备",
                                 min_freq=3,
                                 other_min_freq=2,
                                 stopwords=stopwords)
    pos = nx.kamada_kawai_layout(G)
    nx.draw(G, pos)
    nx.draw_networkx_labels(G, pos)
    plt.show()
    G = ht0.build_entity_ego_graph(docs, "刘备", min_freq=3, other_min_freq=2)
    pos = nx.spring_layout(G)
    nx.draw(G, pos)
    nx.draw_networkx_labels(G, pos)
    plt.show()
Beispiel #3
0
def filter_el_with_rule():
    # 当候选实体集很大的时候,实体链接得到的指称重可能有很多噪声,可以利用一些规则进行筛选
    # 1. 词性:全部由动词v,形容词a, 副词d, 连词c,介词p等组成的,一般不是传统意义上会关心的实体
    # 2. 词长:指称长度只有1的,一般信息不足
    # 由于这些规则可以高度定制化,所以不直接写入库中,而在外部定义。这段代码提供一个示例:
    def el_filtering(entities_info, ch_pos):
        return [([l, r], (entity0, type0))
                for [l, r], (entity0, type0) in entities_info if not all(
                    bool(re.search("^(v|a|d|c|p|y|z)", pos))
                    for pos in ch_pos[l:r]) and (r - l) > 1]

    ht0 = HarvestText()
    text = "《记得》:谁还记得 是谁先说 永远的爱我"
    entity_mention_dict = {
        '记得(歌曲)': ['记得', '《记得》'],
        "我(张国荣演唱歌曲)": ['我', '《我》']
    }
    entity_type_dict = {'记得(歌曲)': '歌名', '我(张国荣演唱歌曲)': '歌名'}
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    entities_info, ch_pos = ht0.entity_linking(
        text, with_ch_pos=True)  # 显式设定了with_ch_pos=True才有
    print("filter_el_with_rule")
    print("Sentence:", text)
    print("Original Entities:", entities_info)
    filtered_entities = el_filtering(entities_info, ch_pos)
    # 我 因为词长被过滤,而 记得 因为是纯动词而被过滤,但是《记得》包括了标点,不会被过滤
    print("filtered_entities:", filtered_entities)
Beispiel #4
0
def depend_parse():
    ht0 = HarvestText()
    para = "上港的武磊武球王是中国最好的前锋。"
    entity_mention_dict = {'武磊': ['武磊', '武球王'], "上海上港": ["上港"]}
    entity_type_dict = {'武磊': '球员', "上海上港": "球队"}
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    for arc in ht0.dependency_parse(para):
        print(arc)
    print(ht0.triple_extraction(para))
Beispiel #5
0
def test_depend_parse():
    sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read()
    ht0 = HarvestText()
    para = "上港的武磊武球王是中国最好的前锋。"
    entity_mention_dict = {'武磊': ['武磊', '武球王'], "上海上港":["上港"]}
    entity_type_dict = {'武磊': '球员', "上海上港":"球队"}
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    for arc in ht0.dependency_parse(para):
        print(arc)
    print(ht0.triple_extraction(para))

    sys.stdout.close()
    assert open(get_current_function_name() + "_current").read() == expected
def f():
    from harvesttext import HarvestText
    ht = HarvestText()
    entity_mention_dict = {
        '武磊': ['武磊', '武球王'],
        '郜林': ['郜林', '郜飞机'],
        '前锋': ['前锋'],
        '上海上港': ['上港'],
        '广州恒大': ['恒大'],
        '单刀球': ['单刀']
    }
    entity_type_dict = {
        '武磊': '球员',
        '郜林': '球员',
        '前锋': '位,置',
        '上海上港': '球队',
        '广州恒大': '球队',
        '单刀球': '术语'
    }
    ht.add_entities(entity_mention_dict, entity_type_dict)
Beispiel #7
0
def el_keep_all():
    ht0 = HarvestText()
    entity_mention_dict = {'李娜1': ['李娜'], "李娜2": ['李娜']}
    entity_type_dict = {'李娜1': '运动员', '李娜2': '歌手'}
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    print(ht0.entity_linking("打球的李娜和唱歌的李娜不是一个人", keep_all=True))
Beispiel #8
0
if __name__ == '__main__':
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False 
    warnings.filterwarnings("ignore")

    # add_mention_dict = False  # 用来控制是否要分析别名(别名分析效果不好,而且也还没有成功加到模型里,改成True就会报错)

    ht = HarvestText()
    # 获得清华开放领域的词典,加入命名实体词典中
    entity_type_dict = establish_qh_dict()
    # 获得爬虫数据
    text_data = get_data()
    # 扩充词典
    ht, entity_type_dict = enrich_type_dict(text_data, ht, entity_type_dict)
    # 获得别名词典
    # if add_mention_dict:
    #     entity_mention_dict = get_mention_dict(text_data, ht)
    #     # 将命名实体词典加入到模型中
    #     ht.add_entities(entity_type_dict = entity_type_dict, entity_mention_dict = entity_mention_dict)
    # else:
    ht.add_entities(entity_type_dict = entity_type_dict)
    # 遍历所有的网页,进行三元组的抽取
    SVOs = extract_triple(text_data, ht, entity_type_dict)
    # 输出图像
    show_picture(SVOs)
    # 输出所有的三元组
    print(SVOs)
    # 保存数据
    save_data()
Beispiel #9
0
@Software:   PyCharm
 
@File    :   test.py
 
@Time    :   2020/3/30 8:46 下午
 
@Desc    :
 
'''

from harvesttext import HarvestText
ht = HarvestText()


para = "上港的武磊和恒大的郜林,谁是中国最好的前锋?那当然是武磊武球王了,他是射手榜第一,原来是弱点的单刀也有了进步"
entity_mention_dict = {'武磊':['武磊','武球王'],'郜林':['郜林','郜飞机'],'前锋':['前锋'],'上海上港':['上港'],'广州恒大':['恒大'],'单刀球':['单刀']}
entity_type_dict = {'武磊':'球员','郜林':'球员','前锋':'位置','上海上港':'球队','广州恒大':'球队','单刀球':'术语'}
ht.add_entities(entity_mention_dict,entity_type_dict)
print("\nSentence segmentation")
print(ht.seg(para,return_sent=True))    # return_sent=False时,则返回词语列表



# 在现有实体库的基础上随时新增,比如从新词发现中得到的漏网之鱼
ht.add_new_entity("颜骏凌", "颜骏凌", "球员")
docs = ["武磊和颜骏凌是队友",
		"武磊和郜林都是国内顶尖前锋"]
G = ht.build_entity_graph(docs)
print(dict(G.edges.items()))
G = ht.build_entity_graph(docs, used_types=["球员"])
print(dict(G.edges.items()))