def test_build_word_ego_graph(): sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read() import networkx as nx import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['SimHei'] # 步骤一(替换sans-serif字体) plt.rcParams['axes.unicode_minus'] = False # 步骤二(解决坐标轴负数的负号显示问题) from harvesttext import get_sanguo, get_sanguo_entity_dict, get_baidu_stopwords ht0 = HarvestText() entity_mention_dict, entity_type_dict = get_sanguo_entity_dict() ht0.add_entities(entity_mention_dict, entity_type_dict) sanguo1 = get_sanguo()[0] stopwords = get_baidu_stopwords() docs = ht0.cut_sentences(sanguo1) G = ht0.build_word_ego_graph(docs,"刘备",min_freq=3,other_min_freq=2,stopwords=stopwords) pos = nx.kamada_kawai_layout(G) nx.draw(G,pos) nx.draw_networkx_labels(G,pos) G = ht0.build_entity_ego_graph(docs, "刘备", min_freq=3, other_min_freq=2) pos = nx.spring_layout(G) nx.draw(G, pos) nx.draw_networkx_labels(G, pos) sys.stdout.close() assert open(get_current_function_name() + "_current").read() == expected
def build_word_ego_graph(): import networkx as nx import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['SimHei'] # 步骤一(替换sans-serif字体) plt.rcParams['axes.unicode_minus'] = False # 步骤二(解决坐标轴负数的负号显示问题) from harvesttext import get_sanguo, get_sanguo_entity_dict, get_baidu_stopwords ht0 = HarvestText() entity_mention_dict, entity_type_dict = get_sanguo_entity_dict() ht0.add_entities(entity_mention_dict, entity_type_dict) sanguo1 = get_sanguo()[0] stopwords = get_baidu_stopwords() docs = ht0.cut_sentences(sanguo1) G = ht0.build_word_ego_graph(docs, "刘备", min_freq=3, other_min_freq=2, stopwords=stopwords) pos = nx.kamada_kawai_layout(G) nx.draw(G, pos) nx.draw_networkx_labels(G, pos) plt.show() G = ht0.build_entity_ego_graph(docs, "刘备", min_freq=3, other_min_freq=2) pos = nx.spring_layout(G) nx.draw(G, pos) nx.draw_networkx_labels(G, pos) plt.show()
def filter_el_with_rule(): # 当候选实体集很大的时候,实体链接得到的指称重可能有很多噪声,可以利用一些规则进行筛选 # 1. 词性:全部由动词v,形容词a, 副词d, 连词c,介词p等组成的,一般不是传统意义上会关心的实体 # 2. 词长:指称长度只有1的,一般信息不足 # 由于这些规则可以高度定制化,所以不直接写入库中,而在外部定义。这段代码提供一个示例: def el_filtering(entities_info, ch_pos): return [([l, r], (entity0, type0)) for [l, r], (entity0, type0) in entities_info if not all( bool(re.search("^(v|a|d|c|p|y|z)", pos)) for pos in ch_pos[l:r]) and (r - l) > 1] ht0 = HarvestText() text = "《记得》:谁还记得 是谁先说 永远的爱我" entity_mention_dict = { '记得(歌曲)': ['记得', '《记得》'], "我(张国荣演唱歌曲)": ['我', '《我》'] } entity_type_dict = {'记得(歌曲)': '歌名', '我(张国荣演唱歌曲)': '歌名'} ht0.add_entities(entity_mention_dict, entity_type_dict) entities_info, ch_pos = ht0.entity_linking( text, with_ch_pos=True) # 显式设定了with_ch_pos=True才有 print("filter_el_with_rule") print("Sentence:", text) print("Original Entities:", entities_info) filtered_entities = el_filtering(entities_info, ch_pos) # 我 因为词长被过滤,而 记得 因为是纯动词而被过滤,但是《记得》包括了标点,不会被过滤 print("filtered_entities:", filtered_entities)
def depend_parse(): ht0 = HarvestText() para = "上港的武磊武球王是中国最好的前锋。" entity_mention_dict = {'武磊': ['武磊', '武球王'], "上海上港": ["上港"]} entity_type_dict = {'武磊': '球员', "上海上港": "球队"} ht0.add_entities(entity_mention_dict, entity_type_dict) for arc in ht0.dependency_parse(para): print(arc) print(ht0.triple_extraction(para))
def test_depend_parse(): sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read() ht0 = HarvestText() para = "上港的武磊武球王是中国最好的前锋。" entity_mention_dict = {'武磊': ['武磊', '武球王'], "上海上港":["上港"]} entity_type_dict = {'武磊': '球员', "上海上港":"球队"} ht0.add_entities(entity_mention_dict, entity_type_dict) for arc in ht0.dependency_parse(para): print(arc) print(ht0.triple_extraction(para)) sys.stdout.close() assert open(get_current_function_name() + "_current").read() == expected
def f(): from harvesttext import HarvestText ht = HarvestText() entity_mention_dict = { '武磊': ['武磊', '武球王'], '郜林': ['郜林', '郜飞机'], '前锋': ['前锋'], '上海上港': ['上港'], '广州恒大': ['恒大'], '单刀球': ['单刀'] } entity_type_dict = { '武磊': '球员', '郜林': '球员', '前锋': '位,置', '上海上港': '球队', '广州恒大': '球队', '单刀球': '术语' } ht.add_entities(entity_mention_dict, entity_type_dict)
def el_keep_all(): ht0 = HarvestText() entity_mention_dict = {'李娜1': ['李娜'], "李娜2": ['李娜']} entity_type_dict = {'李娜1': '运动员', '李娜2': '歌手'} ht0.add_entities(entity_mention_dict, entity_type_dict) print(ht0.entity_linking("打球的李娜和唱歌的李娜不是一个人", keep_all=True))
if __name__ == '__main__': plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False warnings.filterwarnings("ignore") # add_mention_dict = False # 用来控制是否要分析别名(别名分析效果不好,而且也还没有成功加到模型里,改成True就会报错) ht = HarvestText() # 获得清华开放领域的词典,加入命名实体词典中 entity_type_dict = establish_qh_dict() # 获得爬虫数据 text_data = get_data() # 扩充词典 ht, entity_type_dict = enrich_type_dict(text_data, ht, entity_type_dict) # 获得别名词典 # if add_mention_dict: # entity_mention_dict = get_mention_dict(text_data, ht) # # 将命名实体词典加入到模型中 # ht.add_entities(entity_type_dict = entity_type_dict, entity_mention_dict = entity_mention_dict) # else: ht.add_entities(entity_type_dict = entity_type_dict) # 遍历所有的网页,进行三元组的抽取 SVOs = extract_triple(text_data, ht, entity_type_dict) # 输出图像 show_picture(SVOs) # 输出所有的三元组 print(SVOs) # 保存数据 save_data()
@Software: PyCharm @File : test.py @Time : 2020/3/30 8:46 下午 @Desc : ''' from harvesttext import HarvestText ht = HarvestText() para = "上港的武磊和恒大的郜林,谁是中国最好的前锋?那当然是武磊武球王了,他是射手榜第一,原来是弱点的单刀也有了进步" entity_mention_dict = {'武磊':['武磊','武球王'],'郜林':['郜林','郜飞机'],'前锋':['前锋'],'上海上港':['上港'],'广州恒大':['恒大'],'单刀球':['单刀']} entity_type_dict = {'武磊':'球员','郜林':'球员','前锋':'位置','上海上港':'球队','广州恒大':'球队','单刀球':'术语'} ht.add_entities(entity_mention_dict,entity_type_dict) print("\nSentence segmentation") print(ht.seg(para,return_sent=True)) # return_sent=False时,则返回词语列表 # 在现有实体库的基础上随时新增,比如从新词发现中得到的漏网之鱼 ht.add_new_entity("颜骏凌", "颜骏凌", "球员") docs = ["武磊和颜骏凌是队友", "武磊和郜林都是国内顶尖前锋"] G = ht.build_entity_graph(docs) print(dict(G.edges.items())) G = ht.build_entity_graph(docs, used_types=["球员"]) print(dict(G.edges.items()))