Python HarvestText.add_entities Examples

Programming Language: Python

Namespace/Package Name: harvesttext

Class/Type: HarvestText

Method/Function: add_entities

Examples at hotexamples.com: 9

Python HarvestText.add_entities - 9 examples found. These are the top rated real world Python examples of harvesttext.HarvestText.add_entities extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

HarvestText(30)

add_entities(9)

clean_text(6)

posseg(5)

cut_sentences(5)

entity_linking(4)

cut_paragraphs(4)

triple_extraction(4)

add_typed_words(4)

build_word_ego_graph(2)

clear(2)

build_entity_ego_graph(2)

dependency_parse(2)

find_entity_with_rule(2)

get_linking_mention_candidates(2)

mention2entity(2)

named_entity_recognition(2)

seg(2)

build_sent_dict(1)

build_entity_graph(1)

analyse_sent(1)

add_new_entity(1)

Example #1

Show file

def test_build_word_ego_graph():
    sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read()
    import networkx as nx
    import matplotlib.pyplot as plt
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 步骤一（替换sans-serif字体）
    plt.rcParams['axes.unicode_minus'] = False  # 步骤二（解决坐标轴负数的负号显示问题）
    from harvesttext import get_sanguo, get_sanguo_entity_dict, get_baidu_stopwords

    ht0 = HarvestText()
    entity_mention_dict, entity_type_dict = get_sanguo_entity_dict()
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    sanguo1 = get_sanguo()[0]
    stopwords = get_baidu_stopwords()
    docs = ht0.cut_sentences(sanguo1)
    G = ht0.build_word_ego_graph(docs,"刘备",min_freq=3,other_min_freq=2,stopwords=stopwords)
    pos = nx.kamada_kawai_layout(G)
    nx.draw(G,pos)
    nx.draw_networkx_labels(G,pos)

    G = ht0.build_entity_ego_graph(docs, "刘备", min_freq=3, other_min_freq=2)
    pos = nx.spring_layout(G)
    nx.draw(G, pos)
    nx.draw_networkx_labels(G, pos)


    sys.stdout.close()
    assert open(get_current_function_name() + "_current").read() == expected

Example #2

Show file

File: basics.py Project: zimizzzz/HarvestText

def build_word_ego_graph():
    import networkx as nx
    import matplotlib.pyplot as plt
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 步骤一（替换sans-serif字体）
    plt.rcParams['axes.unicode_minus'] = False  # 步骤二（解决坐标轴负数的负号显示问题）
    from harvesttext import get_sanguo, get_sanguo_entity_dict, get_baidu_stopwords

    ht0 = HarvestText()
    entity_mention_dict, entity_type_dict = get_sanguo_entity_dict()
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    sanguo1 = get_sanguo()[0]
    stopwords = get_baidu_stopwords()
    docs = ht0.cut_sentences(sanguo1)
    G = ht0.build_word_ego_graph(docs,
                                 "刘备",
                                 min_freq=3,
                                 other_min_freq=2,
                                 stopwords=stopwords)
    pos = nx.kamada_kawai_layout(G)
    nx.draw(G, pos)
    nx.draw_networkx_labels(G, pos)
    plt.show()
    G = ht0.build_entity_ego_graph(docs, "刘备", min_freq=3, other_min_freq=2)
    pos = nx.spring_layout(G)
    nx.draw(G, pos)
    nx.draw_networkx_labels(G, pos)
    plt.show()

Example #3

Show file

File: basics.py Project: zimizzzz/HarvestText

def filter_el_with_rule():
    # 当候选实体集很大的时候，实体链接得到的指称重可能有很多噪声，可以利用一些规则进行筛选
    # 1. 词性：全部由动词v，形容词a, 副词d, 连词c，介词p等组成的，一般不是传统意义上会关心的实体
    # 2. 词长：指称长度只有1的，一般信息不足
    # 由于这些规则可以高度定制化，所以不直接写入库中，而在外部定义。这段代码提供一个示例：
    def el_filtering(entities_info, ch_pos):
        return [([l, r], (entity0, type0))
                for [l, r], (entity0, type0) in entities_info if not all(
                    bool(re.search("^(v|a|d|c|p|y|z)", pos))
                    for pos in ch_pos[l:r]) and (r - l) > 1]

    ht0 = HarvestText()
    text = "《记得》：谁还记得 是谁先说 永远的爱我"
    entity_mention_dict = {
        '记得（歌曲）': ['记得', '《记得》'],
        "我（张国荣演唱歌曲）": ['我', '《我》']
    }
    entity_type_dict = {'记得（歌曲）': '歌名', '我（张国荣演唱歌曲）': '歌名'}
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    entities_info, ch_pos = ht0.entity_linking(
        text, with_ch_pos=True)  # 显式设定了with_ch_pos=True才有
    print("filter_el_with_rule")
    print("Sentence:", text)
    print("Original Entities:", entities_info)
    filtered_entities = el_filtering(entities_info, ch_pos)
    # 我 因为词长被过滤，而 记得 因为是纯动词而被过滤，但是《记得》包括了标点，不会被过滤
    print("filtered_entities:", filtered_entities)

Example #4

Show file

File: basics.py Project: zimizzzz/HarvestText

def depend_parse():
    ht0 = HarvestText()
    para = "上港的武磊武球王是中国最好的前锋。"
    entity_mention_dict = {'武磊': ['武磊', '武球王'], "上海上港": ["上港"]}
    entity_type_dict = {'武磊': '球员', "上海上港": "球队"}
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    for arc in ht0.dependency_parse(para):
        print(arc)
    print(ht0.triple_extraction(para))

Example #5

Show file

def test_depend_parse():
    sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read()
    ht0 = HarvestText()
    para = "上港的武磊武球王是中国最好的前锋。"
    entity_mention_dict = {'武磊': ['武磊', '武球王'], "上海上港":["上港"]}
    entity_type_dict = {'武磊': '球员', "上海上港":"球队"}
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    for arc in ht0.dependency_parse(para):
        print(arc)
    print(ht0.triple_extraction(para))

    sys.stdout.close()
    assert open(get_current_function_name() + "_current").read() == expected

Example #6

Show file

File: test_punct_type_exception.py Project: zxuer2020/HarvestText

def f():
    from harvesttext import HarvestText
    ht = HarvestText()
    entity_mention_dict = {
        '武磊': ['武磊', '武球王'],
        '郜林': ['郜林', '郜飞机'],
        '前锋': ['前锋'],
        '上海上港': ['上港'],
        '广州恒大': ['恒大'],
        '单刀球': ['单刀']
    }
    entity_type_dict = {
        '武磊': '球员',
        '郜林': '球员',
        '前锋': '位,置',
        '上海上港': '球队',
        '广州恒大': '球队',
        '单刀球': '术语'
    }
    ht.add_entities(entity_mention_dict, entity_type_dict)

Example #7

Show file

File: basics.py Project: zimizzzz/HarvestText

def el_keep_all():
    ht0 = HarvestText()
    entity_mention_dict = {'李娜1': ['李娜'], "李娜2": ['李娜']}
    entity_type_dict = {'李娜1': '运动员', '李娜2': '歌手'}
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    print(ht0.entity_linking("打球的李娜和唱歌的李娜不是一个人", keep_all=True))

Example #8

Show file

if __name__ == '__main__':
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False 
    warnings.filterwarnings("ignore")

    # add_mention_dict = False  # 用来控制是否要分析别名（别名分析效果不好，而且也还没有成功加到模型里，改成True就会报错）

    ht = HarvestText()
    # 获得清华开放领域的词典，加入命名实体词典中
    entity_type_dict = establish_qh_dict()
    # 获得爬虫数据
    text_data = get_data()
    # 扩充词典
    ht, entity_type_dict = enrich_type_dict(text_data, ht, entity_type_dict)
    # 获得别名词典
    # if add_mention_dict:
    #     entity_mention_dict = get_mention_dict(text_data, ht)
    #     # 将命名实体词典加入到模型中
    #     ht.add_entities(entity_type_dict = entity_type_dict, entity_mention_dict = entity_mention_dict)
    # else:
    ht.add_entities(entity_type_dict = entity_type_dict)
    # 遍历所有的网页，进行三元组的抽取
    SVOs = extract_triple(text_data, ht, entity_type_dict)
    # 输出图像
    show_picture(SVOs)
    # 输出所有的三元组
    print(SVOs)
    # 保存数据
    save_data()

Example #9

Show file

@Software:   PyCharm
 
@File    :   test.py
 
@Time    :   2020/3/30 8:46 下午
 
@Desc    :
 
'''

from harvesttext import HarvestText
ht = HarvestText()


para = "上港的武磊和恒大的郜林，谁是中国最好的前锋？那当然是武磊武球王了，他是射手榜第一，原来是弱点的单刀也有了进步"
entity_mention_dict = {'武磊':['武磊','武球王'],'郜林':['郜林','郜飞机'],'前锋':['前锋'],'上海上港':['上港'],'广州恒大':['恒大'],'单刀球':['单刀']}
entity_type_dict = {'武磊':'球员','郜林':'球员','前锋':'位置','上海上港':'球队','广州恒大':'球队','单刀球':'术语'}
ht.add_entities(entity_mention_dict,entity_type_dict)
print("\nSentence segmentation")
print(ht.seg(para,return_sent=True))    # return_sent=False时，则返回词语列表



# 在现有实体库的基础上随时新增，比如从新词发现中得到的漏网之鱼
ht.add_new_entity("颜骏凌", "颜骏凌", "球员")
docs = ["武磊和颜骏凌是队友",
		"武磊和郜林都是国内顶尖前锋"]
G = ht.build_entity_graph(docs)
print(dict(G.edges.items()))
G = ht.build_entity_graph(docs, used_types=["球员"])
print(dict(G.edges.items()))