def entity_error_check(): ht0 = HarvestText() typed_words = {"人名":["武磊"]} ht0.add_typed_words(typed_words) sent1 = "武磊和吴力只差一个拼音" print(sent1) print(ht0.entity_linking(sent1, pinyin_recheck=True)) sent2 = "武磊和吴磊只差一个字" print(sent2) print(ht0.entity_linking(sent2, char_recheck=True)) sent3 = "吴磊和吴力都可能是武磊的代称" print(sent3) print(ht0.get_linking_mention_candidates(sent3, pinyin_recheck=True, char_recheck=True))
def filter_el_with_rule(): # 当候选实体集很大的时候,实体链接得到的指称重可能有很多噪声,可以利用一些规则进行筛选 # 1. 词性:全部由动词v,形容词a, 副词d, 连词c,介词p等组成的,一般不是传统意义上会关心的实体 # 2. 词长:指称长度只有1的,一般信息不足 # 由于这些规则可以高度定制化,所以不直接写入库中,而在外部定义。这段代码提供一个示例: def el_filtering(entities_info, ch_pos): return [([l, r], (entity0, type0)) for [l, r], (entity0, type0) in entities_info if not all( bool(re.search("^(v|a|d|c|p|y|z)", pos)) for pos in ch_pos[l:r]) and (r - l) > 1] ht0 = HarvestText() text = "《记得》:谁还记得 是谁先说 永远的爱我" entity_mention_dict = { '记得(歌曲)': ['记得', '《记得》'], "我(张国荣演唱歌曲)": ['我', '《我》'] } entity_type_dict = {'记得(歌曲)': '歌名', '我(张国荣演唱歌曲)': '歌名'} ht0.add_entities(entity_mention_dict, entity_type_dict) entities_info, ch_pos = ht0.entity_linking( text, with_ch_pos=True) # 显式设定了with_ch_pos=True才有 print("filter_el_with_rule") print("Sentence:", text) print("Original Entities:", entities_info) filtered_entities = el_filtering(entities_info, ch_pos) # 我 因为词长被过滤,而 记得 因为是纯动词而被过滤,但是《记得》包括了标点,不会被过滤 print("filtered_entities:", filtered_entities)
def test_entity_error_check(): sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read() ht0 = HarvestText() typed_words = {"人名":["武磊"]} ht0.add_typed_words(typed_words) sent1 = "武磊和吴力只差一个拼音" print(sent1) print(ht0.entity_linking(sent1, pinyin_recheck=True)) sent2 = "武磊和吴磊只差一个字" print(sent2) print(ht0.entity_linking(sent2, char_recheck=True)) sent3 = "吴磊和吴力都可能是武磊的代称" print(sent3) print(ht0.get_linking_mention_candidates(sent3, pinyin_recheck=True, char_recheck=True)) sys.stdout.close() assert open(get_current_function_name() + "_current").read() == expected
def el_keep_all(): ht0 = HarvestText() entity_mention_dict = {'李娜1': ['李娜'], "李娜2": ['李娜']} entity_type_dict = {'李娜1': '运动员', '李娜2': '歌手'} ht0.add_entities(entity_mention_dict, entity_type_dict) print(ht0.entity_linking("打球的李娜和唱歌的李娜不是一个人", keep_all=True))