def build_word_ego_graph(): import networkx as nx import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['SimHei'] # 步骤一(替换sans-serif字体) plt.rcParams['axes.unicode_minus'] = False # 步骤二(解决坐标轴负数的负号显示问题) from harvesttext import get_sanguo, get_sanguo_entity_dict, get_baidu_stopwords ht0 = HarvestText() entity_mention_dict, entity_type_dict = get_sanguo_entity_dict() ht0.add_entities(entity_mention_dict, entity_type_dict) sanguo1 = get_sanguo()[0] stopwords = get_baidu_stopwords() docs = ht0.cut_sentences(sanguo1) G = ht0.build_word_ego_graph(docs, "刘备", min_freq=3, other_min_freq=2, stopwords=stopwords) pos = nx.kamada_kawai_layout(G) nx.draw(G, pos) nx.draw_networkx_labels(G, pos) plt.show() G = ht0.build_entity_ego_graph(docs, "刘备", min_freq=3, other_min_freq=2) pos = nx.spring_layout(G) nx.draw(G, pos) nx.draw_networkx_labels(G, pos) plt.show()
def test_find_with_rules(): sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read() from harvesttext.match_patterns import UpperFirst, AllEnglish, Contains, StartsWith, EndsWith # some more patterns is provided text0 = "我喜欢Python,因为requests库很适合爬虫" ht0 = HarvestText() found_entities = ht0.find_entity_with_rule(text0, rulesets=[AllEnglish()], type0="英文名") print(found_entities) print(ht0.posseg(text0)) print(ht0.mention2entity("Python")) # Satisfying one of the rules ht0.clear() found_entities = ht0.find_entity_with_rule(text0,rulesets=[AllEnglish(),Contains("爬")],type0="技术") print(found_entities) print(ht0.posseg(text0)) # Satisfying a couple of rules [using tuple] ht0.clear() found_entities = ht0.find_entity_with_rule(text0, rulesets=[(AllEnglish(),UpperFirst())], type0="专有英文词") print(found_entities) print(ht0.posseg(text0)) sys.stdout.close() assert open(get_current_function_name() + "_current").read() == expected
def test_build_word_ego_graph(): sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read() import networkx as nx import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = ['SimHei'] # 步骤一(替换sans-serif字体) plt.rcParams['axes.unicode_minus'] = False # 步骤二(解决坐标轴负数的负号显示问题) from harvesttext import get_sanguo, get_sanguo_entity_dict, get_baidu_stopwords ht0 = HarvestText() entity_mention_dict, entity_type_dict = get_sanguo_entity_dict() ht0.add_entities(entity_mention_dict, entity_type_dict) sanguo1 = get_sanguo()[0] stopwords = get_baidu_stopwords() docs = ht0.cut_sentences(sanguo1) G = ht0.build_word_ego_graph(docs,"刘备",min_freq=3,other_min_freq=2,stopwords=stopwords) pos = nx.kamada_kawai_layout(G) nx.draw(G,pos) nx.draw_networkx_labels(G,pos) G = ht0.build_entity_ego_graph(docs, "刘备", min_freq=3, other_min_freq=2) pos = nx.spring_layout(G) nx.draw(G, pos) nx.draw_networkx_labels(G, pos) sys.stdout.close() assert open(get_current_function_name() + "_current").read() == expected
def clean_text(file, save_dir): ht = HarvestText() CharTable = pyhanlp.JClass('com.hankcs.hanlp.dictionary.other.CharTable') data = read_json(file) num_null = 0 cleaned_data = [] for i in trange(len(data)): content = CharTable.convert(data[i]['content']) cleaned_content = remove_url(ht.clean_text(content, emoji=False)) # 过滤@后最多6个字符 num_null += 1 if cleaned_content == '' else 0 if 'train' in file and (not content or not cleaned_content ): # 删除train中的自带的空数据或清洗后出现的空数据 continue if 'eval' in file or 'test' in file: cleaned_data.append({ 'id': data[i]['id'], 'content': cleaned_content }) else: cleaned_data.append({ 'id': data[i]['id'], 'content': cleaned_content, 'label': data[i]['label'] }) filename = file.split('/')[-1] save_json(cleaned_data, os.path.join(save_dir, filename)) print('num data: ', num_null)
def clean_text(self, origin): cltweets = [] ht = HarvestText() for twcl in origin: if type(twcl) == list: cltwcl = [] for etwcl in twcl: cltwcl.append( ht.clean_text(emojiswitch.demojize(etwcl, delimiters=("[", "]")), t2s=True, weibo_at=False)) # cltweets.append( # ht.clean_text(emojiswitch.demojize(twcl, delimiters=("[", "]")), t2s=True)) cltweets.append(cltwcl) else: cltweets.append( ht.clean_text(emojiswitch.demojize(twcl, delimiters=("[", "]")), t2s=True, weibo_at=False)) # cltweets.append( # ht.clean_text(emojiswitch.demojize(twcl, delimiters=("[", "]")), t2s=True)) # print(cltweets) return cltweets
def filter_el_with_rule(): # 当候选实体集很大的时候,实体链接得到的指称重可能有很多噪声,可以利用一些规则进行筛选 # 1. 词性:全部由动词v,形容词a, 副词d, 连词c,介词p等组成的,一般不是传统意义上会关心的实体 # 2. 词长:指称长度只有1的,一般信息不足 # 由于这些规则可以高度定制化,所以不直接写入库中,而在外部定义。这段代码提供一个示例: def el_filtering(entities_info, ch_pos): return [([l, r], (entity0, type0)) for [l, r], (entity0, type0) in entities_info if not all( bool(re.search("^(v|a|d|c|p|y|z)", pos)) for pos in ch_pos[l:r]) and (r - l) > 1] ht0 = HarvestText() text = "《记得》:谁还记得 是谁先说 永远的爱我" entity_mention_dict = { '记得(歌曲)': ['记得', '《记得》'], "我(张国荣演唱歌曲)": ['我', '《我》'] } entity_type_dict = {'记得(歌曲)': '歌名', '我(张国荣演唱歌曲)': '歌名'} ht0.add_entities(entity_mention_dict, entity_type_dict) entities_info, ch_pos = ht0.entity_linking( text, with_ch_pos=True) # 显式设定了with_ch_pos=True才有 print("filter_el_with_rule") print("Sentence:", text) print("Original Entities:", entities_info) filtered_entities = el_filtering(entities_info, ch_pos) # 我 因为词长被过滤,而 记得 因为是纯动词而被过滤,但是《记得》包括了标点,不会被过滤 print("filtered_entities:", filtered_entities)
def test_hard_text_cleaning(): ht = HarvestText() # 不可见字符 text1 = "捧杀!干得漂亮![doge] \\u200b\\u200b\\u200b" text2 = ht.clean_text(text1) print("清洗前:", [text1]) print("清洗后:", [text2]) assert text2 == "捧杀!干得漂亮!" text1 = "捧杀!干得漂亮![doge] \u200b\u200b\u200b" text2 = ht.clean_text(text1) assert text2 == "捧杀!干得漂亮!" print("清洗前:", [text1]) print("清洗后:", [text2]) # 两个表情符号中间有内容 text1 = "#缺钱找新浪# 瞎找不良网贷不如用新浪官方借款,不查负债不填联系人。 http://t.cn/A643boyi \n新浪[浪]用户专享福利,[浪]新浪产品用的越久额度越高,借万元日利率最低至0.03%,最长可分12期慢慢还! http://t.cn/A643bojv http://t.cn/A643bKHS \u200b\u200b\u200b" text2 = ht.clean_text(text1) print("清洗前:", [text1]) print("清洗后:", [text2]) assert text2 == "#缺钱找新浪# 瞎找不良网贷不如用新浪官方借款,不查负债不填联系人。\n新浪用户专享福利,新浪产品用的越久额度越高,借万元日利率最低至0.03%,最长可分12期慢慢还!" # 包含emoji text1 = "各位大神们🙏求教一下这是什么动物呀![疑问]\n\n为什么它同时长得有点吓人又有点可爱[允悲]\n\n#thosetiktoks# http://t.cn/A6bXIC44 \u200b\u200b\u200b" text2 = ht.clean_text(text1) print("清洗前:", [text1]) print("清洗后:", [text2]) assert text2 == "各位大神们求教一下这是什么动物呀!\n为什么它同时长得有点吓人又有点可爱\n#thosetiktoks#" text1 = "JJ棋牌数据4.3万。数据链接http://www.jj.cn/,数据第一个账号,第二个密码,95%可登录,可以登录官网查看数据是否准确" text2 = ht.clean_text(text1) assert text2 == "JJ棋牌数据4.3万。数据链接,数据第一个账号,第二个密码,95%可登录,可以登录官网查看数据是否准确"
def clean_text(): print("各种清洗文本") ht0 = HarvestText() # 默认的设置可用于清洗微博文本 text1 = "回复@钱旭明QXM:[嘻嘻][嘻嘻] //@钱旭明QXM:杨大哥[good][good]" print("清洗微博【@和表情符等】") print("原:", text1) print("清洗后:", ht0.clean_text(text1)) # URL的清理 text1 = "【#赵薇#:正筹备下一部电影 但不是青春片....http://t.cn/8FLopdQ" print("清洗网址URL") print("原:", text1) print("清洗后:", ht0.clean_text(text1, remove_url=True)) # 清洗邮箱 text1 = "我的邮箱是[email protected],欢迎联系" print("清洗邮箱") print("原:", text1) print("清洗后:", ht0.clean_text(text1, email=True)) # 处理URL转义字符 text1 = "www.%E4%B8%AD%E6%96%87%20and%20space.com" print("URL转正常字符") print("原:", text1) print("清洗后:", ht0.clean_text(text1, norm_url=True, remove_url=False)) text1 = "www.中文 and space.com" print("正常字符转URL[含有中文和空格的request需要注意]") print("原:", text1) print("清洗后:", ht0.clean_text(text1, to_url=True, remove_url=False)) # 处理HTML转义字符 text1 = "<a c> ''" print("HTML转正常字符") print("原:", text1) print("清洗后:", ht0.clean_text(text1, norm_html=True))
def find_with_rules(): from harvesttext.match_patterns import UpperFirst, AllEnglish, Contains, StartsWith, EndsWith # some more patterns is provided text0 = "我喜欢Python,因为requests库很适合爬虫" ht0 = HarvestText() found_entities = ht0.find_entity_with_rule(text0, rulesets=[AllEnglish()], type0="英文名") print(found_entities) print(ht0.posseg(text0)) print(ht0.mention2entity("Python")) # Satisfying one of the rules ht0.clear() found_entities = ht0.find_entity_with_rule( text0, rulesets=[AllEnglish(), Contains("爬")], type0="技术") print(found_entities) print(ht0.posseg(text0)) # Satisfying a couple of rules [using tuple] ht0.clear() found_entities = ht0.find_entity_with_rule(text0, rulesets=[(AllEnglish(), UpperFirst())], type0="专有英文词") print(found_entities) print(ht0.posseg(text0))
def linking_strategy(): ht0 = HarvestText() def test_case(text0,entity_mention_dict,strategy,entity_type_dict=None,**kwargs): ht0.add_entities(entity_mention_dict,entity_type_dict) ht0.set_linking_strategy(strategy,**kwargs) print(ht0.entity_linking(text0)) ht0.clear() # latest 例 test_case('X老师您好。请问老师这题怎么做?', entity_mention_dict={"X老师": ["X老师", "老师"], "Y老师": ["Y老师", "老师"]}, strategy="latest" ) test_case('谢谢老师', entity_mention_dict={"X老师": ["X老师", "老师"], "Y老师": ["Y老师", "老师"]}, strategy="latest", lastest_mention={"老师": "X老师"}) # freq 单字面值例 test_case('市长', entity_mention_dict={"A市长": ["市长"], "B市长": ["长江"]}, strategy="freq", entity_freq={"A市长": 5, "B市长": 3}) # freq 重叠字面值例 test_case('xx市长江yy', entity_mention_dict={"xx市长":["xx市长"],"长江yy":["长江yy"]}, strategy="freq", entity_freq={"xx市长":3,"长江yy":5}) test_case('我叫小沈阳', entity_mention_dict={"沈阳": ["沈阳"], "小沈阳": ["小沈阳"]}, strategy="freq", entity_type_dict={"沈阳": "地名", "小沈阳": "人名"}, type_freq={"地名": -1})
def test_named_entity_recognition(): sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read() ht0 = HarvestText() sent = "上海上港足球队的武磊是中国最好的前锋。" print(ht0.named_entity_recognition(sent)) sys.stdout.close() assert open(get_current_function_name()+"_current").read() == expected
def depend_parse(): ht0 = HarvestText() para = "上港的武磊武球王是中国最好的前锋。" entity_mention_dict = {'武磊': ['武磊', '武球王'], "上海上港": ["上港"]} entity_type_dict = {'武磊': '球员', "上海上港": "球队"} ht0.add_entities(entity_mention_dict, entity_type_dict) for arc in ht0.dependency_parse(para): print(arc) print(ht0.triple_extraction(para))
def auto_pre_one(self, start, text): tt = tkitText.Text() # sents=tt.sentence_segmentation_v1(text) ht0 = HarvestText() sents = ht0.cut_paragraphs(text, 50) text_a = start li = self.pre(text_a[-200:], sents) return li
def using_typed_words(): from harvesttext.resources import get_qh_typed_words, get_baidu_stopwords ht0 = HarvestText() typed_words, stopwords = get_qh_typed_words(), get_baidu_stopwords() ht0.add_typed_words(typed_words) print("加载清华领域词典,并使用停用词") print("全部类型", typed_words.keys()) sentence = "THUOCL是自然语言处理的一套中文词库,词表来自主流网站的社会标签、搜索热词、输入法词库等。" print(sentence) print(ht0.posseg(sentence, stopwords=stopwords)) print("一些词语被赋予特殊类型IT,而“是”等词语被筛出。")
def clean_text_whole(original_text): ht = HarvestText() original_text = re.compile(r'【.*?】').sub('', original_text) # 去掉方括号 original_text = re.compile(r'(\d{4}-\d{2}-\d{2})').sub( '', original_text) # 去掉日期 original_text = re.compile(r'(\d{2}:\d{2}:\d{2})').sub( '', original_text) # 去掉时间 original_text = re.compile(r'(\d{2}:\d{2})').sub('', original_text) # 去掉时间 cleaned_text = ht.clean_text(original_text) return cleaned_text
def clean_cn_text_by_third_party(self, sentence): """ 用第三方库清洗中文文本 """ from harvesttext import HarvestText ht_obj = HarvestText() # 去掉微博的@,表情符;网址;email;html代码中的一类的特殊字符等 _text = sentence.replace('\u2028', '').replace('\n', '').replace('\t', '') re_h = re.compile('<(/?\w+|!--|!DOCTYPE|\?xml)[^>]*>') _text = re_h.sub('', _text) # html处理 clean_text = ht_obj.clean_text(_text) return clean_text
def cut_paragraph(): print("文本自动分段") ht0 = HarvestText() text = """备受社会关注的湖南常德滴滴司机遇害案,将于1月3日9时许,在汉寿县人民法院开庭审理。此前,犯罪嫌疑人、19岁大学生杨某淇被鉴定为作案时患有抑郁症,为“有限定刑事责任能力”。 新京报此前报道,2019年3月24日凌晨,滴滴司机陈师傅,搭载19岁大学生杨某淇到常南汽车总站附近。坐在后排的杨某淇趁陈某不备,朝陈某连捅数刀致其死亡。事发监控显示,杨某淇杀人后下车离开。随后,杨某淇到公安机关自首,并供述称“因悲观厌世,精神崩溃,无故将司机杀害”。据杨某淇就读学校的工作人员称,他家有四口人,姐姐是聋哑人。 今日上午,田女士告诉新京报记者,明日开庭时间不变,此前已提出刑事附带民事赔偿,但通过与法院的沟通后获知,对方父母已经没有赔偿的意愿。当时按照人身死亡赔偿金计算共计80多万元,那时也想考虑对方家庭的经济状况。 田女士说,她相信法律,对最后的结果也做好心理准备。对方一家从未道歉,此前庭前会议中,对方提出了嫌疑人杨某淇作案时患有抑郁症的辩护意见。另具警方出具的鉴定书显示,嫌疑人作案时有限定刑事责任能力。 新京报记者从陈师傅的家属处获知,陈师傅有两个儿子,大儿子今年18岁,小儿子还不到5岁。“这对我来说是一起悲剧,对我们生活的影响,肯定是很大的”,田女士告诉新京报记者,丈夫遇害后,他们一家的主劳动力没有了,她自己带着两个孩子和两个老人一起过,“生活很艰辛”,她说,“还好有妹妹的陪伴,现在已经好些了。”""" print("原始文本[5段]") print(text + "\n") print("预测文本[手动设置分3段]") predicted_paras = ht0.cut_paragraphs(text, num_paras=3) print("\n".join(predicted_paras) + "\n")
def test_depend_parse(): sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read() ht0 = HarvestText() para = "上港的武磊武球王是中国最好的前锋。" entity_mention_dict = {'武磊': ['武磊', '武球王'], "上海上港":["上港"]} entity_type_dict = {'武磊': '球员', "上海上港":"球队"} ht0.add_entities(entity_mention_dict, entity_type_dict) for arc in ht0.dependency_parse(para): print(arc) print(ht0.triple_extraction(para)) sys.stdout.close() assert open(get_current_function_name() + "_current").read() == expected
def entity_error_check(): ht0 = HarvestText() typed_words = {"人名":["武磊"]} ht0.add_typed_words(typed_words) sent1 = "武磊和吴力只差一个拼音" print(sent1) print(ht0.entity_linking(sent1, pinyin_recheck=True)) sent2 = "武磊和吴磊只差一个字" print(sent2) print(ht0.entity_linking(sent2, char_recheck=True)) sent3 = "吴磊和吴力都可能是武磊的代称" print(sent3) print(ht0.get_linking_mention_candidates(sent3, pinyin_recheck=True, char_recheck=True))
def test_using_typed_words(): sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read() from harvesttext.resources import get_qh_typed_words,get_baidu_stopwords ht0 = HarvestText() typed_words, stopwords = get_qh_typed_words(), get_baidu_stopwords() ht0.add_typed_words(typed_words) print("加载清华领域词典,并使用停用词") print("全部类型",typed_words.keys()) sentence = "THUOCL是自然语言处理的一套中文词库,词表来自主流网站的社会标签、搜索热词、输入法词库等。" print(sentence) print(ht0.posseg(sentence,stopwords=stopwords)) print("一些词语被赋予特殊类型IT,而“是”等词语被筛出。") sys.stdout.close() assert open(get_current_function_name() + "_current").read() == expected
def test_entity_error_check(): sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read() ht0 = HarvestText() typed_words = {"人名":["武磊"]} ht0.add_typed_words(typed_words) sent1 = "武磊和吴力只差一个拼音" print(sent1) print(ht0.entity_linking(sent1, pinyin_recheck=True)) sent2 = "武磊和吴磊只差一个字" print(sent2) print(ht0.entity_linking(sent2, char_recheck=True)) sent3 = "吴磊和吴力都可能是武磊的代称" print(sent3) print(ht0.get_linking_mention_candidates(sent3, pinyin_recheck=True, char_recheck=True)) sys.stdout.close() assert open(get_current_function_name() + "_current").read() == expected
def get_seq(text): """ 获取关键内容 三元组抽取 """ ht=HarvestText() s=[] text=tkitText.Text().clear(text) for item in ht.triple_extraction(sent=text, standard_name=False, stopwords=None, expand = "all"): if item=='': pass else: # print(' '.join(item)) # s.append(str(item)) s.append(''.join(item)) # s="。".join(s) return s
def f(): from harvesttext import HarvestText ht = HarvestText() entity_mention_dict = { '武磊': ['武磊', '武球王'], '郜林': ['郜林', '郜飞机'], '前锋': ['前锋'], '上海上港': ['上港'], '广州恒大': ['恒大'], '单刀球': ['单刀'] } entity_type_dict = { '武磊': '球员', '郜林': '球员', '前锋': '位,置', '上海上港': '球队', '广州恒大': '球队', '单刀球': '术语' } ht.add_entities(entity_mention_dict, entity_type_dict)
def test_english(): # ♪ "Until the Day" by JJ Lin test_text = """ In the middle of the night. Lonely souls travel in time. Familiar hearts start to entwine. We imagine what we'll find, in another life. """.lower() ht_eng = HarvestText(language="en") sentences = ht_eng.cut_sentences(test_text) print("\n".join(sentences)) print(ht_eng.seg(sentences[-1])) print(ht_eng.posseg(sentences[0], stopwords={"in"})) sent_dict = ht_eng.build_sent_dict(sentences, pos_seeds=["familiar"], neg_seeds=["lonely"], min_times=1, stopwords={'in', 'to'}) print("Sentiment analysis") for sent0 in sentences: print(sent0, "%.3f" % ht_eng.analyse_sent(sent0)) print("Segmentation") print("\n".join(ht_eng.cut_paragraphs(test_text, num_paras=2)))
def test_linking_strategy(): sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read() ht0 = HarvestText() def test_case(text0,entity_mention_dict,strategy,entity_type_dict=None,**kwargs): ht0.add_entities(entity_mention_dict,entity_type_dict) ht0.set_linking_strategy(strategy,**kwargs) print(ht0.entity_linking(text0)) ht0.clear() # latest 例 test_case('X老师您好。请问老师这题怎么做?', entity_mention_dict={"X老师": ["X老师", "老师"], "Y老师": ["Y老师", "老师"]}, strategy="latest" ) test_case('谢谢老师', entity_mention_dict={"X老师": ["X老师", "老师"], "Y老师": ["Y老师", "老师"]}, strategy="latest", lastest_mention={"老师": "X老师"}) # freq 单字面值例 test_case('市长', entity_mention_dict={"A市长": ["市长"], "B市长": ["长江"]}, strategy="freq", entity_freq={"A市长": 5, "B市长": 3}) # freq 重叠字面值例 test_case('xx市长江yy', entity_mention_dict={"xx市长":["xx市长"],"长江yy":["长江yy"]}, strategy="freq", entity_freq={"xx市长":3,"长江yy":5}) test_case('我叫小沈阳', entity_mention_dict={"沈阳": ["沈阳"], "小沈阳": ["小沈阳"]}, strategy="freq", entity_type_dict={"沈阳": "地名", "小沈阳": "人名"}, type_freq={"地名": -1}) sys.stdout.close() assert open(get_current_function_name() + "_current").read() == expected
def __init__(self, bertTokenizer): self.bertTokenizer = bertTokenizer self._load_raw_data() self.ht = HarvestText()
def cut_paragraphs(text, num_paras=5): tt = tkitText.Text() text = tt.sentence_segmentation_v1(text) ht0 = HarvestText() return ht0.cut_paragraphs("\n".join(text), num_paras)
def named_entity_recognition(): ht0 = HarvestText() sent = "上海上港足球队的武磊是中国最好的前锋。" print(ht0.named_entity_recognition(sent))
def el_keep_all(): ht0 = HarvestText() entity_mention_dict = {'李娜1': ['李娜'], "李娜2": ['李娜']} entity_type_dict = {'李娜1': '运动员', '李娜2': '歌手'} ht0.add_entities(entity_mention_dict, entity_type_dict) print(ht0.entity_linking("打球的李娜和唱歌的李娜不是一个人", keep_all=True))
#coding=utf-8 import os import sys import inspect def get_current_function_name(): return os.path.dirname(os.path.abspath(__file__))+"/"+inspect.stack()[1][3] import _locale _locale._getdefaultlocale = (lambda *args: ['zh_CN', 'utf8']) from harvesttext import HarvestText ht = HarvestText() def test_new_word_discover(): sys.stdout = open(get_current_function_name()+"_current","w") expected = open(get_current_function_name()+"_expected").read() para = "上港的武磊和恒大的郜林,谁是中国最好的前锋?那当然是武磊武球王了,他是射手榜第一,原来是弱点的单刀也有了进步" # 返回关于新词质量的一系列信息,允许手工改进筛选(pd.DataFrame型) new_words_info = ht.word_discover(para) # new_words_info = ht.word_discover(para, threshold_seeds=["武磊"]) new_words = new_words_info.index.tolist() print(new_words) sys.stdout.close() assert open(get_current_function_name() + "_current").read() == expected def test_new_word_register(): sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read() new_words = ["落叶球","666"] ht.add_new_words(new_words) # 作为广义上的"新词"登录 ht.add_new_entity("落叶球", mention0="落叶球", type0="术语") # 作为特定类型登录 print(ht.seg("这个落叶球踢得真是666", return_sent=True))