def test_find_with_rules(): sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read() from harvesttext.match_patterns import UpperFirst, AllEnglish, Contains, StartsWith, EndsWith # some more patterns is provided text0 = "我喜欢Python,因为requests库很适合爬虫" ht0 = HarvestText() found_entities = ht0.find_entity_with_rule(text0, rulesets=[AllEnglish()], type0="英文名") print(found_entities) print(ht0.posseg(text0)) print(ht0.mention2entity("Python")) # Satisfying one of the rules ht0.clear() found_entities = ht0.find_entity_with_rule(text0,rulesets=[AllEnglish(),Contains("爬")],type0="技术") print(found_entities) print(ht0.posseg(text0)) # Satisfying a couple of rules [using tuple] ht0.clear() found_entities = ht0.find_entity_with_rule(text0, rulesets=[(AllEnglish(),UpperFirst())], type0="专有英文词") print(found_entities) print(ht0.posseg(text0)) sys.stdout.close() assert open(get_current_function_name() + "_current").read() == expected
def find_with_rules(): from harvesttext.match_patterns import UpperFirst, AllEnglish, Contains, StartsWith, EndsWith # some more patterns is provided text0 = "我喜欢Python,因为requests库很适合爬虫" ht0 = HarvestText() found_entities = ht0.find_entity_with_rule(text0, rulesets=[AllEnglish()], type0="英文名") print(found_entities) print(ht0.posseg(text0)) print(ht0.mention2entity("Python")) # Satisfying one of the rules ht0.clear() found_entities = ht0.find_entity_with_rule( text0, rulesets=[AllEnglish(), Contains("爬")], type0="技术") print(found_entities) print(ht0.posseg(text0)) # Satisfying a couple of rules [using tuple] ht0.clear() found_entities = ht0.find_entity_with_rule(text0, rulesets=[(AllEnglish(), UpperFirst())], type0="专有英文词") print(found_entities) print(ht0.posseg(text0))
def using_typed_words(): from harvesttext.resources import get_qh_typed_words, get_baidu_stopwords ht0 = HarvestText() typed_words, stopwords = get_qh_typed_words(), get_baidu_stopwords() ht0.add_typed_words(typed_words) print("加载清华领域词典,并使用停用词") print("全部类型", typed_words.keys()) sentence = "THUOCL是自然语言处理的一套中文词库,词表来自主流网站的社会标签、搜索热词、输入法词库等。" print(sentence) print(ht0.posseg(sentence, stopwords=stopwords)) print("一些词语被赋予特殊类型IT,而“是”等词语被筛出。")
def test_using_typed_words(): sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read() from harvesttext.resources import get_qh_typed_words,get_baidu_stopwords ht0 = HarvestText() typed_words, stopwords = get_qh_typed_words(), get_baidu_stopwords() ht0.add_typed_words(typed_words) print("加载清华领域词典,并使用停用词") print("全部类型",typed_words.keys()) sentence = "THUOCL是自然语言处理的一套中文词库,词表来自主流网站的社会标签、搜索热词、输入法词库等。" print(sentence) print(ht0.posseg(sentence,stopwords=stopwords)) print("一些词语被赋予特殊类型IT,而“是”等词语被筛出。") sys.stdout.close() assert open(get_current_function_name() + "_current").read() == expected
def test_english(): # ♪ "Until the Day" by JJ Lin test_text = """ In the middle of the night. Lonely souls travel in time. Familiar hearts start to entwine. We imagine what we'll find, in another life. """.lower() ht_eng = HarvestText(language="en") sentences = ht_eng.cut_sentences(test_text) print("\n".join(sentences)) print(ht_eng.seg(sentences[-1])) print(ht_eng.posseg(sentences[0], stopwords={"in"})) sent_dict = ht_eng.build_sent_dict(sentences, pos_seeds=["familiar"], neg_seeds=["lonely"], min_times=1, stopwords={'in', 'to'}) print("Sentiment analysis") for sent0 in sentences: print(sent0, "%.3f" % ht_eng.analyse_sent(sent0)) print("Segmentation") print("\n".join(ht_eng.cut_paragraphs(test_text, num_paras=2)))