def run_text( path="/mnt/data/dev/github/数据处理工具/tool_data_processing/data/text"): tt = tkitText.Text() for f in tfile.file_List(path): t = tfile.open_file(f) for s in tt.sentence_segmentation_v1(t): # for key,item in kg.get_unmarked(): print("#################标记数据######") # print("本次已经标注:",i) # print(key) if len(s) > 50: continue ner_list, vs = get_w_v(s) ner_list = ner(s) + ner_list print("实体", ner_list) #判断是不是宠物 # if check_pet(item['sentence'])==0: # continue # print("ht知识:",ht0.triple_extraction(sent=item['sentence'])) # tt.ht.triple_extraction(sent=sentence) ht_kg = tt.ht.triple_extraction(sent=s) jiagu_kg = jiagu.knowledge(s) # c_kg=[item['kg']] all_kg = ht_kg + jiagu_kg end_kg = [] # print("所有知识:",all_kg) for k in all_kg: if k in end_kg: continue if k[0] in ner_list and k[1] in vs: new = {'sentence': s, 'kg': k} one(new) end_kg.append(k)
def auto_mark_wiki(self, text): """ 自动标记wiki句子 """ # print(ner(text)) kg = Kg() data = [] self.kg_tmp = [] self.tdb.load("mark") word = self.keyword # print(word) # print(kg.get(word)) tt = tkit.Text() knowledge = jiagu.knowledge(text) # print(knowledge) # for one in knowledge: if len(knowledge) > 0: key = tt.md5(text + "".join("".join(knowledge))) one_item = {"text": text, "data": knowledge} print("jiagu成功匹配知识", one_item) self.tdb.put(key, one_item) self.i = self.i + 1 kg_one = kg.get(word) # print(kg_one) if kg_one == None or text.find(word) < 0: return data print("句子:", text) print("发现word", word) for b in kg_one.keys(): one_p = [word] # print(key) # print(text.find(key)) if text.find(b) > 0 or b in ["是"]: one_p.append(b) # print(one_p) if type(kg_one.get(b)) == dict: for c in kg_one.get(b).keys(): # print(c) # print(text.find(c)) if text.find(c) > 0: # print("222") one_p.append(c) # print(c) else: one_p = [] # print(one_p) if len(one_p) == 3: # 发现一条可标记数据 # print("zui",one_p) data.append(one_p) else: pass self.kg_tmp.append((word, kg_one)) return data
def run_mark(): tt = tkitText.Text() tt.load_ht() i = 0 for key, item in kg.get_unmarked(): print("#################标记数据######") # print("本次已经标注:",i) print(item) s = item['sentence'] print(key) if len(s) > 50: continue # ner_list,vs=get_w_v(s) # ner_list=ner(s)+ner_list ner_list = ner(s) ner_s = tt.named_entity_recognition(s) print("提取实体") for key in ner_s: # print(key) ner_list.append(key) vs = [] print('预测成功', i) print("句子", s) print("实体", ner_list) ht_kg = tt.ht.triple_extraction(sent=s) # print(ht_kg) # ht_kg=ht0.triple_extraction(sent=s) # print(ht_kg) jiagu_kg = jiagu.knowledge(s) # c_kg=[item['kg']] all_kg = ht_kg + jiagu_kg end_kg = [] # print("所有知识:",all_kg) for k in all_kg: if k in end_kg: continue if k[0] in ner_list and k[1] in vs: new = {'sentence': s, 'kg': k} if k[0] in ner_list: one(new) end_kg.append(k) if item['kg'][0] in ner_list: one(item)
def read_txt(file_path): with open(file_path, 'r', encoding='utf-8') as f: all_data = [line.strip('\n') for line in f.readlines()] result = [] for data in all_data: # print(jiagu.seg(''.join(data))) one = [] for ner in jiagu.ner(data): if ner is not 'O': one.append(ner) if one: result.append(one) text = ''' 4. 香农的信息定义 假定事物状态可以用一个以经典集合论为基础的概率模型来描述,则信息就是用来消除不确定性的东西,或信息是事物运动状态或存在方式的不确定性描述。 但在实际中要寻找一个合适的概率模型往往是非常困难的,有时是否存在这样一种模型还值得探讨。此外,信息有很强的主观性和实用性,但该定义没有考虑信息接收者的主观特性和主观意义,不顾信息的具体含义、具体用途、重要程度和可能引起的后果等因素,这就与实际情况不完全一致。 ''' keywords = jiagu.knowledge(text) # 关键词 print(keywords)
def nlp_jiagu(self, btn): text = str(self.lbl.text.strip()).replace("\n", "") if text[-2:] == "qg": sentiment = jiagu.sentiment(text[:-2]) self.lbl.text = pprint.pformat(sentiment) elif text[-2:] == "cq": keywords = jiagu.keywords(text, 5) # 关键词 self.lbl.text = pprint.pformat(keywords) elif text[-2:] == "jl": if "," in self.lbl.text: docs = self.lbl.text.split(",") else: docs = self.lbl.text.split(",") #print(docs) cluster = jiagu.text_cluster(docs) self.lbl.text = pprint.pformat(cluster) else: knowledge = jiagu.knowledge(text) self.lbl.text = pprint.pformat(knowledge)
“MODIS的数据让我们能在非常小的尺度上理解这一现象,我们发现人类活动也作出了贡献。” NASA文章介绍,在中国为全球绿化进程做出的贡献中,有42%来源于植树造林工程,对于减少土壤侵蚀、空气污染与气候变化发挥了作用。 据观察者网过往报道,2017年我国全国共完成造林736.2万公顷、森林抚育830.2万公顷。其中,天然林资源保护工程完成造林26万公顷,退耕还林工程完成造林91.2万公顷。京津风沙源治理工程完成造林18.5万公顷。三北及长江流域等重点防护林体系工程完成造林99.1万公顷。完成国家储备林建设任务68万公顷。 ''' keywords = jiagu.keywords(text, 5) # 关键词抽取 print(keywords) summarize = jiagu.summarize(text, 3) # 文本摘要 print(summarize) # jiagu.findword('input.txt', 'output.txt') # 根据大规模语料,利用信息熵做新词发现。 # 知识图谱关系抽取 text = '姚明1980年9月12日出生于上海市徐汇区,祖籍江苏省苏州市吴江区震泽镇,前中国职业篮球运动员,司职中锋,现任中职联公司董事长兼总经理。' knowledge = jiagu.knowledge(text) print(knowledge) # 情感分析 text = '很讨厌还是个懒鬼' sentiment = jiagu.sentiment(text) print(sentiment) # 文本聚类(需要调参) docs = [ "百度深度学习中文情感分析工具Senta试用及在线测试", "情感分析是自然语言处理里面一个热门话题", "AI Challenger 2018 文本挖掘类竞赛相关解决方案及代码汇总", "深度学习实践:从零开始做电影评论文本情感分析", "BERT相关论文、文章和代码资源汇总", "将不同长度的句子用BERT预训练模型编码,映射到一个固定长度的向量上", "自然语言处理工具包spaCy介绍", "现在可以快速测试一下spaCy的相关功能,我们以英文数据为例,spaCy目前主要支持英文和德文" ] cluster = jiagu.text_cluster(docs)
filelist = os.listdir(dir) tot = 0 entities = [] tpcsv = open("news_tp/relationship.csv", "a+", newline='', encoding='utf-8') encsv = open('news_tp/entities.csv', "a+", newline='', encoding='utf-8') tot_news = 0 for file in filelist: path = os.path.join(dir, file) with open(path, 'r', encoding='utf-8') as f: lines = f.readlines() for l in lines: tot_news += 1 knowledge = jiagu.knowledge(l) csvwriter = csv.writer(tpcsv) tot += len(knowledge) for tp in knowledge: csvwriter.writerow(tp) entities.append(tp[0]) entities = list(set(entities)) csvwriter = csv.writer(encsv) for en in entities: csvwriter.writerow([en]) print(len(entities)) print(tot) print(tot_news) tpcsv.close() encsv.close()