def get_predict_title(text, plen, n): ttext = tkitText.Text() tid = str(text) + str(plen) + str(n) tid = ttext.md5(tid) data_path = "tmp/run_task" + tid + ".json" print('load', data_path) if not os.path.exists(data_path): # 不存在缓存,重新预测 cmd = "python3 ./generate.py --prefix '''" + text + "''' --length " + str( plen) + " --nsamples " + str(n) + " --tid " + str(tid) print("开始处理: " + cmd) # print(subprocess.call(cmd, shell=True)) if subprocess.call(cmd, shell=True) == 0: try: tjson = tkitFile.Json(file_path=data_path) return tjson.load()[0]['data'] except: print('load文件失败', data_path) return {} pass else: return {} else: #加载缓存预测 try: tjson = tkitFile.Json(file_path=data_path) return tjson.load()[0]['data'] except: print('load文件失败', data_path) return {}
def print_urls(self): '''输出当前urls中的url''' tt = tkitText.Text() for i, url in enumerate(self.urls): # print("url:",url) if url.startswith("/"): continue d = url_domain(url) # print("d",d) try: ranks = domain_rank([d['domain']]) # print("ranks:",ranks) except: # print("domain err") pass # print("url:",url) d['url'] = url d['title'] = str(self.titles[i]) d['page'] = self.page d['i'] = i d['keyword'] = self.keyword md5 = tt.md5(url) d['_id'] = md5 d['time'] = self.time print("d", d) db_add_search_rank(md5, d)
def run_text( path="/mnt/data/dev/github/数据处理工具/tool_data_processing/data/text"): tt = tkitText.Text() for f in tfile.file_List(path): t = tfile.open_file(f) for s in tt.sentence_segmentation_v1(t): # for key,item in kg.get_unmarked(): print("#################标记数据######") # print("本次已经标注:",i) # print(key) if len(s) > 50: continue ner_list, vs = get_w_v(s) ner_list = ner(s) + ner_list print("实体", ner_list) #判断是不是宠物 # if check_pet(item['sentence'])==0: # continue # print("ht知识:",ht0.triple_extraction(sent=item['sentence'])) # tt.ht.triple_extraction(sent=sentence) ht_kg = tt.ht.triple_extraction(sent=s) jiagu_kg = jiagu.knowledge(s) # c_kg=[item['kg']] all_kg = ht_kg + jiagu_kg end_kg = [] # print("所有知识:",all_kg) for k in all_kg: if k in end_kg: continue if k[0] in ner_list and k[1] in vs: new = {'sentence': s, 'kg': k} one(new) end_kg.append(k)
def mark_word_label(self,text,label_b,word,tp="实体"): # print("pp",text,label_b,word) #自动搜索最大匹配单词 tt=tkitText.Text() c,r=tt.find_match(text,word) # print('122') if r>50: p=c else: p=word # a="嘉朵能够帮助或带领丹恩·萧完成许多事,如逛商店;牠在完成一天的工作后便待在马厩里" # b="帮助丹恩" start_p =text.find(p) end_p=text.find(p)+len(p)-1 # print("start_p",start_p) if start_p>=0: if len(p)>3: label_b[start_p]=self.auto_label(label_b[start_p],'B-'+tp) label_b[end_p]=self.auto_label(label_b[end_p],'E-'+tp) for n in range(start_p+1,end_p): label_b[n]=self.auto_label(label_b[n],'M-'+tp) pass elif len(p)==3: label_b[start_p]=self.auto_label(label_b[start_p],'B-'+tp) label_b[end_p]= self.auto_label(label_b[end_p],'E-'+tp) label_b[start_p+1]= self.auto_label(label_b[start_p+1],'M-'+tp) elif len(p)==1: label_b[start_p]=self.auto_label(label_b[start_p],'S-'+tp) elif len(p)==2: label_b[start_p]=self.auto_label(label_b[start_p],'B-'+tp) label_b[end_p]= self.auto_label(label_b[end_p],'E-'+tp) return label_b,start_p
def json_pre_kg(): """ 自动预测知识 """ keyword = request.args.get('keyword') # http://0.0.0.0:6801/json/keyword?keyword=%E9%AC%A3%E7%8B%97&limit=1000 # r = requests.get('https://api.github.com/user', auth=('user', 'pass')) # Search GitHub's repositories for requests # keyword="花千骨" response = requests.get( 'http://0.0.0.0:6801/json/keyword', params={ 'keyword': keyword, 'limit': 20 }, ) tt = tkitText.Text() if response.status_code == 200: items = response.json() for item in items: for s in tt.sentence_segmentation_v1(item['content']): print(s) get_kg(s) # pprint(items) return jsonify(response.json()) else: return '' return 'jsonify(text_array)'
def json_remove_duplicates(self,json_file): print("尝试移除重复数据") origin_json=tkitFile.Json(json_file) temp=tkitFile.Json(json_file+".tmp.json") tt=tkitText.Text() temp_keys=[] data=[] num_duplicates=0 for i, item in enumerate(origin_json.auto_load()): # if i%10000==0: # print("~~~~"*10) # print('已经处理',i) # temp.save(data) # data=[] # key=tt.md5(str(item)) # if key in temp_keys: # # print("重复数据",item) # num_duplicates=num_duplicates+1 # pass # else: # temp_keys.append(key) # data.append(item) data.append(json.dumps(item)) new=list(set(data)) print("原始长度",len(data)) new_json=[] for item in new: new_json.append(json.loads(item)) print("新长度",len(new_json)) temp.save(new_json) print("移除重复内容",num_duplicates) #覆盖之前文件 shutil.move(json_file+".tmp.json",json_file)
def get_predict(text, plen, n, start, end, key=None, model_path=None): if model_path != None: tokenizer_path = " --tokenizer_path " + str(model_path) + "vocab.txt" model_config = " --model_config " + str(model_path) + "config.json" model_path = " --model_path " + str(model_path) else: model_path = '' tokenizer_path = '' model_config = '' if key == None: ttext = tkitText.Text() tid = str(text) + str(plen) + str(n) tid = ttext.md5(tid) else: tid = key if start != None: start_clip = " --start " + str(start) else: start_clip = '' cmd = "python3 ./generate.py --prefix '''" + text + "''' --length " + str( plen ) + " --nsamples " + str(n) + " --tid '''" + str(tid) + "''' --end " + str( end ) + start_clip + " " + model_path + " " + model_config + " " + tokenizer_path print("开始处理: " + cmd) # print(subprocess.call(cmd, shell=True)) if subprocess.call(cmd, shell=True) == 0: return get_temp(tid)['value'].get("text") else: return []
def page_list(): """ label all 0,1,2 过滤 """ tt = tkitText.Text() start = request.args.get('start') kg.tdb.load("kg_mark") items = [] i = 0 for k, v in kg.tdb.get_all(start=start): # print(k) try: item = kg.tdb.str_dict(v) except: pass if i >= 100: break # 索引数据 # index_one(k, item) if item.get('kg') != None: # index_one(k, item) # print('选择', item) items.append((k, item)) i = i + 1 # gc.collect() if len(items) > 0: return render_template("page_list.html", **locals()) else: return "没有数据"
def get_keys(data_path=""): tjson=tkitFile.Json(file_path=data_path) keys=[] for it in tjson.auto_load(): key=tkitText.Text().md5(it['sentence']) keys.append(key) return list(set(keys))
def add_article(): """ 添加文章 """ items = [] print(request.form) article = request.form.get('article') if article == None: return render_template("add_article.html", **locals()) else: tt = tkitText.Text() sents = tt.sentence_segmentation_v1(article) # print(items) for item in sents: if len(item) > 10: ner_list = ner_plus(item) onlykgs = [] goodkgs = [] if len(ner_list) > 0: onlykgs = pre_kg(item) for kg in onlykgs: p, s = pre({'sentence': item, 'kg': kg}) print(s) if s[1][1] > 0.8: goodkgs.append(kg + [str(s[1][1])]) items.append((item, goodkgs, ner_list)) return render_template("add_article.html", **locals())
def get_entity_one_content(message): cid = message.get('cid') one = get_entity_kg_content(cid) tt = tkitText.Text() sens = tt.sentence_segmentation_v1(one['content']) one['sents'] = sens emit('返回单篇文章', {'state': 'success', 'step': 'search_sent', 'data': one})
def auto_pre_one(self, start, text): tt = tkitText.Text() sents = tt.sentence_segmentation_v1(text) text_a = start li = self.pre(text_a[-200:], sents) return li
def json_get_keywords(): """ 构建训练数据 """ data = get_post_data() ttext = tkitText.Text() keywords = ttext.get_keywords(data['text'], num=40) return jsonify(keywords)
def data_pre_train_mongo_kwseq( data_path='data/data.json',train_path='data/train_db_kwseq.txt' ): """ from=0 #文章开始id limit=10 # 返回文章数目 >>>data_pre_train(from=0, limit=10) [unused5] 标记关键词 [unused6] 标记标题 [unused7] 标记前文标题 [unused8] 标记正文 """ LANGUAGE = "chinese" SENTENCES_COUNT = 10 article_max_len=500 tt=tkitText.Text() # stemmer = Stemmer(LANGUAGE) # summarizer = Summarizer(stemmer) # summarizer.stop_words = get_stop_words(LANGUAGE) # ie=tkitNlp.TripleIE(model_path="/mnt/data/dev/model/ltp/ltp_data_v3.4.0") f1 = open(train_path,'w') articles=[] # 引入TF-IDF关键词抽取接口 # tfidf = analyse.extract_tags jieba.load_userdict('dict.txt') jieba.analyse.set_stop_words('stopwords.txt') textrank = jieba.analyse.textrank #这里定义mongo数据 client = pymongo.MongoClient("localhost", 27017) DB_kg_scrapy = client.kg_scrapy print(DB.name) q={} # print('q',q) # w2v=tkitW2vec.Word2vec() # w2v.load(model_file=Word2vec_model) i=0 for item in DB_kg_scrapy.kg_content.find(q): i=i+1 if i==100: break try: keywords = textrank(item['content'], topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) print('keywords2',keywords) content="[KW] "+" ".join(keywords)+" [/KW] [TT] "+ item['title']+" [/TT] "+item['content']+" [PT] "+ item['title']+" [/PT] [END]" content=content.replace("\n\n\n", "\n\n") content=content.replace("\n", " [SEP] ") content_list=cut_text(content,480) # for it in content_list: # print("++++"*20) # print(it) # f1.write("\n".join(content_list)+"") f1.write(content) f1.write("\n") except : pass
def json_lv(self): """ 将json版本保存到数据 """ self.tdb.load("kg") tt = tkitText.Text() for item in tqdm(self.read_kg()): key = tt.md5(item["句子"] + ','.join(item['知识'])) data = {"sentence": item["句子"], "kg": item['知识']} self.tdb.put_data([(key, data)])
def auto_pre_one(self, start, text): tt = tkitText.Text() # sents=tt.sentence_segmentation_v1(text) ht0 = HarvestText() sents = ht0.cut_paragraphs(text, 50) text_a = start li = self.pre(text_a[-200:], sents) return li
def unique_data(self): """ 值保存标记为yes的数据 并且对数据进行合并 """ # self.tdb.load("kg_mark") tt=tkitText.Text() i=-1 n=0 q={'check': True,'state':'2'} for it in DB.kg_mark.find(q): # for k,v in self.tdb.get_all(): n=n+1 k=it["_id"] try: # it=self.tdb.str_dict(v) # if it['label']-1!=1 or it['state']!='2': if it['label']-1!=1 or it['state']!='2' or it.get('check')==None or len(it['kg'])!=3: continue else: # print("状态为2") pass except: # self.tdb.load("kg") continue # print(it) kgs=[] key = tt.md5(it['sentence']+str(it['kg'][0])+str(it['kg'][1])) # self.tdb.load("kg_mark_unique_data") # kg=self.tdb.get(key) kg=DB.kg_mark_unique_data.find_one({"_id":key}) # print("kg",kg) if kg==None: # 新建 # print("新建") if len(it['kg'])==3: kgs.append(it['kg']) one={"_id":key,'sentence':it['sentence'],'kgs':kgs} DB.kg_mark_unique_data.insert_one(one) else: # 更新 # print("更新") kgs=kg['kgs'] if len(it['kg'])==3 and it['kg'] not in kgs: kgs.append(it['kg']) one={"_id":key,'sentence':it['sentence'],'kgs':kgs} # print(one) # self.tdb.put(key,one) # self.tdb.load("kg_mark") DB.kg_mark_unique_data.update_one({"_id":key},{"$set" :one}) i=i+1 print("总共",i) print("总共",n)
def __init__(self, model=None): self.tt = tkitText.Text() if model == None: self.model = "tkitfiles/ht.model" else: self.model = model if os.path.exists(self.model): self.tt.load_ht(ht_model=self.model) else: self.tt.load_ht() self.tt.typed_words(ht_model=self.model) self.tt.save_ht()
def get_title(): """ 构建训练数据 """ jieba.load_userdict('dict.txt') jieba.analyse.set_stop_words('stopwords.txt') textrank = jieba.analyse.textrank keyword = request.args.get('keyword') pre_title = request.args.get('pre_title') if pre_title != None and keyword == None: kw = textrank(pre_title + '', topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v', 'nr', 'nz', 'a', 'm', 'PER', 'f', 'ns', 'q', 'LOC', 's', 'nt', 'an', 'ORG', 't', 'nw', 'vn', 'TIME')) keyword = ",".join(kw) elif pre_title != None and keyword != None: # keyword.append() kw = textrank(pre_title + '', topK=10, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v', 'nr', 'nz', 'a', 'm', 'PER', 'f', 'ns', 'q', 'LOC', 's', 'nt', 'an', 'ORG', 't', 'nw', 'vn', 'TIME')) keyword = keyword + ",".join(kw) if keyword == None: return render_template("get_title.html", **locals()) pass else: tt = tkitText.Text() keywords = " [KW] " + keyword + " [/KW] " start = '[TT]' end = '[/TT]' nsamples = 20 model_path = "model/title/" key = tt.md5(keywords) titles_p = get_predict(keywords, 100, nsamples, start, end, key, model_path) # print("titles",len(titles)) titles = [] for item in titles_p: irank, grade, softmax = get_rank(item['tt']) # print(irank,grade,softmax) # print((items[i])) item['rank'] = irank item['softmax'] = softmax item['grade'] = grade titles.append(item) return render_template("get_title.html", **locals())
def add_miaoshu(word, vaule, content): """ 保存对象 vaule为对象 """ tt = tkitText.Text() c_id = tt.md5(content) try: DB.entity_kg.insert_one({ "_id": word + "##" + vaule + "##" + str(c_id), "entity": word, 'value': vaule, 'md5': c_id }) except: # del vaule['_id'] print("已经存在") # DB.runvar.update_one({'_id':key}, {"$set" :{"_id":key,'value':vaule}}) return pass # 保存文档 try: DB.entity_kg_content.insert_one({"_id": c_id, 'content': content}) except: print("已经存在") pass rank_data = DB.entity_kg_rank.find_one({"_id": word + "##" + vaule}) try: if rank_data != None: rank = rank_data.get('rank') # DB.entity_kg_rank.insert_one({"_id":word+"##"+vaule,"rank":1,}) DB.entity_kg_rank.update_one({'_id': word + "##" + vaule}, { "$set": { "_id": word + "##" + vaule, "entity": word, 'value': vaule, 'rank': rank + 1 } }) else: DB.entity_kg_rank.insert_one({ "_id": word + "##" + vaule, "entity": word, 'value': vaule, "rank": 1, }) except: # del vaule['_id'] print("添加内容错误") # DB.runvar.update_one({'_id':key}, {"$set" :{"_id":key,'value':vaule}}) pass
def load(): LANGUAGE = "chinese" SENTENCES_COUNT = 5 # article_max_len=500 tt=tkitText.Text() stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) jieba.load_userdict('dict.txt') jieba.analyse.set_stop_words('stopwords.txt') textrank = jieba.analyse.textrank w2v=tkitW2vec.Word2vec() w2v.load(model_file=Word2vec_model)
def unique_data(self): """ 值保存标记为yes的数据 并且对数据进行合并 """ self.tdb.load("kg_mark") tt = tkitText.Text() i = -1 for k, v in self.tdb.get_all(): try: it = self.tdb.str_dict(v) # if it['label']-1!=1 or it['state']!='2': if it['label'] - 1 != 1 or it['state'] != '2' or it.get( 'check') == None or len(it['kg']) != 3: continue else: # print("状态为2") pass except: # self.tdb.load("kg") continue # print(it) kgs = [] key = tt.md5(it['sentence'] + str(it['kg'][0]) + str(it['kg'][1])) self.tdb.load("kg_mark_unique_data") kg = self.tdb.get(key) if kg == None: if len(it['kg']) == 3: kgs.append(it['kg']) # print('111') else: # print("222222222222") try: # print('kgs',kg) kg = self.tdb.str_dict(kg) kgs = kg['kgs'] if len(it['kg']) == 3: kgs.append(it['kg']) except: pass # print(kgs) one = {'sentence': it['sentence'], 'kgs': kgs} # print(one) self.tdb.put(key, one) self.tdb.load("kg_mark") i = i + 1 print("总共", i)
def run_mark(): tt = tkitText.Text() tt.load_ht() i = 0 for key, item in kg.get_unmarked(): print("#################标记数据######") # print("本次已经标注:",i) print(item) s = item['sentence'] print(key) if len(s) > 50: continue # ner_list,vs=get_w_v(s) # ner_list=ner(s)+ner_list ner_list = ner(s) ner_s = tt.named_entity_recognition(s) print("提取实体") for key in ner_s: # print(key) ner_list.append(key) vs = [] print('预测成功', i) print("句子", s) print("实体", ner_list) ht_kg = tt.ht.triple_extraction(sent=s) # print(ht_kg) # ht_kg=ht0.triple_extraction(sent=s) # print(ht_kg) jiagu_kg = jiagu.knowledge(s) # c_kg=[item['kg']] all_kg = ht_kg + jiagu_kg end_kg = [] # print("所有知识:",all_kg) for k in all_kg: if k in end_kg: continue if k[0] in ner_list and k[1] in vs: new = {'sentence': s, 'kg': k} if k[0] in ner_list: one(new) end_kg.append(k) if item['kg'][0] in ner_list: one(item)
def read(): """ 构建下一句语料 from=0 #文章开始id limit=10 # 返回文章数目 >>>data_pre_train(from=0, limit=10) """ i = 0 n = 0 data = [] tt = tkitText.Text() data_json = tkitFile.Json(file_path='data/train.json') for it in data_json.auto_load(): print(it)
def save_to_json(self): """ 可以用于测试知识是否是合理的 """ kgjson_t = tkitFile.Json("../tdata/kg_check/train.json") kgjson_d = tkitFile.Json("../tdata/kg_check/dev.json") kgjson_l = tkitFile.Json("../tdata/kg_check/labels.json") # self.tdb.load("kg_mark") data = [] i = 0 n = 0 self.tdb.load("kg_mark") tt = tkitText.Text() i = -1 for k, v in self.tdb.get_all(): i = i + 1 # print(v) if v == None: n += 1 else: try: it = self.tdb.str_dict(v) one = {} one['sentence'] = " [kg] " + ",".join( it['kg']) + " [/kg] " + it['sentence'] one['label'] = it['label'] - 1 if int(one['label']) in [0, 1] and len( it['kg']) == 3 and it.get( 'check') != None and it.get('state') == '2': data.append(one) else: # print(it) pass except: # self.tdb.load("kg") continue c = int(len(data) * 0.85) print("总数据", len(data), i, n) kgjson_t.save(data[:c]) kgjson_d.save(data[c:]) #自动处理重复标记问题 self.json_remove_duplicates("../tdata/kg_check/train.json") self.json_remove_duplicates("../tdata/kg_check/dev.json") print("已经将数据导出到 ../tdata/kg_check")
def csv_data(file_path=''): d=tkitFile.Csv().csv_data(file_path=file_path) ttext=tkitText.Text() # print(d[10]) new_data=[] for item in tqdm(d): # print(item) if item['title'] == '' or item['content'] == '': # print(",哦哦哦") pass else: kwords=ttext.get_keywords(item['title']+' '+item['content'],num=40) keywords=[] for it in kwords: keywords.append(it['word']) # keywords=keywords data_one={'keywords':','.join(keywords),'title':item['title'],'content':item['content']} yield data_one
def json_lv(self): """ 将json版本保存到数据 """ # self.tdb.load("kg") # Db.kg tt=tkitText.Text() for item in tqdm(self.read_kg()): key=tt.md5(item["句子"]+','.join(item['知识'])) data={ "sentence":item["句子"], "kg":item['知识'] } # self.tdb.put_data([(key,data)]) data['_id']=key try: DB.kg.insert_one(data) except: pass
def get_seq(text): """ 获取关键内容 三元组抽取 """ ht=HarvestText() s=[] text=tkitText.Text().clear(text) for item in ht.triple_extraction(sent=text, standard_name=False, stopwords=None, expand = "all"): if item=='': pass else: # print(' '.join(item)) # s.append(str(item)) s.append(''.join(item)) # s="。".join(s) return s
def auto_pre(self, start, text, limit=0.5): tt = tkitText.Text() sents = tt.sentence_segmentation_v1(text) text_a = start for i in range(len(sents)): if i == 0: text_a = start # print(start) else: text_a = text_a + "\n" + li[0][0] li = self.pre(text_a[-200:], sents) # print("li",li) sents = self.bulid_text(li) # print("text_a",text_a) if li[0][1] <= limit: print(li[0][1]) break yield text_a
def auto_pre(self, start, text, limit=0.7): tt = tkitText.Text() sents = tt.sentence_segmentation_v1(text) for i in range(len(sents)): if i == 0: text_a = start print(start) else: text_a = text_a + li[0][0] print(i, li[0][1], li[0][0]) # text_a=text_a[-200:] li = self.pre(text_a[-200:], sents) sents = self.bulid_text(li) if li[0][1] <= limit: break # print(text_a) return text_a