def data_pre_train_mongo_text(train_path='data/train/' ): """ 构建文本数据将单篇文章一个txt文件 """ # tt=tkitText.Text() #这里定义mongo数据 # client = pymongo.MongoClient("localhost", 27017) # DB_kg_scrapy = client.kg_scrapy # q={} i=0 # content_pet # for item in DB_kg_scrapy.kg_content.find(q): time_path='0' ttf=tkitFile.File() for item in tqdm(DB.content_pet.find({})): i=i+1 if i%10000==0: ttf.mkdir(train_path+time_path) time_path =str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) # break name= str(int(time.time()))+item['title'][:10]+".txt" # file_path=os.path.join(train_path,name) file_path=train_path+time_path+"/"+name # print(file_path) try: with open(file_path,'w',encoding = 'utf-8') as f1: f1.write(item['title']+"\n") f1.write(item['content']+"\n") except: pass
def download_model(self): """自动下载模型""" tfile = tkitFile.File() tfile.mkdir('tkitfiles/') tfile.mkdir('tkitfiles/' + self.model_version) th = tkitWeb.Http() # th.test() # 下载文件 url = "http://cdn.terrychan.org/model/tkit/tkitMarker/" + self.model_version + "/pytorch_model.bin" name = "pytorch_model.bin" data = th.download(url, name, dirname='tkitfiles/' + self.model_version) print(data) url = "http://cdn.terrychan.org/model/tkit/tkitMarker/" + self.model_version + "/config.json" name = "config.json" data = th.download(url, name, dirname='tkitfiles/' + self.model_version) url = "http://cdn.terrychan.org/model/tkit/tkitMarker/" + self.model_version + "/vocab.txt" name = "vocab.txt" data = th.download(url, name, dirname='tkitfiles/' + self.model_version) # print(data) url = "http://cdn.terrychan.org/model/tkit/tkitMarker/" + self.model_version + "/tag.txt" name = "tag.txt" data = th.download(url, name, dirname='tkitfiles/' + self.model_version)
def save_data(data, path='data/', name="train.json"): """ 保存数据 """ tkitFile.File().mkdir(path) data_path = path + name tjson = tkitFile.Json(file_path=data_path) tjson.save(data)
def csv_list(path="data/csv/"): f = tkitFile.File() csv_list=f.file_List(path, type='csv') for line in csv_list: print('add:',line) try: data=csv_data(file_path=line) add_data(data=data) except: print('csv文件有误跳过')
def add_data(data,path='data/',name="data.json"): """ 添加数据样本 data=[{"keywords": "哈士奇,主人,嚎叫,便是,说明,思考,没有,犬种,原因,新手,", "content": "新手养狗,哈是无忧无的经验和耐心。"}] """ tkitFile.File().mkdir(path) data_path=path+name tjson=tkitFile.Json(file_path=data_path) tjson.save(data) return tjson.auto_load()
def save_to_json_kg(self): """ 保存知识提取训练集 https://www.kaggle.com/terrychanorg/albert-bilstm-crf-pytorch/data """ tkitFile.File().mkdir("../tdata/kg") kgjson_t = tkitFile.Json("../tdata/kg/train.json") kgjson_d = tkitFile.Json("../tdata/kg/dev.json") # kgjson_l=tkitFile.Json("../tdata/labels.json") self.tdb.load("kg_mark_unique_data") data = [] all_data_id = [] for k, v in self.tdb.get_all(): # print("k",k) try: it = self.tdb.str_dict(v) # print("it",it) label = ['O'] * len(it['sentence']) s = 0 for one in it['kgs']: label, s1 = self.mark_word_label(it['sentence'], label, one[2], "描述") if s1 >= 0: s = s + 1 # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系") # print(label) d = { 'text': list(one[0] + '#' + one[1] + '#' + it['sentence']), 'label': ['K'] * len(one[0]) + ['X'] + ['P'] * len(one[1]) + ['X'] + label } # print(d) # print(d) if s > 0: # print(s) data.append(d) except: # self.tdb.load("kg") continue c = int(len(data) * 0.85) kgjson_t.save(data[:c]) kgjson_d.save(data[c:]) print("总共生成数据", len(data)) #自动处理重复标记问题 self.json_remove_duplicates("../tdata/kg/train.json") self.json_remove_duplicates("../tdata/kg/dev.json") print("已经将数据导出到 ../tdata/kg")
def save_collection(collection_name,DB): """ 保存一张表 """ path=os.path.join("data",DB.name) tkitFile.File().mkdir(path) json_save=os.path.join("data",DB.name,collection_name+".json") json_backup=tkitFile.Json(json_save) for it in DB[collection_name].find(): # print(it) try: json_backup.save([it]) except: print("error") print(it) pass
def data_pre_train_file(path='./data/'): """ 生成训练样本 """ tkitFile.File().mkdir(path) train_path=path+'train.txt' task_path=path+'task.json' data_path=path+'data.json' tjson=tkitFile.Json(file_path=task_path) # try: # tasks=tjson.load() # task=tasks[0] # os.remove(task_path) # except: # # task=[] # task={"tfrom":0,'limit':10} data_pre_train(data_path=data_path,train_path=train_path)
def data_pre_train_mongo_text(keyword, train_path='../data/'): """ 构建文本数据将单篇文章一个txt文件 """ # tt=tkitText.Text() #这里定义mongo数据 # client = pymongo.MongoClient("localhost", 27017) # DB_kg_scrapy = client.kg_scrapy # q={} i = 0 # content_pet # for item in DB_kg_scrapy.kg_content.find(q): # time_path='0' ttf = tkitFile.File() ttf.mkdir(train_path + keyword) for item in tqdm(search_content(keyword)): i = i + 1 # if i%10000==0: # time_path =str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) # break # #跳过大于1000的文章 # if len(item.content)>1000: # continue if len(item.title + item.content) < 50: continue p = tclass.pre(item.content) # print(p) if p == 0: continue name = str(int(time.time())) + item.title[:10] + ".txt" # file_path=os.path.join(train_path,name) file_path = train_path + keyword + "/" + name # print(file_path) try: with open(file_path, 'w', encoding='utf-8') as f1: f1.write(item.title + "\n") f1.write(item.content[:500] + "\n") except: pass
def save_to_json_SQuAD(self): tkitFile.File().mkdir("../tdata/SQuAD") kgjson_t=tkitFile.Json("../tdata/SQuAD/train.json") kgjson_d=tkitFile.Json("../tdata/SQuAD/dev.json") data=[] all_data_id=[] for it in DB.kg_mark_unique_data.find(): k=it['_id'] ner={} one_q={ "id":k+"_s", "context":it['sentence'], "qas":[] } for one in it['kgs']: try: if one[1] not in ner[one[0]]: ner[one[0]].append(one[1]) except: ner[one[0]]=[one[1]] answers=[] for nr in ner: s=0 label=['O']*len(it['sentence']) # print(ner[nr]) for n in ner[nr]: try: label,s1=self.mark_word_label(it['sentence'],label,n,"关系") if s1>=0: answers_one={ "answer_start": s1, "text": n } answers.append(answers_one) except: pass if len(answers)>0: # #构造一条ner预测数据 one_q['qas'].append({ "question":nr, 'id':k+"_ner_rel_"+nr, 'answers':answers }) if len(one_q['qas'])>0: one_kg={ 'paragraphs':[one_q], 'id':k+"_kg", 'title':it['sentence'][:10] } # print((one_kg)) data.append(one_kg) # data=data[0:1000] c=int(len(data)*0.85) t=data[:c] d=data[c:] t_data={ "version": "v1.0", "data": t } d_data={ "version": "v1.0", "data": d } kgjson_t.save([t_data]) kgjson_d.save([d_data]) print("总共生成数据",len(data)) #自动处理重复标记问题 # self.json_remove_duplicates("../tdata/SQuAD/train.json") # self.json_remove_duplicates("../tdata/SQuAD/dev.json") print("已经将数据导出到 ../tdata/SQuAD")
for w,m in it: labels[m]=1 # print(m,w) keys=[] for key in labels.keys(): keys.append(key) f1.write("\n".join(keys)) # data_path='../data' data_path=input("Data Path:") if data_path: pass else: data_path="/home/t/dev/auto-translation-plan/clear-content-marker/data/" ttf=tkitFile.File() tt=tkitText.Text() data=[] anns=[] bad=0 good=0 bad_files=[] for f_path in ttf.all_path(data_path): # print(f_path) if f_path.endswith(".anns"): # print(f_path) anns.append(f_path) # print(_read_data(f_path)) one_data=_read_data(f_path) # print(one_data) if len(one_data)==0:
import tkitFile """ 将标记好的数据分割保存为训练数据集 """ tkitFile.File().mkdir("data/train") mjson = tkitFile.Json("data/marked.json") tjson = tkitFile.Json("data/train/train.json") djson = tkitFile.Json("data/train/dev.json") data = [] for item in mjson.auto_load(): print(item) data.append(item) c = len(data) * 0.8 tjson.save(data[:int(c)]) djson.save(data[int(c):])
def save_to_json_kg_tmark(self): """ 保存知识提取训练集 https://github.com/napoler/tmark_Description https://www.kaggle.com/terrychanorg/tmark-description # https://www.kaggle.com/terrychanorg/albert-bilstm-crf-pytorch/data """ tkitFile.File().mkdir("../tdata/kg_tmark") kgjson_t=tkitFile.Json("../tdata/kg_tmark/train.json") kgjson_d=tkitFile.Json("../tdata/kg_tmark/dev.json") # kgjson_l=tkitFile.Json("../tdata/labels.json") # self.tdb.load("kg_mark_unique_data") data=[] all_data_id=[] for it in DB.kg_mark_unique_data.find(): # print("k",k) k=it['_id'] try: # it=self.tdb.str_dict(v) # print("it",it) label=['O']*len(it['sentence']) s=0 # print('222') for one in it['kgs']: label,s1=self.mark_word_label(it['sentence'],label,one[1]+one[2],"描述") if s1>=0: s=s+1 # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系") # print(label) d={'text':list(one[0])+['[SEP]']+list(it['sentence']),'label':['实体']*len(one[0])+['X']+label} # print(d) # print(len(d['text']),len(d['label'])) # print(d) if len(d['text'])==len(d['label']): one_kg_tmk=[] for t,tmk_l in zip(d['text'],d['label']): # print(t,tmk_l) one_kg_tmk.append((t,tmk_l)) if s>0: # print(s) # print(one_kg_tmk) data.append(one_kg_tmk) except: # self.tdb.load("kg") continue # c=int(len(data)*0.85) # kgjson_t.save(data[:c]) # kgjson_d.save(data[c:]) print("总共生成数据",len(data)) #自动处理重复标记问题 # self.json_remove_duplicates("../tdata/kg_tmark/train.json") # self.json_remove_duplicates("../tdata/kg_tmark/dev.json") c=int(len(data)*0.7) b=int(len(data)*0.85) print(data[:10]) print(len(data)) train_data=data[:c] dev_data=data[c:b] test_data=data[b:] self.save_data(train_data,file="../tdata/kg_tmark/train.txt") self.save_data(dev_data,file="../tdata/kg_tmark/dev.txt") self.save_data(test_data,file="../tdata/kg_tmark/test.txt") self.save_labels(data,"../tdata/kg_tmark/labels.txt") print("已经将数据导出到 ../tdata/kg_tmark")
def save_to_json_ner_bio(self): tkitFile.File().mkdir("../tdata/onlyner_bio") # self.tdb.load("kg_mark_unique_data") data=[] all_data_id=[] nlp_plus=tkitNlp.Plus() nlp_plus.load_tlp() flags={} for it in DB.kg_mark_unique_data.find(): # print("k",k) k=it['_id'] try: # it=self.tdb.str_dict(v) text=it['sentence'] # print("it",it) label= ["O"]*len(text) ner={} for one in it['kgs']: # print(one) # label,s1=self.mark_word_label(it['sentence'],label,one[0],"实体") # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系") try: if one[1] not in ner[one[0]]: ner[one[0]].append(one[1]) except: ner[one[0]]=[one[1]] # print("++++++"*10) # print('text',text) ner_list =[tmp for tmp in ner.keys() ] # print('ner_list',ner_list) # print(ner_list) # fner =[word for word,flag in nlp_plus.ner(text)] fner=[] for word,flag in nlp_plus.ner(text): flags[flag]=0 fner.append(word) ner_list=list(set(ner_list+fner)) ner_list = sorted(ner_list,key = lambda i:len(i),reverse=False) # print('ner_list',ner_list) s=0 for nr in ner_list: # print(nr) label,s1=nlp_plus.mark_word_label(text,label,nr,"实体") if s1>=0: s=s+1 if s>0: # one={'text':list(text),'label':label} one=[] for it_w,it_l in zip(list(text),label): one.append((it_w,it_l)) data.append(one) # print(flags) except: pass nlp_plus.release() c=int(len(data)*0.7) b=int(len(data)*0.85) print(data[:10]) print(len(data)) train_data=data[:c] dev_data=data[c:b] test_data=data[b:] self.save_data(train_data,file="../tdata/onlyner_bio/train.txt") self.save_data(dev_data,file="../tdata/onlyner_bio/dev.txt") self.save_data(test_data,file="../tdata/onlyner_bio/test.txt") self.save_labels(data,"../tdata/onlyner_bio/labels.txt")
def save_to_json_ner(self): tkitFile.File().mkdir("../tdata/onlyner") kgjson_t=tkitFile.Json("../tdata/onlyner/train.json") kgjson_d=tkitFile.Json("../tdata/onlyner/dev.json") # kgjson_l=tkitFile.Json("../tdata/labels.json") self.tdb.load("kg_mark_unique_data") data=[] all_data_id=[] nlp_plus=tkitNlp.Plus() nlp_plus.load_tlp() flags={} for it in DB.kg_mark_unique_data.find(): # print("k",k) k=it['_id'] try: # it=self.tdb.str_dict(v) text=it['sentence'] # print("it",it) label= ["O"]*len(text) ner={} for one in it['kgs']: # print(one) # label,s1=self.mark_word_label(it['sentence'],label,one[0],"实体") # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系") try: if one[1] not in ner[one[0]]: ner[one[0]].append(one[1]) except: ner[one[0]]=[one[1]] # print("++++++"*10) # print('text',text) ner_list =[tmp for tmp in ner.keys() ] # print('ner_list',ner_list) # print(ner_list) # fner =[word for word,flag in nlp_plus.ner(text)] fner=[] for word,flag in nlp_plus.ner(text): flags[flag]=0 fner.append(word) ner_list=list(set(ner_list+fner)) ner_list = sorted(ner_list,key = lambda i:len(i),reverse=False) # print('ner_list',ner_list) s=0 for nr in ner_list: # print(nr) label,s1=nlp_plus.mark_word_label(text,label,nr,"实体") if s1>=0: s=s+1 if s>0: one={'text':list(text),'label':label} data.append(one) # print(flags) except: pass nlp_plus.release() c=int(len(data)*0.85) kgjson_t.save(data[:c]) kgjson_d.save(data[c:]) print("总共生成数据",len(data)) #自动处理重复标记问题 self.json_remove_duplicates("../tdata/onlyner/train.json") self.json_remove_duplicates("../tdata/onlyner/dev.json") print("已经将数据导出到 .../tdata/onlyner/")
def __init__(self): tkitFile.File().mkdir("../tdata") self.tdb= tkitDb.LDB(path="../tdata/lv.db") self.ss=tkitSearch.Search() pass
import warnings warnings.filterwarnings(action = 'ignore', category = UserWarning, module = 'gensim')#忽略警告 import logging import os.path import sys import multiprocessing from gensim.corpora import WikiCorpus from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence,PathLineSentences import gensim import tkitFile ff=tkitFile.File() #program = os.path.basename(sys.argv[0]) #logger = logging.getLogger(program) #logging.basicConfig(format = '%(asctime)s: %(levelname)s:' %('message)s'),level = logging.INFO) #logger.info("running %s" % ' '.join(sys.argv)) fdir = 'model/' ddir='data/train/' ff.mkdir("model/") ff.mkdir("data") ff.mkdir("data/train") inp = ddir + 'data_seg.txt' outp1 = fdir + 'word2vec.model' outp2 = fdir + 'word2vec.vector'
from relation_extraction import getRelation from utils import readFile import tkitFile import gc # from albert_pytorch import classify tfile = tkitFile.File() # Tclass = classify(model_name_or_path='tkitfiles/checkkg') # def pre(data): # """ # 获取预测结果 # """ # # tkg = "[kg] "+",".join(data['kg'])+" [/kg] "+data['sentence'] # data['sentence_b']=",".join(data['kg']) # p = Tclass.pre(data['sentence'], data['sentence_b']) # softmax = Tclass.softmax() # Tclass.release # print("分类", "|", '概率') # pre = [] # for ck, rank in zip([1, 2], softmax): # print(ck, "|", rank) # pre.append([ck, round(rank, 4)]) # # del Tclass # gc.collect() # return p+1, pre path = "/mnt/data/dev/tdata/wiki_zh" relations_all = [] for f in tfile.all_path(path): # para = readFile('./wiki_00') print(f) para = readFile(f)
def save_to_json_ner_rel(self): tkitFile.File().mkdir("../tdata/ner") kgjson_t=tkitFile.Json("../tdata/ner_rel/train.json") kgjson_d=tkitFile.Json("../tdata/ner_rel/dev.json") # kgjson_l=tkitFile.Json("../tdata/labels.json") # self.tdb.load("kg_mark_unique_data") data=[] all_data_id=[] # for k,v in self.tdb.get_all(): for it in DB.kg_mark_unique_data.find(): # print("k",k) k=it['_id'] try: # it=self.tdb.str_dict(v) # print("it",it) ner={} for one in it['kgs']: # print(one) # label,s1=self.mark_word_label(it['sentence'],label,one[0],"实体") # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系") try: if one[1] not in ner[one[0]]: ner[one[0]].append(one[1]) except: ner[one[0]]=[one[1]] # print(ner) for nr in ner: s=0 label=['O']*len(it['sentence']) # print(ner[nr]) for n in ner[nr]: # print(n) # print("label",label) # print(it['sentence']) label,s1=self.mark_word_label(it['sentence'],label,n,"关系") # print(label,s1) if s1>=0: s=s+1 if s>0: one_ner={'text':list(nr+'#'+it['sentence']),'label':['K']*len(nr)+['X']+label} data.append(one_ner) # print(one_ner) # # print(label) # d={'text':list(it['sentence']),'label':label} # # print(d) # # print(d) # data.append(d) except: # self.tdb.load("kg") continue c=int(len(data)*0.85) kgjson_t.save(data[:c]) kgjson_d.save(data[c:]) print("总共生成数据",len(data)) #自动处理重复标记问题 self.json_remove_duplicates("../tdata/ner_rel/train.json") self.json_remove_duplicates("../tdata/ner_rel/dev.json") print("已经将数据导出到 ../tdata/ner_rel")