def get_predict_title(text, plen, n): ttext = tkitText.Text() tid = str(text) + str(plen) + str(n) tid = ttext.md5(tid) data_path = "tmp/run_task" + tid + ".json" print('load', data_path) if not os.path.exists(data_path): # 不存在缓存,重新预测 cmd = "python3 ./generate.py --prefix '''" + text + "''' --length " + str( plen) + " --nsamples " + str(n) + " --tid " + str(tid) print("开始处理: " + cmd) # print(subprocess.call(cmd, shell=True)) if subprocess.call(cmd, shell=True) == 0: try: tjson = tkitFile.Json(file_path=data_path) return tjson.load()[0]['data'] except: print('load文件失败', data_path) return {} pass else: return {} else: #加载缓存预测 try: tjson = tkitFile.Json(file_path=data_path) return tjson.load()[0]['data'] except: print('load文件失败', data_path) return {}
def json_remove_duplicates(self,json_file): print("尝试移除重复数据") origin_json=tkitFile.Json(json_file) temp=tkitFile.Json(json_file+".tmp.json") tt=tkitText.Text() temp_keys=[] data=[] num_duplicates=0 for i, item in enumerate(origin_json.auto_load()): # if i%10000==0: # print("~~~~"*10) # print('已经处理',i) # temp.save(data) # data=[] # key=tt.md5(str(item)) # if key in temp_keys: # # print("重复数据",item) # num_duplicates=num_duplicates+1 # pass # else: # temp_keys.append(key) # data.append(item) data.append(json.dumps(item)) new=list(set(data)) print("原始长度",len(data)) new_json=[] for item in new: new_json.append(json.loads(item)) print("新长度",len(new_json)) temp.save(new_json) print("移除重复内容",num_duplicates) #覆盖之前文件 shutil.move(json_file+".tmp.json",json_file)
def save_to_json_kg(self): """ 保存知识提取训练集 https://www.kaggle.com/terrychanorg/albert-bilstm-crf-pytorch/data """ tkitFile.File().mkdir("../tdata/kg") kgjson_t = tkitFile.Json("../tdata/kg/train.json") kgjson_d = tkitFile.Json("../tdata/kg/dev.json") # kgjson_l=tkitFile.Json("../tdata/labels.json") self.tdb.load("kg_mark_unique_data") data = [] all_data_id = [] for k, v in self.tdb.get_all(): # print("k",k) try: it = self.tdb.str_dict(v) # print("it",it) label = ['O'] * len(it['sentence']) s = 0 for one in it['kgs']: label, s1 = self.mark_word_label(it['sentence'], label, one[2], "描述") if s1 >= 0: s = s + 1 # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系") # print(label) d = { 'text': list(one[0] + '#' + one[1] + '#' + it['sentence']), 'label': ['K'] * len(one[0]) + ['X'] + ['P'] * len(one[1]) + ['X'] + label } # print(d) # print(d) if s > 0: # print(s) data.append(d) except: # self.tdb.load("kg") continue c = int(len(data) * 0.85) kgjson_t.save(data[:c]) kgjson_d.save(data[c:]) print("总共生成数据", len(data)) #自动处理重复标记问题 self.json_remove_duplicates("../tdata/kg/train.json") self.json_remove_duplicates("../tdata/kg/dev.json") print("已经将数据导出到 ../tdata/kg")
def get_data(path, tokenizer): # temp=tkitFile.Json('data/cache/train.json') for it in tkitFile.Json(path).auto_load(): item = {} # print(it) kw = tokenizer.encode_plus(it['keywords'], max_length=tokenizer.max_len, add_special_tokens=True) pad_num = tokenizer.max_len - len(kw['input_ids']) item['keywords'] = kw['input_ids'] + [ tokenizer.convert_tokens_to_ids('[PAD]') ] * pad_num tx = tokenizer.encode_plus(it['text'], max_length=tokenizer.max_len, add_special_tokens=True) pad_num = tokenizer.max_len - len(tx['input_ids']) item['text'] = tx['input_ids'] + [ tokenizer.convert_tokens_to_ids('[PAD]') ] * pad_num # inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt') # print(item) yield item
def make_labels(self): tjosn = tkitFile.Json(file_path=self.data_dir + "/labels.json").auto_load() labels = [] for item in tjosn: labels.append(str(item['label'])) self.labels = labels
def get_dev_examples(self, data_dir): """See base class.""" self.data_dir = data_dir file_path = os.path.join(self.data_dir, "dev.json") # bulid_labels(self,data_dir) tjosn = tkitFile.Json(file_path=file_path).auto_load() return self._create_examples(tjosn, 'dev')
def get_keys(data_path=""): tjson=tkitFile.Json(file_path=data_path) keys=[] for it in tjson.auto_load(): key=tkitText.Text().md5(it['sentence']) keys.append(key) return list(set(keys))
def save_loss(loss,name="default"): """ 保存loss 方便以后绘图分析 """ file_path="dataset/"+name+".json" tjosn=tkitFile.Json(file_path=file_path) one={"time":time.time(), 'loss':loss } tjosn.save([one])
def save_data(data, path='data/', name="train.json"): """ 保存数据 """ tkitFile.File().mkdir(path) data_path = path + name tjson = tkitFile.Json(file_path=data_path) tjson.save(data)
def save_to_json(self): """ 可以用于测试知识是否是合理的 """ kgjson_t = tkitFile.Json("../tdata/kg_check/train.json") kgjson_d = tkitFile.Json("../tdata/kg_check/dev.json") kgjson_l = tkitFile.Json("../tdata/kg_check/labels.json") # self.tdb.load("kg_mark") data = [] i = 0 n = 0 self.tdb.load("kg_mark") tt = tkitText.Text() i = -1 for k, v in self.tdb.get_all(): i = i + 1 # print(v) if v == None: n += 1 else: try: it = self.tdb.str_dict(v) one = {} one['sentence'] = " [kg] " + ",".join( it['kg']) + " [/kg] " + it['sentence'] one['label'] = it['label'] - 1 if int(one['label']) in [0, 1] and len( it['kg']) == 3 and it.get( 'check') != None and it.get('state') == '2': data.append(one) else: # print(it) pass except: # self.tdb.load("kg") continue c = int(len(data) * 0.85) print("总数据", len(data), i, n) kgjson_t.save(data[:c]) kgjson_d.save(data[c:]) #自动处理重复标记问题 self.json_remove_duplicates("../tdata/kg_check/train.json") self.json_remove_duplicates("../tdata/kg_check/dev.json") print("已经将数据导出到 ../tdata/kg_check")
def add_data(data,path='data/',name="data.json"): """ 添加数据样本 data=[{"keywords": "哈士奇,主人,嚎叫,便是,说明,思考,没有,犬种,原因,新手,", "content": "新手养狗,哈是无忧无的经验和耐心。"}] """ tkitFile.File().mkdir(path) data_path=path+name tjson=tkitFile.Json(file_path=data_path) tjson.save(data) return tjson.auto_load()
def save_collection(collection_name,DB): """ 保存一张表 """ path=os.path.join("data",DB.name) tkitFile.File().mkdir(path) json_save=os.path.join("data",DB.name,collection_name+".json") json_backup=tkitFile.Json(json_save) for it in DB[collection_name].find(): # print(it) try: json_backup.save([it]) except: print("error") print(it) pass
def read(): """ 构建下一句语料 from=0 #文章开始id limit=10 # 返回文章数目 >>>data_pre_train(from=0, limit=10) """ i = 0 n = 0 data = [] tt = tkitText.Text() data_json = tkitFile.Json(file_path='data/train.json') for it in data_json.auto_load(): print(it)
def data_pre_train_file(path='./data/'): """ 生成训练样本 """ tkitFile.File().mkdir(path) train_path=path+'train.txt' task_path=path+'task.json' data_path=path+'data.json' tjson=tkitFile.Json(file_path=task_path) # try: # tasks=tjson.load() # task=tasks[0] # os.remove(task_path) # except: # # task=[] # task={"tfrom":0,'limit':10} data_pre_train(data_path=data_path,train_path=train_path)
def bulid_labels(self): """See base class 基于数据构建label词典.""" # print("self.data_dir",self.data_dir) file_path = os.path.join(self.data_dir, "all_50_schemas.json") data = tkitFile.Json(file_path=file_path).auto_load() # tjson=tkit.Json(file_path=os.path.join('data/all_50_schemas.json')) # data= tjson.auto_load() # print(len(data)) labels = [] for i, it in enumerate(data): labels.append(it['predicate']) # labels = list(set(labels)) labels = {}.fromkeys(labels).keys() # print(labels) labels_dict = {"NULL": 0} for i, it in enumerate(labels, 1): labels_dict[it] = str(i) # print(labels_dict) return labels_dict
def data_pre_train_mongo_summary( data_path='data/data.json',train_path='data/train_db_Summary.txt' ): """ from=0 #文章开始id limit=10 # 返回文章数目 >>>data_pre_train(from=0, limit=10) """ f1 = open(train_path,'w') i=0 # data=[] # tt=tiktThreading.TT(5) # for item in get_one(): tjson=tkitFile.Json(file_path=data_path) for item in tqdm(tjson.auto_load()): i=i+1 if i%10000==0: print(i) args={'item':item,'f1':f1} add_one(args)
def check_model(): """ 对之前的训练数据重新筛选 """ tjson = tkitFile.Json(file_path="data/classifypet/train.json") # tjson_b=tkitFile.Json(file_path="data/classifypet/train_b.json") a = 0 b = 0 data = [] for it in tjson.auto_load(): # print(it) a = a + 1 p = petclass.pre(it['sentence']) if p == it['label']: b = b + 1 else: print(it['sentence'][:500]) print(it['label']) mp = input("不一致:") it['label'] = int(mp) data.append(it) print("one", b, a, b / a) print(b, a, b / a) add_data(data, path='data/classifypet/', name="train_b.json")
def save_to_json_SQuAD(self): tkitFile.File().mkdir("../tdata/SQuAD") kgjson_t=tkitFile.Json("../tdata/SQuAD/train.json") kgjson_d=tkitFile.Json("../tdata/SQuAD/dev.json") data=[] all_data_id=[] for it in DB.kg_mark_unique_data.find(): k=it['_id'] ner={} one_q={ "id":k+"_s", "context":it['sentence'], "qas":[] } for one in it['kgs']: try: if one[1] not in ner[one[0]]: ner[one[0]].append(one[1]) except: ner[one[0]]=[one[1]] answers=[] for nr in ner: s=0 label=['O']*len(it['sentence']) # print(ner[nr]) for n in ner[nr]: try: label,s1=self.mark_word_label(it['sentence'],label,n,"关系") if s1>=0: answers_one={ "answer_start": s1, "text": n } answers.append(answers_one) except: pass if len(answers)>0: # #构造一条ner预测数据 one_q['qas'].append({ "question":nr, 'id':k+"_ner_rel_"+nr, 'answers':answers }) if len(one_q['qas'])>0: one_kg={ 'paragraphs':[one_q], 'id':k+"_kg", 'title':it['sentence'][:10] } # print((one_kg)) data.append(one_kg) # data=data[0:1000] c=int(len(data)*0.85) t=data[:c] d=data[c:] t_data={ "version": "v1.0", "data": t } d_data={ "version": "v1.0", "data": d } kgjson_t.save([t_data]) kgjson_d.save([d_data]) print("总共生成数据",len(data)) #自动处理重复标记问题 # self.json_remove_duplicates("../tdata/SQuAD/train.json") # self.json_remove_duplicates("../tdata/SQuAD/dev.json") print("已经将数据导出到 ../tdata/SQuAD")
# all=0 # # ner_list=ner_plus(text) # for item in ner_reljson.auto_load(): # The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the # examples/run_squad.py example to see how to fine-tune a model to a question answering task. from transformers import AlbertTokenizer, AlbertForQuestionAnswering, BertTokenizer, AlbertConfig import torch # tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') tokenizer = BertTokenizer.from_pretrained('tkitfiles/qa/model/') # config=AlbertConfig.from_pretrained('tkitfiles/qa/model/config.json') model = AlbertForQuestionAnswering.from_pretrained('tkitfiles/qa/model/') data = tkitFile.Json("../tdata/SQuAD/dev.json") i = 0 all = 0 f = 0 for item in data.auto_load(): for one in item['data']: all = all + 1 # print(one['paragraphs'][0]) # print(one['paragraphs'][0]['context']) question, text = one['paragraphs'][0]['qas'][0]['question'], one[ 'paragraphs'][0]['context'] # question, text = "利比里亚共和国", "利比里亚共和国(英语:') 通称赖比瑞亚,是位于西非,北接几内亚,西北界塞拉利昂,东邻象牙海岸,西南濒大西洋的总统制共和国家" input_dict = tokenizer.encode_plus(question, text, return_tensors='pt') start_scores, end_scores = model(**input_dict) # print(start_scores, end_scores)
import tkitFile """ 将标记好的数据分割保存为训练数据集 """ tkitFile.File().mkdir("data/train") mjson = tkitFile.Json("data/marked.json") tjson = tkitFile.Json("data/train/train.json") djson = tkitFile.Json("data/train/dev.json") data = [] for item in mjson.auto_load(): print(item) data.append(item) c = len(data) * 0.8 tjson.save(data[:int(c)]) djson.save(data[int(c):])
# from search import * from config import * import tkitText, tkitFile import time from tqdm import tqdm data_path = "/mnt/data/dev/github/scrapy/scrapy_baidu/scrapy_baidu/scrapy_baidu/data/all.json" tjson = tkitFile.Json(file_path=data_path) def save(): its = [] i = 0 for it in DB.kg_content.find({}): # print(it) i = i + 1 its.append(it) if i % 10000 == 0: tjson.save(its) print(i) its = [] tjson.save(its) # save() # # s=Search(name='Terry') # # # s.init_search() # # s.load() # tt= tkitText.Text() # with open(data_path, 'r') as f: # for i,line in tqdm(enumerate(f)):
# word.append(item['text'][lnum]) # if l=="P": # word.append(item['text'][lnum]) # print(''.join(word),words_list) # if item['label']==labels: # # print('准确') # i=i+1 # all=all+1 # if all%1==0: # print('准确率',i/all) # if all==limit: # break # print('统计',i,all) ner_reljson = tkitFile.Json("../tdata/kg/dev.json") i = 0 all = 0 r = 0 # ner_list=ner_plus(text) limit = 1000 q = {'check': True, "state": '2', 'label': int(2)} print('q', q) for item in DB.kg_mark.find(q): item['text'] = item['kg'][0] + '#' + item['kg'][1] + '#' + item['sentence'] words_list, labels = get_Relationship_test(item) all = all + 1 if len(words_list) > 0: if item['kg'][2] in words_list: i = i + 1
def save_to_json_kg_tmark(self): """ 保存知识提取训练集 https://github.com/napoler/tmark_Description https://www.kaggle.com/terrychanorg/tmark-description # https://www.kaggle.com/terrychanorg/albert-bilstm-crf-pytorch/data """ tkitFile.File().mkdir("../tdata/kg_tmark") kgjson_t=tkitFile.Json("../tdata/kg_tmark/train.json") kgjson_d=tkitFile.Json("../tdata/kg_tmark/dev.json") # kgjson_l=tkitFile.Json("../tdata/labels.json") # self.tdb.load("kg_mark_unique_data") data=[] all_data_id=[] for it in DB.kg_mark_unique_data.find(): # print("k",k) k=it['_id'] try: # it=self.tdb.str_dict(v) # print("it",it) label=['O']*len(it['sentence']) s=0 # print('222') for one in it['kgs']: label,s1=self.mark_word_label(it['sentence'],label,one[1]+one[2],"描述") if s1>=0: s=s+1 # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系") # print(label) d={'text':list(one[0])+['[SEP]']+list(it['sentence']),'label':['实体']*len(one[0])+['X']+label} # print(d) # print(len(d['text']),len(d['label'])) # print(d) if len(d['text'])==len(d['label']): one_kg_tmk=[] for t,tmk_l in zip(d['text'],d['label']): # print(t,tmk_l) one_kg_tmk.append((t,tmk_l)) if s>0: # print(s) # print(one_kg_tmk) data.append(one_kg_tmk) except: # self.tdb.load("kg") continue # c=int(len(data)*0.85) # kgjson_t.save(data[:c]) # kgjson_d.save(data[c:]) print("总共生成数据",len(data)) #自动处理重复标记问题 # self.json_remove_duplicates("../tdata/kg_tmark/train.json") # self.json_remove_duplicates("../tdata/kg_tmark/dev.json") c=int(len(data)*0.7) b=int(len(data)*0.85) print(data[:10]) print(len(data)) train_data=data[:c] dev_data=data[c:b] test_data=data[b:] self.save_data(train_data,file="../tdata/kg_tmark/train.txt") self.save_data(dev_data,file="../tdata/kg_tmark/dev.txt") self.save_data(test_data,file="../tdata/kg_tmark/test.txt") self.save_labels(data,"../tdata/kg_tmark/labels.txt") print("已经将数据导出到 ../tdata/kg_tmark")
def save_to_json_ner(self): tkitFile.File().mkdir("../tdata/onlyner") kgjson_t=tkitFile.Json("../tdata/onlyner/train.json") kgjson_d=tkitFile.Json("../tdata/onlyner/dev.json") # kgjson_l=tkitFile.Json("../tdata/labels.json") self.tdb.load("kg_mark_unique_data") data=[] all_data_id=[] nlp_plus=tkitNlp.Plus() nlp_plus.load_tlp() flags={} for it in DB.kg_mark_unique_data.find(): # print("k",k) k=it['_id'] try: # it=self.tdb.str_dict(v) text=it['sentence'] # print("it",it) label= ["O"]*len(text) ner={} for one in it['kgs']: # print(one) # label,s1=self.mark_word_label(it['sentence'],label,one[0],"实体") # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系") try: if one[1] not in ner[one[0]]: ner[one[0]].append(one[1]) except: ner[one[0]]=[one[1]] # print("++++++"*10) # print('text',text) ner_list =[tmp for tmp in ner.keys() ] # print('ner_list',ner_list) # print(ner_list) # fner =[word for word,flag in nlp_plus.ner(text)] fner=[] for word,flag in nlp_plus.ner(text): flags[flag]=0 fner.append(word) ner_list=list(set(ner_list+fner)) ner_list = sorted(ner_list,key = lambda i:len(i),reverse=False) # print('ner_list',ner_list) s=0 for nr in ner_list: # print(nr) label,s1=nlp_plus.mark_word_label(text,label,nr,"实体") if s1>=0: s=s+1 if s>0: one={'text':list(text),'label':label} data.append(one) # print(flags) except: pass nlp_plus.release() c=int(len(data)*0.85) kgjson_t.save(data[:c]) kgjson_d.save(data[c:]) print("总共生成数据",len(data)) #自动处理重复标记问题 self.json_remove_duplicates("../tdata/onlyner/train.json") self.json_remove_duplicates("../tdata/onlyner/dev.json") print("已经将数据导出到 .../tdata/onlyner/")
def read_kg(self): kgjson=tkitFile.Json("../data/knowledge_triple.json") for item in kgjson.auto_load(): # print(item) yield item
def build_dataset_ner(train_file,type="all"): """ 百度训练集 转化为标注数据集 实体标注和关系词抽取训练集 train_file 文件路径 type="all" 或者mini mini 构建数据思路 多个描述合并到一个训练里 使用ner提取出句子中的实体 文本: ner+句子 label: ['K']*len(ner)+正常标记 """ tjson=tkitFile.Json(file_path=train_file) # all_save=Tjson(file_path="data/train_all.json") # tjson_save=Tjson(file_path="data/ner_train.json") # dev_json_save=Tjson(file_path="data/ner_dev.json") tjson_save=tkitFile.Json(file_path="../tdata/onlyner/train.json") dev_json_save=tkitFile.Json(file_path="../tdata/onlyner/dev.json") data=[] nlp_plus=tkitNlp.Plus() nlp_plus.load_tlp() flags={} for item in tqdm(tjson.load()): text= item['text'] label= ["O"]*len(text) ner={} for n in item['spo_list']: try: ner[n['subject']].append(n['predicate']) except: ner[n['subject']]=[n['predicate']] # for tmp in ner.keys(): # print(tmp) ner_list =[tmp for tmp in ner.keys() ] # print(ner_list) # fner =[word for word,flag in nlp_plus.ner(text)] fner=[] for word,flag in nlp_plus.ner(text): flags[flag]=0 fner.append(word) ner_list=list(set(ner_list+fner)) ner_list = sorted(ner_list,key = lambda i:len(i),reverse=False) # print(ner_list) s=0 for nr in ner_list: # print(nr) label,s1=nlp_plus.mark_word_label(text,label,nr,"实体") if s1>=0: s=s+1 # for n in ner[nr]: # label,s1=mark_word_label(text,label,n,"实体") # if s1>=0: # s=s+1 if s>0: one={'text':list(text),'label':label} data.append(one) # print(one) # print(flags) nlp_plus.release() if type=="all": pass elif type=="mini": data=data[:200] # all_save.save(data) print("总共数据",len(data)) f=int(len(data)*0.85) tjson_save.save(data=data[:f]) dev_json_save.save(data=data[f:])
def data_pre_train_mongo_next_sentence(): """ 构建下一句语料 from=0 #文章开始id limit=10 # 返回文章数目 >>>data_pre_train(from=0, limit=10) """ i = 0 n = 0 data = [] parser = argparse.ArgumentParser() parser.add_argument('--limit', default=50000, type=int, required=False, help='长度限制') args = parser.parse_args() tt = tkitText.Text() data_json = tkitFile.Json(file_path='data.json') for it in data_json.auto_load(): # print(it) sents = tt.sentence_segmentation_v1(it['content']) pre_sents = [] for i, sent in enumerate(sents): if i == 0: one = {'sentence': it['title'], 'sentence_b': sent, 'label': 1} data.append(one) rand_sent = choice(sents) if rand_sent != sent: one = { 'sentence': it['title'], 'sentence_b': rand_sent, 'label': 0 } data.append(one) pre_sents.append(it['title']) pre_sents.append(sent) else: pre_text = "".join(pre_sents) one = { 'sentence': pre_text[-200:], 'sentence_b': sent, 'label': 1 } data.append(one) rand_sent = choice(sents) if rand_sent != sent: one = { 'sentence': pre_text[-200:], 'sentence_b': rand_sent, 'label': 0 } data.append(one) pre_sents.append(sent) # print(len(data)) if len(data) > args.limit: break if n % 10000 == 0: # print("保存10000") pass n = n + 1 cut = int(len(data) * 0.8) save_data(data[:cut], path='data/', name="train.json") save_data(data[cut:], path='data/', name="dev.json") data = []
def save_to_json_ner_rel(self): tkitFile.File().mkdir("../tdata/ner") kgjson_t=tkitFile.Json("../tdata/ner_rel/train.json") kgjson_d=tkitFile.Json("../tdata/ner_rel/dev.json") # kgjson_l=tkitFile.Json("../tdata/labels.json") # self.tdb.load("kg_mark_unique_data") data=[] all_data_id=[] # for k,v in self.tdb.get_all(): for it in DB.kg_mark_unique_data.find(): # print("k",k) k=it['_id'] try: # it=self.tdb.str_dict(v) # print("it",it) ner={} for one in it['kgs']: # print(one) # label,s1=self.mark_word_label(it['sentence'],label,one[0],"实体") # label,s1=self.mark_word_label(it['sentence'],label,one[1],"关系") try: if one[1] not in ner[one[0]]: ner[one[0]].append(one[1]) except: ner[one[0]]=[one[1]] # print(ner) for nr in ner: s=0 label=['O']*len(it['sentence']) # print(ner[nr]) for n in ner[nr]: # print(n) # print("label",label) # print(it['sentence']) label,s1=self.mark_word_label(it['sentence'],label,n,"关系") # print(label,s1) if s1>=0: s=s+1 if s>0: one_ner={'text':list(nr+'#'+it['sentence']),'label':['K']*len(nr)+['X']+label} data.append(one_ner) # print(one_ner) # # print(label) # d={'text':list(it['sentence']),'label':label} # # print(d) # # print(d) # data.append(d) except: # self.tdb.load("kg") continue c=int(len(data)*0.85) kgjson_t.save(data[:c]) kgjson_d.save(data[c:]) print("总共生成数据",len(data)) #自动处理重复标记问题 self.json_remove_duplicates("../tdata/ner_rel/train.json") self.json_remove_duplicates("../tdata/ner_rel/dev.json") print("已经将数据导出到 ../tdata/ner_rel")
elif flag.startswith("I-"): one.append(word) elif flag.startswith("E-"): one.append(word) words_list.append("".join(one)) elif flag.startswith("S-"): words_list.append(word) # print(words_list) # return words_list,words, postags,netags return words_list import tkitFile tfile = tkitFile.File() tj = tkitFile.Json(file_path='data/ner/dev.json') i = 0 n = 0 f = 0 good = 0 all = 0 for item in tj.auto_load(): print("###" * 20) o_ners = Ner_Marker.get_mark_data(item).get("实体") text = ''.join(item['text']) # result=TNer.pre([text]) result = Ner_Marker.pre_ner(text) all = all + 1 if o_ners == result: good = good + 1 pass
def save_to_json(self): """ 可以用于测试知识是否是合理的 """ kgjson_t=tkitFile.Json("../tdata/kg_check/train.json") kgjson_d=tkitFile.Json("../tdata/kg_check/dev.json") # kgjson_l=tkitFile.Json("../tdata/kg_check/labels.json") # self.tdb.load("kg_mark") data=[] i=0 n=0 # self.tdb.load("kg_mark") tt=tkitText.Text() i=-1 q={'check': True,'state':'2'} for it in DB.kg_mark.find(q): # for k,v in self.tdb.get_all(): k=it["_id"] n=n+1 # print(v) # if v==None: # n += 1 # else: try: # it=self.tdb.str_dict(v) one={} # one['sentence']=" [kg] "+",".join(it['kg'])+" [/kg] "+it['sentence'] one['sentence']=it['sentence'] one['sentence_b']=",".join(it['kg']) one['label']=it['label']-1 if int(one['label']) in [0,1] and len(it['kg'])==3 and it.get('check')!=None and it.get('state')=='2': data.append(one) for i,sentence in enumerate( it['kg']): # print('111') if i!=2: continue new=self.random_text_clip(sentence) # print(new) if new not in it['kg']: new_one=it.copy() # print(new_one) new_one['kg'][i]=new one={} # one['sentence']=" [kg] "+",".join(new_one['kg'])+" [/kg] "+new_one['sentence'] one['sentence']=it['sentence'] one['sentence_b']=",".join(new_one['kg']) one['label']=0 data.append(one) # print('new_one',one) # for i in range(3): # # print('111') # new=self.random_text_clip(it['sentence']) # # print(new) # if new not in it['kg']: # new_one=it.copy() # # print(new_one) # new_one['kg'][i]=new # one={} # # one['sentence']=" [kg] "+",".join(new_one['kg'])+" [/kg] "+new_one['sentence'] # one['sentence']=it['sentence'] # one['sentence_b']=",".join(new_one['kg']) # one['label']=0 # data.append(one) # print('new_one',one) else: print(it) pass except: # self.tdb.load("kg") continue c=int(len(data)*0.85) print("总数据",len(data),i,n) kgjson_t.save(data[:c]) kgjson_d.save(data[c:]) #自动处理重复标记问题 self.json_remove_duplicates("../tdata/kg_check/train.json") self.json_remove_duplicates("../tdata/kg_check/dev.json") print("已经将数据导出到 ../tdata/kg_check")