def build_dataset_kg_check(train_file, type="all"): """ 百度训练集转化为判断抽取知识是否是合理的 """ tjson = Tjson(file_path=train_file) # all_save=Tjson(file_path="data/train_all.json") tjson_save = Tjson(file_path="data/kg_check/train.json") dev_json_save = Tjson(file_path="data/kg_check/dev.json") data = [] i = 0 for item in tqdm(tjson.load()): for n in item['spo_list']: kg_one = [n['subject'], n['predicate'], n['object']] kg = ' [KG] ' + ",".join(kg_one) + " [/KG] " + item['text'] one = {'sentence': kg, 'label': 1} data.append(one) kg_one_list = list(",".join(kg_one)) shuffle(kg_one_list) # print(kg_one_list) if kg_one_list != list(",".join(kg_one)): kg = ' [KG] ' + "".join(kg_one_list) + " [/KG] " + item['text'] one = {'sentence': kg, 'label': 0} data.append(one) # print(data[10:]) if type == "all": pass elif type == "mini": data = data[:200] # all_save.save(data) f = int(len(data) * 0.85) tjson_save.save(data=data[:f]) dev_json_save.save(data=data[f:])
def build_dataset_ner(train_file, type="all"): """ 百度训练集 转化为标注数据集 实体标注和关系词抽取训练集 train_file 文件路径 type="all" 或者mini mini 构建数据思路 多个描述合并到一个训练里 使用ner提取出句子中的实体 文本: ner+句子 label: ['K']*len(ner)+正常标记 """ tjson = Tjson(file_path=train_file) all_save = Tjson(file_path="data/train_all.json") # tjson_save=Tjson(file_path="data/ner_train.json") # dev_json_save=Tjson(file_path="data/ner_dev.json") tjson_save = Tjson(file_path="data/ner_train.json") dev_json_save = Tjson(file_path="data/ner_dev.json") data = [] for item in tqdm(tjson.load()): text = item['text'] label = ["O"] * len(text) ner = {} for n in item['spo_list']: try: ner[n['subject']].append(n['predicate']) except: ner[n['subject']] = [n['predicate']] for nr in ner: s = 0 for n in ner[nr]: label, s1 = mark_word_label(text, label, n, "关系") if s1 >= 0: s = s + 1 if s > 0: one = { 'text': list(nr + '#' + text), 'label': ['K'] * len(nr) + ['X'] + label } data.append(one) # print(one) if type == "all": pass elif type == "mini": data = data[:200] # all_save.save(data) print("总共数据", len(data)) f = int(len(data) * 0.85) tjson_save.save(data=data[:f]) dev_json_save.save(data=data[f:])
def build_ner(input_file, path='./', tags=None, type='all'): d = _read_data(input_file) # tjson=Tjson(file_path=train_file) tjson_save = Tjson(file_path=path + "train.json") dev_json_save = Tjson(file_path=path + "dev.json") data = [] if tags == None: tags = {"<pad>": 1, "O": 1, "<start>": 1, "<eos>": 1} for item in tqdm(d): text = item[0] # print(text) # print(item['spo_list']) # predicate={} # for n in item['spo_list']: # predicate[n['predicate']]=[] # print(label) for label in item[0]: tags[label] = 1 if len(list(item[0])) == len(list(item[1])) and "M-描述" not in item[0]: lb = [] for l in item[0]: if l.endswith("关系") or l.endswith("实体") or l.endswith("O"): lb.append(l) elif l.endswith("属性"): lb.append(l.replace("属性", '关系')) else: lb.append("O") one = {"text": item[1], "label": lb} data.append(one) else: # print("pass") pass if type == "all": pass elif type == "mini": data = data[:200] # print(tags) # with open("data/tag.txt","w") as f: # f.write("\n".join(tags.keys())) f = int(len(data) * 0.85) tjson_save.save(data=data[:f]) dev_json_save.save(data=data[f:]) return tags
def ner_rebulid(): """ 将原有数据转化为标记数据 """ new_train = Tjson(file_path="data/train.json") new_dev = Tjson(file_path="data/dev.json") files = ["data/o/train.json", "data/o/dev.json"] data = [] for file in files: for line in Tjson(file_path=file).load(): # print("line",line['label']) new_label = {} for i, label in enumerate(line['label']): one = {} new_label[i] = label if i == 0: a = {'type': "实体", 'num': []} if label == "B-ORG": # 在为O时候处理 if len(a['num']) >= 2: for key, i_n in enumerate(a['num']): if key == 0: new_label[i_n] = "B-" + a['type'] elif key == len(a['num']) - 1: new_label[i_n] = "E-" + a['type'] else: new_label[i_n] = "M-" + a['type'] elif len(a['num']) == 1: new_label[a['num'][0]] = "S-" + a['type'] a = {'type': "实体", 'num': [i]} elif label == "I-ORG": a['num'].append(i) elif label == "B-PER": # 在为O时候处理 if len(a['num']) >= 2: for key, i_n in enumerate(a['num']): if key == 0: new_label[i_n] = "B-" + a['type'] elif key == len(a['num']) - 1: new_label[i_n] = "E-" + a['type'] else: new_label[i_n] = "M-" + a['type'] elif len(a['num']) == 1: new_label[a['num'][0]] = "S-" + a['type'] a = {'type': "实体", 'num': [i]} elif label == "I-PER": a['num'].append(i) elif label == "B-LOC": # 在为O时候处理 if len(a['num']) >= 2: for key, i_n in enumerate(a['num']): if key == 0: new_label[i_n] = "B-" + a['type'] elif key == len(a['num']) - 1: new_label[i_n] = "E-" + a['type'] else: new_label[i_n] = "M-" + a['type'] elif len(a['num']) == 1: new_label[a['num'][0]] = "S-" + a['type'] a = {'type': "地点", 'num': [i]} elif label == "I-LOC": a['num'].append(i) else: # 在为O时候处理 if len(a['num']) >= 2: for key, i_n in enumerate(a['num']): if key == 0: new_label[i_n] = "B-" + a['type'] elif key == len(a['num']) - 1: new_label[i_n] = "E-" + a['type'] else: new_label[i_n] = "M-" + a['type'] elif len(a['num']) == 1: new_label[a['num'][0]] = "S-" + a['type'] # a={'type':"实体",'num':[i]} labels = [] # print(new_label) tags = {} for l in new_label: labels.append(new_label[l]) # print(new_label[l]) tags[new_label[l]] = 0 if len(tags) > 1: one = {"text": line["text"], "label": labels} # print(one) data.append(one) f = int(len(data) * 0.85) new_train.save(data[:f]) new_dev.save(data[f:])
def build_dataset(train_file, type="all"): """ 百度训练集 train_file 文件路径 type="all" 或者mini mini """ tjson = Tjson(file_path=train_file) tjson_save = Tjson(file_path="data/train.json") dev_json_save = Tjson(file_path="data/dev.json") data = [] for item in tqdm(tjson.load()): text = item['text'] # print(text) # print(item['spo_list']) predicate = {} for n in item['spo_list']: predicate[n['predicate']] = [] for n in item['spo_list']: one = { "subject": n['subject'], "object": n['object'], } predicate[n['predicate']].append(one) # print(predicate) p_n = list(range(20)) # random.shuffle(p_n) label = ["O"] * len(text) for i, p in enumerate(predicate): # print('p',p) # print(predicate) # i=0 i = 0 # for m in predicate[p]: # start_a =text.find(m['subject']) # end_a=text.find(m['subject'])+len(m['subject']) # for n in range(start_a,end_a): # # label[n]='M_A_'+str(p_n[i]) # label[n]='M_A' # pass # start_a =text.find(m['object']) # end_a=text.find(m['object'])+len(m['object']) # for n in range(start_a,end_a): # # label[n]='M_B_'+str(p_n[i]) # label[n]='M_A' # pass start_p = text.find(p) end_p = text.find(p) + len(p) if start_p >= 0: for n in range(start_p, end_p): # label[n]='M_P_'+str(p_n[i]) label[n] = 'M_P' pass # print(label) if len(list(text)) == len(list(label)): one = {"text": list(text), "label": label} data.append(one) else: # print("pass") pass if type == "all": pass elif type == "mini": data = data[:200] f = int(len(data) * 0.85) tjson_save.save(data=data[:f]) dev_json_save.save(data=data[f:])
def build_dataset_kg(train_file, type="all"): """ 百度训练集 转化为标注数据集 train_file 文件路径 type="all" 或者mini mini 构建数据思路 多个描述合并到一个训练里 使用ner提取出句子中的实体 文本: ner+句子 label: ['K']*len(ner)+正常标记 """ tjson = Tjson(file_path=train_file) all_save = Tjson(file_path="data/train_all.json") tjson_save = Tjson(file_path="data/train.json") dev_json_save = Tjson(file_path="data/dev.json") data = [] i = 0 for item in tqdm(tjson.load()): # i=i+1 # if i==1000: # break # print(item) text = item['text'] # print(text) # print(item['spo_list']) predicate = {} for n in item['spo_list']: predicate[n['predicate']] = [] kgs = {} # s_n=0 for n in item['spo_list']: if kgs.get(n['subject']) == None: kgs[n['subject']] = {} label = ["O"] * len(text) # w=n['subject'] # label,s=mark_word_label(text,label,w,"实体") # w=n['predicate'] # label,s=mark_word_label(text,label,w,"关系") w = n['object'] label, s = mark_word_label(text, label, w, "描述") kgs[n['subject']][n['predicate']] = { "objects": [n['object']], 'label': label } elif kgs[n['subject']].get(n['predicate']) == None: label = ["O"] * len(text) # w=n['subject'] # label,s=mark_word_label(text,label,w,"实体") # w=n['predicate'] # label,s=mark_word_label(text,label,w,"关系") w = n['object'] label, s = mark_word_label(text, label, w, "描述") kgs[n['subject']][n['predicate']] = { "objects": [n['object']], 'label': label } else: label = kgs[n['subject']][n['predicate']]['label'] # w=n['subject'] # label,s=mark_word_label(text,label,w,"实体") # w=n['predicate'] # label,s=mark_word_label(text,label,w,"关系") w = n['object'] label, s = mark_word_label(text, label, w, "描述") kgs[n['subject']][n['predicate']]['objects'].append( n['object']) # if s>=0: # s_n=s_n+1 # mark_one(text,kgs) # print(kgs) for ner in kgs.keys(): for p in kgs[ner]: # print('####'*20) # print(kgs[ner][p]) # print(text) # print(kgs[ner][p]['label']) one = { "text": list(ner + '#' + p + '#' + text), 'label': len(ner) * ['K'] + ['X'] + len(p) * ['P'] + ['X'] + kgs[ner][p]['label'] } if len(one['text']) == len(one['label']): data.append(one) if type == "all": pass elif type == "mini": data = data[:200] all_save.save(data) f = int(len(data) * 0.85) tjson_save.save(data=data[:f]) dev_json_save.save(data=data[f:])