def build_dataset_gpt2(train_file, type="all"): """ 百度训练集 train_file 文件路径 type="all" 或者mini mini """ tjson = Tjson(file_path=train_file) tjson_save = Tjson(file_path="data/train.json") dev_json_save = Tjson(file_path="data/dev.json") data = [] f = open('data/gpt2kg.txt', 'a') for item in tqdm(tjson.load()): text = item['text'] # print(text) # print(item['spo_list']) predicate = {} kg = " [KGS] " for n in item['spo_list']: # predicate[n['predicate']]=[] # print(n) # print(n) kg = kg + ' [KG] ' + n['subject'] + "," + n['predicate'] + "," + n[ 'object'] + " [/KG] " pass # data=text+str(item['spo_list']) data = text + kg + " [KGE] " print("***" * 10) print(data) f.write(data + '\n\n') f.close()
def saoke(): saoke = Tjson(file_path="data/SAOKE_DATA.json") i = 0 for line in saoke.load(): print("###" * 20) print(line) print(line['natural']) for logic in line['logic']: print(logic) print(logic['predicate']) print(logic['qualifier']) # print(logic['object']) for object in logic['object']: print(object) print(logic['place']) print(logic['time']) print(logic['subject']) i = i + 1 if i > 10: exit()
def build_ner(input_file, path='./', tags=None, type='all'): d = _read_data(input_file) # tjson=Tjson(file_path=train_file) tjson_save = Tjson(file_path=path + "train.json") dev_json_save = Tjson(file_path=path + "dev.json") data = [] if tags == None: tags = {"<pad>": 1, "O": 1, "<start>": 1, "<eos>": 1} for item in tqdm(d): text = item[0] # print(text) # print(item['spo_list']) # predicate={} # for n in item['spo_list']: # predicate[n['predicate']]=[] # print(label) for label in item[0]: tags[label] = 1 if len(list(item[0])) == len(list(item[1])) and "M-描述" not in item[0]: lb = [] for l in item[0]: if l.endswith("关系") or l.endswith("实体") or l.endswith("O"): lb.append(l) elif l.endswith("属性"): lb.append(l.replace("属性", '关系')) else: lb.append("O") one = {"text": item[1], "label": lb} data.append(one) else: # print("pass") pass if type == "all": pass elif type == "mini": data = data[:200] # print(tags) # with open("data/tag.txt","w") as f: # f.write("\n".join(tags.keys())) f = int(len(data) * 0.85) tjson_save.save(data=data[:f]) dev_json_save.save(data=data[f:]) return tags
def load_data(): file = "data/SAOKE_DATA.json" for line in Tjson(file_path=file).load(): print("##" * 30) # print("line",line) print(line['natural']) # print(line['natural']) for it in line['logic']: print(it) # subject =line['natural'].find(it['subject']) # print(subject) print(find_srt(line['natural'], it['subject'])) print(find_srt(line['natural'], it['predicate'])) print(find_srt(line['natural'], it['object'][0])) print(it['subject'], it['predicate'], it['object'])
def ner_rebulid(): """ 将原有数据转化为标记数据 """ new_train = Tjson(file_path="data/train.json") new_dev = Tjson(file_path="data/dev.json") files = ["data/o/train.json", "data/o/dev.json"] data = [] for file in files: for line in Tjson(file_path=file).load(): # print("line",line['label']) new_label = {} for i, label in enumerate(line['label']): one = {} new_label[i] = label if i == 0: a = {'type': "实体", 'num': []} if label == "B-ORG": # 在为O时候处理 if len(a['num']) >= 2: for key, i_n in enumerate(a['num']): if key == 0: new_label[i_n] = "B-" + a['type'] elif key == len(a['num']) - 1: new_label[i_n] = "E-" + a['type'] else: new_label[i_n] = "M-" + a['type'] elif len(a['num']) == 1: new_label[a['num'][0]] = "S-" + a['type'] a = {'type': "实体", 'num': [i]} elif label == "I-ORG": a['num'].append(i) elif label == "B-PER": # 在为O时候处理 if len(a['num']) >= 2: for key, i_n in enumerate(a['num']): if key == 0: new_label[i_n] = "B-" + a['type'] elif key == len(a['num']) - 1: new_label[i_n] = "E-" + a['type'] else: new_label[i_n] = "M-" + a['type'] elif len(a['num']) == 1: new_label[a['num'][0]] = "S-" + a['type'] a = {'type': "实体", 'num': [i]} elif label == "I-PER": a['num'].append(i) elif label == "B-LOC": # 在为O时候处理 if len(a['num']) >= 2: for key, i_n in enumerate(a['num']): if key == 0: new_label[i_n] = "B-" + a['type'] elif key == len(a['num']) - 1: new_label[i_n] = "E-" + a['type'] else: new_label[i_n] = "M-" + a['type'] elif len(a['num']) == 1: new_label[a['num'][0]] = "S-" + a['type'] a = {'type': "地点", 'num': [i]} elif label == "I-LOC": a['num'].append(i) else: # 在为O时候处理 if len(a['num']) >= 2: for key, i_n in enumerate(a['num']): if key == 0: new_label[i_n] = "B-" + a['type'] elif key == len(a['num']) - 1: new_label[i_n] = "E-" + a['type'] else: new_label[i_n] = "M-" + a['type'] elif len(a['num']) == 1: new_label[a['num'][0]] = "S-" + a['type'] # a={'type':"实体",'num':[i]} labels = [] # print(new_label) tags = {} for l in new_label: labels.append(new_label[l]) # print(new_label[l]) tags[new_label[l]] = 0 if len(tags) > 1: one = {"text": line["text"], "label": labels} # print(one) data.append(one) f = int(len(data) * 0.85) new_train.save(data[:f]) new_dev.save(data[f:])
def build_dataset_ner(train_file, type="all"): """ 百度训练集 转化为标注数据集 实体标注和关系词抽取训练集 train_file 文件路径 type="all" 或者mini mini 构建数据思路 多个描述合并到一个训练里 使用ner提取出句子中的实体 文本: ner+句子 label: ['K']*len(ner)+正常标记 """ tjson = Tjson(file_path=train_file) all_save = Tjson(file_path="data/train_all.json") # tjson_save=Tjson(file_path="data/ner_train.json") # dev_json_save=Tjson(file_path="data/ner_dev.json") tjson_save = Tjson(file_path="data/ner_train.json") dev_json_save = Tjson(file_path="data/ner_dev.json") data = [] for item in tqdm(tjson.load()): text = item['text'] label = ["O"] * len(text) ner = {} for n in item['spo_list']: try: ner[n['subject']].append(n['predicate']) except: ner[n['subject']] = [n['predicate']] for nr in ner: s = 0 for n in ner[nr]: label, s1 = mark_word_label(text, label, n, "关系") if s1 >= 0: s = s + 1 if s > 0: one = { 'text': list(nr + '#' + text), 'label': ['K'] * len(nr) + ['X'] + label } data.append(one) # print(one) if type == "all": pass elif type == "mini": data = data[:200] # all_save.save(data) print("总共数据", len(data)) f = int(len(data) * 0.85) tjson_save.save(data=data[:f]) dev_json_save.save(data=data[f:])
def build_dataset_kg_check(train_file, type="all"): """ 百度训练集转化为判断抽取知识是否是合理的 """ tjson = Tjson(file_path=train_file) # all_save=Tjson(file_path="data/train_all.json") tjson_save = Tjson(file_path="data/kg_check/train.json") dev_json_save = Tjson(file_path="data/kg_check/dev.json") data = [] i = 0 for item in tqdm(tjson.load()): for n in item['spo_list']: kg_one = [n['subject'], n['predicate'], n['object']] kg = ' [KG] ' + ",".join(kg_one) + " [/KG] " + item['text'] one = {'sentence': kg, 'label': 1} data.append(one) kg_one_list = list(",".join(kg_one)) shuffle(kg_one_list) # print(kg_one_list) if kg_one_list != list(",".join(kg_one)): kg = ' [KG] ' + "".join(kg_one_list) + " [/KG] " + item['text'] one = {'sentence': kg, 'label': 0} data.append(one) # print(data[10:]) if type == "all": pass elif type == "mini": data = data[:200] # all_save.save(data) f = int(len(data) * 0.85) tjson_save.save(data=data[:f]) dev_json_save.save(data=data[f:])
def build_dataset(train_file, type="all"): """ 百度训练集 train_file 文件路径 type="all" 或者mini mini """ tjson = Tjson(file_path=train_file) tjson_save = Tjson(file_path="data/train.json") dev_json_save = Tjson(file_path="data/dev.json") data = [] for item in tqdm(tjson.load()): text = item['text'] # print(text) # print(item['spo_list']) predicate = {} for n in item['spo_list']: predicate[n['predicate']] = [] for n in item['spo_list']: one = { "subject": n['subject'], "object": n['object'], } predicate[n['predicate']].append(one) # print(predicate) p_n = list(range(20)) # random.shuffle(p_n) label = ["O"] * len(text) for i, p in enumerate(predicate): # print('p',p) # print(predicate) # i=0 i = 0 # for m in predicate[p]: # start_a =text.find(m['subject']) # end_a=text.find(m['subject'])+len(m['subject']) # for n in range(start_a,end_a): # # label[n]='M_A_'+str(p_n[i]) # label[n]='M_A' # pass # start_a =text.find(m['object']) # end_a=text.find(m['object'])+len(m['object']) # for n in range(start_a,end_a): # # label[n]='M_B_'+str(p_n[i]) # label[n]='M_A' # pass start_p = text.find(p) end_p = text.find(p) + len(p) if start_p >= 0: for n in range(start_p, end_p): # label[n]='M_P_'+str(p_n[i]) label[n] = 'M_P' pass # print(label) if len(list(text)) == len(list(label)): one = {"text": list(text), "label": label} data.append(one) else: # print("pass") pass if type == "all": pass elif type == "mini": data = data[:200] f = int(len(data) * 0.85) tjson_save.save(data=data[:f]) dev_json_save.save(data=data[f:])
def build_dataset_kg(train_file, type="all"): """ 百度训练集 转化为标注数据集 train_file 文件路径 type="all" 或者mini mini 构建数据思路 多个描述合并到一个训练里 使用ner提取出句子中的实体 文本: ner+句子 label: ['K']*len(ner)+正常标记 """ tjson = Tjson(file_path=train_file) all_save = Tjson(file_path="data/train_all.json") tjson_save = Tjson(file_path="data/train.json") dev_json_save = Tjson(file_path="data/dev.json") data = [] i = 0 for item in tqdm(tjson.load()): # i=i+1 # if i==1000: # break # print(item) text = item['text'] # print(text) # print(item['spo_list']) predicate = {} for n in item['spo_list']: predicate[n['predicate']] = [] kgs = {} # s_n=0 for n in item['spo_list']: if kgs.get(n['subject']) == None: kgs[n['subject']] = {} label = ["O"] * len(text) # w=n['subject'] # label,s=mark_word_label(text,label,w,"实体") # w=n['predicate'] # label,s=mark_word_label(text,label,w,"关系") w = n['object'] label, s = mark_word_label(text, label, w, "描述") kgs[n['subject']][n['predicate']] = { "objects": [n['object']], 'label': label } elif kgs[n['subject']].get(n['predicate']) == None: label = ["O"] * len(text) # w=n['subject'] # label,s=mark_word_label(text,label,w,"实体") # w=n['predicate'] # label,s=mark_word_label(text,label,w,"关系") w = n['object'] label, s = mark_word_label(text, label, w, "描述") kgs[n['subject']][n['predicate']] = { "objects": [n['object']], 'label': label } else: label = kgs[n['subject']][n['predicate']]['label'] # w=n['subject'] # label,s=mark_word_label(text,label,w,"实体") # w=n['predicate'] # label,s=mark_word_label(text,label,w,"关系") w = n['object'] label, s = mark_word_label(text, label, w, "描述") kgs[n['subject']][n['predicate']]['objects'].append( n['object']) # if s>=0: # s_n=s_n+1 # mark_one(text,kgs) # print(kgs) for ner in kgs.keys(): for p in kgs[ner]: # print('####'*20) # print(kgs[ner][p]) # print(text) # print(kgs[ner][p]['label']) one = { "text": list(ner + '#' + p + '#' + text), 'label': len(ner) * ['K'] + ['X'] + len(p) * ['P'] + ['X'] + kgs[ner][p]['label'] } if len(one['text']) == len(one['label']): data.append(one) if type == "all": pass elif type == "mini": data = data[:200] all_save.save(data) f = int(len(data) * 0.85) tjson_save.save(data=data[:f]) dev_json_save.save(data=data[f:])
def test(): """ 执行预测 """ config = Config() # config.update(**kwargs) print('当前设置为:\n', config) if config.use_cuda: torch.cuda.set_device(config.gpu) print('loading corpus') vocab = load_vocab(config.vocab) label_dic = load_vocab(config.label_file) tagset_size = len(label_dic) # content=["柯 基 犬 是 个 小 狗 子"] content = list( "威尔士柯基犬(welsh corgi pembroke)是一种小型犬,它们的胆子很大,也相当机警,能高度警惕地守护家园,是最受欢迎的小型护卫犬之一。" ) content = " ".join(content) dev_json_save = Tjson(file_path="data/dev.json") data = [] for item in dev_json_save.load(): print("#########" * 5) content = " ".join(item['text']) print(content) print(item['label']) input_data = build_input(content=[content], max_length=config.max_length, vocab=vocab) input_ids = torch.LongTensor([temp.input_id for temp in input_data]) input_masks = torch.LongTensor( [temp.input_mask for temp in input_data]) input_dataset = TensorDataset(input_ids, input_masks) input_loader = DataLoader(input_dataset, shuffle=True, batch_size=config.batch_size) model = BERT_LSTM_CRF(config.bert_path, tagset_size, config.bert_embedding, config.rnn_hidden, config.rnn_layer, dropout_ratio=config.dropout_ratio, dropout1=config.dropout1, use_cuda=config.use_cuda) if config.load_model: assert config.load_path is not None # model = load_model(model, name=config.load_path) # model = load_model(model, name='result/pytorch_model.bin') if config.use_cuda: model.cuda() # model.train() for i, batch in enumerate(input_loader): inputs, masks = batch # print('inputs',inputs) inputs, masks = Variable(inputs), Variable(masks) # print("masks",masks) if config.use_cuda: inputs, masks = inputs.cuda(), masks.cuda() feats = model(inputs) # print("feats",feats) path_score, best_path = model.crf(feats, masks.bool()) print("feats", path_score, best_path) for item in best_path.numpy(): # print(item.tolist()) words = [] for i, id in enumerate(item.tolist()): word_id = inputs.numpy().tolist()[0][i] words.append((list(vocab)[word_id], list(label_dic)[id])) print('words', words)