def generate(): # dataloader for training train_dataloader = TrainDataLoader(in_path='./data/kg/', nbatches=100, threads=8, sampling_mode="normal", bern_flag=1, filter_flag=1, neg_ent=25, neg_rel=0) # define the model transe = TransE(ent_tot=train_dataloader.get_ent_tot(), rel_tot=train_dataloader.get_rel_tot(), dim=Config.entity_embedding_dim, p_norm=1, norm_flag=True) transe.load_checkpoint('./data/kg/transe.ckpt') entity_embedding = transe.get_parameters()['ent_embeddings.weight'] entity_embedding[0] = 0 np.save('./data/kg/entity.npy', entity_embedding) context_embedding = np.empty_like(entity_embedding) context_embedding[0] = 0 relation = pd.read_table('./data/sub_kg/triple2id.txt', header=None)[[0, 1]] entity = pd.read_table('./data/sub_kg/entity2name.txt', header=None)[[0]].to_numpy().flatten() for e in entity: df = pd.concat( [relation[relation[0] == e], relation[relation[1] == e]]) context = list(set(np.append(df.to_numpy().flatten(), e))) context_embedding[e] = np.mean(entity_embedding[context, :], axis=0) np.save('./data/kg/context.npy', context_embedding)
norm_flag=True, rand_init=False) model_r = NegativeSampling(model=transr, loss=MarginLoss(margin=4.0), batch_size=train_dataloader.get_batch_size()) # pretrain transe # trainer = Trainer(model = model_e, data_loader = train_dataloader, train_times = 1000, alpha = 0.5, use_gpu = False) trainer = Trainer(model=model_e, data_loader=train_dataloader, train_times=1000, alpha=1.0, use_gpu=False) trainer.run() parameters = transe.get_parameters() transe.save_parameters("./result/transr_transe.json") # train transr # transr.set_parameters(parameters) transr.ent_embeddings = transe.ent_embeddings trainer = Trainer(model=model_r, data_loader=train_dataloader, train_times=1000, alpha=0.1, use_gpu=False) trainer.run() transr.save_checkpoint('./checkpoint/transr.ckpt') epoch = trainer.epoch loss = trainer.loss
sampling_mode="normal", bern_flag=1, filter_flag=1, neg_ent=25, neg_rel=5) # define the model transe = TransE(ent_tot=train_dataloader.get_ent_tot(), rel_tot=train_dataloader.get_rel_tot(), dim=200, p_norm=2, norm_flag=True) save_path = os.path.join('checkpoint', phase, 'transe.ckpt') transe.load_checkpoint(save_path) rel_emb = transe.get_parameters()['rel_embeddings.weight'] ent_emb = transe.get_parameters()['ent_embeddings.weight'] e_emb, r_emb = dict(), dict() with open(entity2id_path, 'r', encoding='utf-8') as f: next(f) for line in f: tmp = line.split('\t') entity = ''.join(tmp[:-1]) e_emb[entity] = ent_emb[int(tmp[1]), :] with open(relation2id_path, 'r', encoding='utf-8') as f: next(f) for line in f: tmp = line.split('\t') r_emb[tmp[0]] = rel_emb[int(tmp[1]), :]
# define the loss function model = NegativeSampling(model=transe, loss=MarginLoss(), batch_size=train_dataloader.get_batch_size()) # train the model trainer = Trainer(model=model, data_loader=train_dataloader, train_times=1000, alpha=0.01, use_gpu=True, opt_method='adagrad') trainer.run() transe.save_checkpoint('./checkpoint/transe.ckpt') embeddings = transe.get_parameters() directory = './models/dbpedia50/transe300/' pathlib.Path(directory).mkdir(exist_ok=True, parents=True) other_name_map = { 'ent_embeddings.weight': 'entities.p', 'rel_embeddings.weight': 'relations.p' } def save_torch_embedding_as_numpy(embedding, filename): with open(filename, "wb") as f: pickle.dump(embedding, f) for emb_name, filename in other_name_map.items():