def run_program(args): kg = utils.load_kg(args.dataset) kg_mask = KGMask(kg) train_labels = utils.load_labels(args.dataset, 'train') test_labels = utils.load_labels(args.dataset, 'test') path_counts = utils.load_path_count(args.dataset) # Training path freq with open(args.infer_path_data, 'rb') as f: raw_paths = pickle.load(f) # Test path with scores symbolic_model = create_symbolic_model(args, kg, train=False) program_exe = MetaProgramExecutor(symbolic_model, kg_mask, args) pred_labels = {} pbar = tqdm(total=len(test_labels)) for uid in test_labels: program = create_heuristic_program(kg.metapaths, raw_paths[uid], path_counts[uid], args.sample_size) program_exe.execute(program, uid, train_labels[uid]) paths = program_exe.collect_results(program) tmp = [(r[0][-1], np.mean(r[1][-1])) for r in paths] tmp = sorted(tmp, key=lambda x: x[1], reverse=True)[:10] pred_labels[uid] = [t[0] for t in tmp] pbar.update(1) msg = evaluate_with_insufficient_pred(pred_labels, test_labels) logger.info(msg)
def estimate_path_count(args): kg = utils.load_kg(args.dataset) num_mp = len(kg.metapaths) train_labels = utils.load_labels(args.dataset, 'train') counts = {} pbar = tqdm(total=len(train_labels)) for uid in train_labels: counts[uid] = np.zeros(num_mp) for pid in train_labels[uid]: for mpid in range(num_mp): cnt = kg.count_paths_with_target(mpid, uid, pid, 50) counts[uid][mpid] += cnt counts[uid] = counts[uid] / len(train_labels[uid]) pbar.update(1) utils.save_path_count(args.dataset, counts)
def main(args): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") n_user, n_item, train_rec, eval_rec, test_rec = load_rating() n_entity, n_relation, kg = load_kg() kg_data = (kg[:, 0], kg[:, 1], kg[:, 2]) rec_data = (train_rec[:, 0], train_rec[:, 1], train_rec[:, 2]) rec_val = (eval_rec[:, 0], eval_rec[:, 1], eval_rec[:, 2]) train_data_kg = TrainSet(kg_data) train_loader_kg = DataLoader(train_data_kg, batch_size=args.batch_size, shuffle=args.shuffle_train) train_data_rec = TrainSet(rec_data) eval_data_rec = TrainSet(rec_val) train_loader_rec = DataLoader(train_data_rec, batch_size=args.batch_size, shuffle=args.shuffle_train) eval_loader_rec = DataLoader(eval_data_rec, batch_size=args.batch_size, shuffle=args.shuffle_test) model = MultiKR(n_user + 1, n_item + 1, n_entity + 1, n_relation + 1, n_layer=args.n_layer, embed_dim=args.batch_size, hidden_layers=args.hidden_layers, dropouts=args.dropouts, output_rec=args.output_rec) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), weight_decay=args.weight_decay, lr=args.lr) loss_fuction = nn.BCEWithLogitsLoss() epochs = args.epochs # print('='*10+str(type(train_loader_kg))+"="*10) train_model(model, train_loader_rec, train_loader_kg, eval_loader_rec, optimizer, loss_fuction, epochs)
def infer_paths(args): kg = utils.load_kg(args.dataset) model = create_symbolic_model(args, kg, train=False) train_labels = utils.load_labels(args.dataset, 'train') train_uids = list(train_labels.keys()) kg_mask = KGMask(kg) predicts = {} pbar = tqdm(total=len(train_uids)) for uid in train_uids: predicts[uid] = {} for mpid in range(len(kg.metapaths)): metapath = kg.metapaths[mpid] paths = model.infer_with_path(metapath, uid, kg_mask, excluded_pids=train_labels[uid], topk_paths=20) predicts[uid][mpid] = paths pbar.update(1) with open(args.infer_path_data, 'wb') as f: pickle.dump(predicts, f)
def main(args): n_user, n_item, train_rec, eval_rec, test_rec = load_rating() n_entity, n_relation, kg = load_kg() kg_data = (kg[:, 0], kg[:, 1], kg[:, 2]) rec_data = (train_rec[:, 0], train_rec[:, 1], train_rec[:, 2]) rec_val = (eval_rec[:, 0], eval_rec[:, 1], eval_rec[:, 2]) train_data_kg = TrainSet(kg_data) train_loader_kg = DataLoader(train_data_kg, batch_size=args.batch_size, shuffle=args.shuffle_train) train_data_rec = TrainSet(rec_data) eval_data_rec = TrainSet(rec_val) train_loader_rec = DataLoader(train_data_rec, batch_size=args.batch_size, shuffle=args.shuffle_train) eval_loader_rec = DataLoader(eval_data_rec, batch_size=args.batch_size, shuffle=args.shuffle_test) model = MultiKR(n_user + 1, n_item + 1, n_entity + 1, n_relation + 1, n_layer=args.n_layer, embed_dim=args.batch_size, hidden_layers=args.hidden_layers, dropouts=args.dropouts, output_rec=args.output_rec) optimizer = torch.optim.Adam(model.parameters(), weight_decay=args.weight_decay, lr=args.lr) loss_function = nn.BCEWithLogitsLoss() epochs = args.epochs train_model(model, train_loader_rec, train_loader_kg, eval_loader_rec, optimizer, loss_function, epochs)
def main(args, model_path): print(os.getcwd()) print("start training ...") print(model_path) start = time.time() ent_str2id, ent_id2str, rel_str2id, rel_id2str = load_kg() print("making vocab is done " + str(time.time() - start)) n_ent, n_rel = len(ent_str2id), len(rel_str2id) model = ConvE(args, n_ent, n_rel) model.init() if args.multi_gpu: model = torch.nn.DataParallel(model) model.load_state_dict(torch.load(model_path)) model.cuda() print('cuda : ' + str(torch.cuda.is_available()) + ' count : ' + str(torch.cuda.device_count())) params = [value.numel() for value in model.parameters()] print(params) print(sum(params)) start = time.time() evalset = KG_EvalSet(dir + '/test_set.txt', args, n_ent) print("making evalset is done " + str(time.time() - start)) evalloader = DataLoader(dataset=evalset, num_workers=args.num_worker, batch_size=args.batch_size, shuffle=True) model.eval() with torch.no_grad(): start = time.time() ranking_and_hits(model, args, evalloader, n_ent, ent_id2str, rel_id2str) end = time.time() print('eval time used: {} minutes'.format((end - start) / 60))
def main(args, model_path): print (os.getcwd()) print ("start training ...") start = time.time() ent_str2id, ent_id2str, rel_str2id, rel_id2str = load_kg() print ("making vocab is done "+str(time.time()-start)) n_ent, n_rel = len(ent_str2id), len(rel_str2id) model = ConvE(args, n_ent, n_rel) model.init() if args.multi_gpu: model = torch.nn.DataParallel(model) bce = torch.nn.BCELoss().cuda() model.cuda() print ('cuda : ' + str(torch.cuda.is_available()) + ' count : ' + str(torch.cuda.device_count())) params = [value.numel() for value in model.parameters()] print(params) print(sum(params)) opt = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) start = time.time() dataset = KG_DataSet(dir+'/train_set.txt', args, n_ent) print ("making train dataset is done " + str(time.time()-start)) start = time.time() evalset = KG_EvalSet(dir+'/test_set.txt', args, n_ent) print ("making evalset is done " + str(time.time()-start)) prev_loss = 1000 patience = 0 early_stop = False best_loss = 1000 for epoch in range(args.epochs): print (epoch) epoch_loss = 0 epoch_start = time.time() model.train() tot = 0.0 dataloader = DataLoader(dataset=dataset, num_workers=args.num_worker, batch_size=args.batch_size, shuffle=True) evalloader = DataLoader(dataset=evalset, num_workers=args.num_worker, batch_size=args.batch_size, shuffle=True) n_train = dataset.__len__() for i, data in enumerate(dataloader): opt.zero_grad() start = time.time() head, rel, tail = data head = torch.LongTensor(head) rel = torch.LongTensor(rel) head = head.cuda() rel = rel.cuda() batch_size = head.size(0) e2_multi = tail.cuda() print ("e2_multi " + str(time.time()-start) + "\n") start = time.time() pred = model.forward(head, rel) loss = bce(pred, e2_multi) loss.backward() opt.step() batch_loss = torch.sum(loss) print ("step " + str(time.time()-start) + "\n") epoch_loss += batch_loss tot += head.size(0) print ('\r{:>10} epoch {} progress {} loss: {}\n'.format('', epoch, tot/n_train, batch_loss), end='') epoch_loss /= batch_size print ('') end = time.time() time_used = end - epoch_start print ('one epoch time: {} minutes'.format(time_used/60)) print ('{} epochs'.format(epoch)) print ('epoch {} loss: {}'.format(epoch+1, epoch_loss)) # TODO: calculate valid loss and develop early stopping model.eval() with torch.no_grad(): valid_loss = 0.0 for i,data in enumerate(evalloader): #head, rel, tail, head2, rel_rev, tail2 = data head, rel, tail, tail_idx = data head = torch.LongTensor(head) rel = torch.LongTensor(rel) #head2 = torch.LongTensor(head2) #rel_rev = torch.LongTensor(rel_rev) head = head.cuda() rel = rel.cuda() #head2 = head2.cuda() #rel_rev = rel_rev.cuda() batch_size = head.size(0) e2_multi1 = tail.cuda() #e2_multi2 = tail2.cuda() pred1 = model.forward(head, rel) #pred2 = model.forward(head2, rel_rev) loss1 = bce(pred1, e2_multi1) #loss2 = bce(pred2, e2_multi2) sum_loss = torch.sum(loss1).item() #sum_loss = (torch.sum(loss1).item() + torch.sum(loss2).item())/2 sum_loss /= batch_size valid_loss += sum_loss print ("valid loss : " + str(valid_loss)) with open(os.getcwd() + '/log_file/log.txt', 'a') as f: f.write(str(epoch) + " epochs valid loss : " + str(valid_loss) + "\n") if valid_loss > prev_loss: patience += 1 if patience > 2: early_stop = True else: patience = 0 prev_loss = valid_loss if early_stop: print("{0} epochs Early stopping ...".format(epoch)) break if valid_loss < best_loss: best_loss = valid_loss print ('saving to {0}'.format(model_path)) torch.save(model.state_dict(), model_path) model.eval() with torch.no_grad(): start = time.time() ranking_and_hits(model, args, evalloader, n_ent, epoch) end = time.time() print ('eval time used: {} minutes'.format((end - start)/60))
random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) # Load data # adj, features, labels, idx_train, idx_val, idx_test = load_data() # load sensor adj, features, labels, idx_train, idx_val, idx_test = load_data( path='./data/sensor/', dataset='sensor') # Load kg kg_adj, kg_features = load_kg(path='./data/sensor/', dataset='kg') # Model and optimizer if args.sparse: model = SpGAT(nfeat=features.shape[1], nhid=args.hidden, nclass=int(labels.max()) + 1, dropout=args.dropout, nheads=args.nb_heads, alpha=args.alpha) else: model = GAFT(nsize=features.shape[0], kgsize=kg_features.shape[0], nfeat=features.shape[1], kgfeat=kg_features.shape[1], nhid=args.hidden,