def load_test_data( ) -> Tuple[pyg.torch_geometric.data.Data, pyg.torch_geometric.data.Data, Dict[ str, torch.Tensor], Dict[str, torch.Tensor]]: ''' Returns Tuple valid_graph Graph containing all training edges test_graph Graph containing all training edges, plus validation edges valid_edges Dict of positive and negative edges from validation edge split (not in train_graph) test_edges Dict of positive and negative edges from test edge split (not in valid_graph) ''' dataset = PygLinkPropPredDataset(name='ogbl-ddi') transform = T.ToSparseTensor(False) edge_split = dataset.get_edge_split() valid_edges = edge_split['valid'] test_edges = edge_split['test'] valid_graph = dataset[0] test_graph = valid_graph.clone() # Add validation edges to valid_graph for test inference valid_edge_index = torch.cat([ test_graph.edge_index, valid_edges['edge'].T, valid_edges['edge'][:, [1, 0]].T ], dim=1) test_graph.edge_index = valid_edge_index valid_graph = transform(valid_graph) test_graph = transform(test_graph) return valid_graph, test_graph, valid_edges, test_edges
def load_training_data() -> Tuple[pyg.data.Data, pyg.data.Data, Dict[ str, torch.Tensor], Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: ''' Returns Tuple train_graph Graph containing a subset of the training edges valid_graph Graph containing all training edges train_edges Dict of positive edges across entire train split eval_edges Dict of positive edges from the training edges set that aren't in eval_graph valid_edges Dict of positive and negative edges not in train_graph. ''' dataset = PygLinkPropPredDataset(name='ogbl-ddi') transform = T.ToSparseTensor(False) edge_split = dataset.get_edge_split() train_edges = edge_split['train'] valid_edges = edge_split['valid'] train_graph = dataset[0] valid_graph = train_graph.clone() # Partition training edges torch.manual_seed(12345) perm = torch.randperm(train_edges['edge'].shape[0]) eval_idxs, train_idxs = perm[:valid_edges['edge']. shape[0]], perm[valid_edges['edge'].shape[0]:] eval_edges = {'edge': train_edges['edge'][eval_idxs]} train_edges = {'edge': train_edges['edge'][train_idxs]} # Update graph object to have subset of edges and adj_t matrix train_edge_index = torch.cat( [train_edges['edge'].T, train_edges['edge'][:, [1, 0]].T], dim=1) train_graph.edge_index = train_edge_index train_graph = transform(train_graph) valid_graph = transform(valid_graph) return train_graph, valid_graph, edge_split[ 'train'], eval_edges, valid_edges
def main(): args = ArgsInit().args if args.use_gpu: device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available( ) else torch.device("cpu") else: device = torch.device('cpu') dataset = PygLinkPropPredDataset(name=args.dataset) data = dataset[0] # Data(edge_index=[2, 2358104], edge_weight=[2358104, 1], edge_year=[2358104, 1], x=[235868, 128]) split_edge = dataset.get_edge_split() evaluator = Evaluator(args.dataset) x = data.x.to(device) edge_index = data.edge_index.to(device) args.in_channels = data.x.size(-1) args.num_tasks = 1 print(args) model = DeeperGCN(args).to(device) predictor = LinkPredictor(args).to(device) model.load_state_dict(torch.load(args.model_load_path)['model_state_dict']) model.to(device) predictor.load_state_dict( torch.load(args.predictor_load_path)['model_state_dict']) predictor.to(device) hits = ['Hits@10', 'Hits@50', 'Hits@100'] result = test(model, predictor, x, edge_index, split_edge, evaluator, args.batch_size) for k in hits: train_result, valid_result, test_result = result[k] print('{}--Train: {}, Validation: {}, Test: {}'.format( k, train_result, valid_result, test_result))
def main(): parser = argparse.ArgumentParser(description='OGBL-Citation (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_sage', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0) parser.add_argument('--batch_size', type=int, default=64 * 1024) parser.add_argument('--lr', type=float, default=0.0005) parser.add_argument('--epochs', type=int, default=50) parser.add_argument('--eval_steps', type=int, default=1) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygLinkPropPredDataset(name='ogbl-citation') split_edge = dataset.get_edge_split() data = dataset[0] # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596] split_edge['eval_train'] = { 'source_node': split_edge['train']['source_node'][idx], 'target_node': split_edge['train']['target_node'][idx], 'target_node_neg': split_edge['valid']['target_node_neg'], } x = data.x.to(device) edge_index = data.edge_index.to(device) edge_index = to_undirected(edge_index, data.num_nodes) adj = SparseTensor(row=edge_index[0], col=edge_index[1]) if args.use_sage: model = SAGE(x.size(-1), args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout).to(device) else: model = GCN(x.size(-1), args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout).to(device) # Pre-compute GCN normalization. adj = adj.set_value(None) adj = adj.set_diag() deg = adj.sum(dim=1) deg_inv_sqrt = deg.pow(-0.5) deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0 adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbl-citation') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, x, adj, split_edge, optimizer, args.batch_size) print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}') if epoch % args.eval_steps == 0: result = test(model, predictor, x, adj, split_edge, evaluator, args.batch_size) logger.add_result(run, result) if epoch % args.log_steps == 0: train_mrr, valid_mrr, test_mrr = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {train_mrr:.4f}, ' f'Valid: {valid_mrr:.4f}, ' f'Test: {test_mrr:.4f}') logger.print_statistics(run) logger.print_statistics()
def main(): parser = argparse.ArgumentParser(description='OGBL-DDI') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--model', type=str, default='MAD_GCN', choices=[ 'GCN_Linear', 'SAGE_Linear', 'MAD_GCN', 'MAD_SAGE', 'MAD_Model' ]) parser.add_argument('--train_batch_size', type=int, default=4096) parser.add_argument('--test_batch_size', type=int, default=1024) parser.add_argument('--lr', type=float, default=0.005) parser.add_argument('--epochs', type=int, default=100) parser.add_argument('--eval_steps', type=int, default=5) parser.add_argument('--runs', type=int, default=5) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygLinkPropPredDataset(name='ogbl-ddi', transform=T.ToSparseTensor()) data = dataset[0] adj_t = data.adj_t.to(device) split_edge = dataset.get_edge_split() # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge['train']['edge'].size(0)) idx = idx[:split_edge['valid']['edge'].size(0)] split_edge['eval_train'] = {'edge': split_edge['train']['edge'][idx]} model = models.get_model(args.model)(data.num_nodes, adj_t).to(device) print(f"Parameters: {count_parameters(model)}") evaluator = Evaluator(name='ogbl-ddi') loggers = { 'Hits@10': Logger(args.runs, args), 'Hits@20': Logger(args.runs, args), 'Hits@30': Logger(args.runs, args), } for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, adj_t, split_edge, optimizer, args.train_batch_size) if epoch % args.eval_steps == 0: results = test(model, adj_t, split_edge, evaluator, args.test_batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%') print('---') print(f'Finished epoch {epoch}') for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
def main(): parser = argparse.ArgumentParser(description="OGBL-Citation2 (GNN)") parser.add_argument("--device", type=int, default=0) parser.add_argument("--log_steps", type=int, default=1) parser.add_argument("--use_sage", action="store_true") parser.add_argument("--num_layers", type=int, default=3) parser.add_argument("--hidden_channels", type=int, default=256) parser.add_argument("--dropout", type=float, default=0) parser.add_argument("--batch_size", type=int, default=64 * 1024) parser.add_argument("--lr", type=float, default=0.0005) parser.add_argument("--epochs", type=int, default=50) parser.add_argument("--eval_steps", type=int, default=1) parser.add_argument("--runs", type=int, default=10) args = parser.parse_args() print(args) device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu" device = torch.device(device) dataset = PygLinkPropPredDataset(name="ogbl-citation2", transform=T.ToSparseTensor()) data = dataset[0] data.adj_t = data.adj_t.to_symmetric() data = data.to(device) split_edge = dataset.get_edge_split() # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge["train"]["source_node"].numel())[:86596] split_edge["eval_train"] = { "source_node": split_edge["train"]["source_node"][idx], "target_node": split_edge["train"]["target_node"][idx], "target_node_neg": split_edge["valid"]["target_node_neg"], } if args.use_sage: model = SAGE( data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, ).to(device) else: model = GCN( data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, ).to(device) # Pre-compute GCN normalization. adj_t = data.adj_t.set_diag() deg = adj_t.sum(dim=1).to(torch.float) deg_inv_sqrt = deg.pow(-0.5) deg_inv_sqrt[deg_inv_sqrt == float("inf")] = 0 adj_t = deg_inv_sqrt.view(-1, 1) * adj_t * deg_inv_sqrt.view(1, -1) data.adj_t = adj_t predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name="ogbl-citation2") logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, data, split_edge, optimizer, args.batch_size) print(f"Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}") if epoch % args.eval_steps == 0: result = test(model, predictor, data, split_edge, evaluator, args.batch_size) logger.add_result(run, result) if epoch % args.log_steps == 0: train_mrr, valid_mrr, test_mrr = result print(f"Run: {run + 1:02d}, " f"Epoch: {epoch:02d}, " f"Loss: {loss:.4f}, " f"Train: {train_mrr:.4f}, " f"Valid: {valid_mrr:.4f}, " f"Test: {test_mrr:.4f}") print("GraphSAGE" if args.use_sage else "GCN") logger.print_statistics(run) print("GraphSAGE" if args.use_sage else "GCN") logger.print_statistics()
def main_get_mask(args, imp_num): device = torch.device("cuda:" + str(args.device)) dataset = PygLinkPropPredDataset(name=args.dataset) data = dataset[0] # Data(edge_index=[2, 2358104], edge_weight=[2358104, 1], edge_year=[2358104, 1], x=[235868, 128]) split_edge = dataset.get_edge_split() evaluator = Evaluator(args.dataset) x = data.x.to(device) edge_index = data.edge_index.to(device) args.in_channels = data.x.size(-1) args.num_tasks = 1 model = DeeperGCN(args).to(device) pruning.add_mask(model, args) for name, param in model.named_parameters(): if 'mask' in name: param.requires_grad = False predictor = LinkPredictor(args).to(device) optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) results = {'epoch': 0} keys = ['highest_valid', 'final_train', 'final_test', 'highest_train'] hits = ['Hits@10', 'Hits@50', 'Hits@100'] for key in keys: results[key] = {k: 0 for k in hits} start_epoch = 1 for epoch in range(start_epoch, args.mask_epochs + 1): t0 = time.time() epoch_loss = train.train_fixed(model, predictor, x, edge_index, split_edge, optimizer, args.batch_size, args) result = train.test(model, predictor, x, edge_index, split_edge, evaluator, args.batch_size, args) k = 'Hits@50' train_result, valid_result, test_result = result[k] if train_result > results['highest_train'][k]: results['highest_train'][k] = train_result if valid_result > results['highest_valid'][k]: results['highest_valid'][k] = valid_result results['final_train'][k] = train_result results['final_test'][k] = test_result results['epoch'] = epoch epoch_time = (time.time() - t0) / 60 print( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' | ' + 'IMP:[{}] (GET Mask) Epoch:[{}/{}] LOSS:[{:.4f}] Train :[{:.2f}] Valid:[{:.2f}] Test:[{:.2f}] | Update Test:[{:.2f}] at epoch:[{}] Time:[{:.2f}min]' .format(imp_num, epoch, args.mask_epochs, epoch_loss, train_result * 100, valid_result * 100, test_result * 100, results['final_test'][k] * 100, results['epoch'], epoch_time)) print('-' * 100) print( "syd : IMP:[{}] (FIX Mask) Final Result Train:[{:.2f}] Valid:[{:.2f}] Test:[{:.2f}]" .format(imp_num, results['final_train'][k] * 100, results['highest_valid'][k] * 100, results['final_test'][k] * 100)) print('-' * 100)
def main(): parser = argparse.ArgumentParser(description='OGBL-COLLAB (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=64 * 1024) parser.add_argument('--lr', type=float, default=5e-4) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--eval_steps', type=int, default=1) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--k', type=int, default=100) parser.add_argument('--gpu_id', type=int, default=0) args = parser.parse_args() print(args) device = gpu_setup(args.gpu_id) dataset = PygLinkPropPredDataset(name='ogbl-collab') data = dataset[0] data.edge_weight = data.edge_weight.view(-1).to(torch.float) data = T.ToSparseTensor()(data) data = data.to(device) split_edge = dataset.get_edge_split() model = GCNWithAttention(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, args.k).to(device) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) print("model parameters {}".format( sum(p.numel() for p in model.parameters()))) print("predictor parameters {}".format( sum(p.numel() for p in predictor.parameters()))) print("total parameters {}".format( sum(p.numel() for p in model.parameters()) + sum(p.numel() for p in predictor.parameters()))) evaluator = Evaluator(name='ogbl-collab') loggers = { 'Hits@10': Logger(args.runs, args), 'Hits@50': Logger(args.runs, args), 'Hits@100': Logger(args.runs, args), } for run in range(args.runs): model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, data, split_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test(model, predictor, data, split_edge, evaluator, args.batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%') print('---') for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
def main(): parser = argparse.ArgumentParser(description="OGBL-Citation2 (MLP)") parser.add_argument("--device", type=int, default=0) parser.add_argument("--log_steps", type=int, default=1) parser.add_argument("--use_node_embedding", action="store_true") parser.add_argument("--num_layers", type=int, default=3) parser.add_argument("--hidden_channels", type=int, default=256) parser.add_argument("--dropout", type=float, default=0.0) parser.add_argument("--batch_size", type=int, default=64 * 1024) parser.add_argument("--lr", type=float, default=0.01) parser.add_argument("--epochs", type=int, default=100) parser.add_argument("--eval_steps", type=int, default=10) parser.add_argument("--runs", type=int, default=10) args = parser.parse_args() print(args) device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu" device = torch.device(device) dataset = PygLinkPropPredDataset(name="ogbl-citation2") split_edge = dataset.get_edge_split() data = dataset[0] # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge["train"]["source_node"].numel())[:86596] split_edge["eval_train"] = { "source_node": split_edge["train"]["source_node"][idx], "target_node": split_edge["train"]["target_node"][idx], "target_node_neg": split_edge["valid"]["target_node_neg"], } x = data.x if args.use_node_embedding: embedding = torch.load("embedding.pt", map_location="cpu") x = torch.cat([x, embedding], dim=-1) x = x.to(device) predictor = LinkPredictor(x.size(-1), args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name="ogbl-citation2") logger = Logger(args.runs, args) for run in range(args.runs): predictor.reset_parameters() optimizer = torch.optim.Adam(predictor.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(predictor, x, split_edge, optimizer, args.batch_size) print(f"Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}") if epoch % args.eval_steps == 0: result = test(predictor, x, split_edge, evaluator, args.batch_size) logger.add_result(run, result) if epoch % args.log_steps == 0: train_mrr, valid_mrr, test_mrr = result print(f"Run: {run + 1:02d}, " f"Epoch: {epoch:02d}, " f"Loss: {loss:.4f}, " f"Train: {train_mrr:.4f}, " f"Valid: {valid_mrr:.4f}, " f"Test: {test_mrr:.4f}") print("Node2vec" if args.use_node_embedding else "MLP") logger.print_statistics(run) print("Node2vec" if args.use_node_embedding else "MLP") logger.print_statistics()
def main(): parser = argparse.ArgumentParser(description="OGBL-DDI (GNN)") parser.add_argument("--device", type=int, default=0) parser.add_argument("--log_steps", type=int, default=1) parser.add_argument("--use_sage", action="store_true") parser.add_argument("--num_layers", type=int, default=2) parser.add_argument("--hidden_channels", type=int, default=256) parser.add_argument("--dropout", type=float, default=0.5) parser.add_argument("--batch_size", type=int, default=64 * 1024) parser.add_argument("--lr", type=float, default=0.005) parser.add_argument("--epochs", type=int, default=200) parser.add_argument("--eval_steps", type=int, default=5) parser.add_argument("--runs", type=int, default=10) args = parser.parse_args() print(args) device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu" device = torch.device(device) dataset = PygLinkPropPredDataset(name="ogbl-ddi", transform=T.ToSparseTensor()) data = dataset[0] adj_t = data.adj_t.to(device) split_edge = dataset.get_edge_split() # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge["train"]["edge"].size(0)) idx = idx[: split_edge["valid"]["edge"].size(0)] split_edge["eval_train"] = {"edge": split_edge["train"]["edge"][idx]} if args.use_sage: model = SAGE( args.hidden_channels, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, ).to(device) else: model = GCN( args.hidden_channels, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, ).to(device) emb = torch.nn.Embedding(data.num_nodes, args.hidden_channels).to(device) predictor = LinkPredictor( args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout ).to(device) evaluator = Evaluator(name="ogbl-ddi") loggers = { "Hits@10": Logger(args.runs, args), "Hits@20": Logger(args.runs, args), "Hits@30": Logger(args.runs, args), } for run in range(args.runs): torch.nn.init.xavier_uniform_(emb.weight) model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam( list(model.parameters()) + list(emb.parameters()) + list(predictor.parameters()), lr=args.lr, ) for epoch in range(1, 1 + args.epochs): loss = train( model, predictor, emb.weight, adj_t, split_edge, optimizer, args.batch_size, ) if epoch % args.eval_steps == 0: results = test( model, predictor, emb.weight, adj_t, split_edge, evaluator, args.batch_size, ) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print( f"Run: {run + 1:02d}, " f"Epoch: {epoch:02d}, " f"Loss: {loss:.4f}, " f"Train: {100 * train_hits:.2f}%, " f"Valid: {100 * valid_hits:.2f}%, " f"Test: {100 * test_hits:.2f}%" ) print("---") for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
def main(): parser = argparse.ArgumentParser(description="OGBL-COLLAB (GNN)") parser.add_argument("--device", type=int, default=0) parser.add_argument("--log_steps", type=int, default=1) parser.add_argument("--use_sage", action="store_true") parser.add_argument("--use_valedges_as_input", action="store_true") parser.add_argument("--num_layers", type=int, default=3) parser.add_argument("--hidden_channels", type=int, default=256) parser.add_argument("--dropout", type=float, default=0.0) parser.add_argument("--batch_size", type=int, default=64 * 1024) parser.add_argument("--lr", type=float, default=0.001) parser.add_argument("--epochs", type=int, default=400) parser.add_argument("--eval_steps", type=int, default=1) parser.add_argument("--runs", type=int, default=1) parser.add_argument("--seed",type=int,default=1) args = parser.parse_args() print(args) device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu" device = torch.device(device) dataset = PygLinkPropPredDataset(name="ogbl-collab") data = dataset[0] edge_index = data.edge_index data.edge_weight = data.edge_weight.view(-1).to(torch.float) data = T.ToSparseTensor()(data) split_edge = dataset.get_edge_split() # Use training + validation edges for inference on test set. if args.use_valedges_as_input: val_edge_index = split_edge["valid"]["edge"].t() full_edge_index = torch.cat([edge_index, val_edge_index], dim=-1) data.full_adj_t = SparseTensor.from_edge_index(full_edge_index).t() data.full_adj_t = data.full_adj_t.to_symmetric() else: data.full_adj_t = data.adj_t data = data.to(device) if args.use_sage: model = SAGE( data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, ).to(device) else: model = GCN( data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, ).to(device) predictor = LinkPredictor( args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout ).to(device) evaluator = Evaluator(name="ogbl-collab") loggers = { "Hits@10": Logger(args.runs, args), "Hits@50": Logger(args.runs, args), "Hits@100": Logger(args.runs, args), } for run in tqdm(range(args.runs)): torch.manual_seed(args.seed + run) np.random.seed(args.seed+run) model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam( list(model.parameters()) + list(predictor.parameters()), lr=args.lr ) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, data, split_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test( model, predictor, data, split_edge, evaluator, args.batch_size ) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print( f"Run: {run + 1:02d}, " f"Epoch: {epoch:02d}, " f"Loss: {loss:.4f}, " f"Train: {100 * train_hits:.2f}%, " f"Valid: {100 * valid_hits:.2f}%, " f"Test: {100 * test_hits:.2f}%" ) print("---") for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
def main(): parser = argparse.ArgumentParser(description='OGBL-COLLAB (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_sage', action='store_true') parser.add_argument('--use_valedges_as_input', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=64 * 1024) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=400) parser.add_argument('--eval_steps', type=int, default=1) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygLinkPropPredDataset(name='ogbl-collab') data = dataset[0] edge_index = data.edge_index data.edge_weight = data.edge_weight.view(-1).to(torch.float) data = T.ToSparseTensor()(data) split_edge = dataset.get_edge_split() # Use training + validation edges for inference on test set. if args.use_valedges_as_input: val_edge_index = split_edge['valid']['edge'].t() full_edge_index = torch.cat([edge_index, val_edge_index], dim=-1) data.full_adj_t = SparseTensor.from_edge_index(full_edge_index).t() data.full_adj_t = data.full_adj_t.to_symmetric() else: data.full_adj_t = data.adj_t data = data.to(device) if args.use_sage: model = SAGE(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout).to(device) else: model = GCN(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout).to(device) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbl-collab') loggers = { 'Hits@10': Logger(args.runs, args), 'Hits@50': Logger(args.runs, args), 'Hits@100': Logger(args.runs, args), } for run in range(args.runs): model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, data, split_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test(model, predictor, data, split_edge, evaluator, args.batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%') print('---') for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
def main(): parser = argparse.ArgumentParser(description='OGBL-Citation2 (MF)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--std', type=float, default=0.2) parser.add_argument('--hidden_channels', type=int, default=128) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--batch_size', type=int, default=64 * 1024) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=300) parser.add_argument('--eval_steps', type=int, default=10) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygLinkPropPredDataset(name='ogbl-citation2') split_edge = dataset.get_edge_split() data = dataset[0] # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596] split_edge['eval_train'] = { 'source_node': split_edge['train']['source_node'][idx], 'target_node': split_edge['train']['target_node'][idx], 'target_node_neg': split_edge['valid']['target_node_neg'], } emb = torch.nn.Embedding(data.num_nodes, 96).to(device) predictor = LinkPredictor(96, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbl-citation2') logger = Logger(args.runs, args) for run in range(args.runs): torch.nn.init.normal_(emb.weight, std=args.std) predictor.reset_parameters() optimizer = torch.optim.Adam(list(emb.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(predictor, emb.weight, split_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: result = test(predictor, emb.weight, split_edge, evaluator, args.batch_size) logger.add_result(run, result) if epoch % args.log_steps == 0: train_mrr, valid_mrr, test_mrr = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {train_mrr:.4f}, ' f'Valid: {valid_mrr:.4f}, ' f'Test: {test_mrr:.4f}') print('MF') logger.print_statistics(run) print('MF') logger.print_statistics()
def main(): parser = argparse.ArgumentParser(description="OGBL-PPA (GNN)") parser.add_argument("--device", type=int, default=0) parser.add_argument("--log_steps", type=int, default=1) parser.add_argument("--use_node_embedding", action="store_true") parser.add_argument("--use_sage", action="store_true") parser.add_argument("--num_layers", type=int, default=3) parser.add_argument("--hidden_channels", type=int, default=256) parser.add_argument("--dropout", type=float, default=0.0) parser.add_argument("--batch_size", type=int, default=64 * 1024) parser.add_argument("--lr", type=float, default=0.01) parser.add_argument("--epochs", type=int, default=20) parser.add_argument("--eval_steps", type=int, default=1) parser.add_argument("--runs", type=int, default=10) args = parser.parse_args() print(args) device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu" device = torch.device(device) dataset = PygLinkPropPredDataset(name="ogbl-ppa", transform=T.ToSparseTensor()) data = dataset[0] data.x = data.x.to(torch.float) if args.use_node_embedding: data.x = torch.cat([data.x, torch.load("embedding.pt")], dim=-1) data = data.to(device) split_edge = dataset.get_edge_split() if args.use_sage: model = SAGE( data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, ).to(device) else: model = GCN( data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, ).to(device) # Pre-compute GCN normalization. adj_t = data.adj_t.set_diag() deg = adj_t.sum(dim=1).to(torch.float) deg_inv_sqrt = deg.pow(-0.5) deg_inv_sqrt[deg_inv_sqrt == float("inf")] = 0 adj_t = deg_inv_sqrt.view(-1, 1) * adj_t * deg_inv_sqrt.view(1, -1) data.adj_t = adj_t predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name="ogbl-ppa") loggers = { "Hits@10": Logger(args.runs, args), "Hits@50": Logger(args.runs, args), "Hits@100": Logger(args.runs, args), } for run in range(args.runs): model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, data, split_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test(model, predictor, data, split_edge, evaluator, args.batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f"Run: {run + 1:02d}, " f"Epoch: {epoch:02d}, " f"Loss: {loss:.4f}, " f"Train: {100 * train_hits:.2f}%, " f"Valid: {100 * valid_hits:.2f}%, " f"Test: {100 * test_hits:.2f}%") for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
def main(): parser = argparse.ArgumentParser(description='OGBL-DDI (MLP)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--batch_size', type=int, default=64 * 1024) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--eval_steps', type=int, default=5) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygLinkPropPredDataset(name='ogbl-ddi') data = dataset[0] split_edge = dataset.get_edge_split() # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge['train']['edge'].size(0)) idx = idx[:split_edge['valid']['edge'].size(0)] split_edge['eval_train'] = {'edge': split_edge['train']['edge'][idx]} x = torch.load('embedding.pt', map_location='cpu').to(device) predictor = LinkPredictor(x.size(-1), args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbl-ddi') loggers = { 'Hits@10': Logger(args.runs, args), 'Hits@20': Logger(args.runs, args), 'Hits@30': Logger(args.runs, args), } for run in range(args.runs): predictor.reset_parameters() optimizer = torch.optim.Adam(predictor.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(predictor, x, data.edge_index, split_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test(predictor, x, split_edge, evaluator, args.batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%') print('---') for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
def process_PygLinkDataset_homo(self, dataset: PygLinkPropPredDataset): data = dataset[0] self._name = dataset.name self.head_node_type = "entity" self.metapaths = [(self.head_node_type, "default", self.head_node_type) ] self.edge_index_dict = {self.metapaths[0]: data.edge_index} self.num_nodes_dict = self.get_num_nodes_dict(self.edge_index_dict) self.node_types = list(self.num_nodes_dict.keys()) if hasattr(data, "x") and data.x is not None: self.x_dict = {self.head_node_type: data.x} elif hasattr(data, "x_dict") and data.x_dict is not None: self.x_dict = data.x_dict else: self.x_dict = {} self.node_attr_shape = { node_type: x.size(1) for node_type, x in self.x_dict.items() } split_idx = dataset.get_edge_split() train_triples, valid_triples, test_triples = split_idx[ "train"], split_idx["valid"], split_idx["test"] self.edge_index = [] self.edge_index.extend( [valid_triples["edge"], valid_triples["edge_neg"]]) self.edge_index.extend( [test_triples["edge"], test_triples["edge_neg"]]) self.edge_index.extend([train_triples["edge"]]) self.edge_index = torch.cat(self.edge_index, dim=0) self.edge_reltype = [] self.edge_reltype.extend([ torch.ones( valid_triples["edge"].size(0)), # Ones correspond to pos edges torch.zeros(valid_triples["edge_neg"].size( 0)), # Zeroes correspond to neg edges torch.ones(test_triples["edge"].size(0)), torch.zeros(test_triples["edge_neg"].size(0)), torch.ones(train_triples["edge"].size(0)) ]) self.edge_reltype = torch.cat(self.edge_reltype, dim=0).to(torch.int) # Build train/test/valid idx self.start_idx = { "valid": 0, "test": len(valid_triples["edge"]) + len(valid_triples["edge_neg"]) } self.start_idx["train"] = self.start_idx["test"] + len( test_triples["edge"]) + len(test_triples["edge_neg"]) self.validation_idx = torch.arange(self.start_idx["valid"], self.start_idx["test"]) self.testing_idx = torch.arange(self.start_idx["test"], self.start_idx["train"]) self.training_idx = torch.arange( self.start_idx["train"], self.start_idx["train"] + train_triples["edge"].size(0)) assert self.validation_idx.max() < self.testing_idx.min() assert self.testing_idx.max() < self.training_idx.min()
def main_get_mask(args, imp_num, rewind_weight_mask=None, rewind_predict_weight=None, resume_train_ckpt=None): device = torch.device("cuda:" + str(args.device)) dataset = PygLinkPropPredDataset(name=args.dataset) data = dataset[0] # Data(edge_index=[2, 2358104], edge_weight=[2358104, 1], edge_year=[2358104, 1], x=[235868, 128]) split_edge = dataset.get_edge_split() evaluator = Evaluator(args.dataset) x = data.x.to(device) edge_index = data.edge_index.to(device) args.in_channels = data.x.size(-1) args.num_tasks = 1 model = DeeperGCN(args).to(device) pruning.add_mask(model, args) predictor = LinkPredictor(args).to(device) pruning.add_trainable_mask_noise(model, args, c=1e-4) optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) results = {'epoch': 0 } keys = ['highest_valid', 'final_train', 'final_test', 'highest_train'] hits = ['Hits@10', 'Hits@50', 'Hits@100'] for key in keys: results[key] = {k: 0 for k in hits} start_epoch = 1 if resume_train_ckpt: start_epoch = resume_train_ckpt['epoch'] rewind_weight_mask = resume_train_ckpt['rewind_weight_mask'] ori_model_dict = model.state_dict() over_lap = {k : v for k, v in resume_train_ckpt['model_state_dict'].items() if k in ori_model_dict.keys()} ori_model_dict.update(over_lap) model.load_state_dict(ori_model_dict) print("Resume at IMP:[{}] epoch:[{}] len:[{}/{}]!".format(imp_num, resume_train_ckpt['epoch'], len(over_lap.keys()), len(ori_model_dict.keys()))) optimizer.load_state_dict(resume_train_ckpt['optimizer_state_dict']) adj_spar, wei_spar = pruning.print_sparsity(model, args) else: rewind_weight_mask = copy.deepcopy(model.state_dict()) rewind_predict_weight = copy.deepcopy(predictor.state_dict()) for epoch in range(start_epoch, args.mask_epochs + 1): t0 = time.time() epoch_loss, prune_info_dict = train.train_mask(model, predictor, x, edge_index, split_edge, optimizer, args) result = train.test(model, predictor, x, edge_index, split_edge, evaluator, args.batch_size, args) k = 'Hits@50' train_result, valid_result, test_result = result[k] if train_result > results['highest_train'][k]: results['highest_train'][k] = train_result if valid_result > results['highest_valid'][k]: results['highest_valid'][k] = valid_result results['final_train'][k] = train_result results['final_test'][k] = test_result results['epoch'] = epoch pruning.save_all(model, predictor, rewind_weight_mask, optimizer, imp_num, epoch, args.model_save_path, 'IMP{}_train_ckpt'.format(imp_num)) epoch_time = (time.time() - t0) / 60 print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' | ' + 'IMP:[{}] (GET Mask) Epoch:[{}/{}] LOSS:[{:.4f}] Train :[{:.2f}] Valid:[{:.2f}] Test:[{:.2f}] | Update Test:[{:.2f}] at epoch:[{}] | Adj[{:.3f}%] Wei[{:.3f}%] Time:[{:.2f}min]' .format(imp_num, epoch, args.mask_epochs, epoch_loss, train_result * 100, valid_result * 100, test_result * 100, results['final_test'][k] * 100, results['epoch'], prune_info_dict['adj_spar'], prune_info_dict['wei_spar'], epoch_time)) rewind_weight_mask, adj_spar, wei_spar = pruning.change(rewind_weight_mask, model, args) print('-' * 100) print("INFO : IMP:[{}] (GET MASK) Final Result Train:[{:.2f}] Valid:[{:.2f}] Test:[{:.2f}] | Adj:[{:.3f}%] Wei:[{:.3f}%]" .format(imp_num, results['final_train'][k] * 100, results['highest_valid'][k] * 100, results['final_test'][k] * 100, adj_spar, wei_spar)) print('-' * 100) return rewind_weight_mask, rewind_predict_weight
def main(): # Training settings parser = argparse.ArgumentParser(description='DEA-JKNet on ogbl-ddi') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument('--jk_mode', type=str, default="max", help='JKNet aggregation method [max, mean, lstm, sum, cat] (default: max)') parser.add_argument('--remove_batchnorm', action='store_true', help='remove batchnorm layers') parser.add_argument('--use_stored_x', action='store_true', help='use precomputed distance encoding and node statistics') parser.add_argument('--embed_dim', type=int, default=256, help='initial node embedding dimension') parser.add_argument('--gnn_hidden_dim', type=int, default=256, help='GNN(DEA) hidden layer dimension (default: 256)') parser.add_argument('--gnn_num_layers', type=int, default=3, help='number of GNN(DEA) message passing layers (default: 3, must be greater than 1)') parser.add_argument('--k', type=int, default=2, help='GNN(DEA) number of hops') parser.add_argument('--mlp_hidden_dim', type=int, default=256, help='linear hidden layer dimension for edge prediction (default: 256)') parser.add_argument('--mlp_num_layers', type=int, default=2, help='number of linear layers for edge prediction (default: 2, must be greater than 1)') parser.add_argument('--dropout', type=float, default=0.5, help='dropout rate (default: 0.5)') parser.add_argument('--batch_size', type=int, default=1024*64, help='input batch size for training (default: 1024*64)') parser.add_argument('--epochs', type=int, default=400, help='number of epochs to train (default: 400)') parser.add_argument('--runs', type=int, default=10, help='number of runs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.005, help='Adam learning rate (default: 0.005') args = parser.parse_args() print(args) device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") print('Device: {}'.format(device)) dataset = PygLinkPropPredDataset(name='ogbl-ddi') graph = dataset[0] split_idx = dataset.get_edge_split() evaluator = Evaluator(name='ogbl-ddi') edge_index = graph.edge_index adj_t = torch_sparse.SparseTensor.from_edge_index(edge_index) print('Train edges:', split_idx['train']['edge'].shape) print('Val edges:', split_idx['valid']['edge'].shape) print('Val negative edges:', split_idx['valid']['edge_neg'].shape) print('Test edges:', split_idx['test']['edge'].shape) print('Test negative edges:', split_idx['test']['edge_neg'].shape) if not args.use_stored_x: print('start computing extra features (~30 min):') nx_graph = to_networkx(graph, to_undirected=True) nx_degree = nx.degree(nx_graph) nx_pagerank = nx.pagerank(nx_graph) nx_clustering = nx.clustering(nx_graph) nx_centrality = nx.closeness_centrality(nx_graph) # S=200 nodes # np.random.seed(0) node_subset = np.random.choice(nx_graph.number_of_nodes(), size=200, replace=False) spd_feature = get_spd_matrix(G=nx_graph, S=node_subset, max_spd=5) # S = all nodes lp_feature = get_lp_matrix(adj_t.to_dense(), max_steps=5) # Convert to tensor tensor_degree = torch.Tensor([t[1] for t in nx_degree]).unsqueeze(1) tensor_pagerank = torch.Tensor([t[1] for t in nx_pagerank.items()]).unsqueeze(1) tensor_clustering = torch.Tensor([t[1] for t in nx_clustering.items()]).unsqueeze(1) tensor_centrality = torch.Tensor([t[1] for t in nx_centrality.items()]).unsqueeze(1) tensor_spd = torch.Tensor(spd_feature) tensor_lp = torch.Tensor(lp_feature) # Concat feature_tensor_list = [tensor_degree, tensor_pagerank, tensor_clustering, tensor_centrality, tensor_spd, tensor_lp] x_feature = torch.cat(feature_tensor_list, dim=1) print('extra feature shape:', x_feature.shape) else: print('load extra features:') x_df = pd.read_csv('x_feature.csv') x_feature_numpy = x_df.to_numpy() x_feature = torch.Tensor(x_feature_numpy) print('extra feature shape:', x_feature.shape) # Normalize to 0-1 x_max = torch.max(x_feature, dim=0, keepdim=True)[0] x_min = torch.min(x_feature, dim=0, keepdim=True)[0] x_feature = (x_feature - x_min) / (x_max - x_min + 1e-6) edge_index = edge_index.to(device) adj_t = adj_t.to(device) x_feature = x_feature.to(device) gnn_in_dim = args.embed_dim + x_feature.shape[1] if args.jk_mode == 'cat': mlp_in_dim = args.gnn_hidden_dim * args.gnn_num_layers else: mlp_in_dim = args.gnn_hidden_dim model = DEA_GNN_JK(num_nodes=graph.num_nodes, embed_dim=args.embed_dim, gnn_in_dim=gnn_in_dim, gnn_hidden_dim=args.gnn_hidden_dim, gnn_out_dim=args.gnn_hidden_dim, gnn_num_layers=args.gnn_num_layers, mlp_in_dim=mlp_in_dim, mlp_hidden_dim=args.mlp_hidden_dim, mlp_out_dim=1, mlp_num_layers=args.mlp_num_layers, dropout=args.dropout, gnn_batchnorm=not args.remove_batchnorm, mlp_batchnorm=not args.remove_batchnorm, K=args.k, jk_mode=args.jk_mode).to(device) print(model) print('Number of parameters:', sum(p.numel() for p in model.parameters())) # Multiple runs RUNS = args.runs best_val_scores = np.zeros((RUNS,)) best_test_scores = np.zeros((RUNS,)) for i in range(RUNS): random.seed(i + 1) torch.manual_seed(i + 1) model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) result = train(model, optimizer, evaluator, graph, x_feature, edge_index, adj_t, split_idx, device=device, batch_size=args.batch_size, num_epochs=args.epochs, save_model=False) best_val_scores[i] = result[1] best_test_scores[i] = result[2] print('Run', i + 1, 'done.') log = 'Mean Val Hits: {:.4f}, SD Val Hits: {:.4f}' print(log.format(np.mean(best_val_scores), np.std(best_val_scores, ddof=1))) log = 'Mean Test Hits: {:.4f}, SD Test Hits: {:.4f}' print(log.format(np.mean(best_test_scores), np.std(best_test_scores, ddof=1)))
def main(): parser = argparse.ArgumentParser(description='OGBL-Citation2 (GraphSAINT)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=16 * 1024) parser.add_argument('--walk_length', type=int, default=3) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--num_steps', type=int, default=100) parser.add_argument('--eval_steps', type=int, default=10) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygLinkPropPredDataset(name='ogbl-citation2') split_edge = dataset.get_edge_split() data = dataset[0] data.edge_index = to_undirected(data.edge_index, data.num_nodes) loader = GraphSAINTRandomWalkSampler(data, batch_size=args.batch_size, walk_length=args.walk_length, num_steps=args.num_steps, sample_coverage=0, save_dir=dataset.processed_dir) # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596] split_edge['eval_train'] = { 'source_node': split_edge['train']['source_node'][idx], 'target_node': split_edge['train']['target_node'][idx], 'target_node_neg': split_edge['valid']['target_node_neg'], } model = GCN(data.x.size(-1), args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout).to(device) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbl-citation2') logger = Logger(args.runs, args) run_idx = 0 while run_idx < args.runs: model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) run_success = True for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, loader, optimizer, device) print( f'Run: {run_idx + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}' ) if loss > 2.: run_success = False logger.reset(run_idx) print('Learning failed. Rerun...') break if epoch > 49 and epoch % args.eval_steps == 0: result = test(model, predictor, data, split_edge, evaluator, batch_size=64 * 1024, device=device) logger.add_result(run_idx, result) train_mrr, valid_mrr, test_mrr = result print(f'Run: {run_idx + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {train_mrr:.4f}, ' f'Valid: {valid_mrr:.4f}, ' f'Test: {test_mrr:.4f}') print('GraphSAINT') if run_success: logger.print_statistics(run_idx) run_idx += 1 print('GraphSAINT') logger.print_statistics()
def main(): parser = argparse.ArgumentParser(description="OGBL-PPA (MF)") parser.add_argument("--device", type=int, default=0) parser.add_argument("--log_steps", type=int, default=1) parser.add_argument("--num_layers", type=int, default=3) parser.add_argument("--hidden_channels", type=int, default=256) parser.add_argument("--dropout", type=float, default=0.0) parser.add_argument("--batch_size", type=int, default=64 * 1024) parser.add_argument("--lr", type=float, default=0.005) parser.add_argument("--epochs", type=int, default=1000) parser.add_argument("--eval_steps", type=int, default=1) parser.add_argument("--runs", type=int, default=10) args = parser.parse_args() print(args) device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu" device = torch.device(device) dataset = PygLinkPropPredDataset(name="ogbl-ppa") split_edge = dataset.get_edge_split() data = dataset[0] emb = torch.nn.Embedding(data.num_nodes, args.hidden_channels).to(device) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name="ogbl-ppa") loggers = { "Hits@10": Logger(args.runs, args), "Hits@50": Logger(args.runs, args), "Hits@100": Logger(args.runs, args), } for run in range(args.runs): emb.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(emb.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(emb.weight, predictor, split_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test(emb.weight, predictor, split_edge, evaluator, args.batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f"Run: {run + 1:02d}, " f"Epoch: {epoch:02d}, " f"Loss: {loss:.4f}, " f"Train: {100 * train_hits:.2f}%, " f"Valid: {100 * valid_hits:.2f}%, " f"Test: {100 * test_hits:.2f}%") for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
if not args.keep_old: # Backup python files. copy('seal_link_pred.py', args.res_dir) copy('utils.py', args.res_dir) log_file = os.path.join(args.res_dir, 'log.txt') # Save command line input. cmd_input = 'python ' + ' '.join(sys.argv) + '\n' with open(os.path.join(args.res_dir, 'cmd_input.txt'), 'a') as f: f.write(cmd_input) print('Command line input: ' + cmd_input + ' is saved.') with open(log_file, 'a') as f: f.write('\n' + cmd_input) if args.dataset.startswith('ogbl'): dataset = PygLinkPropPredDataset(name=args.dataset) split_edge = dataset.get_edge_split() data = dataset[0] else: path = osp.join('dataset', args.dataset) dataset = Planetoid(path, args.dataset) split_edge = do_edge_split(dataset) print(split_edge) data = dataset[0] #data.edge_index = split_edge['train']['edge'].t() if args.dataset.startswith('ogbl-citation'): args.eval_metric = 'mrr' directed = True elif args.dataset.startswith('ogbl-bvg'): # args.eval_metric = 'auc'
def main(): parser = argparse.ArgumentParser(description='OGBL-PPA (Full-Batch)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_node_embedding', action='store_true') parser.add_argument('--use_sage', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=64 * 1024) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=20) parser.add_argument('--eval_steps', type=int, default=1) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygLinkPropPredDataset(name='ogbl-ppa') data = dataset[0] splitted_edge = dataset.get_edge_split() if args.use_node_embedding: x = data.x.to(torch.float) x = torch.cat([x, torch.load('embedding.pt')], dim=-1) x = x.to(device) else: x = data.x.to(torch.float).to(device) edge_index = data.edge_index.to(device) adj = SparseTensor(row=edge_index[0], col=edge_index[1]) if args.use_sage: model = SAGE(x.size(-1), args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout).to(device) else: model = GCN(x.size(-1), args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout).to(device) # Pre-compute GCN normalization. adj = adj.set_diag() deg = adj.sum(dim=1).to(torch.float) deg_inv_sqrt = deg.pow(-0.5) deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0 adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbl-ppa') loggers = { 'Hits@10': Logger(args.runs, args), 'Hits@50': Logger(args.runs, args), 'Hits@100': Logger(args.runs, args), } for run in range(args.runs): model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam( list(model.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, x, adj, splitted_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test(model, predictor, x, adj, splitted_edge, evaluator, args.batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%') for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
def main(): parser = argparse.ArgumentParser(description='OGBL-Citation (Cluster-GCN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_partitions', type=int, default=15000) parser.add_argument('--num_workers', type=int, default=12) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--eval_steps', type=int, default=10) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygLinkPropPredDataset(name='ogbl-citation') split_edge = dataset.get_edge_split() data = dataset[0] data.edge_index = to_undirected(data.edge_index, data.num_nodes) cluster_data = ClusterData(data, num_parts=args.num_partitions, recursive=False, save_dir=dataset.processed_dir) loader = ClusterLoader(cluster_data, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596] split_edge['eval_train'] = { 'source_node': split_edge['train']['source_node'][idx], 'target_node': split_edge['train']['target_node'][idx], 'target_node_neg': split_edge['valid']['target_node_neg'], } model = GCN(data.x.size(-1), args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout).to(device) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbl-citation') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam( list(model.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, loader, optimizer, device) print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}') if epoch > 49 and epoch % args.eval_steps == 0: result = test(model, predictor, data, split_edge, evaluator, batch_size=64 * 1024, device=device) logger.add_result(run, result) train_mrr, valid_mrr, test_mrr = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {train_mrr:.4f}, ' f'Valid: {valid_mrr:.4f}, ' f'Test: {test_mrr:.4f}') logger.print_statistics(run) logger.print_statistics()
def main(): parser = argparse.ArgumentParser(description='OGBL-PPA (MLP)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=64 * 1024) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=20) parser.add_argument('--eval_steps', type=int, default=1) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygLinkPropPredDataset(name='ogbl-ppa') splitted_edge = dataset.get_edge_split() data = dataset[0] x = data.x.to(torch.float) embedding = torch.load('embedding.pt', map_location='cpu') x = torch.cat([x, embedding], dim=-1) x = x.to(device) predictor = LinkPredictor(x.size(-1), args.hidden_channels, 1, args.num_layers, args.dropout).to(device) optimizer = torch.optim.Adam(predictor.parameters(), lr=args.lr) evaluator = Evaluator(name='ogbl-ppa') loggers = { 'Hits@10': Logger(args.runs, args), 'Hits@50': Logger(args.runs, args), 'Hits@100': Logger(args.runs, args), } for run in range(args.runs): predictor.reset_parameters() for epoch in range(1, 1 + args.epochs): loss = train(predictor, x, splitted_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test(predictor, x, splitted_edge, evaluator, args.batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%') for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
def main(): parser = argparse.ArgumentParser(description="OGBL-COLLAB (MLP)") parser.add_argument("--device", type=int, default=0) parser.add_argument("--log_steps", type=int, default=1) parser.add_argument("--use_node_embedding", action="store_true") parser.add_argument("--num_layers", type=int, default=3) parser.add_argument("--hidden_channels", type=int, default=256) parser.add_argument("--dropout", type=float, default=0.0) parser.add_argument("--batch_size", type=int, default=64 * 1024) parser.add_argument("--lr", type=float, default=0.01) parser.add_argument("--epochs", type=int, default=200) parser.add_argument("--eval_steps", type=int, default=1) parser.add_argument("--runs", type=int, default=10) args = parser.parse_args() print(args) device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu" device = torch.device(device) dataset = PygLinkPropPredDataset(name="ogbl-collab") split_edge = dataset.get_edge_split() data = dataset[0] x = data.x if args.use_node_embedding: embedding = torch.load("embedding.pt", map_location="cpu") x = torch.cat([x, embedding], dim=-1) x = x.to(device) predictor = LinkPredictor(x.size(-1), args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name="ogbl-collab") loggers = { "Hits@10": Logger(args.runs, args), "Hits@50": Logger(args.runs, args), "Hits@100": Logger(args.runs, args), } for run in range(args.runs): predictor.reset_parameters() optimizer = torch.optim.Adam(predictor.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(predictor, x, split_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test(predictor, x, split_edge, evaluator, args.batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f"Run: {run + 1:02d}, " f"Epoch: {epoch:02d}, " f"Loss: {loss:.4f}, " f"Train: {100 * train_hits:.2f}%, " f"Valid: {100 * valid_hits:.2f}%, " f"Test: {100 * test_hits:.2f}%") print("---") for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
def main(): args = ArgsInit().save_exp() if args.use_tensor_board: writer = SummaryWriter(log_dir=args.save) if args.use_gpu: device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available( ) else torch.device("cpu") else: device = torch.device('cpu') dataset = PygLinkPropPredDataset(name=args.dataset) data = dataset[0] # Data(edge_index=[2, 2358104], edge_weight=[2358104, 1], edge_year=[2358104, 1], x=[235868, 128]) split_edge = dataset.get_edge_split() evaluator = Evaluator(args.dataset) x = data.x.to(device) edge_index = data.edge_index.to(device) args.in_channels = data.x.size(-1) args.num_tasks = 1 logging.info('%s' % args) model = DeeperGCN(args).to(device) predictor = LinkPredictor(args).to(device) logging.info(model) logging.info(predictor) optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) results = {} keys = ['highest_valid', 'final_train', 'final_test', 'highest_train'] hits = ['Hits@10', 'Hits@50', 'Hits@100'] for key in keys: results[key] = {k: 0 for k in hits} start_time = time.time() for epoch in range(1, args.epochs + 1): epoch_loss = train(model, predictor, x, edge_index, split_edge, optimizer, args.batch_size) logging.info('Epoch {}, training loss {:.4f}'.format( epoch, epoch_loss)) model.print_params(epoch=epoch) result = test(model, predictor, x, edge_index, split_edge, evaluator, args.batch_size) for k in hits: # return a tuple train_result, valid_result, test_result = result[k] if args.use_tensor_board and k == 'Hits@50': writer.add_scalar('stats/train_loss', epoch_loss, epoch) writer.add_scalar('stats/train_Hits@50', train_result, epoch) writer.add_scalar('stats/valid_Hits@50', valid_result, epoch) writer.add_scalar('stats/test_Hits@50', test_result, epoch) if train_result > results['highest_train'][k]: results['highest_train'][k] = train_result if valid_result > results['highest_valid'][k]: results['highest_valid'][k] = valid_result results['final_train'][k] = train_result results['final_test'][k] = test_result save_ckpt(model, optimizer, round(epoch_loss, 4), epoch, args.model_save_path, k, name_post='valid_best') save_ckpt(predictor, optimizer, round(epoch_loss, 4), epoch, args.model_save_path, k, name_post='valid_best_link_predictor') logging.info(result) logging.info("%s" % results) end_time = time.time() total_time = end_time - start_time time_used = 'Total time: {}'.format( time.strftime('%H:%M:%S', time.gmtime(total_time))) logging.info(time_used)
def main(): parser = argparse.ArgumentParser(description='OGBL-DDI (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_layers', type=int, default=2) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--batch_size', type=int, default=64 * 1024) parser.add_argument('--lr', type=float, default=0.005) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--eval_steps', type=int, default=1) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--k', type=int, default=50) parser.add_argument('--gpu_id', type=int, default=0) args = parser.parse_args() print(args) device = gpu_setup(args.gpu_id) dataset = PygLinkPropPredDataset(name='ogbl-ddi', transform=T.ToSparseTensor()) data = dataset[0] adj_t = data.adj_t.to(device) split_edge = dataset.get_edge_split() # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge['train']['edge'].size(0)) idx = idx[:split_edge['valid']['edge'].size(0)] split_edge['eval_train'] = {'edge': split_edge['train']['edge'][idx]} model = GCNWithAttention(args.hidden_channels, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, args.k).to(device) emb = torch.nn.Embedding(data.num_nodes, args.hidden_channels).to(device) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) print("model parameters {}".format(sum(p.numel() for p in model.parameters()))) print("predictor parameters {}".format(sum(p.numel() for p in predictor.parameters()))) print("total parameters {}".format(data.num_nodes*args.hidden_channels + sum(p.numel() for p in model.parameters())+sum(p.numel() for p in predictor.parameters()))) evaluator = Evaluator(name='ogbl-ddi') loggers = { 'Hits@10': Logger(args.runs, args), 'Hits@20': Logger(args.runs, args), 'Hits@30': Logger(args.runs, args), } for run in range(args.runs): torch.nn.init.xavier_uniform_(emb.weight) model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam( list(model.parameters()) + list(emb.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, emb.weight, adj_t, split_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test(model, predictor, emb.weight, adj_t, split_edge, evaluator, args.batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%') print('---') for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
def main(): parser = argparse.ArgumentParser(description='OGBL-Citation2 (MLP)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_node_embedding', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=64 * 1024) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=100) parser.add_argument('--eval_steps', type=int, default=10) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygLinkPropPredDataset(name='ogbl-citation2') split_edge = dataset.get_edge_split() data = dataset[0] # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596] split_edge['eval_train'] = { 'source_node': split_edge['train']['source_node'][idx], 'target_node': split_edge['train']['target_node'][idx], 'target_node_neg': split_edge['valid']['target_node_neg'], } x = data.x if args.use_node_embedding: embedding = torch.load('embedding.pt', map_location='cpu') x = torch.cat([x, embedding], dim=-1) x = x.to(device) predictor = LinkPredictor(x.size(-1), args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbl-citation2') logger = Logger(args.runs, args) for run in range(args.runs): predictor.reset_parameters() optimizer = torch.optim.Adam(predictor.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(predictor, x, split_edge, optimizer, args.batch_size) print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}') if epoch % args.eval_steps == 0: result = test(predictor, x, split_edge, evaluator, args.batch_size) logger.add_result(run, result) if epoch % args.log_steps == 0: train_mrr, valid_mrr, test_mrr = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {train_mrr:.4f}, ' f'Valid: {valid_mrr:.4f}, ' f'Test: {test_mrr:.4f}') print('Node2vec' if args.use_node_embedding else 'MLP') logger.print_statistics(run) print('Node2vec' if args.use_node_embedding else 'MLP') logger.print_statistics()
def main_fixed_mask(args, imp_num, resume_train_ckpt=None): device = torch.device("cuda:" + str(args.device)) dataset = PygLinkPropPredDataset(name=args.dataset) data = dataset[0] # Data(edge_index=[2, 2358104], edge_weight=[2358104, 1], edge_year=[2358104, 1], x=[235868, 128]) split_edge = dataset.get_edge_split() evaluator = Evaluator(args.dataset) x = data.x.to(device) edge_index = data.edge_index.to(device) args.in_channels = data.x.size(-1) args.num_tasks = 1 model = DeeperGCN(args).to(device) pruning.add_mask(model, args) predictor = LinkPredictor(args).to(device) rewind_weight_mask, adj_spar, wei_spar = pruning.resume_change(resume_train_ckpt, model, args) model.load_state_dict(rewind_weight_mask) predictor.load_state_dict(resume_train_ckpt['predictor_state_dict']) # model.load_state_dict(rewind_weight_mask) # predictor.load_state_dict(rewind_predict_weight) adj_spar, wei_spar = pruning.print_sparsity(model, args) for name, param in model.named_parameters(): if 'mask' in name: param.requires_grad = False optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) #results = {} results = {'epoch': 0 } keys = ['highest_valid', 'final_train', 'final_test', 'highest_train'] hits = ['Hits@10', 'Hits@50', 'Hits@100'] for key in keys: results[key] = {k: 0 for k in hits} results['adj_spar'] = adj_spar results['wei_spar'] = wei_spar start_epoch = 1 for epoch in range(start_epoch, args.fix_epochs + 1): t0 = time.time() epoch_loss = train.train_fixed(model, predictor, x, edge_index, split_edge, optimizer, args.batch_size, args) result = train.test(model, predictor, x, edge_index, split_edge, evaluator, args.batch_size, args) # return a tuple k = 'Hits@50' train_result, valid_result, test_result = result[k] if train_result > results['highest_train'][k]: results['highest_train'][k] = train_result if valid_result > results['highest_valid'][k]: results['highest_valid'][k] = valid_result results['final_train'][k] = train_result results['final_test'][k] = test_result results['epoch'] = epoch pruning.save_all(model, predictor, rewind_weight_mask, optimizer, imp_num, epoch, args.model_save_path, 'IMP{}_fixed_ckpt'.format(imp_num)) epoch_time = (time.time() - t0) / 60 print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' | ' + 'IMP:[{}] (FIX Mask) Epoch:[{}/{}] LOSS:[{:.4f}] Train :[{:.2f}] Valid:[{:.2f}] Test:[{:.2f}] | Update Test:[{:.2f}] at epoch:[{}] Time:[{:.2f}min]' .format(imp_num, epoch, args.fix_epochs, epoch_loss, train_result * 100, valid_result * 100, test_result * 100, results['final_test'][k] * 100, results['epoch'], epoch_time)) print("=" * 120) print("syd final: IMP:[{}], Train:[{:.2f}] Best Val:[{:.2f}] at epoch:[{}] | Final Test Acc:[{:.2f}] Adj:[{:.2f}%] Wei:[{:.2f}%]" .format(imp_num, results['final_train'][k] * 100, results['highest_valid'][k] * 100, results['epoch'], results['final_test'][k] * 100, results['adj_spar'], results['wei_spar'])) print("=" * 120)
def main(): parser = argparse.ArgumentParser(description='OGBL-DDI (Full-Batch)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_sage', action='store_true') parser.add_argument('--num_layers', type=int, default=2) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--batch_size', type=int, default=64 * 1024) parser.add_argument('--lr', type=float, default=0.005) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--eval_steps', type=int, default=5) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygLinkPropPredDataset(name='ogbl-ddi') split_edge = dataset.get_edge_split() data = dataset[0] # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge['train']['edge'].size(0)) idx = idx[:split_edge['valid']['edge'].size(0)] split_edge['eval_train'] = {'edge': split_edge['train']['edge'][idx]} edge_index = data.edge_index adj = SparseTensor(row=edge_index[0], col=edge_index[1]).to(device) if args.use_sage: model = SAGE(args.hidden_channels, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout).to(device) else: model = GCN(args.hidden_channels, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout).to(device) # Pre-compute GCN normalization. adj = adj.set_diag() deg = adj.sum(dim=1) deg_inv_sqrt = deg.pow(-0.5) deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0 adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1) emb = torch.nn.Embedding(data.num_nodes, args.hidden_channels).to(device) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbl-ddi') loggers = { 'Hits@10': Logger(args.runs, args), 'Hits@20': Logger(args.runs, args), 'Hits@30': Logger(args.runs, args), } for run in range(args.runs): torch.nn.init.xavier_uniform_(emb.weight) model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(emb.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, emb.weight, adj, data.edge_index, split_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test(model, predictor, emb.weight, adj, split_edge, evaluator, args.batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%') print('---') for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()