def load_ogb(name, dataset_dir): if name[:4] == 'ogbn': dataset = PygNodePropPredDataset(name=name, root=dataset_dir) splits = dataset.get_idx_split() split_names = ['train_mask', 'val_mask', 'test_mask'] for i, key in enumerate(splits.keys()): mask = index2mask(splits[key], size=dataset.data.y.shape[0]) set_dataset_attr(dataset, split_names[i], mask, len(mask)) edge_index = to_undirected(dataset.data.edge_index) set_dataset_attr(dataset, 'edge_index', edge_index, edge_index.shape[1]) elif name[:4] == 'ogbg': dataset = PygGraphPropPredDataset(name=name, root=dataset_dir) splits = dataset.get_idx_split() split_names = [ 'train_graph_index', 'val_graph_index', 'test_graph_index' ] for i, key in enumerate(splits.keys()): id = splits[key] set_dataset_attr(dataset, split_names[i], id, len(id)) elif name[:4] == "ogbl": dataset = PygLinkPropPredDataset(name=name, root=dataset_dir) splits = dataset.get_edge_split() id = splits['train']['edge'].T if cfg.dataset.resample_negative: set_dataset_attr(dataset, 'train_pos_edge_index', id, id.shape[1]) # todo: applying transform for negative sampling is very slow dataset.transform = neg_sampling_transform else: id_neg = negative_sampling(edge_index=id, num_nodes=dataset.data.num_nodes[0], num_neg_samples=id.shape[1]) id_all = torch.cat([id, id_neg], dim=-1) label = get_link_label(id, id_neg) set_dataset_attr(dataset, 'train_edge_index', id_all, id_all.shape[1]) set_dataset_attr(dataset, 'train_edge_label', label, len(label)) id, id_neg = splits['valid']['edge'].T, splits['valid']['edge_neg'].T id_all = torch.cat([id, id_neg], dim=-1) label = get_link_label(id, id_neg) set_dataset_attr(dataset, 'val_edge_index', id_all, id_all.shape[1]) set_dataset_attr(dataset, 'val_edge_label', label, len(label)) id, id_neg = splits['test']['edge'].T, splits['test']['edge_neg'].T id_all = torch.cat([id, id_neg], dim=-1) label = get_link_label(id, id_neg) set_dataset_attr(dataset, 'test_edge_index', id_all, id_all.shape[1]) set_dataset_attr(dataset, 'test_edge_label', label, len(label)) else: raise ValueError('OGB dataset: {} non-exist') return dataset
def eval_with_partition(args): model_load_path = args.model_load_path print("Starting evaluating model stored at", model_load_path) device = torch.device("cuda") dataset = PygNodePropPredDataset(name=args.dataset, root=args.data_folder) graph = dataset[0] adj = SparseTensor(row=graph.edge_index[0], col=graph.edge_index[1]) if args.self_loop: adj = adj.set_diag() graph.edge_index = add_self_loops(edge_index=graph.edge_index, num_nodes=graph.num_nodes)[0] split_idx = dataset.get_idx_split() evaluator = Evaluator(args.dataset) args.in_channels = graph.x.size(-1) args.num_tasks = dataset.num_classes # print('%s' % args) model = DeeperGCN(args).to(device) ckpt = torch.load(model_load_path) model.load_state_dict(ckpt['model_state_dict']) res = test_with_partition(model, graph, adj, split_idx, num_clusters=args.eval_cluster_number, partition_method=args.partition_method, evaluator=evaluator, device=device) print(res) return res
def eval_model(params): model_load_path, args = params if os.path.isdir(args.model_load_path): model_load_dir = args.model_load_path model_load_path = os.path.join(model_load_dir, model_load_path) print("Starting evaluating model stored at", model_load_path) dataset = PygNodePropPredDataset(name=args.dataset, root=args.data_folder) graph = dataset[0] if args.self_loop: graph.edge_index = add_self_loops(edge_index=graph.edge_index, num_nodes=graph.num_nodes)[0] split_idx = dataset.get_idx_split() evaluator = Evaluator(args.dataset) args.in_channels = graph.x.size(-1) args.num_tasks = dataset.num_classes model = DeeperGCN(args) ckpt = torch.load(model_load_path, map_location=torch.device('cpu')) model.load_state_dict(ckpt['model_state_dict']) test_res = test(model, graph.x, graph.edge_index, graph.y, split_idx, evaluator) test_res["model_load_path"] = model_load_path return test_res
def main(): parser = argparse.ArgumentParser(description='OGBN-Arxiv (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_sage', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=500) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--kind', type=str, default="ReLU") args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-arxiv', transform=T.ToSparseTensor()) data = dataset[0] data.adj_t = data.adj_t.to_symmetric() data = data.to(device) split_idx = dataset.get_idx_split() train_idx = split_idx['train'].to(device) if args.use_sage: model = SAGE(data.num_features, args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout, kind=args.kind).to(device) else: model = GCN(data.num_features, args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout, kind=args.kind).to(device) evaluator = Evaluator(name='ogbn-arxiv') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, data, train_idx, optimizer) result = test(model, data, split_idx, evaluator) logger.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ' f'Test: {100 * test_acc:.2f}%') logger.print_statistics(run) logger.print_statistics()
def main(): parser = argparse.ArgumentParser(description="OGBN-Arxiv (MLP)") parser.add_argument("--device", type=int, default=0) parser.add_argument("--log_steps", type=int, default=1) parser.add_argument("--use_node_embedding", action="store_true") parser.add_argument("--num_layers", type=int, default=3) parser.add_argument("--hidden_channels", type=int, default=256) parser.add_argument("--dropout", type=float, default=0.5) parser.add_argument("--lr", type=float, default=0.01) parser.add_argument("--epochs", type=int, default=500) parser.add_argument("--runs", type=int, default=10) args = parser.parse_args() print(args) device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu" device = torch.device(device) dataset = PygNodePropPredDataset(name="ogbn-arxiv") split_idx = dataset.get_idx_split() data = dataset[0] x = data.x if args.use_node_embedding: embedding = torch.load("embedding.pt", map_location="cpu") x = torch.cat([x, embedding], dim=-1) x = x.to(device) y_true = data.y.to(device) train_idx = split_idx["train"].to(device) model = MLP( x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout, ).to(device) evaluator = Evaluator(name="ogbn-arxiv") logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, x, y_true, train_idx, optimizer) result = test(model, x, y_true, split_idx, evaluator) logger.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result print(f"Run: {run + 1:02d}, " f"Epoch: {epoch:02d}, " f"Loss: {loss:.4f}, " f"Train: {100 * train_acc:.2f}%, " f"Valid: {100 * valid_acc:.2f}%, " f"Test: {100 * test_acc:.2f}%") logger.print_statistics(run) logger.print_statistics()
def get_product_clusters(): dataset_name = "ogbn-products" dataset = PygNodePropPredDataset(name=dataset_name) print('The {} dataset has {} graph'.format(dataset_name, len(dataset))) data = dataset[0] print(data) split_idx = dataset.get_idx_split() train_idx = split_idx['train'] val_idx = split_idx['valid'] test_idx = split_idx['test'] train_mask = torch.zeros(data.num_nodes, dtype=torch.bool) train_mask[train_idx] = True data['train_mask'] = train_mask val_mask = torch.zeros(data.num_nodes, dtype=torch.bool) val_mask[val_idx] = True data['valid_mask'] = val_mask test_mask = torch.zeros(data.num_nodes, dtype=torch.bool) test_mask[test_idx] = True data['test_mask'] = test_mask cluster_data = ClusterData(data, num_parts=15000, save_dir="dataset") return cluster_data, dataset, data, split_idx
def main(): args = ArgsInit().args dataset = PygNodePropPredDataset(name=args.dataset) graph = dataset[0] if args.self_loop: graph.edge_index = add_self_loops(edge_index=graph.edge_index, num_nodes=graph.num_nodes)[0] split_idx = dataset.get_idx_split() evaluator = Evaluator(args.dataset) args.in_channels = graph.x.size(-1) args.num_tasks = dataset.num_classes print(args) model = DeeperGCN(args) print(model) model.load_state_dict(torch.load(args.model_load_path)['model_state_dict']) result = test(model, graph.x, graph.edge_index, graph.y, split_idx, evaluator) print(result) model.print_params(final=True)
def main(): parser = argparse.ArgumentParser(description='OGBN-Proteins (MLP)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_node_embedding', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=1000) parser.add_argument('--eval_steps', type=int, default=5) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-proteins') split_idx = dataset.get_idx_split() data = dataset[0] x = scatter(data.edge_attr, data.edge_index[0], dim=0, dim_size=data.num_nodes, reduce='mean').to('cpu') if args.use_node_embedding: embedding = torch.load('embedding.pt', map_location='cpu') x = torch.cat([x, embedding], dim=-1) x = x.to(device) y_true = data.y.to(device) train_idx = split_idx['train'].to(device) model = MLP(x.size(-1), args.hidden_channels, 112, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbn-proteins') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, x, y_true, train_idx, optimizer) if epoch % args.eval_steps == 0: result = test(model, x, y_true, split_idx, evaluator) logger.add_result(run, result) if epoch % args.log_steps == 0: train_rocauc, valid_rocauc, test_rocauc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_rocauc:.2f}%, ' f'Valid: {100 * valid_rocauc:.2f}% ' f'Test: {100 * test_rocauc:.2f}%') logger.print_statistics(run) logger.print_statistics()
def main(): parser = argparse.ArgumentParser(description='OGBN-Arxiv (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_sage', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=3000) parser.add_argument('--runs', type=int, default=1) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-arxiv',root='/mnt/ogbdata', transform=T.ToSparseTensor()) data = dataset[0] data.adj_t = data.adj_t.to_symmetric() data = data.to(device) split_idx = dataset.get_idx_split() print(split_idx['train'].nonzero().size(0)/len(split_idx['train']), split_idx['valid'].nonzero().size(0)/len(split_idx['train']), split_idx['test'].nonzero().size(0)/len(split_idx['train']))
def main(): parser = argparse.ArgumentParser(description='OGBN-Products (MLP)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_node_embedding', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=300) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-products') split_idx = dataset.get_idx_split() data = dataset[0] x = data.x if args.use_node_embedding: embedding = np.load('./embed_results/embeddings.npy') embedding = torch.from_numpy(embedding).float() x = torch.cat([x, embedding], dim=-1) x = x.to(device) y_true = data.y.to(device) train_idx = split_idx['train'].to(device) model = MLP(x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbn-products') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, x, y_true, train_idx, optimizer) result = test(model, x, y_true, split_idx, evaluator) logger.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}%, ' f'Test: {100 * test_acc:.2f}%') logger.print_statistics(run) logger.print_statistics() total_params = sum(p.numel() for p in model.parameters()) print(f'mlp total params are {total_params}')
def load_ogb_2(dataset): ## Load the dataset ## Setup PyTorch device_name = 'cuda:0' if torch.cuda.is_available() else 'cpu' device = torch.device(device_name) dataset = PygNodePropPredDataset(name=dataset, transform=T.ToSparseTensor()) ogb_data = dataset[0] # TODO: Not sure how to format adj_t... ogb_data.adj_t = ogb_data.adj_t.to_symmetric() ogb_data = ogb_data.to(device) split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx[ "valid"], split_idx["test"] train_idx, valid_idx, split_idx = train_idx.numpy(), valid_idx.numpy( ), test_idx.numpy() # Convert OGB's data split pytorch index vectors to Wang's data split numpy boolean masks train_mask_2 = indexes2booleanvec(ogb_data.num_nodes, train_idx) val_mask_2 = indexes2booleanvec(ogb_data.num_nodes, valid_idx) test_mask_2 = indexes2booleanvec(ogb_data.num_nodes, test_idx) # Add 1's down the diagonal of adj_t adj_t = ogb_data.adj_t.to_torch_sparse_coo_tensor() adj_t = adj_t + sparse_identity(adj_t.shape[0]) # Convert OGB's adjacency SparseTensor to Wang's adjacency index matrix (Nx2) adj_2_0 = adj_t.coalesce().indices().numpy() adj_2_0 = adj_2_0.T.astype('int32') ##adj_2_0 = np.vstack((adj_2_0, np.array([[i,i] for i in range(ogb_data.num_nodes)]))) adj_2_1 = adj_t.coalesce().values().numpy().astype('float64') adj_2_2 = tuple(adj_t.size()) #TODO: Fix the adjacency matrix, bc it probably is symmetric with identity adj_2 = (adj_2_0, adj_2_1, adj_2_2) from sklearn.preprocessing import OneHotEncoder labels_2 = ogb_data.y.numpy() labels_2 = OneHotEncoder(sparse=False).fit_transform(labels_2) #TODO: I don't know if this feature vector will work # OGB used a skip-gram encoding, # whereas Wang's Citeseer just used normalized rows with 1-0 for different words x = ogb_data.x + 1.5 norm_x = np.apply_along_axis(np.linalg.norm, 1, x) x = x / norm_x[:, None] x = x.to_sparse() features_2_0 = x.indices().numpy().T.astype('int32') features_2_1 = x.values().numpy() #features_2_1 = 1.5 + features_2_1 features_2_1 = features_2_1.astype('float64') features_2_2 = tuple(x.size()) features_2 = features_2_0, features_2_1, features_2_2 data2 = features_2, labels_2, adj_2, train_mask_2, val_mask_2, test_mask_2 return data2
def main_fixed_mask(args): device = torch.device("cuda:" + str(args.device)) dataset = PygNodePropPredDataset(name=args.dataset) data = dataset[0] split_idx = dataset.get_idx_split() evaluator = Evaluator(args.dataset) x = data.x.to(device) y_true = data.y.to(device) train_idx = split_idx['train'].to(device) edge_index = data.edge_index.to(device) edge_index = to_undirected(edge_index, data.num_nodes) if args.self_loop: edge_index = add_self_loops(edge_index, num_nodes=data.num_nodes)[0] args.in_channels = data.x.size(-1) args.num_tasks = dataset.num_classes model = DeeperGCN(args).to(device) pruning.add_mask(model, args.num_layers) for name, param in model.named_parameters(): if 'mask' in name: param.requires_grad = False optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) results = {'highest_valid': 0, 'final_train': 0, 'final_test': 0, 'highest_train': 0, 'epoch': 0} start_epoch = 1 for epoch in range(start_epoch, args.epochs + 1): epoch_loss = train_fixed(model, x, edge_index, y_true, train_idx, optimizer, args) result = test(model, x, edge_index, y_true, split_idx, evaluator) train_accuracy, valid_accuracy, test_accuracy = result if valid_accuracy > results['highest_valid']: results['highest_valid'] = valid_accuracy results['final_train'] = train_accuracy results['final_test'] = test_accuracy results['epoch'] = epoch print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' | ' + 'Baseline (FIX Mask) Epoch:[{}/{}]\t LOSS:[{:.4f}] Train :[{:.2f}] Valid:[{:.2f}] Test:[{:.2f}] | Update Test:[{:.2f}] at epoch:[{}]' .format(epoch, args.epochs, epoch_loss, train_accuracy * 100, valid_accuracy * 100, test_accuracy * 100, results['final_test'] * 100, results['epoch'])) print("=" * 120) print("syd final: Baseline, Train:[{:.2f}] Best Val:[{:.2f}] at epoch:[{}] | Final Test Acc:[{:.2f}]" .format( results['final_train'] * 100, results['highest_valid'] * 100, results['epoch'], results['final_test'] * 100)) print("=" * 120)
def main(): parser = argparse.ArgumentParser(description='OGBN-Products (Cluster-GCN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_partitions', type=int, default=15000) parser.add_argument('--num_workers', type=int, default=6) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=20) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-products') splitted_idx = dataset.get_idx_split() data = dataset[0] # Convert split indices to boolean masks and add them to `data`. for key, idx in splitted_idx.items(): mask = torch.zeros(data.num_nodes, dtype=torch.bool) mask[idx] = True data[f'{key}_mask'] = mask cluster_data = ClusterData(data, num_parts=args.num_partitions, recursive=False, save_dir=dataset.processed_dir) loader = ClusterLoader(cluster_data, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) model = SAGE(data.x.size(-1), args.hidden_channels, 47, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbn-products') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, loader, optimizer, device) if epoch % args.log_steps == 0: print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}') result = test(model, data, evaluator) logger.add_result(run, result) logger.print_statistics(run) logger.print_statistics()
def main(): parser = argparse.ArgumentParser(description='OGBN-Products (SIGN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-products') split_idx = dataset.get_idx_split() data = SIGN(args.num_layers)(dataset[0]) # This might take a while. xs = [data.x] + [data[f'x{i}'] for i in range(1, args.num_layers + 1)] xs_train = [x[split_idx['train']].to(device) for x in xs] xs_valid = [x[split_idx['valid']].to(device) for x in xs] xs_test = [x[split_idx['test']].to(device) for x in xs] y_train_true = data.y[split_idx['train']].to(device) y_valid_true = data.y[split_idx['valid']].to(device) y_test_true = data.y[split_idx['test']].to(device) model = MLP(data.x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbn-products') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, xs_train, y_train_true, optimizer) train_acc = test(model, xs_train, y_train_true, evaluator) valid_acc = test(model, xs_valid, y_valid_true, evaluator) test_acc = test(model, xs_test, y_test_true, evaluator) result = (train_acc, valid_acc, test_acc) logger.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}%, ' f'Test: {100 * test_acc:.2f}%') logger.print_statistics(run) logger.print_statistics()
def arxiv_data(root): # keep the same data loading logic for all architectures dataset = PygNodePropPredDataset( name="ogbn-arxiv", root=root, # transform=T.ToSparseTensor(), ) data = dataset[0] # data.adj_t = data.adj_t.to_symmetric() data.edge_index = to_undirected(data.edge_index) split_idx = dataset.get_idx_split() return data, split_idx
def get_data(args): dataset = PygNodePropPredDataset(name=args['dataset_name'], transform=T.ToSparseTensor()) evaluator = Evaluator(name=args['dataset_name']) data = dataset[0] data.adj_t = data.adj_t.to_symmetric() split_idx = dataset.get_idx_split() device = 'cuda' if torch.cuda.is_available() else 'cpu' data = data.to(device) for setname in ['train', 'valid', 'test']: split_idx[setname] = split_idx[setname].to(device) return data, dataset, split_idx, evaluator
def main(): args = ArgsInit().args if args.use_gpu: device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available( ) else torch.device("cpu") else: device = torch.device('cpu') dataset = PygNodePropPredDataset(name=args.dataset) data = dataset[0] split_idx = dataset.get_idx_split() evaluator = Evaluator(args.dataset) x = data.x.to(device) y_true = data.y.to(device) edge_index = data.edge_index.to(device) edge_index = to_undirected(edge_index, data.num_nodes) if args.self_loop: edge_index = add_self_loops(edge_index, num_nodes=data.num_nodes)[0] args.in_channels = data.x.size(-1) args.num_tasks = dataset.num_classes print(args) model = DeeperGCN(args) model.load_state_dict(torch.load(args.model_load_path)['model_state_dict']) model.to(device) result = test(model, x, edge_index, y_true, split_idx, evaluator) train_accuracy, valid_accuracy, test_accuracy = result print({ 'Train': train_accuracy, 'Validation': valid_accuracy, 'Test': test_accuracy }) model.print_params(final=True)
def load_data(args, datapath): if args.dataset in ['arxiv'] and args.task == 'lp': data = {} dataset = PygNodePropPredDataset(name='ogbn-{}'.format(args.dataset), root='/pasteur/u/jeffgu/hgcn/data') split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx[ "valid"], split_idx["test"] induced_edges_train, _ = subgraph(train_idx, dataset[0].edge_index) induced_edges_valid, _ = subgraph(valid_idx, dataset[0].edge_index) induced_edges_test, _ = subgraph(test_idx, dataset[0].edge_index) neg_edges_train = negative_sampling(induced_edges_train) neg_edges_valid = negative_sampling(induced_edges_valid) neg_edges_test = negative_sampling(induced_edges_test) data['adj_train'] = to_scipy_sparse_matrix( dataset[0].edge_index).tocsr() data['features'] = dataset[0].x data['train_edges'], data[ 'train_edges_false'] = induced_edges_train, neg_edges_train data['val_edges'], data[ 'val_edges_false'] = induced_edges_valid, neg_edges_valid data['test_edges'], data[ 'test_edges_false'] = induced_edges_test, neg_edges_test elif args.task == 'nc': data = load_data_nc(args.dataset, args.use_feats, datapath, args.split_seed) else: data = load_data_lp(args.dataset, args.use_feats, datapath) adj = data['adj_train'] if args.task == 'lp': adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = mask_edges( adj, args.val_prop, args.test_prop, args.split_seed) data['adj_train'] = adj_train data['train_edges'], data[ 'train_edges_false'] = train_edges, train_edges_false data['val_edges'], data[ 'val_edges_false'] = val_edges, val_edges_false data['test_edges'], data[ 'test_edges_false'] = test_edges, test_edges_false data['adj_train_norm'], data['features'] = process(data['adj_train'], data['features'], args.normalize_adj, args.normalize_feats) if args.dataset == 'airport': data['features'] = augment(data['adj_train'], data['features']) return data
def load_ogb_graph(dataset_name): if not os.path.isfile('torch_geometric_data/dgl_' + dataset_name): dataset = PygNodePropPredDataset(name="ogbn-" + dataset_name, root='torch_geometric_data/') split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx[ "valid"], split_idx["test"] edge = dataset[0].edge_index num_classes = len(np.unique(dataset[0].y)) print("Nodes: %d, edges: %d, features: %d, classes: %d. \n" % (dataset[0].y.shape[0], len(edge[0]) / 2, len( dataset[0].x[0]), num_classes)) graph = dgl.DGLGraph((edge[0], edge[1])) graph.ndata['features'] = dataset[0].x graph.ndata['labels'] = dataset[0].y dgl.data.utils.save_graphs('torch_geometric_data/dgl_' + dataset_name, graph) torch.save( train_idx, 'torch_geometric_data/ogbn_' + dataset_name + '/train_' + dataset_name + '.pt') torch.save( valid_idx, 'torch_geometric_data/ogbn_' + dataset_name + '/valid_' + dataset_name + '.pt') torch.save( test_idx, 'torch_geometric_data/ogbn_' + dataset_name + '/test_' + dataset_name + '.pt') labels = graph.ndata.pop('labels') features = graph.ndata.pop('features') features = torch.hstack([features, torch.ones([features.shape[0], 1])]) #print(features) elif os.path.isfile('torch_geometric_data/dgl_' + dataset_name): graph = dgl.data.utils.load_graphs('torch_geometric_data/dgl_' + dataset_name)[0][0] labels = graph.ndata.pop('labels') features = graph.ndata.pop('features') features = torch.hstack([features, torch.ones([features.shape[0], 1])]) train_idx = torch.load('torch_geometric_data/ogbn_' + dataset_name + '/train_' + dataset_name + '.pt') valid_idx = torch.load('torch_geometric_data/ogbn_' + dataset_name + '/valid_' + dataset_name + '.pt') test_idx = torch.load('torch_geometric_data/ogbn_' + dataset_name + '/test_' + dataset_name + '.pt') num_classes = len(torch.unique(labels)) return graph, features, labels, num_classes, train_idx, valid_idx, test_idx
def process_PygNodeDataset_hetero( self, dataset: PygNodePropPredDataset, ): data = dataset[0] self._name = dataset.name self.edge_index_dict = data.edge_index_dict self.num_nodes_dict = data.num_nodes_dict if hasattr( data, "num_nodes_dict") else self.get_num_nodes_dict( self.edge_index_dict) if self.node_types is None: self.node_types = list(self.num_nodes_dict.keys()) if hasattr(data, "x_dict"): self.x_dict = data.x_dict elif hasattr(data, "x"): self.x_dict = {self.head_node_type: data.x} else: self.x_dict = {} if hasattr(data, "y_dict"): self.y_dict = data.y_dict elif hasattr(data, "y"): self.y_dict = {self.head_node_type: data.y} else: self.y_dict = {} self.y_index_dict = { node_type: torch.arange(self.num_nodes_dict[node_type]) for node_type in self.y_dict.keys() } if self.head_node_type is None: if hasattr(self, "y_dict"): self.head_node_type = list(self.y_dict.keys())[0] else: self.head_node_type = self.node_types[0] self.metapaths = list(self.edge_index_dict.keys()) split_idx = dataset.get_idx_split() self.training_idx, self.validation_idx, self.testing_idx = split_idx["train"][self.head_node_type], \ split_idx["valid"][self.head_node_type], \ split_idx["test"][self.head_node_type]
def load_data_nc(dataset, use_feats, data_path, split_seed): if dataset in ['cora', 'pubmed']: adj, features, labels, idx_train, idx_val, idx_test = load_citation_data( dataset, use_feats, data_path, split_seed) elif dataset == 'arxiv': dataset = PygNodePropPredDataset(name='ogbn-arxiv', root='/pasteur/u/jeffgu/hgcn/data') split_idx = dataset.get_idx_split() idx_train, idx_val, idx_test = split_idx["train"], split_idx[ "valid"], split_idx["test"] adj = to_scipy_sparse_matrix(dataset[0].edge_index).tocsr() features = dataset[0].x labels = dataset[0].y else: if dataset == 'disease_nc': adj, features, labels = load_synthetic_data( dataset, use_feats, data_path) val_prop, test_prop = 0.10, 0.60 elif dataset == 'airport': adj, features, labels = load_data_airport(dataset, data_path, return_label=True) val_prop, test_prop = 0.15, 0.15 else: raise FileNotFoundError( 'Dataset {} is not supported.'.format(dataset)) idx_val, idx_test, idx_train = split_data(labels, val_prop, test_prop, seed=split_seed) labels = torch.LongTensor(labels) data = { 'adj_train': adj, 'features': features, 'labels': labels, 'idx_train': idx_train, 'idx_val': idx_val, 'idx_test': idx_test } return data
def load_pyg_dataset(dataset_name, root='dataset/'): from ogb.nodeproppred import PygNodePropPredDataset, Evaluator source, name = dataset_name.split('-', maxsplit=1) assert source in ['ogbn', 'pyg', 'custom'] if source == 'ogbn': dataset = PygNodePropPredDataset(name=dataset_name, root=root) return dataset, dataset.get_idx_split(), Evaluator(dataset_name) elif source == 'pyg': from torch_geometric.datasets import KarateClub, CoraFull if name == "karate": dataset = KarateClub() elif name == "cora": dataset = CoraFull(root) else: raise Exception("Dataset not recognized") num_nodes = dataset[0].x.shape[0] num_train = int(num_nodes * 0.8) num_val = int(num_nodes * 0.1) perm = np.arange(num_nodes, dtype=int) np.random.shuffle(perm) split_idx = { 'train': perm[:num_train], 'valid': perm[num_train:num_train + num_val], 'test': perm[num_train + num_val:] } return dataset, split_idx, Evaluator('ogbn-arxiv') elif source == "custom": from dataset import registry dataset = registry[name]() split_idx = { 'train': dataset[0].idx_train, 'valid': dataset[0].idx_val, 'test': dataset[0].idx_test } return dataset, split_idx, CustomEvaluator() else: raise Exception("Dataset not recognized")
def load_ogb(dataset_name, time_budget, zero_features=True, sample_num=None): print("*" * 30, "Start!", "*" * 30) dataset = PygNodePropPredDataset(name=dataset_name) split_idx = dataset.get_idx_split() train_idx, test_idx = split_idx["train"].numpy().tolist(), split_idx[ "valid"].numpy().tolist() + split_idx["test"].numpy().tolist() print("Train rate {}, test rate {}".format( len(train_idx) / (len(train_idx) + len(test_idx)), len(test_idx) / (len(train_idx) + len(test_idx)))) graph = dataset[0] # pyg graph object features, labels = graph.x.numpy(), graph.y.numpy() edge_index = graph.edge_index.numpy() edge_weight = graph.edge_attr print(features.shape, labels.shape, edge_index.shape) if zero_features: features = np.zeros((features.shape[0], features.shape[1]), dtype=np.float) if edge_weight is None: edge_weight = np.ones(edge_index.shape[1]) adj_matrix = sp.coo_matrix((edge_weight.reshape(-1), edge_index), shape=(labels.shape[0], labels.shape[0])) node_indexs = np.arange(labels.shape[0]) n_class = len(np.unique(labels)) # output directory control output_dir = os.path.join(os.path.dirname(__file__) + '../data-offline') os.makedirs(output_dir, exist_ok=True) sample_num_str = "" if sample_num is None else str(sample_num) data_dir = os.path.join(output_dir, dataset_name + sample_num_str) if os.path.exists(data_dir): shutil.rmtree(data_dir) train_dir = os.path.join(data_dir, 'train.data') os.makedirs(data_dir, exist_ok=True) os.makedirs(train_dir, exist_ok=True) transform_to_autograph_format(features, labels, adj_matrix, node_indexs, n_class, data_dir, train_dir, time_budget) print("*" * 30, "Finish!", "*" * 30)
def setup_ogb(self): dataset = PygNodePropPredDataset(name='ogbn-arxiv', root=self.root, transform=T.ToSparseTensor()) data = dataset[0] self.metric = 'Accuracy' self.num_classes = dataset.num_classes self.split_idx = dataset.get_idx_split() self.x = data.x self.y = data.y self.adj_t = data.adj_t.to_symmetric() self.num_nodes = data.num_nodes if self.make_edge_index: row = self.adj_t.storage.row() col = self.adj_t.storage.col() self.edge_index = torch.stack((row, col), dim=0) self.criterion = torch.nn.CrossEntropyLoss()
def load_proteins_dataset(): dataset = PygNodePropPredDataset('ogbn-proteins', root='../data') splitted_idx = dataset.get_idx_split() data = dataset[0] data.node_species = None data.y = data.y.to(torch.float) data.n_id = torch.arange(data.num_nodes) # Initialize features of nodes by aggregating edge features. row, col = data.edge_index data.x = scatter(data.edge_attr, col, 0, dim_size=data.num_nodes, reduce='add') # Set split indices to masks. for split in ['train', 'valid', 'test']: mask = torch.zeros(data.num_nodes, dtype=torch.bool) mask[splitted_idx[split]] = True data[f'{split}_mask'] = mask return data
# Reaches around 0.7945 ± 0.0059 test accuracy. import os.path as osp import torch import torch.nn.functional as F from torch.nn import Linear as Lin from tqdm import tqdm from ogb.nodeproppred import PygNodePropPredDataset, Evaluator from torch_geometric.data import NeighborSampler from torch_geometric.nn import GATConv root = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'products') dataset = PygNodePropPredDataset('ogbn-products', root) split_idx = dataset.get_idx_split() evaluator = Evaluator(name='ogbn-products') data = dataset[0] train_idx = split_idx['train'] train_loader = NeighborSampler(data.edge_index, node_idx=train_idx, sizes=[10, 10, 10], batch_size=512, shuffle=True, num_workers=12) subgraph_loader = NeighborSampler(data.edge_index, node_idx=None, sizes=[-1], batch_size=1024, shuffle=False, num_workers=12)
def main(): parser = argparse.ArgumentParser(description='gen_models') parser.add_argument('--device', type=int, default=0) parser.add_argument('--dataset', type=str, default='arxiv') parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--model', type=str, default='mlp') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--use_embeddings', action='store_true') parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=300) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name=f'ogbn-{args.dataset}', transform=T.ToSparseTensor()) data = dataset[0] data.adj_t = data.adj_t.to_symmetric() x = data.x split_idx = dataset.get_idx_split() preprocess_data = PygNodePropPredDataset(name=f'ogbn-{args.dataset}')[0] if args.dataset == 'arxiv': embeddings = torch.cat([ preprocess(preprocess_data, 'diffusion', post_fix=args.dataset), preprocess(preprocess_data, 'spectral', post_fix=args.dataset) ], dim=-1) elif args.dataset == 'products': embeddings = preprocess(preprocess_data, 'spectral', post_fix=args.dataset) if args.use_embeddings: x = torch.cat([x, embeddings], dim=-1) if args.dataset == 'arxiv': x = (x - x.mean(0)) / x.std(0) if args.model == 'mlp': model = MLP(x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, 0.5, args.dataset == 'products').to(device) elif args.model == 'linear': model = MLPLinear(x.size(-1), dataset.num_classes).to(device) elif args.model == 'plain': model = MLPLinear(x.size(-1), dataset.num_classes).to(device) elif args.model == 'sgc': model = SGC(x.size(-1), dataset.num_classes).to(device) x = x.to(device) y_true = data.y.to(device) train_idx = split_idx['train'].to(device) model_dir = prepare_folder(f'{args.dataset}_{args.model}', model) evaluator = Evaluator(name=f'ogbn-{args.dataset}') logger = Logger(args.runs, args) for run in range(args.runs): import gc gc.collect() print(sum(p.numel() for p in model.parameters())) model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) best_valid = 0 best_out = None for epoch in range(1, args.epochs): loss = train(model, x, y_true, train_idx, optimizer) result, out = test(model, x, y_true, split_idx, evaluator) train_acc, valid_acc, test_acc = result if valid_acc > best_valid: best_valid = valid_acc best_out = out.cpu().exp() if (epoch % 10 == 0): print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ' f'Test: {100 * test_acc:.2f}%') logger.add_result(run, result) logger.print_statistics(run) torch.save(best_out, f'{model_dir}/{run}.pt') logger.print_statistics()
def main(): parser = argparse.ArgumentParser(description='OGBN-Products (GraphSAINT)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--inductive', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--batch_size', type=int, default=20000) parser.add_argument('--walk_length', type=int, default=3) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--num_steps', type=int, default=30) parser.add_argument('--epochs', type=int, default=20) parser.add_argument('--eval_steps', type=int, default=2) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset( name='ogbn-products', root='/srv/scratch/ogb/datasets/nodeproppred') split_idx = dataset.get_idx_split() data = dataset[0] # Convert split indices to boolean masks and add them to `data`. for key, idx in split_idx.items(): mask = torch.zeros(data.num_nodes, dtype=torch.bool) mask[idx] = True data[f'{key}_mask'] = mask # We omit normalization factors here since those are only defined for the # inductive learning setup. sampler_data = data if args.inductive: sampler_data = to_inductive(data) loader = GraphSAINTRandomWalkSampler(sampler_data, batch_size=args.batch_size, walk_length=args.walk_length, num_steps=args.num_steps, sample_coverage=0, save_dir=dataset.processed_dir) model = SAGE(data.x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout).to(device) subgraph_loader = NeighborSampler(data.edge_index, sizes=[-1], batch_size=4096, shuffle=False, num_workers=12) evaluator = Evaluator(name='ogbn-products') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, loader, optimizer, device) if epoch % args.log_steps == 0: print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}') if epoch > 9 and epoch % args.eval_steps == 0: result = test(model, data, evaluator, subgraph_loader, device) logger.add_result(run, result) train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ' f'Test: {100 * test_acc:.2f}%') logger.add_result(run, result) logger.print_statistics(run) logger.print_statistics()
def main(): parser = argparse.ArgumentParser(description='OGBN-Products (Cluster-GCN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_partitions', type=int, default=15000) parser.add_argument('--num_workers', type=int, default=12) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=50) parser.add_argument('--eval_steps', type=int, default=5) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-products') split_idx = dataset.get_idx_split() data = dataset[0] # Convert split indices to boolean masks and add them to `data`. for key, idx in split_idx.items(): mask = torch.zeros(data.num_nodes, dtype=torch.bool) mask[idx] = True data[f'{key}_mask'] = mask cluster_data = ClusterData(data, num_parts=args.num_partitions, recursive=False, save_dir=dataset.processed_dir) loader = ClusterLoader(cluster_data, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) subgraph_loader = NeighborSampler(data.edge_index, sizes=[-1], batch_size=1024, shuffle=False, num_workers=args.num_workers) model = GCN(data.x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbn-products') logger = Logger(args.runs, args) logger_orig = Logger(args.runs, args) adj = process_adj(data) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) best_valid = 0 best_out = None for epoch in range(1, 1 + args.epochs): loss, train_acc = train(model, loader, optimizer, device) if epoch % args.log_steps == 0: print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Approx Train Acc: {train_acc:.4f}') if epoch > 19 and epoch % args.eval_steps == 0: out, result = test(model, data, evaluator, subgraph_loader, device) logger_orig.add_result(run, result) train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ' f'Test: {100 * test_acc:.2f}%') logger.print_statistics(run) logger.print_statistics() logger_orig.print_statistics()
def main(): parser = argparse.ArgumentParser(description='OGBN-papers100M (MLP)') parser.add_argument('--data_root_dir', type=str, default='../../dataset') parser.add_argument('--num_propagations', type=int, default=3) parser.add_argument('--dropedge_rate', type=float, default=0.4) parser.add_argument('--node_emb_path', type=str, default=None) parser.add_argument('--output_path', type=str, required=True) args = parser.parse_args() # SGC pre-processing ###################################################### dataset = PygNodePropPredDataset(name='ogbn-papers100M', root=args.data_root_dir) split_idx = dataset.get_idx_split() data = dataset[0] x = None if args.node_emb_path: x = np.load(args.node_emb_path) else: x = data.x.numpy() N = data.num_nodes print('Making the graph undirected.') ### Randomly drop some edges to save computation data.edge_index, _ = dropout_adj(data.edge_index, p=args.dropedge_rate, num_nodes=data.num_nodes) data.edge_index = to_undirected(data.edge_index, data.num_nodes) print(data) row, col = data.edge_index print('Computing adj...') adj = SparseTensor(row=row, col=col, sparse_sizes=(N, N)) adj = adj.set_diag() deg = adj.sum(dim=1).to(torch.float) deg_inv_sqrt = deg.pow(-0.5) deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0 adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1) adj = adj.to_scipy(layout='csr') train_idx, valid_idx, test_idx = split_idx['train'], split_idx[ 'valid'], split_idx['test'] all_idx = torch.cat([train_idx, valid_idx, test_idx]) mapped_train_idx = torch.arange(len(train_idx)) mapped_valid_idx = torch.arange(len(train_idx), len(train_idx) + len(valid_idx)) mapped_test_idx = torch.arange( len(train_idx) + len(valid_idx), len(train_idx) + len(valid_idx) + len(test_idx)) sgc_dict = {} sgc_dict['label'] = data.y.data[all_idx].to(torch.long) sgc_dict['split_idx'] = { 'train': mapped_train_idx, 'valid': mapped_valid_idx, 'test': mapped_test_idx } print('Start SGC processing') for _ in tqdm(range(args.num_propagations)): x = adj @ x sgc_dict['sgc_embedding'] = torch.from_numpy(x[all_idx]).to(torch.float) torch.save(sgc_dict, args.output_path)