def cluster_data(data, num_clusters, batch_size, shuffle=True, verbose=True): """Prepares clusters for batching Parameters ---------- data : torch_geometric.data.Data Graph data object. num_clusters : int The number of clusters to chop the input graph into. batch_size : int The number of clusters in each batch shuffle : bool, optional If true, the ClusterLoader will shuffle clusters, by default True verbose : bool, optional If true, prints clusters info, by default True Returns ------- torch_geometric.data.ClusterLoader A loader for training """ clusters = ClusterData(data, num_clusters, recursive=True, save_dir=None) if verbose: for cluster in clusters: print(cluster) return ClusterLoader(clusters, batch_size=batch_size)
def build_sampler(args, data, save_dir): if args.sampler == 'rw-my': msg = 'Use GraphSaint randomwalk sampler(mysaint sampler)' loader = MySAINTSampler(data, batch_size=args.batch_size, sample_type='random_walk', walk_length=2, sample_coverage=1000, save_dir=save_dir) elif args.sampler == 'node-my': msg = 'Use random node sampler(mysaint sampler)' loader = MySAINTSampler(data, sample_type='node', batch_size=args.batch_size * 3, walk_length=2, sample_coverage=1000, save_dir=save_dir) elif args.sampler == 'rw': msg = 'Use GraphSaint randomwalk sampler' loader = GraphSAINTRandomWalkSampler(data, batch_size=args.batch_size, walk_length=2, num_steps=5, sample_coverage=1000, save_dir=save_dir) elif args.sampler == 'node': msg = 'Use GraphSaint node sampler' loader = GraphSAINTNodeSampler(data, batch_size=args.batch_size * 3, num_steps=5, sample_coverage=1000, num_workers=0, save_dir=save_dir) elif args.sampler == 'edge': msg = 'Use GraphSaint edge sampler' loader = GraphSAINTEdgeSampler(data, batch_size=args.batch_size, num_steps=5, sample_coverage=1000, save_dir=save_dir, num_workers=0) elif args.sampler == 'cluster': msg = 'Use cluster sampler' cluster_data = ClusterData(data, num_parts=args.num_parts, save_dir=save_dir) loader = ClusterLoader(cluster_data, batch_size=20, shuffle=True, num_workers=0) else: raise KeyError('Sampler type error') return loader, msg
def main(): parser = argparse.ArgumentParser(description='OGBN-Products (Cluster-GCN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_partitions', type=int, default=15000) parser.add_argument('--num_workers', type=int, default=6) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=20) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-products') splitted_idx = dataset.get_idx_split() data = dataset[0] # Convert split indices to boolean masks and add them to `data`. for key, idx in splitted_idx.items(): mask = torch.zeros(data.num_nodes, dtype=torch.bool) mask[idx] = True data[f'{key}_mask'] = mask cluster_data = ClusterData(data, num_parts=args.num_partitions, recursive=False, save_dir=dataset.processed_dir) loader = ClusterLoader(cluster_data, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) model = SAGE(data.x.size(-1), args.hidden_channels, 47, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbn-products') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, loader, optimizer, device) if epoch % args.log_steps == 0: print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}') result = test(model, data, evaluator) logger.add_result(run, result) logger.print_statistics(run) logger.print_statistics()
def _make_graph_sampler(self): graph = Data( edge_index=self.edge_index, edge_attr=self.edge_weight, n_id=torch.arange(0, self.num_nodes), num_nodes=self.num_nodes ).to('cpu') cluster_data = ClusterData( graph, num_parts=100, recursive=False, save_dir=None ) cluster_loader = ClusterLoader(cluster_data, batch_size=5, shuffle=True, num_workers=0) return cluster_loader
def run_sim(cl, lr, layer): layer_dict = {'arma': ARMAConv, 'sage': SAGEConv, 'tag': TAGConv} mat = load_npz( '/gpfs/data/rsingh47/jbigness/data/%s/hic_sparse_vcsqrt_oe_edge_v7.npz' % cl) hms = np.load( '/gpfs/data/rsingh47/jbigness/data/%s/np_hmods_norm_vcsqrt_oe_edge_v7.npy' % cl) labs = np.load( '/gpfs/data/rsingh47/jbigness/data/%s/np_nodes_lab_genes_vcsqrt_oe_edge_v7.npy' % cl) print('Data Loaded') mask = torch.tensor(labs[:, -1]).long() loc = {} for i in range(labs[:, -1].shape[0]): loc[labs[i, -1]] = i y = [] for i in range(mat.shape[0]): y.append(labs[loc[i], -2]) if i in mask else y.append(-1) y = torch.tensor(y).long() extract = torch_geometric.utils.from_scipy_sparse_matrix(mat) G = torch_geometric.data.Data( edge_index=extract[0], edge_attr=extract[1], x=torch.tensor(hms[:mat.shape[0]]).float().reshape(-1, 1, 100, 5), y=y) cluster_data = ClusterData(G, num_parts=20, recursive=False) train_loader = ClusterLoader(cluster_data, batch_size=2, shuffle=False, num_workers=0) print('Data Clustered') random.seed(30) idx = list(range(labs.shape[0] - 1)) random.shuffle(idx) train_mask = idx[:10000] test_mask = idx[10000:] net = GCN(94, 500, 400, 100, 50, 2, layer_dict[layer]) return train_model(net, train_loader, 1500, lr, train_mask, test_mask, mask)
def main(): parser = argparse.ArgumentParser(description='OGBN-Products (Cluster-GCN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_partitions', type=int, default=15000) parser.add_argument('--num_workers', type=int, default=12) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=50) parser.add_argument('--eval_steps', type=int, default=5) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--step-size', type=float, default=8e-3) parser.add_argument('-m', type=int, default=3) parser.add_argument('--test-freq', type=int, default=5) parser.add_argument('--attack', type=str, default='flag') parser.add_argument('--amp', type=float, default=2) args = parser.parse_args() device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-products') split_idx = dataset.get_idx_split() data = dataset[0] # Convert split indices to boolean masks and add them to `data`. for key, idx in split_idx.items(): mask = torch.zeros(data.num_nodes, dtype=torch.bool) mask[idx] = True data[f'{key}_mask'] = mask cluster_data = ClusterData(data, num_parts=args.num_partitions, recursive=False, save_dir=dataset.processed_dir) loader = ClusterLoader(cluster_data, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) subgraph_loader = NeighborSampler(data.edge_index, sizes=[-1], batch_size=1024, shuffle=False, num_workers=args.num_workers) model = SAGE(data.x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbn-products') vals, tests = [], [] for run in range(args.runs): best_val, final_test = 0, 0 model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, args.epochs + 1): loss, acc = train_flag(model, loader, optimizer, device, args) if epoch > 19 and epoch % args.test_freq == 0 or epoch == args.epochs: result = test(model, data, evaluator, subgraph_loader, device) train, val, tst = result if val > best_val: best_val = val final_test = tst print(f'Run{run} val:{best_val}, test:{final_test}') vals.append(best_val) tests.append(final_test) print('') print(f"Average val accuracy: {np.mean(vals)} ± {np.std(vals)}") print(f"Average test accuracy: {np.mean(tests)} ± {np.std(tests)}")
def forward(self, X, edge_index, edge_weight): """ :param X: Input data of shape (batch_size, num_nodes, in_channels) :param edge_index: Graph connectivity in COO format with shape(2, num_edges) :param edge_weight: Edge feature matrix with shape (num_edges, num_edge_features) :return: Output data of shape (batch_size, num_nodes, out_channels) """ if torch.is_tensor(X): sz = X.shape if self.gcn_partition == 'cluster': out = torch.zeros(sz[0], sz[1], self.out_channels, device=X.device) graph_data = Data(edge_index=edge_index, edge_attr=edge_weight, train_mask=torch.arange(0, sz[1]), num_nodes=sz[1]).to('cpu') cluster_data = ClusterData(graph_data, num_parts=50, recursive=False, save_dir='data/cluster') loader = ClusterLoader(cluster_data, batch_size=5, shuffle=True, num_workers=0) for subgraph in loader: out[:, subgraph.train_mask] = self.gcn( X[:, subgraph.train_mask], subgraph.edge_index.to(X.device), subgraph.edge_attr.to(X.device)) elif self.gcn_partition == 'sample': # Use NeighborSampler() to iterates over graph nodes in a mini-batch fashion # and constructs sampled subgraphs (use cpu for no CUDA version) out = torch.zeros(sz[0], sz[1], self.out_channels, device=X.device) graph_data = Data(edge_index=edge_index, num_nodes=sz[1]).to('cpu') loader = NeighborSampler(graph_data, size=[10, 5], num_hops=2, batch_size=120, shuffle=True, add_self_loops=False) for data_flow in loader(): block1 = data_flow[0] t = self.gcn1(X, edge_index[:, block1.e_id], edge_weight[block1.e_id]) block2 = data_flow[1] part_out = self.gcn2(t, edge_index[:, block2.e_id], edge_weight[block2.e_id]) out[:, data_flow.n_id] = part_out[:, data_flow.n_id] elif self.batch_training: if self.adj_available: out = self.gcn(X, edge_index, edge_weight) else: out = self.gcn(X, edge_index) else: # Currently, conv in [GATConv] cannot use argument node_dim for batch training # This is a temp solution but it's very very very slow! # Costing about 6 times more than batch_training batch = self.get_batch(X) if self.adj_available: out = self.gcn(batch.x, edge_index, edge_weight) else: out = self.gcn(batch.x, edge_index) out = out.view(sz[0], sz[1], -1) return out
def get_cluster_batches(cluster_data, batch_size): loader = ClusterLoader(cluster_data, batch_size=batch_size, shuffle=True, num_workers=1) return loader
def main(): parser = argparse.ArgumentParser( description='Link Prediction (Cluster-GCN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--dataset', type=str, default='ogbl-citation') parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_partitions', type=int, default=15000) parser.add_argument('--num_workers', type=int, default=4) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--eval_steps', type=int, default=10) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--negs', type=int, default=1) parser.add_argument('--gnn_type', type=str, default='gcn') args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygLinkPropPredDataset(name=args.dataset) data = dataset[0] data.edge_index = to_undirected(data.edge_index, data.num_nodes) print(data.edge_index.shape, data.num_nodes) cluster_data = ClusterData(data, num_parts=args.num_partitions, recursive=False, save_dir=dataset.processed_dir) loader = ClusterLoader(cluster_data, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) model = GCN(data.x.size(-1), args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, gnn_type=args.gnn_type).to(device) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name=args.dataset) logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): t0 = time.time() loss = train(model, predictor, loader, optimizer, device, args.negs) tt = time.time() print(tt - t0) if epoch % args.eval_steps == 0: result = test(model, predictor, data, split_edge, evaluator, 64 * 4 * args.batch_size, device) logger.add_result(run, result) if epoch % args.log_steps == 0: train_mrr, valid_mrr, test_mrr = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {train_mrr:.4f}, ' f'Valid: {valid_mrr:.4f}, ' f'Test: {test_mrr:.4f}') logger.print_statistics(run) logger.print_statistics()
def test_cluster_gcn(): adj = torch.tensor([ [1, 1, 1, 0, 1, 0], [1, 1, 0, 1, 0, 1], [1, 0, 1, 0, 1, 0], [0, 1, 0, 1, 0, 1], [1, 0, 1, 0, 1, 0], [0, 1, 0, 1, 0, 1], ]) edge_index = adj.nonzero().t() x = torch.Tensor([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]) data = Data(edge_index=edge_index, x=x, num_nodes=6) cluster_data = ClusterData(data, num_parts=2, log=False) assert cluster_data.partptr.tolist() == [0, 3, 6] assert cluster_data.perm.tolist() == [0, 2, 4, 1, 3, 5] assert cluster_data.data.x.tolist() == [ [0, 0], [2, 2], [4, 4], [1, 1], [3, 3], [5, 5], ] assert cluster_data.data.adj.to_dense().tolist() == [ [1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [1, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], ] data = cluster_data[0] assert data.x.tolist() == [[0, 0], [2, 2], [4, 4]] assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]] data = cluster_data[1] assert data.x.tolist() == [[1, 1], [3, 3], [5, 5]] assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]] loader = ClusterLoader(cluster_data, batch_size=1) it = iter(loader) data = next(it) assert data.x.tolist() == [[0, 0], [2, 2], [4, 4]] assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]] data = next(it) assert data.x.tolist() == [[1, 1], [3, 3], [5, 5]] assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]] torch.manual_seed(1) loader = ClusterLoader(cluster_data, batch_size=2, shuffle=True) data = next(iter(loader)) assert data.x.tolist() == [ [0, 0], [2, 2], [4, 4], [1, 1], [3, 3], [5, 5], ] assert to_dense_adj(data.edge_index).squeeze().tolist() == [ [1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [1, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], ] torch.manual_seed(2) loader = ClusterLoader(cluster_data, batch_size=2, shuffle=True) data = next(iter(loader)) assert data.x.tolist() == [ [1, 1], [3, 3], [5, 5], [0, 0], [2, 2], [4, 4], ] assert to_dense_adj(data.edge_index).squeeze().tolist() == [ [1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [1, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], ]
def main(): parser = argparse.ArgumentParser(description='OGBN-Products (Cluster-GCN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_partitions', type=int, default=15000) parser.add_argument('--num_workers', type=int, default=12) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=50) parser.add_argument('--eval_steps', type=int, default=5) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-products') split_idx = dataset.get_idx_split() data = dataset[0] # Convert split indices to boolean masks and add them to `data`. for key, idx in split_idx.items(): mask = torch.zeros(data.num_nodes, dtype=torch.bool) mask[idx] = True data[f'{key}_mask'] = mask cluster_data = ClusterData(data, num_parts=args.num_partitions, recursive=False, save_dir=dataset.processed_dir) loader = ClusterLoader(cluster_data, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) subgraph_loader = NeighborSampler(data.edge_index, sizes=[-1], batch_size=1024, shuffle=False, num_workers=args.num_workers) model = GCN(data.x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbn-products') logger = Logger(args.runs, args) logger_orig = Logger(args.runs, args) adj = process_adj(data) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) best_valid = 0 best_out = None for epoch in range(1, 1 + args.epochs): loss, train_acc = train(model, loader, optimizer, device) if epoch % args.log_steps == 0: print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Approx Train Acc: {train_acc:.4f}') if epoch > 19 and epoch % args.eval_steps == 0: out, result = test(model, data, evaluator, subgraph_loader, device) logger_orig.add_result(run, result) train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ' f'Test: {100 * test_acc:.2f}%') logger.print_statistics(run) logger.print_statistics() logger_orig.print_statistics()
def main(args): # Set up logging and devices args.save_dir = get_save_dir(args.save_dir, 'test', training=True) log = get_logger(args.save_dir, 'test') tboard = SummaryWriter(args.save_dir) device, args.gpu_ids = get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get data loader log.info('Building dataset...') # Download and process data at './dataset/xxx' dataset = PygNodePropPredDataset(name=args.dataset, root='dataset/') evaluator = Evaluator(name=args.dataset) split_idx = dataset.get_idx_split() data = dataset[0] # Convert split indices to boolean masks and add them to `data`. for key, idx in split_idx.items(): mask = torch.zeros(data.num_nodes, dtype=torch.bool) mask[idx] = True data[f'{key}_mask'] = mask cluster_data = ClusterData(data, num_parts=args.num_partitions, recursive=False, save_dir=dataset.processed_dir) dataset_loader = ClusterLoader(cluster_data, batch_size=args.batch_size, shuffle=args.data_shuffle, num_workers=args.num_workers) # Get model log.info('Building model...') model = load_full_model(args.load_path, args.gpu_ids) model = nn.DataParallel(model) model = model.to(device) model.eval() # Test log.info('Testing...') # Evaluate, display the stats and save the model dev_results = test(model, dataset_loader, device, evaluator) # Log the metrics dev_log_message = ''.join('{} - {}; '.format(k, v) for k, v in dev_results.items()) log.info(f'Testing - {dev_log_message}')
def run(): cluster_data = ClusterData( data, num_parts=args.num_partitions, recursive=False, save_dir=dataset.processed_dir, ) loader = ClusterLoader( cluster_data, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, ) model = GCN( data.x.size(-1), args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, ).to(device) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name="ogbl-citation") logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, loader, optimizer, device) print(f"Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}") if epoch > 49 and epoch % args.eval_steps == 0: result = test( model, predictor, data, split_edge, evaluator, batch_size=64 * 1024, device=device, ) logger.add_result(run, result) train_mrr, valid_mrr, test_mrr = result print(f"Run: {run + 1:02d}, " f"Epoch: {epoch:02d}, " f"Loss: {loss:.4f}, " f"Train: {train_mrr:.4f}, " f"Valid: {valid_mrr:.4f}, " f"Test: {test_mrr:.4f}") logger.print_statistics(run) logger.print_statistics()
train_data = ClusterData(train_data, num_parts=1500, recursive=False, save_dir="data/Reddit/train") val_data = ClusterData(val_data, num_parts=20, recursive=False, save_dir="data/Reddit/val") test_data = ClusterData(test_data, num_parts=1, recursive=False, save_dir="data/Reddit/test") train_loader = ClusterLoader(train_data, batch_size=20, shuffle=True, num_workers=8) val_loader = ClusterLoader(val_data, batch_size=1, shuffle=False, num_workers=8) test_loader = ClusterLoader(test_data, batch_size=1, shuffle=False, num_workers=1) # subgraph_loader = NeighborSampler(data.edge_index, sizes=[-1], batch_size=1024, # shuffle=False, num_workers=12) class GCNConvDiagEnhance(MessagePassing):
def main(): parser = argparse.ArgumentParser(description='OGBL-Citation (Cluster-GCN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_partitions', type=int, default=15000) parser.add_argument('--num_workers', type=int, default=12) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--eval_steps', type=int, default=10) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygLinkPropPredDataset(name='ogbl-citation') split_edge = dataset.get_edge_split() data = dataset[0] data.edge_index = to_undirected(data.edge_index, data.num_nodes) cluster_data = ClusterData(data, num_parts=args.num_partitions, recursive=False, save_dir=dataset.processed_dir) loader = ClusterLoader(cluster_data, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596] split_edge['eval_train'] = { 'source_node': split_edge['train']['source_node'][idx], 'target_node': split_edge['train']['target_node'][idx], 'target_node_neg': split_edge['valid']['target_node_neg'], } model = GCN(data.x.size(-1), args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout).to(device) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbl-citation') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam( list(model.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, loader, optimizer, device) print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}') if epoch > 49 and epoch % args.eval_steps == 0: result = test(model, predictor, data, split_edge, evaluator, batch_size=64 * 1024, device=device) logger.add_result(run, result) train_mrr, valid_mrr, test_mrr = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {train_mrr:.4f}, ' f'Valid: {valid_mrr:.4f}, ' f'Test: {test_mrr:.4f}') logger.print_statistics(run) logger.print_statistics()
) homo_data.y = node_type.new_full((node_type.size(0), 1), -1) homo_data.y[local2global["paper"]] = data.y_dict["paper"] homo_data.train_mask = torch.zeros((node_type.size(0)), dtype=torch.bool) homo_data.train_mask[local2global["paper"][split_idx["train"]["paper"]]] = True print(homo_data) cluster_data = ClusterData(homo_data, num_parts=5000, recursive=True, save_dir=dataset.processed_dir) train_loader = ClusterLoader(cluster_data, batch_size=500, shuffle=True, num_workers=12) # Map informations to their canonical type. x_dict = {} for key, x in data.x_dict.items(): x_dict[key2int[key]] = x num_nodes_dict = {} for key, N in data.num_nodes_dict.items(): num_nodes_dict[key2int[key]] = N class RGCNConv(MessagePassing): def __init__(self, in_channels, out_channels, num_node_types, num_edge_types):
def test_cluster_gcn(): adj = torch.tensor([ [1, 1, 1, 0, 1, 0], [1, 1, 0, 1, 0, 1], [1, 0, 1, 0, 1, 0], [0, 1, 0, 1, 0, 1], [1, 0, 1, 0, 1, 0], [0, 1, 0, 1, 0, 1], ]) x = torch.Tensor([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]) edge_index = adj.nonzero(as_tuple=False).t() edge_attr = torch.arange(edge_index.size(1)) data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr) data.num_nodes = 6 cluster_data = ClusterData(data, num_parts=2, log=False) assert cluster_data.partptr.tolist() == [0, 3, 6] assert cluster_data.perm.tolist() == [0, 2, 4, 1, 3, 5] assert cluster_data.data.x.tolist() == [ [0, 0], [2, 2], [4, 4], [1, 1], [3, 3], [5, 5], ] assert cluster_data.data.adj.to_dense().tolist() == [ [0, 2, 3, 1, 0, 0], [8, 9, 10, 0, 0, 0], [14, 15, 16, 0, 0, 0], [4, 0, 0, 5, 6, 7], [0, 0, 0, 11, 12, 13], [0, 0, 0, 17, 18, 19], ] data = cluster_data[0] assert data.num_nodes == 3 assert data.x.tolist() == [[0, 0], [2, 2], [4, 4]] assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]] assert data.edge_attr.tolist() == [0, 2, 3, 8, 9, 10, 14, 15, 16] data = cluster_data[1] assert data.num_nodes == 3 assert data.x.tolist() == [[1, 1], [3, 3], [5, 5]] assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]] assert data.edge_attr.tolist() == [5, 6, 7, 11, 12, 13, 17, 18, 19] loader = ClusterLoader(cluster_data, batch_size=1) iterator = iter(loader) data = next(iterator) assert data.x.tolist() == [[0, 0], [2, 2], [4, 4]] assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]] data = next(iterator) assert data.x.tolist() == [[1, 1], [3, 3], [5, 5]] assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]] torch.manual_seed(1) loader = ClusterLoader(cluster_data, batch_size=2, shuffle=True) data = next(iter(loader)) assert data.num_nodes == 6 assert data.x.tolist() == [ [0, 0], [2, 2], [4, 4], [1, 1], [3, 3], [5, 5], ] assert to_dense_adj(data.edge_index).squeeze().tolist() == [ [1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [1, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], ] torch.manual_seed(2) loader = ClusterLoader(cluster_data, batch_size=2, shuffle=True) data = next(iter(loader)) assert data.num_nodes == 6 assert data.x.tolist() == [ [1, 1], [3, 3], [5, 5], [0, 0], [2, 2], [4, 4], ] assert to_dense_adj(data.edge_index).squeeze().tolist() == [ [1, 1, 1, 1, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [1, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], ] loader = ClusterLoader(cluster_data, batch_size=1, shuffle=True) data = next(iter(loader)) assert data.num_nodes == 3
val_data_list = [data for data in val_dataset] for data in val_data_list: data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.val_mask = torch.ones(data.num_nodes, dtype=torch.bool) data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool) test_data_list = [data for data in test_dataset] for data in test_data_list: data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.test_mask = torch.ones(data.num_nodes, dtype=torch.bool) data = Batch.from_data_list(train_data_list + val_data_list + test_data_list) cluster_data = ClusterData(data, num_parts=50, recursive=False, save_dir=dataset.processed_dir) loader = ClusterLoader(cluster_data, batch_size=1, shuffle=True, num_workers=0) #Model Structure class Net(torch.nn.Module): def __init__(self, in_channels, out_channels): super(Net, self).__init__() dim = 512 self.gcn1 = ChebConv(in_channels, dim, K=1) self.lin1 = nn.Linear(in_channels, dim) self.gcn2 = ChebConv(dim, dim, K=1) self.lin2 = nn.Linear(dim, dim) self.gcn3 = ChebConv(dim, dim, K=1) self.lin3 = nn.Linear(dim, dim) self.gcn4 = ChebConv(dim, dim, K=1) self.lin4 = nn.Linear(dim, dim)
def main(): parser = argparse.ArgumentParser(description='OGBN-Proteins (Cluster-GCN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_node_features', action='store_true') parser.add_argument('--num_partitions', type=int, default=700) parser.add_argument('--num_workers', type=int, default=6) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=128) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=50) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=1000) parser.add_argument('--eval_steps', type=int, default=5) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-proteins') splitted_idx = dataset.get_idx_split() data = dataset[0] # Convert split indices to boolean masks and add them to `data`. for key, idx in splitted_idx.items(): mask = torch.zeros(data.num_nodes, dtype=torch.bool) mask[idx] = True data[f'{key}_mask'] = mask cluster_data = ClusterData( data, num_parts=args.num_partitions, recursive=False, save_dir=dataset.processed_dir) if not args.use_node_features: cluster_data.data.x = torch.ones(cluster_data.data.num_nodes, 1) else: cluster_data.data.x = cluster_data.data.x.to(torch.float) loader = ClusterLoader( cluster_data, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) model = GIN( cluster_data.data.x.size(-1), data.edge_attr.size(-1), args.hidden_channels, 112, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbn-proteins') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, loader, optimizer, device) if epoch % args.eval_steps == 0: result = test(model, loader, evaluator, device) logger.add_result(run, result) if epoch % args.log_steps == 0: train_rocauc, valid_rocauc, test_rocauc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_rocauc:.2f}%, ' f'Valid: {100 * valid_rocauc:.2f}% ' f'Test: {100 * test_rocauc:.2f}%') logger.print_statistics(run) logger.print_statistics()
import torch import torch.nn.functional as F from torch_geometric.datasets import Reddit from torch_geometric.data import ClusterDataset, ClusterLoader from torch_geometric.nn import SAGEConv dataset = Reddit('../data/Reddit') print('Partioning the graph... (this may take a while)') cluster_dataset = ClusterDataset(dataset, num_parts=1500, save=True) train_loader = ClusterLoader(cluster_dataset, batch_size=20, shuffle=True, drop_last=True, num_workers=6) test_loader = ClusterLoader(cluster_dataset, batch_size=20, shuffle=False, num_workers=6) print('Done!') class Net(torch.nn.Module): def __init__(self, in_channels, out_channels): super(Net, self).__init__() self.conv1 = SAGEConv(in_channels, 128, normalize=False) self.conv2 = SAGEConv(128, out_channels, normalize=False) def forward(self, x, edge_index): x = F.dropout(x, p=0.2, training=self.training) x = F.relu(self.conv1(x, edge_index))
# Gather some statistics about the graph. print(f'Number of nodes: {data.num_nodes}') print(f'Number of edges: {data.num_edges}') print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}') print(f'Number of training nodes: {data.train_mask.sum()}') print( f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.3f}' ) print(f'Contains isolated nodes: {data.contains_isolated_nodes()}') print(f'Contains self-loops: {data.contains_self_loops()}') print(f'Is undirected: {data.is_undirected()}') ### Test data loader torch.manual_seed(12345) cluster_data = ClusterData(data, num_parts=128) # 1. Create subgraphs. train_loader = ClusterLoader(cluster_data, batch_size=32, shuffle=True) # 2. Stochastic partioning scheme. print() total_num_nodes = 0 for step, sub_data in enumerate(train_loader): print(f'Step {step + 1}:') print('=======') print(f'Number of nodes in the current batch: {sub_data.num_nodes}') print(sub_data) print() total_num_nodes += sub_data.num_nodes print(f'Iterated over {total_num_nodes} of {data.num_nodes} nodes!') class GCN(torch.nn.Module):