def main(): seed_everything(42) root = osp.join('data', 'TUDataset') dataset = TUDataset(root, 'IMDB-BINARY', pre_transform=T.OneHotDegree(135)) dataset = dataset.shuffle() test_dataset = dataset[:len(dataset) // 10] val_dataset = dataset[len(dataset) // 10:2 * len(dataset) // 10] train_dataset = dataset[2 * len(dataset) // 10:] datamodule = LightningDataset(train_dataset, val_dataset, test_dataset, batch_size=64, num_workers=4) model = Model(dataset.num_node_features, dataset.num_classes) devices = torch.cuda.device_count() strategy = pl.strategies.DDPSpawnStrategy(find_unused_parameters=False) checkpoint = pl.callbacks.ModelCheckpoint(monitor='val_acc', save_top_k=1) trainer = pl.Trainer(strategy=strategy, accelerator='gpu', devices=devices, max_epochs=50, log_every_n_steps=5, callbacks=[checkpoint]) trainer.fit(model, datamodule) trainer.test(ckpt_path='best', datamodule=datamodule)
def __init__(self, data_dir): super().__init__() self.data_dir = data_dir self.transform = T.Compose([ T.OneHotDegree(self.num_features - 1), T.ToSparseTensor(), ])
def create_one_hot_transform(dataset): max_degree = 0 degs = [] for data in dataset: degs += [degree(data.edge_index[0], dtype=torch.long)] max_degree = max(max_degree, degs[-1].max().item()) return T.OneHotDegree(max_degree)
def get_dataset(name, sparse=True, dataset_div=None): path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', name) try: shutil.copytree('../input/smt', path) except shutil.Error as e: for src,dst,msg in e.args[0]: print(dst,src,msg) except FileExistsError as e: print(e) dataset = TUDataset(path, name, use_node_attr=True) dataset.data.edge_attr = None if dataset.data.x is None: print('confirm the data.x do not exists!!') exit(1) max_degree = 0 degs = [] for data in dataset: degs += [degree(data.edge_index[0], dtype=torch.long)] max_degree = max(max_degree, degs[-1].max().item()) if max_degree < 1000: dataset.transform = T.OneHotDegree(max_degree) else: deg = torch.cat(degs, dim=0).to(torch.float) mean, std = deg.mean().item(), deg.std().item() dataset.transform = NormalizedDegree(mean, std) if not sparse: num_nodes = max_num_nodes = 0 for data in dataset: num_nodes += data.num_nodes max_num_nodes = max(data.num_nodes, max_num_nodes) # Filter out a few really large graphs in order to apply DiffPool. if name == 'REDDIT-BINARY': num_nodes = min(int(num_nodes / len(dataset) * 1.5), max_num_nodes) else: num_nodes = min(int(num_nodes / len(dataset) * 5), max_num_nodes) indices = [] for i, data in enumerate(dataset): if data.num_nodes <= num_nodes: indices.append(i) dataset = dataset[torch.tensor(indices)] if dataset.transform is None: dataset.transform = T.ToDense(num_nodes) else: dataset.transform = T.Compose( [dataset.transform, T.ToDense(num_nodes)]) if dataset_div!=None: dataset=dataset.shuffle()[:len(dataset)//dataset_div] return dataset
def get_dataset(name, sparse=True, cleaned=False): if name == 'node': path = osp.join(os.environ['GNN_TRAINING_DATA_ROOT'], name) print(path) dataset = HitGraphDataset2(path, directed=False, categorical=True) else: path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', name) dataset = TUDataset(path, name, cleaned=cleaned) dataset.data.edge_attr = None if dataset.data.x is None: max_degree = 0 degs = [] for data in dataset: degs += [degree(data.edge_index[0], dtype=torch.long)] max_degree = max(max_degree, degs[-1].max().item()) if max_degree < 1000: dataset.transform = T.OneHotDegree(max_degree) else: deg = torch.cat(degs, dim=0).to(torch.float) mean, std = deg.mean().item(), deg.std().item() dataset.transform = NormalizedDegree(mean, std) if not sparse: num_nodes = max_num_nodes = 0 for data in dataset: num_nodes += data.num_nodes max_num_nodes = max(data.num_nodes, max_num_nodes) # Filter out a few really large graphs in order to apply DiffPool. if name == 'REDDIT-BINARY': num_nodes = min(int(num_nodes / len(dataset) * 1.5), max_num_nodes) else: num_nodes = min(int(num_nodes / len(dataset) * 5), max_num_nodes) indices = [] for i, data in enumerate(dataset): if data.num_nodes <= num_nodes: indices.append(i) dataset = dataset[torch.tensor(indices)] if dataset.transform is None: dataset.transform = T.ToDense(num_nodes) else: dataset.transform = T.Compose( [dataset.transform, T.ToDense(num_nodes)]) return dataset
def load_data(dataset, cleaned=False): path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'datasets') dataset = TUDataset(path, dataset, cleaned=cleaned) dataset.data.edge_attr = None if dataset.data.x is None: max_degree = 0 degs = [] for data in dataset: degs += [degree(data.edge_index[0], dtype=torch.long)] max_degree = max(max_degree, degs[-1].max().item()) if max_degree < 1000: dataset.transform = T.OneHotDegree(max_degree) else: deg = torch.cat(degs, dim=0).to(torch.float) mean, std = deg.mean().item(), deg.std().item() dataset.transform = NormalizedDegree(mean, std) return dataset
def graph_kernel_dataset(name, path, sparse=True): dataset = TUDataset(path, name) dataset.data.edge_attr = None if dataset.data.x is None: max_degree = 0 degs = [] for data in dataset: degs += [degree(data.edge_index[0], dtype=torch.long)] max_degree = max(max_degree, degs[-1].max().item()) if max_degree < 1000: dataset.transform = T.OneHotDegree(max_degree) else: deg = torch.cat(degs, dim=0).to(torch.float) mean, std = deg.mean().item(), deg.std().item() dataset.transform = NormalizedDegree(mean, std) if not sparse: num_nodes = max_num_nodes = 0 for data in dataset: num_nodes += data.num_nodes max_num_nodes = max(data.num_nodes, max_num_nodes) # Filter out a few really large graphs in order to apply DiffPool. if name == 'REDDIT-BINARY': num_nodes = min(int(num_nodes / len(dataset) * 1.5), max_num_nodes) else: num_nodes = min(int(num_nodes / len(dataset) * 5), max_num_nodes) indices = [] for i, data in enumerate(dataset): if data.num_nodes <= num_nodes: indices.append(i) dataset = dataset[torch.Tensor(indices)] if dataset.transform is None: dataset.transform = T.ToDense(num_nodes) else: dataset.transform = T.Compose( [dataset.transform, T.ToDense(num_nodes)]) return dataset
def get_dataset(name, sparse=True): path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', name) dataset = TUDataset(path, name) dataset.data.edge_attr = None if dataset.data.x is None: max_degree = 0 degs = [] for data in dataset: degs += [degree(data.edge_index[0], dtype=torch.long)] max_degree = max(max_degree, degs[-1].max().item()) if max_degree < 1000: dataset.transform = T.OneHotDegree(max_degree) else: deg = torch.cat(degs, dim=0).to(torch.float) mean, std = deg.mean().item(), deg.std().item() dataset.transform = NormalizedDegree(mean, std) if not sparse: num_nodes = max_num_nodes = 0 for data in dataset: num_nodes += data.num_nodes max_num_nodes = max(data.num_nodes, max_num_nodes) if name == 'REDDIT-BINARY': num_nodes = min(int(num_nodes / len(dataset) * 1.5), max_num_nodes) else: num_nodes = min(int(num_nodes / len(dataset) * 5), max_num_nodes) indices = [] for i, data in enumerate(dataset): if data.num_nodes <= num_nodes: indices.append(i) dataset = dataset[torch.tensor(indices)] if dataset.transform is None: dataset.transform = T.ToDense(num_nodes) else: dataset.transform = T.Compose( [dataset.transform, T.ToDense(num_nodes)]) return dataset
def get_dataset(name, sparse=True, cleaned=False, normalize=False): dataset = TUDataset(os.path.join('./data', name), name, use_node_attr=True, cleaned=cleaned) dataset.data.edge_attr = None if dataset.data.x is None: max_degree = 0 degs = [] for data in dataset: degs += [degree(data.edge_index[0], dtype=torch.long)] max_degree = max(max_degree, degs[-1].max().item()) if max_degree < 1000: dataset.transform = T.OneHotDegree(max_degree) else: deg = torch.cat(degs, dim=0).to(torch.float) mean, std = deg.mean().item(), deg.std().item() dataset.transform = NormalizedDegree(mean, std) elif normalize: dataset.data.x -= torch.mean(dataset.data.x, axis=0) dataset.data.x /= torch.std(dataset.data.x, axis=0) if not sparse: max_num_nodes = 0 for data in dataset: max_num_nodes = max(data.num_nodes, max_num_nodes) if dataset.transform is None: dataset.transform = T.ToDense(max_num_nodes) else: dataset.transform = T.Compose( [dataset.transform, T.ToDense(max_num_nodes)]) return dataset
def main(): seed_everything(42) root = osp.join('data', 'TUDataset') dataset = TUDataset(root, 'IMDB-BINARY', pre_transform=T.OneHotDegree(135)) dataset = dataset.shuffle() test_dataset = dataset[:len(dataset) // 10] val_dataset = dataset[len(dataset) // 10:2 * len(dataset) // 10] train_dataset = dataset[2 * len(dataset) // 10:] train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, pin_memory=True) val_loader = DataLoader(val_dataset, batch_size=64, pin_memory=True) test_loader = DataLoader(test_dataset, batch_size=64, pin_memory=True) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = Model(dataset.num_node_features, dataset.num_classes).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) metrics = {'acc': ignite.metrics.Accuracy()} def prepare_batch_fn(batch, device, non_blocking): return (batch.to(device, non_blocking=non_blocking), batch.y.to(device, non_blocking=non_blocking)) trainer = ignite.engine.create_supervised_trainer( model=model, optimizer=optimizer, loss_fn=F.cross_entropy, device=device, prepare_batch=prepare_batch_fn, output_transform=lambda x, y, y_pred, loss: loss.item(), amp_mode='amp', ) # Progress bar for each epoch: pbar = ignite.contrib.handlers.tqdm_logger.ProgressBar() pbar.attach(trainer, output_transform=lambda x: {'loss': x}) def log_metrics(evaluator, loader, tag): def logger(trainer): evaluator.run(loader) print(f'{tag:10} Epoch: {trainer.state.epoch:02d}, ' f'Acc: {evaluator.state.metrics["acc"]:.4f}') return logger train_evaluator = ignite.engine.create_supervised_evaluator( model=model, metrics=metrics, device=device, prepare_batch=prepare_batch_fn, output_transform=lambda x, y, y_pred: (y_pred, y), amp_mode='amp', ) trainer.on(ignite.engine.Events.EPOCH_COMPLETED(every=1))(log_metrics( train_evaluator, train_loader, 'Training')) val_evaluator = ignite.engine.create_supervised_evaluator( model=model, metrics=metrics, device=device, prepare_batch=prepare_batch_fn, output_transform=lambda x, y, y_pred: (y_pred, y), amp_mode='amp', ) trainer.on(ignite.engine.Events.EPOCH_COMPLETED(every=1))(log_metrics( val_evaluator, val_loader, 'Validation')) test_evaluator = ignite.engine.create_supervised_evaluator( model=model, metrics=metrics, device=device, prepare_batch=prepare_batch_fn, output_transform=lambda x, y, y_pred: (y_pred, y), amp_mode='amp', ) trainer.on(ignite.engine.Events.EPOCH_COMPLETED(every=1))(log_metrics( test_evaluator, test_loader, 'Test')) # Save checkpoint of the model based on Accuracy on the validation set: checkpoint_handler = ignite.handlers.Checkpoint( {'model': model}, 'runs/gin', n_saved=2, score_name=list(metrics.keys())[0], filename_pattern='best-{global_step}-{score_name}-{score}.pt', global_step_transform=ignite.handlers.global_step_from_engine(trainer), ) val_evaluator.add_event_handler(ignite.engine.Events.EPOCH_COMPLETED, checkpoint_handler) # Create a tensorboard logger to write logs: tb_logger = ignite.contrib.handlers.tensorboard_logger.TensorboardLogger( log_dir=osp.join('runs/example', 'tb_logs')) tb_logger.attach_output_handler( trainer, event_name=ignite.engine.Events.ITERATION_COMPLETED, tag='training', output_transform=lambda loss: {'loss_iteration': loss}) tb_logger.attach_output_handler( trainer, event_name=ignite.engine.Events.EPOCH_COMPLETED, tag='training', output_transform=lambda loss: {'loss_epoch': loss}) tb_logger.attach_output_handler( train_evaluator, event_name=ignite.engine.Events.EPOCH_COMPLETED, tag='training', metric_names='all', global_step_transform=ignite.handlers.global_step_from_engine(trainer), ) tb_logger.attach_output_handler( val_evaluator, event_name=ignite.engine.Events.EPOCH_COMPLETED, tag='validation', metric_names='all', global_step_transform=ignite.handlers.global_step_from_engine(trainer), ) tb_logger.attach_output_handler( test_evaluator, event_name=ignite.engine.Events.EPOCH_COMPLETED, tag='test', metric_names='all', global_step_transform=ignite.handlers.global_step_from_engine(trainer), ) tb_logger.close() trainer.run(train_loader, max_epochs=50)
parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate(default: 0.2)') args = parser.parse_args() max_degree = 1000 path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', args.dataset_name) result_path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'Results', args.dataset_name, 'tmp.txt') dataset = TUDataset( path, name=args.dataset_name, transform=T.OneHotDegree(max_degree), ) label = dataset.data.y device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') n_components = dataset.num_classes blocks = args.num_blocks block_in_channels = [int(_) for _ in args.block_in_dim.split('-')] block_out_channels = [int(_) for _ in args.block_out_dim.split('-')] affine_dims = [int(_) for _ in args.affine_dim.split('-')] attention_dims = [int(_) for _ in args.attention_dim.split('-')] learning_rate = args.lr dropout_rate = args.dropout batch_size = args.batch_size print(args)
from torch_geometric.datasets import TUDataset import torch_geometric.transforms as T from torch_geometric.data import DataLoader from torch_geometric.nn import GINConv, GCNConv, SAGPooling from torch_geometric.nn import global_max_pool from torch_scatter import scatter_mean class HandleNodeAttention(object): def __call__(self, data): data.attn = torch.softmax(data.x, dim=0).flatten() data.x = None return data transform = T.Compose([HandleNodeAttention(), T.OneHotDegree(max_degree=14)]) path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'TRIANGLES') dataset = TUDataset(path, name='TRIANGLES', use_node_attr=True, transform=transform) train_loader = DataLoader(dataset[:30000], batch_size=60, shuffle=True) val_loader = DataLoader(dataset[30000:35000], batch_size=60) test_loader = DataLoader(dataset[35000:], batch_size=60) class Net(torch.nn.Module): def __init__(self, in_channels): super(Net, self).__init__() self.conv1 = GINConv(Seq(Lin(in_channels, 64), ReLU(), Lin(64, 64))) self.pool1 = SAGPooling(64, min_score=0.001, GNN=GCNConv)
results = {beta: [0] * repeats for beta in betas} for r in range(repeats): for beta in betas: accuracies = [] dataset = TUDataset(path, name=DS).shuffle() if dataset.data.x is None: max_degree = 0 degs = [] for data in dataset: degs += [degree(data.edge_index[0], dtype=torch.long)] max_degree = max(max_degree, degs[-1].max().item()) if max_degree < 1000: dataset.transform = T.OneHotDegree(max_degree) else: deg = torch.cat(degs, dim=0).to(torch.float) mean, std = deg.mean().item(), deg.std().item() dataset.transform = NormalizedDegree(mean, std) try: dataset_num_features = dataset.num_features except: dataset_num_features = 1 dataloader = DataLoader(dataset, batch_size=batch_size) device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') model = GcnInfomax(args.hidden_dim, args.num_gc_layers).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr)
else: loss_fn = F.cross_entropy predict_fn = lambda output: output.max(1, keepdim=True)[1].detach().cpu() if args.torch_geom: if args.degree: if args.dataset == 'TRIANGLES': max_degree = 14 else: raise NotImplementedError( 'max_degree value should be specified in advance. ' 'Try running without --torch_geom (-g) and look at dataset statistics printed out by our code.' ) if args.degree: transforms.append(T.OneHotDegree(max_degree=max_degree, cat=False)) dataset = TUDataset('./data/%s/' % args.dataset, name=args.dataset, use_node_attr=args.use_cont_node_attr, transform=T.Compose(transforms)) train_ids, test_ids = split_ids(args, rnd_state.permutation(len(dataset)), folds=n_folds) else: datareader = DataReader(args=args, data_dir='./data/%s/' % args.dataset, rnd_state=rnd_state, folds=n_folds, use_cont_node_attr=args.use_cont_node_attr)
import torch from torch.nn import BatchNorm1d from torch.nn.modules import Module import torch.nn.functional as F from torch.nn import Linear from torch_geometric.datasets import TUDataset import torch_geometric.transforms as T from torch_geometric.data import DataLoader from torch_geometric.nn import global_add_pool, GATConv from torch_geometric.utils import add_self_loops from torch_scatter import scatter_add max_degree = 10000 path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'COLLAB') dataset = TUDataset(path, name='COLLAB', transform=T.OneHotDegree(max_degree)).shuffle() data_index = range(len(dataset)) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') n_components = dataset.num_classes node_features = [] graph_features = [] degree = [] def append_node_features(module, input, output): del node_features[:-1] node_features.append(output.tolist()) def append_graph_features(module, input, output):
def gnn_evaluation(gnn, ds_name, layers, hidden, max_num_epochs=200, batch_size=128, start_lr=0.01, min_lr=0.000001, factor=0.5, patience=5, num_repetitions=10, all_std=True): # Load dataset and shuffle. path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'datasets', ds_name) dataset = TUDataset(path, name=ds_name).shuffle() # One-hot degree if node labels are not available. # The following if clause is taken from https://github.com/rusty1s/pytorch_geometric/blob/master/benchmark/kernel/datasets.py. if dataset.data.x is None: max_degree = 0 degs = [] for data in dataset: degs += [degree(data.edge_index[0], dtype=torch.long)] max_degree = max(max_degree, degs[-1].max().item()) if max_degree < 1000: dataset.transform = T.OneHotDegree(max_degree) else: deg = torch.cat(degs, dim=0).to(torch.float) mean, std = deg.mean().item(), deg.std().item() dataset.transform = NormalizedDegree(mean, std) # Set device. device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') test_accuracies_all = [] test_accuracies_complete = [] for i in range(num_repetitions): # Test acc. over all folds. test_accuracies = [] kf = KFold(n_splits=10, shuffle=True) dataset.shuffle() for train_index, test_index in kf.split(list(range(len(dataset)))): # Sample 10% split from training split for validation. train_index, val_index = train_test_split(train_index, test_size=0.1) best_val_acc = 0.0 best_test = 0.0 # Split data. train_dataset = dataset[train_index.tolist()] val_dataset = dataset[val_index.tolist()] test_dataset = dataset[test_index.tolist()] # Prepare batching. train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True) # Collect val. and test acc. over all hyperparameter combinations. for l in layers: for h in hidden: # Setup model. model = gnn(dataset, l, h).to(device) model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=start_lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=factor, patience=patience, min_lr=0.0000001) for epoch in range(1, max_num_epochs + 1): lr = scheduler.optimizer.param_groups[0]['lr'] train(train_loader, model, optimizer, device) val_acc = test(val_loader, model, device) scheduler.step(val_acc) if val_acc > best_val_acc: best_val_acc = val_acc best_test = test(test_loader, model, device) * 100.0 # Break if learning rate is smaller 10**-6. if lr < min_lr: break test_accuracies.append(best_test) if all_std: test_accuracies_complete.append(best_test) test_accuracies_all.append(float(np.array(test_accuracies).mean())) if all_std: return (np.array(test_accuracies_all).mean(), np.array(test_accuracies_all).std(), np.array(test_accuracies_complete).std()) else: return (np.array(test_accuracies_all).mean(), np.array(test_accuracies_all).std())
args = parser.parse_args() if args.gpu is not None: torch.cuda.set_device(args.gpu) max_degrees = { 'IMDB-BINARY': 135, 'IMDB-MULTI': 88, 'COLLAB': 491, } transforms = [] if 'REDDIT' in args.dataset or args.dataset in max_degrees: transforms.append(T.Constant(1)) if args.dataset in max_degrees: transforms.append(T.OneHotDegree(max_degrees[args.dataset])) print('transforms:', transforms) path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', args.dataset) dataset = TUDataset(path, name=args.dataset, transform=T.Compose(transforms)) # different seeds for different folds so that one particularly good or bad init doesn't affect the results for the whole seed # multiply folds by 10 so that nets in different seeds are initialised with different seeds seed = args.seed + 10 * args.fold torch.manual_seed(seed) torch.cuda.manual_seed(seed) dataset = dataset.shuffle() kfold = KFold(n_splits=10) train_indices, test_indices = list(kfold.split(range(len(dataset))))[args.fold]