def __init__(self, name, pos_enc_dim=0, norm='none', verbose=True): start = time.time() if verbose: print("[I] Loading dataset %s..." % (name)) self.name = name self.dataset = DglGraphPropPredDataset(name='ogbg-molhiv') self.split_idx = self.dataset.get_idx_split() self.train = HIVDGL(self.dataset, self.split_idx['train'], norm=norm, pos_enc_dim=pos_enc_dim) self.val = HIVDGL(self.dataset, self.split_idx['valid'], norm=norm, pos_enc_dim=pos_enc_dim) self.test = HIVDGL(self.dataset, self.split_idx['test'], norm=norm, pos_enc_dim=pos_enc_dim) self.evaluator = Evaluator(name='ogbg-molhiv') if verbose: print('train, test, val sizes :', len(self.train), len(self.test), len(self.val)) print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time() - start))
def __init__(self, name, pos_enc_dim=0, norm='none', path='dataset/ogbg-molhiv', directions=['subgraphs'], verbose=True, **subgraph_params): start = time.time() if verbose: print("[I] Loading dataset %s..." % (name)) self.name = name ##### MODIFIED CODE HERE if 'subgraphs' in directions: self.dataset, self.split_idx = prepare_dataset( path, name, **subgraph_params) print("One hot encoding substructure counts... ", end='') self.dataset, self.d_id = encode(self.dataset, subgraph_params['id_encoding']) else: self.dataset = DglGraphPropPredDataset(name=name, root=path) self.split_idx = self.dataset.get_idx_split() self.d_id = None self.train = HIVDGL(self.dataset, self.split_idx['train'], norm=norm, pos_enc_dim=pos_enc_dim, directions=directions, **subgraph_params) self.val = HIVDGL(self.dataset, self.split_idx['valid'], norm=norm, pos_enc_dim=pos_enc_dim, directions=directions, **subgraph_params) self.test = HIVDGL(self.dataset, self.split_idx['test'], norm=norm, pos_enc_dim=pos_enc_dim, directions=directions, **subgraph_params) ##### MODIFIED CODE HERE #import pdb;pdb.set_trace() self.evaluator = Evaluator(name='ogbg-molhiv') if verbose: print('train, test, val sizes :', len(self.train), len(self.test), len(self.val)) print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time() - start))
def load_ogbg(name, device=th.device('cpu'), root='/home/eva_share_users/zhuyu'): from ogb.graphproppred import DglGraphPropPredDataset print('load', name) data = DglGraphPropPredDataset(name=name, root=root) #from IPython import embed; embed() from tqdm import tqdm out_channels = 0 for graph in tqdm(data): if name == 'ogbg-ppa': graph[0].ndata['feat'] = dgl.ops.copy_e_mean( graph[0], graph[0].edata['feat']) else: ef = graph[0].edata['feat'] edge = graph[0].edges()[1] H = th.zeros(graph[0].num_nodes(), 3) for i in range(graph[0].num_nodes()): mask = th.eq(edge, i) H[i, :] += th.matmul(mask.float(), ef.float()) H[i, :] /= graph[0].in_degrees(i) graph[0].ndata['feat'] = th.cat((graph[0].ndata['feat'], H), dim=1) #from IPython import embed; embed() in_channels = graph[0].ndata['feat'].shape[1] try: out_channels = max(out_channels, int(graph[1])) except: from IPython import embed embed() split_idx = data.get_idx_split() print('finish loading', name) from dgl.dataloading import GraphDataLoader train_loader = GraphDataLoader( data[split_idx['train']], batch_size=256, shuffle=True, ) valid_loader = GraphDataLoader( data[split_idx['valid']], batch_size=256, shuffle=True, ) test_loader = GraphDataLoader( data[split_idx['test']], batch_size=256, shuffle=True, ) #from IPython import embed; embed() return train_loader, valid_loader, test_loader, in_channels, out_channels + 1
class HIVDataset(Dataset): def __init__(self, name, pos_enc_dim=0, norm='none', verbose=True): start = time.time() if verbose: print("[I] Loading dataset %s..." % (name)) self.name = name self.dataset = DglGraphPropPredDataset(name='ogbg-molhiv') self.split_idx = self.dataset.get_idx_split() self.train = HIVDGL(self.dataset, self.split_idx['train'], norm=norm, pos_enc_dim=pos_enc_dim) self.val = HIVDGL(self.dataset, self.split_idx['valid'], norm=norm, pos_enc_dim=pos_enc_dim) self.test = HIVDGL(self.dataset, self.split_idx['test'], norm=norm, pos_enc_dim=pos_enc_dim) self.evaluator = Evaluator(name='ogbg-molhiv') if verbose: print('train, test, val sizes :', len(self.train), len(self.test), len(self.val)) print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time() - start)) # form a mini batch from a given list of samples = [(graph, label) pairs] def collate(self, samples): # The input samples is a list of pairs (graph, label). graphs, labels = map(list, zip(*samples)) labels = torch.cat(labels).long() tab_sizes_n = [graphs[i].number_of_nodes() for i in range(len(graphs))] tab_snorm_n = [ torch.FloatTensor(size, 1).fill_(1. / float(size)) for size in tab_sizes_n ] snorm_n = torch.cat(tab_snorm_n).sqrt() tab_sizes_e = [graphs[i].number_of_edges() for i in range(len(graphs))] tab_snorm_e = [ torch.FloatTensor(size, 1).fill_(1. / float(size)) for size in tab_sizes_e ] snorm_e = torch.cat(tab_snorm_e).sqrt() batched_graph = dgl.batch(graphs) return batched_graph, labels, snorm_n, snorm_e def _add_self_loops(self): # function for adding self loops # this function will be called only if self_loop flag is True self.train.graph_lists = [self_loop(g) for g in self.train.graph_lists] self.val.graph_lists = [self_loop(g) for g in self.val.graph_lists] self.test.graph_lists = [self_loop(g) for g in self.test.graph_lists]
def __init__(self, name, re_split=False, pos_enc_dim=0, norm='none', verbose=True): start = time.time() if verbose: print("[I] Loading dataset %s..." % (name)) self.name = name self.dataset = DglGraphPropPredDataset(name='ogbg-molhiv') self.split_idx = self.dataset.get_idx_split() if re_split: ind = [i for i in range(41127)] rd.shuffle(ind) self.split_idx = { 'test': torch.tensor([ind[i] for i in range(36564, 41127)]), 'train': torch.tensor([ind[i] for i in range(32000)]), 'valid': torch.tensor([ind[i] for i in range(32000, 36564)]) } self.train = HIVDGL(self.dataset, self.split_idx['train'], norm=norm, pos_enc_dim=pos_enc_dim) self.val = HIVDGL(self.dataset, self.split_idx['valid'], norm=norm, pos_enc_dim=pos_enc_dim) self.test = HIVDGL(self.dataset, self.split_idx['test'], norm=norm, pos_enc_dim=pos_enc_dim) self.evaluator = Evaluator(name='ogbg-molhiv') if verbose: print('train, test, val sizes :', len(self.train), len(self.test), len(self.val)) print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time() - start))
class HIVDataset(Dataset): def __init__(self, name, verbose=True): start = time.time() if verbose: print("[I] Loading dataset %s..." % (name)) self.name = name self.dataset = DglGraphPropPredDataset(name = 'ogbg-molhiv') self.split_idx = self.dataset.get_idx_split() self.train = HIVDGL(self.dataset, self.split_idx['train']) self.val = HIVDGL(self.dataset, self.split_idx['valid']) self.test = HIVDGL(self.dataset, self.split_idx['test']) self.evaluator = Evaluator(name='ogbg-molhiv') if verbose: print('train, test, val sizes :', len(self.train), len(self.test), len(self.val)) print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time() - start)) # form a mini batch from a given list of samples = [(graph, label) pairs] def collate(self, samples): # The input samples is a list of pairs (graph, label). graphs, labels = map(list, zip(*samples)) labels = torch.cat(labels).long() batched_graph = dgl.batch(graphs) return batched_graph, labels def _add_self_loops(self): # function for adding self loops # this function will be called only if self_loop flag is True self.train.graph_lists = [self_loop(g) for g in self.train.graph_lists] self.val.graph_lists = [self_loop(g) for g in self.val.graph_lists] self.test.graph_lists = [self_loop(g) for g in self.test.graph_lists]
def main(args): torch.manual_seed(args.seed) np.random.seed(args.seed) device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") # Load dataset and evaluator dataset = DglGraphPropPredDataset(name=args.dataset) split_idx = dataset.get_idx_split() evaluator = Evaluator(args.dataset) if args.pos_enc_dim > 0: # Add graph positional encodings print("Adding PEs...") dataset.graphs = [ add_positional_encoding(g, args.pos_enc_dim) for g in tqdm(dataset.graphs) ] # Basic pre-processing if args.dataset == 'ogbg-molpcba': print("Removing training graphs with 0 edges...") train_split = [] for idx, g in enumerate(tqdm(dataset.graphs)): if idx in split_idx["train"] and g.number_of_edges() != 0: train_split.append(idx) split_idx["train"] = torch.LongTensor(train_split) # Prepare dataloaders train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_dgl) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_dgl) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_dgl) # Initialize model, optimizer and scheduler if args.gnn in ['gated-gcn', 'gcn', 'mlp']: model = GNN_mol(gnn_type=args.gnn, num_tasks=dataset.num_tasks, num_layer=args.num_layer, emb_dim=args.emb_dim, dropout=args.dropout, batch_norm=True, residual=True, pos_enc_dim=args.pos_enc_dim, graph_pooling=args.pooling, virtualnode=args.virtualnode) model.to(device) print(model) total_param = 0 for param in model.parameters(): total_param += np.prod(list(param.data.size())) print(f'Total parameters: {total_param}') optimizer = optim.Adam(model.parameters(), lr=args.lr) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=args.lr_reduce_factor, patience=args.lr_scheduler_patience, verbose=True) else: raise ValueError('Invalid GNN type') # Define loss function cls_criterion = torch.nn.BCEWithLogitsLoss() # Create Tensorboard logger start_time_str = time.strftime("%Y%m%dT%H%M%S") log_dir = os.path.join( "logs", args.dataset, f"{args.expt_name}-{args.gnn}-L{args.num_layer}-h{args.emb_dim}-d{args.dropout}-LR{args.lr}", f"{start_time_str}-GPU{args.device}") tb_logger = SummaryWriter(log_dir) # Training loop train_curve = [] valid_curve = [] test_curve = [] for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) tb_logger.add_scalar('lr', optimizer.param_groups[0]['lr'], epoch) print('Training...') train(model, device, train_loader, optimizer, cls_criterion) print('Evaluating...') train_loss, train_perf = eval(model, device, train_loader, evaluator, cls_criterion) valid_loss, valid_perf = eval(model, device, valid_loader, evaluator, cls_criterion) _, test_perf = eval(model, device, test_loader, evaluator, cls_criterion) # Log statistics to Tensorboard, etc. print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) tb_logger.add_scalar('loss/train', train_loss, epoch) tb_logger.add_scalar(f'{dataset.eval_metric}/train', train_perf[dataset.eval_metric], epoch) tb_logger.add_scalar('loss/valid', valid_loss, epoch) tb_logger.add_scalar(f'{dataset.eval_metric}/valid', valid_perf[dataset.eval_metric], epoch) tb_logger.add_scalar(f'{dataset.eval_metric}/test', test_perf[dataset.eval_metric], epoch) train_curve.append(train_perf[dataset.eval_metric]) valid_curve.append(valid_perf[dataset.eval_metric]) test_curve.append(test_perf[dataset.eval_metric]) if args.lr_scheduler_patience > 0: # Reduce LR using scheduler scheduler.step(valid_loss) if 'classification' in dataset.task_type: best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) else: best_val_epoch = np.argmin(np.array(valid_curve)) best_train = min(train_curve) print('Finished training!') print('Best validation score: {}'.format(valid_curve[best_val_epoch])) print('Test score: {}'.format(test_curve[best_val_epoch])) torch.save( { 'args': args, 'model': model.__repr__, 'total_param': total_param, 'BestEpoch': best_val_epoch, 'Validation': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch], 'Train': train_curve[best_val_epoch], 'BestTrain': best_train, }, os.path.join(log_dir, "results.pt"))
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on ogbgmol* data with Pytorch Geometrics') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gin-virtual', help= 'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)') parser.add_argument('--drop_ratio', type=float, default=0.5, help='dropout ratio (default: 0.5)') parser.add_argument( '--num_layer', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--emb_dim', type=int, default=512, help='dimensionality of hidden units in GNNs (default: 300)') parser.add_argument('--batch_size', type=int, default=32, help='input batch size for training (default: 32)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--dataset', type=str, default="molhiv", help='dataset name (default: ogbg-molhiv)') parser.add_argument( '--rank', type=int, default=512, help='dimensionality of rank units in GNNs (default: 300)') parser.add_argument('--filename', type=str, default="", help='filename to output result (default: )') parser.add_argument('--lr', type=float, default=0.003) parser.add_argument('--wd', type=float, default=5e-5, help='Weight decay (L2 loss on parameters).') args = parser.parse_args() device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ### automatic dataloading and splitting dataset = DglGraphPropPredDataset(name="ogbg-" + args.dataset, root='torch_geometric_data/') split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(name="ogbg-" + args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_dgl) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_dgl) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_dgl) model = GNN(num_tasks=dataset.num_tasks, num_layer=args.num_layer, emb_dim=args.emb_dim, rank=args.rank, drop_ratio=args.drop_ratio).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) valid_curve = [] test_curve = [] train_curve = [] for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') train(model, device, train_loader, optimizer, dataset.task_type) print('Evaluating...') train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) train_curve.append(train_perf[dataset.eval_metric]) valid_curve.append(valid_perf[dataset.eval_metric]) test_curve.append(test_perf[dataset.eval_metric]) if 'classification' in dataset.task_type: best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) else: best_val_epoch = np.argmin(np.array(valid_curve)) best_train = min(train_curve) print('Finished training!') print('Best validation score: {}'.format(valid_curve[best_val_epoch])) print('Test score: {}'.format(test_curve[best_val_epoch])) if not args.filename == '': torch.save( { 'Val': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch], 'Train': train_curve[best_val_epoch], 'BestTrain': best_train }, args.filename)
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on ogbg-ppa with DGL') parser.add_argument( '--gnn', type=str, default='gin-virtual', help='GNN gin, gcn, gin-virtual, gcn-virtual (default: gin-virtual)') parser.add_argument('--dropout', type=float, default=0.5, help='dropout ratio (default: 0.5)') parser.add_argument( '--n_layers', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument('--hidden_feats', type=int, default=300, help='number of hidden units in GNNs (default: 300)') parser.add_argument('--batch_size', type=int, default=32, help='input batch size for training (default: 32)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs for training (default: 100)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--dataset', type=str, default="ogbg-ppa", help='dataset name (default: ogbg-ppa)') parser.add_argument('--filename', type=str, help='filename to output result') args = parser.parse_args() if args.filename is None: args.filename = args.gnn if torch.cuda.is_available(): device = torch.device('cuda:0') else: device = torch.device('cpu') # data loading and splitting dataset = DglGraphPropPredDataset(name=args.dataset) # initialize node features for i in range(len(dataset)): dataset[i][0].ndata['h'] = torch.zeros( dataset[i][0].number_of_nodes()).long() splitted_idx = dataset.get_idx_split() # automatic evaluator taking dataset name as input evaluator = Evaluator(args.dataset) # using collate_dgl train_loader = DataLoader(dataset[splitted_idx["train"]], batch_size=args.batch_size, shuffle=True, collate_fn=collate_dgl, num_workers=args.num_workers) valid_loader = DataLoader(dataset[splitted_idx["valid"]], batch_size=args.batch_size, shuffle=False, collate_fn=collate_dgl, num_workers=args.num_workers) test_loader = DataLoader(dataset[splitted_idx["test"]], batch_size=args.batch_size, shuffle=False, collate_fn=collate_dgl, num_workers=args.num_workers) if args.gnn == 'gin': gnn_type = 'gin' virtual_node = False if args.gnn == 'gcn': gnn_type = 'gcn' virtual_node = False if args.gnn == 'gin-virtual': gnn_type = 'gin' virtual_node = True if args.gnn == 'gcn-virtual': gnn_type = 'gcn' virtual_node = True model = GNNOGBPredictor( in_edge_feats=dataset[0][0].edata['feat'].shape[-1], hidden_feats=args.hidden_feats, n_layers=args.n_layers, n_tasks=int(dataset.num_classes), dropout=args.dropout, gnn_type=gnn_type, virtual_node=virtual_node).to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) valid_curve = [] test_curve = [] train_curve = [] time_curve = [] for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') t0 = time.time() train(model, device, train_loader, criterion, optimizer) t1 = time.time() if epoch >= 3: time_curve.append(t1 - t0) print('Evaluating...') train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) if epoch >= 3: print('Training Time: ', time_curve[-1]) train_curve.append(train_perf['acc']) valid_curve.append(valid_perf['acc']) test_curve.append(test_perf['acc']) best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) print('Finished training!') print('Best validation score: {}'.format(valid_curve[best_val_epoch])) print('Test score: {}'.format(test_curve[best_val_epoch])) print('Avg Training Time: ', np.mean(time_curve)) if not args.filename == '': torch.save( { 'Val': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch], 'Train': train_curve[best_val_epoch], 'BestTrain': best_train }, args.filename)
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on ogbg-mol* data with DGL') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gcn', help= 'gin, or gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)') parser.add_argument('--drop_ratio', type=float, default=0.5, help='dropout ratio (default: 0.5)') parser.add_argument( '--n_layers', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--n_hidden', type=int, default=300, help='dimensionality of hidden units in GNNs (default: 300)') parser.add_argument('--batch_size', type=int, default=32, help='input batch size for training (default: 32)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--dataset', type=str, default="ogbg-molhiv", help='dataset name (default: ogbg-molhiv)') parser.add_argument('--graph_pooling_type', type=str, default="mean", choices=["sum", "mean", "max"], help='type of graph pooling: sum, mean or max') parser.add_argument('--feature', type=str, default="full", choices=["full", "simple"], help='full feature or simple feature') parser.add_argument( '--filename', type=str, default="gin-virtual.pth", help='filename to output result (default: gin-virtual.pth)') args = parser.parse_args() if torch.cuda.is_available(): device = torch.device("cuda:" + str(args.device)) torch.cuda.set_device(args.device) else: device = torch.device("cpu") ### dataloading and splitting dataset = DglGraphPropPredDataset(name=args.dataset) print('Metric', dataset.eval_metric) if args.feature == 'simple': print('using simple feature') else: print('using full feature') splitted_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) # using collate_dgl train_loader = DataLoader(dataset[splitted_idx["train"]], batch_size=args.batch_size, shuffle=True, collate_fn=collate_dgl, num_workers=args.num_workers) valid_loader = DataLoader(dataset[splitted_idx["valid"]], batch_size=args.batch_size, shuffle=False, collate_fn=collate_dgl, num_workers=args.num_workers) test_loader = DataLoader(dataset[splitted_idx["test"]], batch_size=args.batch_size, shuffle=False, collate_fn=collate_dgl, num_workers=args.num_workers) if args.gnn == 'gin': gnn_type = 'gin' virtual_node = False if args.gnn == 'gcn': gnn_type = 'gcn' virtual_node = False if args.gnn == 'gin-virtual': gnn_type = 'gin' virtual_node = True if args.gnn == 'gcn-virtual': gnn_type = 'gcn' virtual_node = True model = GNNOGBPredictor( in_edge_feats=dataset[0][0].edata['feat'].shape[-1], hidden_feats=args.hidden_feats, n_layers=args.n_layers, n_tasks=int(dataset.num_classes), dropout=args.dropout, gnn_type=gnn_type, virtual_node=virtual_node).to(device) optimizer = optim.Adam(model.parameters(), lr=0.001) model = model.to(device) valid_curve = [] test_curve = [] train_curve = [] time_curve = [] for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') t0 = time.time() train(args.gnn, model, args.feature, device, train_loader, optimizer, dataset.task_type) if epoch >= 3: t1 = time.time() time_curve.append(t1 - t0) print('Training Time: ', time_curve[-1]) print('Evaluating...') train_perf = eval(args.gnn, model, args.feature, device, train_loader, evaluator, dataset.task_type) valid_perf = eval(args.gnn, model, args.feature, device, valid_loader, evaluator, dataset.task_type) test_perf = eval(args.gnn, model, args.feature, device, test_loader, evaluator, dataset.task_type) print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) # dataset.eval_metric = 'rocauc' train_curve.append(train_perf[dataset.eval_metric]) valid_curve.append(valid_perf[dataset.eval_metric]) test_curve.append(test_perf[dataset.eval_metric]) if 'classification' in dataset.task_type: best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) else: best_val_epoch = np.argmin(np.array(valid_curve)) best_train = min(train_curve) print('Finished training!') print('Best validation score: {}'.format(valid_curve[best_val_epoch])) print('Test score: {}'.format(test_curve[best_val_epoch])) print('Avg Training Time: ', np.mean(time_curve)) if not args.filename == '': torch.save( { 'Val': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch], 'Train': train_curve[best_val_epoch], 'BestTrain': best_train }, args.filename)
def main(): parser = argparse.ArgumentParser(description='OGBN-MolHiv') parser.add_argument('--device', type=int, default=0) parser.add_argument('--num_workers', type=int, default=4) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--batch_size', type=int, default=64) parser.add_argument('--num_layers', type=int, default=5) parser.add_argument('--emb_dim', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=50) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--eval', action='store_true', help='If not set, we will only do the training part.') parser.add_argument('--eval_batch_size', type=int, default=2048) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = DglGraphPropPredDataset(name='ogbg-molhiv') split_idx = dataset.get_idx_split() evaluator = Evaluator(name='ogbg-molhiv') train_loader = GraphDataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) val_loader = GraphDataLoader(dataset[split_idx["valid"]], batch_size=args.eval_batch_size, shuffle=True, num_workers=0) test_loader = GraphDataLoader(dataset[split_idx["test"]], batch_size=args.eval_batch_size, shuffle=True, num_workers=0) model = GCN(args.emb_dim, num_classes=dataset.num_tasks, num_layers=args.num_layers, dropout=args.dropout).to(device) logger = Logger(args.runs, args) dur = [] for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, args.epochs + 1): t0 = time.time() loss = train(model, device, train_loader, optimizer) if epoch >= 3: dur.append(time.time() - t0) print('Training time/epoch {}'.format(np.mean(dur))) if not args.eval: continue val_rocauc = test(model, device, val_loader, evaluator)[dataset.eval_metric] test_rocauc = test(model, device, test_loader, evaluator)[dataset.eval_metric] logger.add_result(run, (0.0, val_rocauc, test_rocauc)) if epoch % args.log_steps == 0: print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Valid: {val_rocauc:.4f} ' f'Test: {test_rocauc:.4f}') if args.eval: logger.print_statistics(run) if args.eval: logger.print_statistics()
def test_datasetsaver(): # test on graph classification # ogbg-molhiv test_task = 'link' # testing all the dataset objects are working. if test_task == 'graph': from ogb.graphproppred import PygGraphPropPredDataset, DglGraphPropPredDataset, GraphPropPredDataset dataset_name = 'ogbg-molhiv' dataset = PygGraphPropPredDataset(dataset_name) dataset.get_idx_split() dataset = DglGraphPropPredDataset(dataset_name) dataset.get_idx_split() dataset = GraphPropPredDataset(dataset_name) dataset.get_idx_split() elif test_task == 'node': from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset dataset_name = 'ogbn-arxiv' # test ogbn-proteins dataset = PygNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = DglNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = NodePropPredDataset(dataset_name) dataset.get_idx_split() elif test_task == 'link': from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset dataset_name = 'ogbl-collab' dataset = PygLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = DglLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = LinkPropPredDataset(dataset_name) dataset.get_edge_split() elif test_task == 'heteronode': from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset dataset_name = 'ogbn-mag' dataset = PygNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = DglNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = NodePropPredDataset(dataset_name) dataset.get_idx_split() elif test_task == 'heterolink': from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset dataset_name = 'ogbl-biokg' dataset = PygLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = DglLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = LinkPropPredDataset(dataset_name) dataset.get_edge_split() else: raise ValueError('Invalid task category') print(dataset[0]) if 'link' in test_task: print(dataset.get_edge_split()) else: print(dataset.get_idx_split()) if 'graph' in test_task: graph_list = dataset.graphs else: graph_list = [dataset.graph] if 'link' not in test_task: labels = dataset.labels is_hetero = 'hetero' in test_task version = 2 if dataset_name == 'ogbn-mag' else 1 saver = DatasetSaver(dataset_name, is_hetero, version=version) # saving graph objects saver.save_graph_list(graph_list) # saving target labels if 'link' not in test_task: saver.save_target_labels(labels) # saving split if 'link' in test_task: split_idx = dataset.get_edge_split() else: split_idx = dataset.get_idx_split() # second argument must be the name of the split saver.save_split(split_idx, dataset.meta_info['split']) # copying mapping dir # saver.copy_mapping_dir(f"dataset/{'_'.join(dataset_name.split('-'))}/mapping/") saver.copy_mapping_dir("dataset/{}/mapping/".format('_'.join( dataset_name.split('-')))) saver.save_task_info( dataset.task_type, dataset.eval_metric, dataset.num_classes if hasattr(dataset, 'num_classes') else None) meta_dict = saver.get_meta_dict() print(meta_dict) print('Now testing.') if 'graph' in test_task: print('library agnostic') dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('Pytorch Geometric') dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('DGL') dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) elif 'node' in test_task: print('library agnostic') dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict) dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('Pytorch Geometric') dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict) dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('DGL') dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict) dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) elif 'link' in test_task: print('library agnostic') dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) # print(dataset.get_edge_split()) print('Pytorch Geometric') dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) # print(dataset.get_edge_split()) print('DGL') dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) # print(dataset.get_edge_split()) else: raise ValueError('Invalid task category') # zip saver.zip() print('Finished zipping!') saver.cleanup()
class HIVDataset(Dataset): def __init__(self, name, pos_enc_dim=0, norm='none', path='dataset/ogbg-molhiv', directions=['subgraphs'], verbose=True, **subgraph_params): start = time.time() if verbose: print("[I] Loading dataset %s..." % (name)) self.name = name ##### MODIFIED CODE HERE if 'subgraphs' in directions: self.dataset, self.split_idx = prepare_dataset( path, name, **subgraph_params) print("One hot encoding substructure counts... ", end='') self.dataset, self.d_id = encode(self.dataset, subgraph_params['id_encoding']) else: self.dataset = DglGraphPropPredDataset(name=name, root=path) self.split_idx = self.dataset.get_idx_split() self.d_id = None self.train = HIVDGL(self.dataset, self.split_idx['train'], norm=norm, pos_enc_dim=pos_enc_dim, directions=directions, **subgraph_params) self.val = HIVDGL(self.dataset, self.split_idx['valid'], norm=norm, pos_enc_dim=pos_enc_dim, directions=directions, **subgraph_params) self.test = HIVDGL(self.dataset, self.split_idx['test'], norm=norm, pos_enc_dim=pos_enc_dim, directions=directions, **subgraph_params) ##### MODIFIED CODE HERE #import pdb;pdb.set_trace() self.evaluator = Evaluator(name='ogbg-molhiv') if verbose: print('train, test, val sizes :', len(self.train), len(self.test), len(self.val)) print("[I] Finished loading.") print("[I] Data load time: {:.4f}s".format(time.time() - start)) # form a mini batch from a given list of samples = [(graph, label) pairs] def collate(self, samples): # The input samples is a list of pairs (graph, label). graphs, labels = map(list, zip(*samples)) labels = torch.cat(labels).long() tab_sizes_n = [graphs[i].number_of_nodes() for i in range(len(graphs))] tab_snorm_n = [ torch.FloatTensor(size, 1).fill_(1. / float(size)) for size in tab_sizes_n ] snorm_n = torch.cat(tab_snorm_n).sqrt() tab_sizes_e = [graphs[i].number_of_edges() for i in range(len(graphs))] tab_snorm_e = [ torch.FloatTensor(size, 1).fill_(1. / float(size)) for size in tab_sizes_e ] snorm_e = torch.cat(tab_snorm_e).sqrt() batched_graph = dgl.batch(graphs) return batched_graph, labels, snorm_n, snorm_e def _add_self_loops(self): # function for adding self loops # this function will be called only if self_loop flag is True self.train.graph_lists = [self_loop(g) for g in self.train.graph_lists] self.val.graph_lists = [self_loop(g) for g in self.val.graph_lists] self.test.graph_lists = [self_loop(g) for g in self.test.graph_lists]
def main(): # check cuda device = f'cuda:{args.gpu}' if args.gpu >= 0 and torch.cuda.is_available( ) else 'cpu' # load ogb dataset & evaluator dataset = DglGraphPropPredDataset(name=args.dataset) evaluator = Evaluator(name=args.dataset) g, _ = dataset[0] edge_feat_dim = g.edata['feat'].size()[-1] n_classes = int(dataset.num_classes) split_idx = dataset.get_idx_split() train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, collate_fn=collate_dgl) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, collate_fn=collate_dgl) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, collate_fn=collate_dgl) # load model model = DeeperGCN(dataset=args.dataset, node_feat_dim=edge_feat_dim, edge_feat_dim=edge_feat_dim, hid_dim=args.hid_dim, out_dim=n_classes, num_layers=args.num_layers, dropout=args.dropout, norm=args.norm, beta=args.beta, mlp_layers=args.mlp_layers).to(device) print(model) opt = optim.Adam(model.parameters(), lr=args.lr) # training & validation & testing best_acc = 0 best_model = copy.deepcopy(model) print('---------- Training ----------') for i in range(args.epochs): train_loss = train(model, device, train_loader, opt) if i % args.eval_steps == 0: train_acc = test(model, device, train_loader, evaluator) valid_acc = test(model, device, valid_loader, evaluator) print( f'Epoch {i} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Valid Acc: {valid_acc:.4f}' ) if valid_acc > best_acc: best_acc = valid_acc best_model = copy.deepcopy(model) else: print(f'Epoch {i} | Train Loss: {train_loss:.4f}') print('---------- Testing ----------') test_acc = test(best_model, device, test_loader, evaluator) print(f'Test Acc: {test_acc}')
def main(): parser = argparse.ArgumentParser( description= 'PyTorch graph convolutional neural net for whole-graph classification' ) parser.add_argument('--dataset', type=str, default="MUTAG", help='name of dataset (default: MUTAG)') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument('--batch_size', type=int, default=32, help='input batch size for training (default: 32)') parser.add_argument( '--iters_per_epoch', type=int, default=50, help='number of iterations per each epoch (default: 50)') parser.add_argument('--epochs', type=int, default=350, help='number of epochs to train (default: 350)') parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)') parser.add_argument( '--seed', type=int, default=0, help='random seed for splitting the dataset into 10 (default: 0)') parser.add_argument('--hidden_dim', type=int, default=3, help='number of hidden units (default: 64)') parser.add_argument('--rank_dim', type=int, default=10, help='number of hidden units (default: 64)') parser.add_argument('--final_dropout', type=float, default=0.5, help='final layer dropout (default: 0.5)') parser.add_argument( '--graph_pooling_type', type=str, default="sum", choices=["sum", "average"], help='Pooling for over nodes in a graph: sum or average') parser.add_argument( '--neighbor_pooling_type', type=str, default="sum", choices=["sum", "average", "max"], help='Pooling for over neighboring nodes: sum, average or max') parser.add_argument( '--learn_eps', action="store_true", help= 'Whether to learn the epsilon weighting for the center nodes. Does not affect training accuracy though.' ) parser.add_argument( '--degree_as_tag', action="store_true", help= 'let the input node features be the degree of nodes (heuristics for unlabeled graph)' ) parser.add_argument('--filename', type=str, default="", help='output file') args = parser.parse_args() #set up seeds and gpu device torch.manual_seed(0) np.random.seed(0) device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") if torch.cuda.is_available(): torch.cuda.manual_seed_all(0) #graphs, num_classes = load_data(args.dataset, args.degree_as_tag) dataset = DglGraphPropPredDataset(name="ogbg-" + args.dataset, root='torch_geometric_data/') split_idx = dataset.get_idx_split() train_graphs = dataset[split_idx["train"]] valid_graphs = dataset[split_idx["valid"]] test_graphs = dataset[split_idx["test"]] num_classes = (torch.max( torch.LongTensor([dataset[idx][1] for idx in range(len(dataset))])) + 1).numpy() ##10-fold cross validation. Conduct an experiment on the fold specified by args.fold_idx. #train_graphs, test_graphs = separate_data(graphs, args.seed, args.fold_idx) model = GraphCPPooling(train_graphs[0][0].ndata['feat'].shape[1], args.hidden_dim, args.rank_dim, num_classes, args.final_dropout, device).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5) evaluator = Evaluator(name="ogbg-" + args.dataset) patience = 50 vacc_mx = 0.0 curr_step = 0 for epoch in range(1, args.epochs + 1): scheduler.step() avg_loss = train(args, model, device, train_graphs, optimizer, epoch) train_acc, val_acc = validation(args, model, evaluator, device, train_graphs, valid_graphs, epoch, args.dataset) if val_acc >= vacc_mx: #or loss_accum <= vlss_mn: #if val_acc >= vacc_mx and loss_accum <= vlss_mn: curr_step = 0 best_test = test(args, model, evaluator, device, valid_graphs, epoch, args.dataset) vacc_mx = val_acc #vlss_mn = np.min((loss_accum, vlss_mn)) print("Best val: %.4f, best test: %.4f" % (vacc_mx, best_test)) else: curr_step += 1 if curr_step >= patience: break
def train_molhiv(args, device, metrics_dict): dataset = DglGraphPropPredDataset(name='ogbg-molhiv') split_idx = dataset.get_idx_split() train_loader = DataLoader(dataset[split_idx["train"]], batch_size=32, shuffle=True, collate_fn=collate_dgl) val_loader = DataLoader(dataset[split_idx["valid"]], batch_size=32, shuffle=False, collate_fn=collate_dgl) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=32, shuffle=False, collate_fn=collate_dgl) model = globals()[args.model_type]( node_dim=dataset[0][0].ndata['feat'].shape[1], edge_dim=dataset[0][0].edata['feat'].shape[1] if args.use_e_features else 0, **args.model_parameters) print('model trainable params: ', sum(p.numel() for p in model.parameters() if p.requires_grad)) collate_function = globals()[ args.collate_function] if args.collate_params == {} else globals()[ args.collate_function](**args.collate_params) metrics = {metric: metrics_dict[metric] for metric in args.metrics} tensorboard_functions = { function: TENSORBOARD_FUNCTIONS[function] for function in args.tensorboard_functions } # Needs "from torch.optim import *" and "from models import *" to work transferred_params = [ v for k, v in model.named_parameters() if any(transfer_name in k for transfer_name in args.transfer_layers) ] new_params = [ v for k, v in model.named_parameters() if all( transfer_name not in k for transfer_name in args.transfer_layers) ] transfer_lr = args.optimizer_params[ 'lr'] if args.transferred_lr == None else args.transferred_lr optim = globals()[args.optimizer]([{ 'params': new_params }, { 'params': transferred_params, 'lr': transfer_lr }], **args.optimizer_params) trainer = Trainer(model=model, args=args, metrics=metrics, main_metric=args.main_metric, main_metric_goal=args.main_metric_goal, optim=optim, loss_func=globals()[args.loss_func](**args.loss_params), device=device, tensorboard_functions=tensorboard_functions, scheduler_step_per_batch=args.scheduler_step_per_batch) trainer.train(train_loader, val_loader) if args.eval_on_test: trainer.evaluation(test_loader, data_split='test')
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on ogbgmol* data with DGL') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument('--gnn', type=str, default='Cheb_net', help='GNN (default: Cheb_net)') parser.add_argument('--dropout', type=float, default=0.5, help='dropout ratio (default: 0.5)') parser.add_argument( '--num_layer', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--emb_dim', type=int, default=300, help='dimensionality of hidden units in GNNs (default: 300)') parser.add_argument('--batch_size', type=int, default=32, help='input batch size for training (default: 32)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--lr', type=float, default=1e-3, help='learning rate (default: 1e-3)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--dataset', type=str, default="ogbg-molhiv", help='dataset name (default: ogbg-molhiv)') parser.add_argument('--filename', type=str, default="", help='filename to output result (default: )') args = parser.parse_args() device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ### automatic dataloading and splitting dataset = DglGraphPropPredDataset(name=args.dataset) if not os.path.exists('results'): os.makedirs('results') writer = SummaryWriter(log_dir='results/' + args.filename + 'logs/' + args.dataset + '/' + args.gnn) split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_dgl, pin_memory=True) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_dgl, pin_memory=True) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_dgl, pin_memory=True) if args.gnn in ['gated-gcn', 'mlp', 'Cheb_net']: model = GNN(gnn_type=args.gnn, num_tasks=dataset.num_tasks, num_layer=args.num_layer, emb_dim=args.emb_dim, dropout=args.dropout, batch_norm=True, residual=True, graph_pooling="mean") model.to(device) else: raise ValueError('Invalid GNN type') print(model) total_param = 0 for param in model.parameters(): total_param += np.prod(list(param.data.size())) print(f'Total parameters: {total_param}') optimizer = optim.Adam(model.parameters(), lr=args.lr) valid_curve = [] test_curve = [] train_curve = [] for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') train(model, device, train_loader, optimizer, dataset.task_type) print('Evaluating...') train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) train_curve.append(train_perf[dataset.eval_metric]) valid_curve.append(valid_perf[dataset.eval_metric]) test_curve.append(test_perf[dataset.eval_metric]) writer.add_scalar('Val', valid_perf[dataset.eval_metric], epoch) writer.add_scalar('Test', test_perf[dataset.eval_metric], epoch) writer.add_scalar('Train', train_perf[dataset.eval_metric], epoch) if 'classification' in dataset.task_type: best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) else: best_val_epoch = np.argmin(np.array(valid_curve)) best_train = min(train_curve) print('Finished training!') print('Best validation score: {}'.format(valid_curve[best_val_epoch])) print('Test score: {}'.format(test_curve[best_val_epoch])) if not args.filename == '': torch.save( { 'Val': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch], 'Train': train_curve[best_val_epoch], 'BestTrain': best_train }, args.filename) writer.add_scalar('Best Val', valid_curve[best_val_epoch], best_val_epoch) writer.add_scalar('Best Test', test_curve[best_val_epoch], best_val_epoch) writer.add_scalar('Best Train', train_curve[best_val_epoch], best_val_epoch) writer.add_scalar('BestTrain', best_train) writer.close()
def run(args): from ogb.graphproppred import DglGraphPropPredDataset, Evaluator, collate_dgl from torch.utils.data import DataLoader dataset = DglGraphPropPredDataset(name="ogbg-molhiv") import os if not os.path.exists("heterographs.bin"): dataset.graphs = [hpno.heterograph(graph) for graph in dataset.graphs] from dgl.data.utils import save_graphs save_graphs("heterographs.bin", dataset.graphs) else: from dgl.data.utils import load_graphs dataset.graphs = load_graphs("heterographs.bin")[0] evaluator = Evaluator(name="ogbg-molhiv") in_features = 9 out_features = 1 split_idx = dataset.get_idx_split() train_loader = DataLoader(dataset[split_idx["train"]], batch_size=128, drop_last=True, shuffle=True, collate_fn=collate_dgl) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=len(split_idx["valid"]), shuffle=False, collate_fn=collate_dgl) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=len(split_idx["test"]), shuffle=False, collate_fn=collate_dgl) model = hpno.HierarchicalPathNetwork( in_features=in_features, out_features=args.hidden_features, hidden_features=args.hidden_features, depth=args.depth, readout=hpno.GraphReadout( in_features=args.hidden_features, out_features=out_features, hidden_features=args.hidden_features, ) ) if torch.cuda.is_available(): model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), args.learning_rate, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min", factor=0.5, patience=20) for idx_epoch in range(args.n_epochs): print(idx_epoch, flush=True) model.train() for g, y in train_loader: y = y.float() if torch.cuda.is_available(): g = g.to("cuda:0") y = y.cuda() optimizer.zero_grad() y_hat = model.forward(g, g.nodes['n1'].data["feat"].float()) loss = torch.nn.BCELoss()( input=y_hat.sigmoid(), target=y, ) loss.backward() optimizer.step() model.eval() with torch.no_grad(): g, y = next(iter(valid_loader)) y = y.float() if torch.cuda.is_available(): g = g.to("cuda:0") y = y.cuda() y_hat = model.forward(g, g.nodes['n1'].data["feat"].float()) loss = torch.nn.BCELoss()( input=y_hat.sigmoid(), target=y, ) scheduler.step(loss) if optimizer.param_groups[0]["lr"] <= 0.01 * args.learning_rate: break model = model.cpu() g, y = next(iter(valid_loader)) rocauc_vl = evaluator.eval( { "y_true": y.float(), "y_pred": model.forward(g, g.nodes['n1'].data["feat"].float()).sigmoid() } )["rocauc"] g, y = next(iter(test_loader)) rocauc_te = evaluator.eval( { "y_true": y.float(), "y_pred": model.forward(g, g.nodes['n1'].data["feat"].float()).sigmoid() } )["rocauc"] import pandas as pd df = pd.DataFrame( { args.data: { "rocauc_te": rocauc_te, "rocauc_vl": rocauc_vl, } } ) df.to_csv("%s.csv" % args.out)
def main(): # check cuda device = f'cuda:{args.gpu}' if args.gpu >= 0 and torch.cuda.is_available( ) else 'cpu' # load ogb dataset & evaluator dataset = DglGraphPropPredDataset(name=args.dataset) evaluator = Evaluator(name=args.dataset) g, _ = dataset[0] node_feat_dim = g.ndata['feat'].size()[-1] edge_feat_dim = g.edata['feat'].size()[-1] n_classes = dataset.num_tasks split_idx = dataset.get_idx_split() train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, collate_fn=collate_dgl) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, collate_fn=collate_dgl) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, collate_fn=collate_dgl) # load model model = DeeperGCN(dataset=args.dataset, node_feat_dim=node_feat_dim, edge_feat_dim=edge_feat_dim, hid_dim=args.hid_dim, out_dim=n_classes, num_layers=args.num_layers, dropout=args.dropout, learn_beta=args.learn_beta).to(device) print(model) opt = optim.Adam(model.parameters(), lr=args.lr) loss_fn = nn.BCEWithLogitsLoss() # training & validation & testing best_auc = 0 best_model = copy.deepcopy(model) times = [] print('---------- Training ----------') for i in range(args.epochs): t1 = time.time() train_loss = train(model, device, train_loader, opt, loss_fn) t2 = time.time() if i >= 5: times.append(t2 - t1) train_auc = test(model, device, train_loader, evaluator) valid_auc = test(model, device, valid_loader, evaluator) print( f'Epoch {i} | Train Loss: {train_loss:.4f} | Train Auc: {train_auc:.4f} | Valid Auc: {valid_auc:.4f}' ) if valid_auc > best_auc: best_auc = valid_auc best_model = copy.deepcopy(model) print('---------- Testing ----------') test_auc = test(best_model, device, test_loader, evaluator) print(f'Test Auc: {test_auc}') print('Times/epoch: ', sum(times) / len(times))