Exemple #1
0
    def __init__(self, name, pos_enc_dim=0, norm='none', verbose=True):
        start = time.time()
        if verbose:
            print("[I] Loading dataset %s..." % (name))
        self.name = name
        self.dataset = DglGraphPropPredDataset(name='ogbg-molhiv')
        self.split_idx = self.dataset.get_idx_split()

        self.train = HIVDGL(self.dataset,
                            self.split_idx['train'],
                            norm=norm,
                            pos_enc_dim=pos_enc_dim)
        self.val = HIVDGL(self.dataset,
                          self.split_idx['valid'],
                          norm=norm,
                          pos_enc_dim=pos_enc_dim)
        self.test = HIVDGL(self.dataset,
                           self.split_idx['test'],
                           norm=norm,
                           pos_enc_dim=pos_enc_dim)

        self.evaluator = Evaluator(name='ogbg-molhiv')

        if verbose:
            print('train, test, val sizes :', len(self.train), len(self.test),
                  len(self.val))
            print("[I] Finished loading.")
            print("[I] Data load time: {:.4f}s".format(time.time() - start))
Exemple #2
0
    def __init__(self,
                 name,
                 pos_enc_dim=0,
                 norm='none',
                 path='dataset/ogbg-molhiv',
                 directions=['subgraphs'],
                 verbose=True,
                 **subgraph_params):
        start = time.time()
        if verbose:
            print("[I] Loading dataset %s..." % (name))
        self.name = name

        ##### MODIFIED CODE HERE
        if 'subgraphs' in directions:
            self.dataset, self.split_idx = prepare_dataset(
                path, name, **subgraph_params)
            print("One hot encoding substructure counts... ", end='')
            self.dataset, self.d_id = encode(self.dataset,
                                             subgraph_params['id_encoding'])
        else:
            self.dataset = DglGraphPropPredDataset(name=name, root=path)
            self.split_idx = self.dataset.get_idx_split()
            self.d_id = None

        self.train = HIVDGL(self.dataset,
                            self.split_idx['train'],
                            norm=norm,
                            pos_enc_dim=pos_enc_dim,
                            directions=directions,
                            **subgraph_params)
        self.val = HIVDGL(self.dataset,
                          self.split_idx['valid'],
                          norm=norm,
                          pos_enc_dim=pos_enc_dim,
                          directions=directions,
                          **subgraph_params)
        self.test = HIVDGL(self.dataset,
                           self.split_idx['test'],
                           norm=norm,
                           pos_enc_dim=pos_enc_dim,
                           directions=directions,
                           **subgraph_params)
        ##### MODIFIED CODE HERE
        #import pdb;pdb.set_trace()
        self.evaluator = Evaluator(name='ogbg-molhiv')

        if verbose:
            print('train, test, val sizes :', len(self.train), len(self.test),
                  len(self.val))
            print("[I] Finished loading.")
            print("[I] Data load time: {:.4f}s".format(time.time() - start))
Exemple #3
0
def load_ogbg(name,
              device=th.device('cpu'),
              root='/home/eva_share_users/zhuyu'):
    from ogb.graphproppred import DglGraphPropPredDataset

    print('load', name)
    data = DglGraphPropPredDataset(name=name, root=root)
    #from IPython import embed; embed()
    from tqdm import tqdm
    out_channels = 0
    for graph in tqdm(data):
        if name == 'ogbg-ppa':
            graph[0].ndata['feat'] = dgl.ops.copy_e_mean(
                graph[0], graph[0].edata['feat'])
        else:
            ef = graph[0].edata['feat']
            edge = graph[0].edges()[1]
            H = th.zeros(graph[0].num_nodes(), 3)
            for i in range(graph[0].num_nodes()):
                mask = th.eq(edge, i)
                H[i, :] += th.matmul(mask.float(), ef.float())
                H[i, :] /= graph[0].in_degrees(i)
            graph[0].ndata['feat'] = th.cat((graph[0].ndata['feat'], H), dim=1)
        #from IPython import embed; embed()
        in_channels = graph[0].ndata['feat'].shape[1]
        try:
            out_channels = max(out_channels, int(graph[1]))
        except:
            from IPython import embed
            embed()

    split_idx = data.get_idx_split()
    print('finish loading', name)
    from dgl.dataloading import GraphDataLoader
    train_loader = GraphDataLoader(
        data[split_idx['train']],
        batch_size=256,
        shuffle=True,
    )
    valid_loader = GraphDataLoader(
        data[split_idx['valid']],
        batch_size=256,
        shuffle=True,
    )
    test_loader = GraphDataLoader(
        data[split_idx['test']],
        batch_size=256,
        shuffle=True,
    )
    #from IPython import embed; embed()
    return train_loader, valid_loader, test_loader, in_channels, out_channels + 1
Exemple #4
0
class HIVDataset(Dataset):
    def __init__(self, name, pos_enc_dim=0, norm='none', verbose=True):
        start = time.time()
        if verbose:
            print("[I] Loading dataset %s..." % (name))
        self.name = name
        self.dataset = DglGraphPropPredDataset(name='ogbg-molhiv')
        self.split_idx = self.dataset.get_idx_split()

        self.train = HIVDGL(self.dataset,
                            self.split_idx['train'],
                            norm=norm,
                            pos_enc_dim=pos_enc_dim)
        self.val = HIVDGL(self.dataset,
                          self.split_idx['valid'],
                          norm=norm,
                          pos_enc_dim=pos_enc_dim)
        self.test = HIVDGL(self.dataset,
                           self.split_idx['test'],
                           norm=norm,
                           pos_enc_dim=pos_enc_dim)

        self.evaluator = Evaluator(name='ogbg-molhiv')

        if verbose:
            print('train, test, val sizes :', len(self.train), len(self.test),
                  len(self.val))
            print("[I] Finished loading.")
            print("[I] Data load time: {:.4f}s".format(time.time() - start))

    # form a mini batch from a given list of samples = [(graph, label) pairs]
    def collate(self, samples):
        # The input samples is a list of pairs (graph, label).
        graphs, labels = map(list, zip(*samples))
        labels = torch.cat(labels).long()
        tab_sizes_n = [graphs[i].number_of_nodes() for i in range(len(graphs))]
        tab_snorm_n = [
            torch.FloatTensor(size, 1).fill_(1. / float(size))
            for size in tab_sizes_n
        ]
        snorm_n = torch.cat(tab_snorm_n).sqrt()
        tab_sizes_e = [graphs[i].number_of_edges() for i in range(len(graphs))]
        tab_snorm_e = [
            torch.FloatTensor(size, 1).fill_(1. / float(size))
            for size in tab_sizes_e
        ]
        snorm_e = torch.cat(tab_snorm_e).sqrt()
        batched_graph = dgl.batch(graphs)

        return batched_graph, labels, snorm_n, snorm_e

    def _add_self_loops(self):
        # function for adding self loops
        # this function will be called only if self_loop flag is True

        self.train.graph_lists = [self_loop(g) for g in self.train.graph_lists]
        self.val.graph_lists = [self_loop(g) for g in self.val.graph_lists]
        self.test.graph_lists = [self_loop(g) for g in self.test.graph_lists]
Exemple #5
0
    def __init__(self,
                 name,
                 re_split=False,
                 pos_enc_dim=0,
                 norm='none',
                 verbose=True):
        start = time.time()
        if verbose:
            print("[I] Loading dataset %s..." % (name))
        self.name = name
        self.dataset = DglGraphPropPredDataset(name='ogbg-molhiv')
        self.split_idx = self.dataset.get_idx_split()
        if re_split:
            ind = [i for i in range(41127)]
            rd.shuffle(ind)
            self.split_idx = {
                'test': torch.tensor([ind[i] for i in range(36564, 41127)]),
                'train': torch.tensor([ind[i] for i in range(32000)]),
                'valid': torch.tensor([ind[i] for i in range(32000, 36564)])
            }

        self.train = HIVDGL(self.dataset,
                            self.split_idx['train'],
                            norm=norm,
                            pos_enc_dim=pos_enc_dim)
        self.val = HIVDGL(self.dataset,
                          self.split_idx['valid'],
                          norm=norm,
                          pos_enc_dim=pos_enc_dim)
        self.test = HIVDGL(self.dataset,
                           self.split_idx['test'],
                           norm=norm,
                           pos_enc_dim=pos_enc_dim)

        self.evaluator = Evaluator(name='ogbg-molhiv')

        if verbose:
            print('train, test, val sizes :', len(self.train), len(self.test),
                  len(self.val))
            print("[I] Finished loading.")
            print("[I] Data load time: {:.4f}s".format(time.time() - start))
Exemple #6
0
class HIVDataset(Dataset):
    def __init__(self, name, verbose=True):
        start = time.time()
        if verbose:
            print("[I] Loading dataset %s..." % (name))
        self.name = name
        self.dataset = DglGraphPropPredDataset(name = 'ogbg-molhiv')
        self.split_idx = self.dataset.get_idx_split()

        self.train = HIVDGL(self.dataset, self.split_idx['train'])
        self.val = HIVDGL(self.dataset, self.split_idx['valid'])
        self.test = HIVDGL(self.dataset, self.split_idx['test'])

        self.evaluator = Evaluator(name='ogbg-molhiv')

        if verbose:
            print('train, test, val sizes :', len(self.train), len(self.test), len(self.val))
            print("[I] Finished loading.")
            print("[I] Data load time: {:.4f}s".format(time.time() - start))

    # form a mini batch from a given list of samples = [(graph, label) pairs]
    def collate(self, samples):
        # The input samples is a list of pairs (graph, label).
        graphs, labels = map(list, zip(*samples))
        labels = torch.cat(labels).long()
        batched_graph = dgl.batch(graphs)

        return batched_graph, labels

    def _add_self_loops(self):
        # function for adding self loops
        # this function will be called only if self_loop flag is True

        self.train.graph_lists = [self_loop(g) for g in self.train.graph_lists]
        self.val.graph_lists = [self_loop(g) for g in self.val.graph_lists]
        self.test.graph_lists = [self_loop(g) for g in self.test.graph_lists]
Exemple #7
0
def main(args):

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

    # Load dataset and evaluator
    dataset = DglGraphPropPredDataset(name=args.dataset)
    split_idx = dataset.get_idx_split()
    evaluator = Evaluator(args.dataset)

    if args.pos_enc_dim > 0:
        # Add graph positional encodings
        print("Adding PEs...")
        dataset.graphs = [
            add_positional_encoding(g, args.pos_enc_dim)
            for g in tqdm(dataset.graphs)
        ]

    # Basic pre-processing
    if args.dataset == 'ogbg-molpcba':
        print("Removing training graphs with 0 edges...")
        train_split = []
        for idx, g in enumerate(tqdm(dataset.graphs)):
            if idx in split_idx["train"] and g.number_of_edges() != 0:
                train_split.append(idx)
        split_idx["train"] = torch.LongTensor(train_split)

    # Prepare dataloaders
    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers,
                              collate_fn=collate_dgl)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers,
                              collate_fn=collate_dgl)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers,
                             collate_fn=collate_dgl)

    # Initialize model, optimizer and scheduler
    if args.gnn in ['gated-gcn', 'gcn', 'mlp']:
        model = GNN_mol(gnn_type=args.gnn,
                        num_tasks=dataset.num_tasks,
                        num_layer=args.num_layer,
                        emb_dim=args.emb_dim,
                        dropout=args.dropout,
                        batch_norm=True,
                        residual=True,
                        pos_enc_dim=args.pos_enc_dim,
                        graph_pooling=args.pooling,
                        virtualnode=args.virtualnode)
        model.to(device)
        print(model)
        total_param = 0
        for param in model.parameters():
            total_param += np.prod(list(param.data.size()))
        print(f'Total parameters: {total_param}')

        optimizer = optim.Adam(model.parameters(), lr=args.lr)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=args.lr_reduce_factor,
            patience=args.lr_scheduler_patience,
            verbose=True)
    else:
        raise ValueError('Invalid GNN type')

    # Define loss function
    cls_criterion = torch.nn.BCEWithLogitsLoss()

    # Create Tensorboard logger
    start_time_str = time.strftime("%Y%m%dT%H%M%S")
    log_dir = os.path.join(
        "logs", args.dataset,
        f"{args.expt_name}-{args.gnn}-L{args.num_layer}-h{args.emb_dim}-d{args.dropout}-LR{args.lr}",
        f"{start_time_str}-GPU{args.device}")
    tb_logger = SummaryWriter(log_dir)

    # Training loop
    train_curve = []
    valid_curve = []
    test_curve = []

    for epoch in range(1, args.epochs + 1):
        print("=====Epoch {}".format(epoch))
        tb_logger.add_scalar('lr', optimizer.param_groups[0]['lr'], epoch)

        print('Training...')
        train(model, device, train_loader, optimizer, cls_criterion)

        print('Evaluating...')
        train_loss, train_perf = eval(model, device, train_loader, evaluator,
                                      cls_criterion)
        valid_loss, valid_perf = eval(model, device, valid_loader, evaluator,
                                      cls_criterion)
        _, test_perf = eval(model, device, test_loader, evaluator,
                            cls_criterion)

        # Log statistics to Tensorboard, etc.
        print({
            'Train': train_perf,
            'Validation': valid_perf,
            'Test': test_perf
        })

        tb_logger.add_scalar('loss/train', train_loss, epoch)
        tb_logger.add_scalar(f'{dataset.eval_metric}/train',
                             train_perf[dataset.eval_metric], epoch)
        tb_logger.add_scalar('loss/valid', valid_loss, epoch)
        tb_logger.add_scalar(f'{dataset.eval_metric}/valid',
                             valid_perf[dataset.eval_metric], epoch)
        tb_logger.add_scalar(f'{dataset.eval_metric}/test',
                             test_perf[dataset.eval_metric], epoch)

        train_curve.append(train_perf[dataset.eval_metric])
        valid_curve.append(valid_perf[dataset.eval_metric])
        test_curve.append(test_perf[dataset.eval_metric])

        if args.lr_scheduler_patience > 0:
            # Reduce LR using scheduler
            scheduler.step(valid_loss)

    if 'classification' in dataset.task_type:
        best_val_epoch = np.argmax(np.array(valid_curve))
        best_train = max(train_curve)
    else:
        best_val_epoch = np.argmin(np.array(valid_curve))
        best_train = min(train_curve)

    print('Finished training!')
    print('Best validation score: {}'.format(valid_curve[best_val_epoch]))
    print('Test score: {}'.format(test_curve[best_val_epoch]))

    torch.save(
        {
            'args': args,
            'model': model.__repr__,
            'total_param': total_param,
            'BestEpoch': best_val_epoch,
            'Validation': valid_curve[best_val_epoch],
            'Test': test_curve[best_val_epoch],
            'Train': train_curve[best_val_epoch],
            'BestTrain': best_train,
        }, os.path.join(log_dir, "results.pt"))
Exemple #8
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(
        description='GNN baselines on ogbgmol* data with Pytorch Geometrics')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument(
        '--gnn',
        type=str,
        default='gin-virtual',
        help=
        'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)')
    parser.add_argument('--drop_ratio',
                        type=float,
                        default=0.5,
                        help='dropout ratio (default: 0.5)')
    parser.add_argument(
        '--num_layer',
        type=int,
        default=5,
        help='number of GNN message passing layers (default: 5)')
    parser.add_argument(
        '--emb_dim',
        type=int,
        default=512,
        help='dimensionality of hidden units in GNNs (default: 300)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='input batch size for training (default: 32)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--num_workers',
                        type=int,
                        default=0,
                        help='number of workers (default: 0)')
    parser.add_argument('--dataset',
                        type=str,
                        default="molhiv",
                        help='dataset name (default: ogbg-molhiv)')
    parser.add_argument(
        '--rank',
        type=int,
        default=512,
        help='dimensionality of rank units in GNNs (default: 300)')
    parser.add_argument('--filename',
                        type=str,
                        default="",
                        help='filename to output result (default: )')
    parser.add_argument('--lr', type=float, default=0.003)
    parser.add_argument('--wd',
                        type=float,
                        default=5e-5,
                        help='Weight decay (L2 loss on parameters).')
    args = parser.parse_args()

    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

    ### automatic dataloading and splitting
    dataset = DglGraphPropPredDataset(name="ogbg-" + args.dataset,
                                      root='torch_geometric_data/')

    split_idx = dataset.get_idx_split()

    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(name="ogbg-" + args.dataset)

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers,
                              collate_fn=collate_dgl)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers,
                              collate_fn=collate_dgl)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers,
                             collate_fn=collate_dgl)

    model = GNN(num_tasks=dataset.num_tasks,
                num_layer=args.num_layer,
                emb_dim=args.emb_dim,
                rank=args.rank,
                drop_ratio=args.drop_ratio).to(device)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.wd)

    valid_curve = []
    test_curve = []
    train_curve = []

    for epoch in range(1, args.epochs + 1):
        print("=====Epoch {}".format(epoch))
        print('Training...')
        train(model, device, train_loader, optimizer, dataset.task_type)

        print('Evaluating...')
        train_perf = eval(model, device, train_loader, evaluator)
        valid_perf = eval(model, device, valid_loader, evaluator)
        test_perf = eval(model, device, test_loader, evaluator)

        print({
            'Train': train_perf,
            'Validation': valid_perf,
            'Test': test_perf
        })

        train_curve.append(train_perf[dataset.eval_metric])
        valid_curve.append(valid_perf[dataset.eval_metric])
        test_curve.append(test_perf[dataset.eval_metric])

    if 'classification' in dataset.task_type:
        best_val_epoch = np.argmax(np.array(valid_curve))
        best_train = max(train_curve)
    else:
        best_val_epoch = np.argmin(np.array(valid_curve))
        best_train = min(train_curve)

    print('Finished training!')
    print('Best validation score: {}'.format(valid_curve[best_val_epoch]))
    print('Test score: {}'.format(test_curve[best_val_epoch]))

    if not args.filename == '':
        torch.save(
            {
                'Val': valid_curve[best_val_epoch],
                'Test': test_curve[best_val_epoch],
                'Train': train_curve[best_val_epoch],
                'BestTrain': best_train
            }, args.filename)
Exemple #9
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(
        description='GNN baselines on ogbg-ppa with DGL')
    parser.add_argument(
        '--gnn',
        type=str,
        default='gin-virtual',
        help='GNN gin, gcn, gin-virtual, gcn-virtual (default: gin-virtual)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.5,
                        help='dropout ratio (default: 0.5)')
    parser.add_argument(
        '--n_layers',
        type=int,
        default=5,
        help='number of GNN message passing layers (default: 5)')
    parser.add_argument('--hidden_feats',
                        type=int,
                        default=300,
                        help='number of hidden units in GNNs (default: 300)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='input batch size for training (default: 32)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='number of epochs for training (default: 100)')
    parser.add_argument('--num_workers',
                        type=int,
                        default=0,
                        help='number of workers (default: 0)')
    parser.add_argument('--dataset',
                        type=str,
                        default="ogbg-ppa",
                        help='dataset name (default: ogbg-ppa)')
    parser.add_argument('--filename',
                        type=str,
                        help='filename to output result')
    args = parser.parse_args()

    if args.filename is None:
        args.filename = args.gnn

    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')

    # data loading and splitting
    dataset = DglGraphPropPredDataset(name=args.dataset)
    # initialize node features
    for i in range(len(dataset)):
        dataset[i][0].ndata['h'] = torch.zeros(
            dataset[i][0].number_of_nodes()).long()
    splitted_idx = dataset.get_idx_split()

    # automatic evaluator taking dataset name as input
    evaluator = Evaluator(args.dataset)

    # using collate_dgl
    train_loader = DataLoader(dataset[splitted_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              collate_fn=collate_dgl,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[splitted_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              collate_fn=collate_dgl,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[splitted_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             collate_fn=collate_dgl,
                             num_workers=args.num_workers)

    if args.gnn == 'gin':
        gnn_type = 'gin'
        virtual_node = False
    if args.gnn == 'gcn':
        gnn_type = 'gcn'
        virtual_node = False
    if args.gnn == 'gin-virtual':
        gnn_type = 'gin'
        virtual_node = True
    if args.gnn == 'gcn-virtual':
        gnn_type = 'gcn'
        virtual_node = True

    model = GNNOGBPredictor(
        in_edge_feats=dataset[0][0].edata['feat'].shape[-1],
        hidden_feats=args.hidden_feats,
        n_layers=args.n_layers,
        n_tasks=int(dataset.num_classes),
        dropout=args.dropout,
        gnn_type=gnn_type,
        virtual_node=virtual_node).to(device)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    valid_curve = []
    test_curve = []
    train_curve = []
    time_curve = []

    for epoch in range(1, args.epochs + 1):
        print("=====Epoch {}".format(epoch))
        print('Training...')
        t0 = time.time()
        train(model, device, train_loader, criterion, optimizer)
        t1 = time.time()
        if epoch >= 3:
            time_curve.append(t1 - t0)
        print('Evaluating...')
        train_perf = eval(model, device, train_loader, evaluator)
        valid_perf = eval(model, device, valid_loader, evaluator)
        test_perf = eval(model, device, test_loader, evaluator)

        print({
            'Train': train_perf,
            'Validation': valid_perf,
            'Test': test_perf
        })
        if epoch >= 3:
            print('Training Time: ', time_curve[-1])

        train_curve.append(train_perf['acc'])
        valid_curve.append(valid_perf['acc'])
        test_curve.append(test_perf['acc'])

    best_val_epoch = np.argmax(np.array(valid_curve))
    best_train = max(train_curve)

    print('Finished training!')
    print('Best validation score: {}'.format(valid_curve[best_val_epoch]))
    print('Test score: {}'.format(test_curve[best_val_epoch]))
    print('Avg Training Time: ', np.mean(time_curve))
    if not args.filename == '':
        torch.save(
            {
                'Val': valid_curve[best_val_epoch],
                'Test': test_curve[best_val_epoch],
                'Train': train_curve[best_val_epoch],
                'BestTrain': best_train
            }, args.filename)
Exemple #10
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(
        description='GNN baselines on ogbg-mol* data with DGL')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument(
        '--gnn',
        type=str,
        default='gcn',
        help=
        'gin, or gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)')
    parser.add_argument('--drop_ratio',
                        type=float,
                        default=0.5,
                        help='dropout ratio (default: 0.5)')
    parser.add_argument(
        '--n_layers',
        type=int,
        default=5,
        help='number of GNN message passing layers (default: 5)')
    parser.add_argument(
        '--n_hidden',
        type=int,
        default=300,
        help='dimensionality of hidden units in GNNs (default: 300)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='input batch size for training (default: 32)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--num_workers',
                        type=int,
                        default=0,
                        help='number of workers (default: 0)')
    parser.add_argument('--dataset',
                        type=str,
                        default="ogbg-molhiv",
                        help='dataset name (default: ogbg-molhiv)')
    parser.add_argument('--graph_pooling_type',
                        type=str,
                        default="mean",
                        choices=["sum", "mean", "max"],
                        help='type of graph pooling: sum, mean or max')
    parser.add_argument('--feature',
                        type=str,
                        default="full",
                        choices=["full", "simple"],
                        help='full feature or simple feature')
    parser.add_argument(
        '--filename',
        type=str,
        default="gin-virtual.pth",
        help='filename to output result (default: gin-virtual.pth)')
    args = parser.parse_args()

    if torch.cuda.is_available():
        device = torch.device("cuda:" + str(args.device))
        torch.cuda.set_device(args.device)
    else:
        device = torch.device("cpu")

    ### dataloading and splitting
    dataset = DglGraphPropPredDataset(name=args.dataset)
    print('Metric', dataset.eval_metric)
    if args.feature == 'simple':
        print('using simple feature')
    else:
        print('using full feature')

    splitted_idx = dataset.get_idx_split()

    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(args.dataset)

    # using collate_dgl
    train_loader = DataLoader(dataset[splitted_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              collate_fn=collate_dgl,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[splitted_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              collate_fn=collate_dgl,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[splitted_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             collate_fn=collate_dgl,
                             num_workers=args.num_workers)

    if args.gnn == 'gin':
        gnn_type = 'gin'
        virtual_node = False
    if args.gnn == 'gcn':
        gnn_type = 'gcn'
        virtual_node = False
    if args.gnn == 'gin-virtual':
        gnn_type = 'gin'
        virtual_node = True
    if args.gnn == 'gcn-virtual':
        gnn_type = 'gcn'
        virtual_node = True

    model = GNNOGBPredictor(
        in_edge_feats=dataset[0][0].edata['feat'].shape[-1],
        hidden_feats=args.hidden_feats,
        n_layers=args.n_layers,
        n_tasks=int(dataset.num_classes),
        dropout=args.dropout,
        gnn_type=gnn_type,
        virtual_node=virtual_node).to(device)

    optimizer = optim.Adam(model.parameters(), lr=0.001)
    model = model.to(device)

    valid_curve = []
    test_curve = []
    train_curve = []
    time_curve = []

    for epoch in range(1, args.epochs + 1):
        print("=====Epoch {}".format(epoch))
        print('Training...')
        t0 = time.time()
        train(args.gnn, model, args.feature, device, train_loader, optimizer,
              dataset.task_type)
        if epoch >= 3:
            t1 = time.time()
            time_curve.append(t1 - t0)
            print('Training Time: ', time_curve[-1])

        print('Evaluating...')
        train_perf = eval(args.gnn, model, args.feature, device, train_loader,
                          evaluator, dataset.task_type)
        valid_perf = eval(args.gnn, model, args.feature, device, valid_loader,
                          evaluator, dataset.task_type)
        test_perf = eval(args.gnn, model, args.feature, device, test_loader,
                         evaluator, dataset.task_type)

        print({
            'Train': train_perf,
            'Validation': valid_perf,
            'Test': test_perf
        })
        # dataset.eval_metric = 'rocauc'
        train_curve.append(train_perf[dataset.eval_metric])
        valid_curve.append(valid_perf[dataset.eval_metric])
        test_curve.append(test_perf[dataset.eval_metric])

    if 'classification' in dataset.task_type:
        best_val_epoch = np.argmax(np.array(valid_curve))
        best_train = max(train_curve)
    else:
        best_val_epoch = np.argmin(np.array(valid_curve))
        best_train = min(train_curve)

    print('Finished training!')
    print('Best validation score: {}'.format(valid_curve[best_val_epoch]))
    print('Test score: {}'.format(test_curve[best_val_epoch]))
    print('Avg Training Time: ', np.mean(time_curve))

    if not args.filename == '':
        torch.save(
            {
                'Val': valid_curve[best_val_epoch],
                'Test': test_curve[best_val_epoch],
                'Train': train_curve[best_val_epoch],
                'BestTrain': best_train
            }, args.filename)
Exemple #11
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-MolHiv')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--num_layers', type=int, default=5)
    parser.add_argument('--emb_dim', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--eval',
                        action='store_true',
                        help='If not set, we will only do the training part.')
    parser.add_argument('--eval_batch_size', type=int, default=2048)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = DglGraphPropPredDataset(name='ogbg-molhiv')
    split_idx = dataset.get_idx_split()

    evaluator = Evaluator(name='ogbg-molhiv')
    train_loader = GraphDataLoader(dataset[split_idx["train"]],
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers)
    val_loader = GraphDataLoader(dataset[split_idx["valid"]],
                                 batch_size=args.eval_batch_size,
                                 shuffle=True,
                                 num_workers=0)
    test_loader = GraphDataLoader(dataset[split_idx["test"]],
                                  batch_size=args.eval_batch_size,
                                  shuffle=True,
                                  num_workers=0)

    model = GCN(args.emb_dim,
                num_classes=dataset.num_tasks,
                num_layers=args.num_layers,
                dropout=args.dropout).to(device)

    logger = Logger(args.runs, args)
    dur = []
    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        for epoch in range(1, args.epochs + 1):
            t0 = time.time()
            loss = train(model, device, train_loader, optimizer)
            if epoch >= 3:
                dur.append(time.time() - t0)
                print('Training time/epoch {}'.format(np.mean(dur)))

            if not args.eval:
                continue

            val_rocauc = test(model, device, val_loader,
                              evaluator)[dataset.eval_metric]
            test_rocauc = test(model, device, test_loader,
                               evaluator)[dataset.eval_metric]
            logger.add_result(run, (0.0, val_rocauc, test_rocauc))

            if epoch % args.log_steps == 0:
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Valid: {val_rocauc:.4f} '
                      f'Test: {test_rocauc:.4f}')

        if args.eval:
            logger.print_statistics(run)
    if args.eval:
        logger.print_statistics()
Exemple #12
0
def test_datasetsaver():
    # test on graph classification
    # ogbg-molhiv

    test_task = 'link'

    # testing all the dataset objects are working.
    if test_task == 'graph':
        from ogb.graphproppred import PygGraphPropPredDataset, DglGraphPropPredDataset, GraphPropPredDataset
        dataset_name = 'ogbg-molhiv'
        dataset = PygGraphPropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = DglGraphPropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = GraphPropPredDataset(dataset_name)
        dataset.get_idx_split()
    elif test_task == 'node':
        from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset
        dataset_name = 'ogbn-arxiv'  # test ogbn-proteins
        dataset = PygNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = DglNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = NodePropPredDataset(dataset_name)
        dataset.get_idx_split()
    elif test_task == 'link':
        from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset
        dataset_name = 'ogbl-collab'
        dataset = PygLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = DglLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = LinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
    elif test_task == 'heteronode':
        from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset
        dataset_name = 'ogbn-mag'
        dataset = PygNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = DglNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = NodePropPredDataset(dataset_name)
        dataset.get_idx_split()
    elif test_task == 'heterolink':
        from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset
        dataset_name = 'ogbl-biokg'
        dataset = PygLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = DglLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = LinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
    else:
        raise ValueError('Invalid task category')

    print(dataset[0])
    if 'link' in test_task:
        print(dataset.get_edge_split())
    else:
        print(dataset.get_idx_split())

    if 'graph' in test_task:
        graph_list = dataset.graphs
    else:
        graph_list = [dataset.graph]

    if 'link' not in test_task:
        labels = dataset.labels

    is_hetero = 'hetero' in test_task
    version = 2 if dataset_name == 'ogbn-mag' else 1
    saver = DatasetSaver(dataset_name, is_hetero, version=version)

    # saving graph objects
    saver.save_graph_list(graph_list)
    # saving target labels
    if 'link' not in test_task:
        saver.save_target_labels(labels)
    # saving split
    if 'link' in test_task:
        split_idx = dataset.get_edge_split()
    else:
        split_idx = dataset.get_idx_split()
    # second argument must be the name of the split
    saver.save_split(split_idx, dataset.meta_info['split'])
    # copying mapping dir
    # saver.copy_mapping_dir(f"dataset/{'_'.join(dataset_name.split('-'))}/mapping/")
    saver.copy_mapping_dir("dataset/{}/mapping/".format('_'.join(
        dataset_name.split('-'))))

    saver.save_task_info(
        dataset.task_type, dataset.eval_metric,
        dataset.num_classes if hasattr(dataset, 'num_classes') else None)

    meta_dict = saver.get_meta_dict()

    print(meta_dict)

    print('Now testing.')

    if 'graph' in test_task:
        print('library agnostic')
        dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('Pytorch Geometric')
        dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('DGL')
        dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
    elif 'node' in test_task:
        print('library agnostic')
        dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('Pytorch Geometric')
        dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('DGL')
        dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())

    elif 'link' in test_task:
        print('library agnostic')
        dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        # print(dataset.get_edge_split())
        print('Pytorch Geometric')
        dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        # print(dataset.get_edge_split())
        print('DGL')
        dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        # print(dataset.get_edge_split())
    else:
        raise ValueError('Invalid task category')

    # zip
    saver.zip()
    print('Finished zipping!')

    saver.cleanup()
Exemple #13
0
class HIVDataset(Dataset):
    def __init__(self,
                 name,
                 pos_enc_dim=0,
                 norm='none',
                 path='dataset/ogbg-molhiv',
                 directions=['subgraphs'],
                 verbose=True,
                 **subgraph_params):
        start = time.time()
        if verbose:
            print("[I] Loading dataset %s..." % (name))
        self.name = name

        ##### MODIFIED CODE HERE
        if 'subgraphs' in directions:
            self.dataset, self.split_idx = prepare_dataset(
                path, name, **subgraph_params)
            print("One hot encoding substructure counts... ", end='')
            self.dataset, self.d_id = encode(self.dataset,
                                             subgraph_params['id_encoding'])
        else:
            self.dataset = DglGraphPropPredDataset(name=name, root=path)
            self.split_idx = self.dataset.get_idx_split()
            self.d_id = None

        self.train = HIVDGL(self.dataset,
                            self.split_idx['train'],
                            norm=norm,
                            pos_enc_dim=pos_enc_dim,
                            directions=directions,
                            **subgraph_params)
        self.val = HIVDGL(self.dataset,
                          self.split_idx['valid'],
                          norm=norm,
                          pos_enc_dim=pos_enc_dim,
                          directions=directions,
                          **subgraph_params)
        self.test = HIVDGL(self.dataset,
                           self.split_idx['test'],
                           norm=norm,
                           pos_enc_dim=pos_enc_dim,
                           directions=directions,
                           **subgraph_params)
        ##### MODIFIED CODE HERE
        #import pdb;pdb.set_trace()
        self.evaluator = Evaluator(name='ogbg-molhiv')

        if verbose:
            print('train, test, val sizes :', len(self.train), len(self.test),
                  len(self.val))
            print("[I] Finished loading.")
            print("[I] Data load time: {:.4f}s".format(time.time() - start))

    # form a mini batch from a given list of samples = [(graph, label) pairs]
    def collate(self, samples):
        # The input samples is a list of pairs (graph, label).
        graphs, labels = map(list, zip(*samples))
        labels = torch.cat(labels).long()
        tab_sizes_n = [graphs[i].number_of_nodes() for i in range(len(graphs))]
        tab_snorm_n = [
            torch.FloatTensor(size, 1).fill_(1. / float(size))
            for size in tab_sizes_n
        ]
        snorm_n = torch.cat(tab_snorm_n).sqrt()
        tab_sizes_e = [graphs[i].number_of_edges() for i in range(len(graphs))]
        tab_snorm_e = [
            torch.FloatTensor(size, 1).fill_(1. / float(size))
            for size in tab_sizes_e
        ]
        snorm_e = torch.cat(tab_snorm_e).sqrt()
        batched_graph = dgl.batch(graphs)

        return batched_graph, labels, snorm_n, snorm_e

    def _add_self_loops(self):
        # function for adding self loops
        # this function will be called only if self_loop flag is True

        self.train.graph_lists = [self_loop(g) for g in self.train.graph_lists]
        self.val.graph_lists = [self_loop(g) for g in self.val.graph_lists]
        self.test.graph_lists = [self_loop(g) for g in self.test.graph_lists]
Exemple #14
0
def main():
    # check cuda
    device = f'cuda:{args.gpu}' if args.gpu >= 0 and torch.cuda.is_available(
    ) else 'cpu'

    # load ogb dataset & evaluator
    dataset = DglGraphPropPredDataset(name=args.dataset)
    evaluator = Evaluator(name=args.dataset)

    g, _ = dataset[0]
    edge_feat_dim = g.edata['feat'].size()[-1]
    n_classes = int(dataset.num_classes)

    split_idx = dataset.get_idx_split()
    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              collate_fn=collate_dgl)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              collate_fn=collate_dgl)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             collate_fn=collate_dgl)

    # load model
    model = DeeperGCN(dataset=args.dataset,
                      node_feat_dim=edge_feat_dim,
                      edge_feat_dim=edge_feat_dim,
                      hid_dim=args.hid_dim,
                      out_dim=n_classes,
                      num_layers=args.num_layers,
                      dropout=args.dropout,
                      norm=args.norm,
                      beta=args.beta,
                      mlp_layers=args.mlp_layers).to(device)

    print(model)

    opt = optim.Adam(model.parameters(), lr=args.lr)

    # training & validation & testing
    best_acc = 0
    best_model = copy.deepcopy(model)

    print('---------- Training ----------')
    for i in range(args.epochs):
        train_loss = train(model, device, train_loader, opt)

        if i % args.eval_steps == 0:
            train_acc = test(model, device, train_loader, evaluator)
            valid_acc = test(model, device, valid_loader, evaluator)

            print(
                f'Epoch {i} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Valid Acc: {valid_acc:.4f}'
            )

            if valid_acc > best_acc:
                best_acc = valid_acc
                best_model = copy.deepcopy(model)
        else:
            print(f'Epoch {i} | Train Loss: {train_loss:.4f}')

    print('---------- Testing ----------')
    test_acc = test(best_model, device, test_loader, evaluator)
    print(f'Test Acc: {test_acc}')
Exemple #15
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'PyTorch graph convolutional neural net for whole-graph classification'
    )
    parser.add_argument('--dataset',
                        type=str,
                        default="MUTAG",
                        help='name of dataset (default: MUTAG)')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='input batch size for training (default: 32)')
    parser.add_argument(
        '--iters_per_epoch',
        type=int,
        default=50,
        help='number of iterations per each epoch (default: 50)')
    parser.add_argument('--epochs',
                        type=int,
                        default=350,
                        help='number of epochs to train (default: 350)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        help='learning rate (default: 0.01)')
    parser.add_argument(
        '--seed',
        type=int,
        default=0,
        help='random seed for splitting the dataset into 10 (default: 0)')
    parser.add_argument('--hidden_dim',
                        type=int,
                        default=3,
                        help='number of hidden units (default: 64)')
    parser.add_argument('--rank_dim',
                        type=int,
                        default=10,
                        help='number of hidden units (default: 64)')
    parser.add_argument('--final_dropout',
                        type=float,
                        default=0.5,
                        help='final layer dropout (default: 0.5)')
    parser.add_argument(
        '--graph_pooling_type',
        type=str,
        default="sum",
        choices=["sum", "average"],
        help='Pooling for over nodes in a graph: sum or average')
    parser.add_argument(
        '--neighbor_pooling_type',
        type=str,
        default="sum",
        choices=["sum", "average", "max"],
        help='Pooling for over neighboring nodes: sum, average or max')
    parser.add_argument(
        '--learn_eps',
        action="store_true",
        help=
        'Whether to learn the epsilon weighting for the center nodes. Does not affect training accuracy though.'
    )
    parser.add_argument(
        '--degree_as_tag',
        action="store_true",
        help=
        'let the input node features be the degree of nodes (heuristics for unlabeled graph)'
    )
    parser.add_argument('--filename', type=str, default="", help='output file')
    args = parser.parse_args()

    #set up seeds and gpu device
    torch.manual_seed(0)
    np.random.seed(0)
    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(0)

    #graphs, num_classes = load_data(args.dataset, args.degree_as_tag)
    dataset = DglGraphPropPredDataset(name="ogbg-" + args.dataset,
                                      root='torch_geometric_data/')
    split_idx = dataset.get_idx_split()
    train_graphs = dataset[split_idx["train"]]
    valid_graphs = dataset[split_idx["valid"]]
    test_graphs = dataset[split_idx["test"]]
    num_classes = (torch.max(
        torch.LongTensor([dataset[idx][1]
                          for idx in range(len(dataset))])) + 1).numpy()

    ##10-fold cross validation. Conduct an experiment on the fold specified by args.fold_idx.
    #train_graphs, test_graphs = separate_data(graphs, args.seed, args.fold_idx)

    model = GraphCPPooling(train_graphs[0][0].ndata['feat'].shape[1],
                           args.hidden_dim, args.rank_dim, num_classes,
                           args.final_dropout, device).to(device)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)
    evaluator = Evaluator(name="ogbg-" + args.dataset)

    patience = 50
    vacc_mx = 0.0
    curr_step = 0
    for epoch in range(1, args.epochs + 1):
        scheduler.step()

        avg_loss = train(args, model, device, train_graphs, optimizer, epoch)
        train_acc, val_acc = validation(args, model, evaluator, device,
                                        train_graphs, valid_graphs, epoch,
                                        args.dataset)

        if val_acc >= vacc_mx:  #or loss_accum <= vlss_mn:
            #if val_acc >= vacc_mx and loss_accum <= vlss_mn:
            curr_step = 0
            best_test = test(args, model, evaluator, device, valid_graphs,
                             epoch, args.dataset)
            vacc_mx = val_acc
            #vlss_mn = np.min((loss_accum, vlss_mn))
            print("Best val: %.4f, best test: %.4f" % (vacc_mx, best_test))
        else:
            curr_step += 1
            if curr_step >= patience:
                break
Exemple #16
0
def train_molhiv(args, device, metrics_dict):
    dataset = DglGraphPropPredDataset(name='ogbg-molhiv')
    split_idx = dataset.get_idx_split()
    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=32,
                              shuffle=True,
                              collate_fn=collate_dgl)
    val_loader = DataLoader(dataset[split_idx["valid"]],
                            batch_size=32,
                            shuffle=False,
                            collate_fn=collate_dgl)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=32,
                             shuffle=False,
                             collate_fn=collate_dgl)

    model = globals()[args.model_type](
        node_dim=dataset[0][0].ndata['feat'].shape[1],
        edge_dim=dataset[0][0].edata['feat'].shape[1]
        if args.use_e_features else 0,
        **args.model_parameters)
    print('model trainable params: ',
          sum(p.numel() for p in model.parameters() if p.requires_grad))
    collate_function = globals()[
        args.collate_function] if args.collate_params == {} else globals()[
            args.collate_function](**args.collate_params)

    metrics = {metric: metrics_dict[metric] for metric in args.metrics}
    tensorboard_functions = {
        function: TENSORBOARD_FUNCTIONS[function]
        for function in args.tensorboard_functions
    }

    # Needs "from torch.optim import *" and "from models import *" to work
    transferred_params = [
        v for k, v in model.named_parameters()
        if any(transfer_name in k for transfer_name in args.transfer_layers)
    ]
    new_params = [
        v for k, v in model.named_parameters() if all(
            transfer_name not in k for transfer_name in args.transfer_layers)
    ]
    transfer_lr = args.optimizer_params[
        'lr'] if args.transferred_lr == None else args.transferred_lr
    optim = globals()[args.optimizer]([{
        'params': new_params
    }, {
        'params': transferred_params,
        'lr': transfer_lr
    }], **args.optimizer_params)
    trainer = Trainer(model=model,
                      args=args,
                      metrics=metrics,
                      main_metric=args.main_metric,
                      main_metric_goal=args.main_metric_goal,
                      optim=optim,
                      loss_func=globals()[args.loss_func](**args.loss_params),
                      device=device,
                      tensorboard_functions=tensorboard_functions,
                      scheduler_step_per_batch=args.scheduler_step_per_batch)
    trainer.train(train_loader, val_loader)

    if args.eval_on_test:
        trainer.evaluation(test_loader, data_split='test')
Exemple #17
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(
        description='GNN baselines on ogbgmol* data with DGL')
    parser.add_argument('--device',
                        type=int,
                        default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument('--gnn',
                        type=str,
                        default='Cheb_net',
                        help='GNN (default: Cheb_net)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.5,
                        help='dropout ratio (default: 0.5)')
    parser.add_argument(
        '--num_layer',
        type=int,
        default=5,
        help='number of GNN message passing layers (default: 5)')
    parser.add_argument(
        '--emb_dim',
        type=int,
        default=300,
        help='dimensionality of hidden units in GNNs (default: 300)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='input batch size for training (default: 32)')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='number of epochs to train (default: 100)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-3,
                        help='learning rate (default: 1e-3)')
    parser.add_argument('--num_workers',
                        type=int,
                        default=0,
                        help='number of workers (default: 0)')
    parser.add_argument('--dataset',
                        type=str,
                        default="ogbg-molhiv",
                        help='dataset name (default: ogbg-molhiv)')
    parser.add_argument('--filename',
                        type=str,
                        default="",
                        help='filename to output result (default: )')
    args = parser.parse_args()

    device = torch.device(
        "cuda:" +
        str(args.device)) if torch.cuda.is_available() else torch.device("cpu")

    ### automatic dataloading and splitting
    dataset = DglGraphPropPredDataset(name=args.dataset)

    if not os.path.exists('results'):
        os.makedirs('results')
    writer = SummaryWriter(log_dir='results/' + args.filename + 'logs/' +
                           args.dataset + '/' + args.gnn)

    split_idx = dataset.get_idx_split()

    ### automatic evaluator. takes dataset name as input
    evaluator = Evaluator(args.dataset)

    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers,
                              collate_fn=collate_dgl,
                              pin_memory=True)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers,
                              collate_fn=collate_dgl,
                              pin_memory=True)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers,
                             collate_fn=collate_dgl,
                             pin_memory=True)

    if args.gnn in ['gated-gcn', 'mlp', 'Cheb_net']:
        model = GNN(gnn_type=args.gnn,
                    num_tasks=dataset.num_tasks,
                    num_layer=args.num_layer,
                    emb_dim=args.emb_dim,
                    dropout=args.dropout,
                    batch_norm=True,
                    residual=True,
                    graph_pooling="mean")
        model.to(device)
    else:
        raise ValueError('Invalid GNN type')

    print(model)
    total_param = 0
    for param in model.parameters():
        total_param += np.prod(list(param.data.size()))
    print(f'Total parameters: {total_param}')

    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    valid_curve = []
    test_curve = []
    train_curve = []

    for epoch in range(1, args.epochs + 1):
        print("=====Epoch {}".format(epoch))
        print('Training...')
        train(model, device, train_loader, optimizer, dataset.task_type)

        print('Evaluating...')
        train_perf = eval(model, device, train_loader, evaluator)
        valid_perf = eval(model, device, valid_loader, evaluator)
        test_perf = eval(model, device, test_loader, evaluator)

        print({
            'Train': train_perf,
            'Validation': valid_perf,
            'Test': test_perf
        })

        train_curve.append(train_perf[dataset.eval_metric])
        valid_curve.append(valid_perf[dataset.eval_metric])
        test_curve.append(test_perf[dataset.eval_metric])

        writer.add_scalar('Val', valid_perf[dataset.eval_metric], epoch)
        writer.add_scalar('Test', test_perf[dataset.eval_metric], epoch)
        writer.add_scalar('Train', train_perf[dataset.eval_metric], epoch)

    if 'classification' in dataset.task_type:
        best_val_epoch = np.argmax(np.array(valid_curve))
        best_train = max(train_curve)
    else:
        best_val_epoch = np.argmin(np.array(valid_curve))
        best_train = min(train_curve)

    print('Finished training!')
    print('Best validation score: {}'.format(valid_curve[best_val_epoch]))
    print('Test score: {}'.format(test_curve[best_val_epoch]))

    if not args.filename == '':
        torch.save(
            {
                'Val': valid_curve[best_val_epoch],
                'Test': test_curve[best_val_epoch],
                'Train': train_curve[best_val_epoch],
                'BestTrain': best_train
            }, args.filename)

    writer.add_scalar('Best Val', valid_curve[best_val_epoch], best_val_epoch)
    writer.add_scalar('Best Test', test_curve[best_val_epoch], best_val_epoch)
    writer.add_scalar('Best Train', train_curve[best_val_epoch],
                      best_val_epoch)
    writer.add_scalar('BestTrain', best_train)
    writer.close()
Exemple #18
0
def run(args):
    from ogb.graphproppred import DglGraphPropPredDataset, Evaluator, collate_dgl
    from torch.utils.data import DataLoader

    dataset = DglGraphPropPredDataset(name="ogbg-molhiv")

    import os
    if not os.path.exists("heterographs.bin"):
        dataset.graphs = [hpno.heterograph(graph) for graph in dataset.graphs]
        from dgl.data.utils import save_graphs
        save_graphs("heterographs.bin", dataset.graphs)
    else:
        from dgl.data.utils import load_graphs
        dataset.graphs = load_graphs("heterographs.bin")[0]

    evaluator = Evaluator(name="ogbg-molhiv")
    in_features = 9
    out_features = 1

    split_idx = dataset.get_idx_split()
    train_loader = DataLoader(dataset[split_idx["train"]], batch_size=128, drop_last=True, shuffle=True, collate_fn=collate_dgl)
    valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=len(split_idx["valid"]), shuffle=False, collate_fn=collate_dgl)
    test_loader = DataLoader(dataset[split_idx["test"]], batch_size=len(split_idx["test"]), shuffle=False, collate_fn=collate_dgl)

    model = hpno.HierarchicalPathNetwork(
        in_features=in_features,
        out_features=args.hidden_features,
        hidden_features=args.hidden_features,
        depth=args.depth,
        readout=hpno.GraphReadout(
            in_features=args.hidden_features,
            out_features=out_features,
            hidden_features=args.hidden_features,
        )
    )


    if torch.cuda.is_available():
        model = model.cuda()

    optimizer = torch.optim.Adam(model.parameters(), args.learning_rate, weight_decay=args.weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min", factor=0.5, patience=20)

    for idx_epoch in range(args.n_epochs):
        print(idx_epoch, flush=True)
        model.train()
        for g, y in train_loader:
            y = y.float()
            if torch.cuda.is_available():
                g = g.to("cuda:0")
                y = y.cuda()
            optimizer.zero_grad()
            y_hat = model.forward(g, g.nodes['n1'].data["feat"].float())
            loss = torch.nn.BCELoss()(
                input=y_hat.sigmoid(),
                target=y,
            )
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            g, y = next(iter(valid_loader))
            y = y.float()
            if torch.cuda.is_available():
                g = g.to("cuda:0")
                y = y.cuda()
            y_hat = model.forward(g, g.nodes['n1'].data["feat"].float())
            loss = torch.nn.BCELoss()(
                input=y_hat.sigmoid(),
                target=y,
            )
            scheduler.step(loss)

        if optimizer.param_groups[0]["lr"] <= 0.01 * args.learning_rate: break

    model = model.cpu()
    g, y = next(iter(valid_loader))
    rocauc_vl = evaluator.eval(
        {
            "y_true": y.float(),
            "y_pred": model.forward(g, g.nodes['n1'].data["feat"].float()).sigmoid()
        }
    )["rocauc"]

    g, y = next(iter(test_loader))
    rocauc_te = evaluator.eval(
        {
            "y_true": y.float(),
            "y_pred": model.forward(g, g.nodes['n1'].data["feat"].float()).sigmoid()
        }
    )["rocauc"]

    import pandas as pd
    df = pd.DataFrame(
        {
            args.data: {
                "rocauc_te": rocauc_te,
                "rocauc_vl": rocauc_vl,
            }
        }
    )

    df.to_csv("%s.csv" % args.out)
Exemple #19
0
def main():
    # check cuda
    device = f'cuda:{args.gpu}' if args.gpu >= 0 and torch.cuda.is_available(
    ) else 'cpu'

    # load ogb dataset & evaluator
    dataset = DglGraphPropPredDataset(name=args.dataset)
    evaluator = Evaluator(name=args.dataset)

    g, _ = dataset[0]
    node_feat_dim = g.ndata['feat'].size()[-1]
    edge_feat_dim = g.edata['feat'].size()[-1]
    n_classes = dataset.num_tasks

    split_idx = dataset.get_idx_split()
    train_loader = DataLoader(dataset[split_idx["train"]],
                              batch_size=args.batch_size,
                              shuffle=True,
                              collate_fn=collate_dgl)
    valid_loader = DataLoader(dataset[split_idx["valid"]],
                              batch_size=args.batch_size,
                              shuffle=False,
                              collate_fn=collate_dgl)
    test_loader = DataLoader(dataset[split_idx["test"]],
                             batch_size=args.batch_size,
                             shuffle=False,
                             collate_fn=collate_dgl)

    # load model
    model = DeeperGCN(dataset=args.dataset,
                      node_feat_dim=node_feat_dim,
                      edge_feat_dim=edge_feat_dim,
                      hid_dim=args.hid_dim,
                      out_dim=n_classes,
                      num_layers=args.num_layers,
                      dropout=args.dropout,
                      learn_beta=args.learn_beta).to(device)

    print(model)

    opt = optim.Adam(model.parameters(), lr=args.lr)
    loss_fn = nn.BCEWithLogitsLoss()

    # training & validation & testing
    best_auc = 0
    best_model = copy.deepcopy(model)
    times = []

    print('---------- Training ----------')
    for i in range(args.epochs):
        t1 = time.time()
        train_loss = train(model, device, train_loader, opt, loss_fn)
        t2 = time.time()

        if i >= 5:
            times.append(t2 - t1)

        train_auc = test(model, device, train_loader, evaluator)
        valid_auc = test(model, device, valid_loader, evaluator)

        print(
            f'Epoch {i} | Train Loss: {train_loss:.4f} | Train Auc: {train_auc:.4f} | Valid Auc: {valid_auc:.4f}'
        )

        if valid_auc > best_auc:
            best_auc = valid_auc
            best_model = copy.deepcopy(model)

    print('---------- Testing ----------')
    test_auc = test(best_model, device, test_loader, evaluator)
    print(f'Test Auc: {test_auc}')
    print('Times/epoch: ', sum(times) / len(times))