Ejemplo n.º 1
0
def load_ogb(name, dataset_dir):
    if name[:4] == 'ogbn':
        dataset = PygNodePropPredDataset(name=name, root=dataset_dir)
        splits = dataset.get_idx_split()
        split_names = ['train_mask', 'val_mask', 'test_mask']
        for i, key in enumerate(splits.keys()):
            mask = index2mask(splits[key], size=dataset.data.y.shape[0])
            set_dataset_attr(dataset, split_names[i], mask, len(mask))
        edge_index = to_undirected(dataset.data.edge_index)
        set_dataset_attr(dataset, 'edge_index', edge_index,
                         edge_index.shape[1])

    elif name[:4] == 'ogbg':
        dataset = PygGraphPropPredDataset(name=name, root=dataset_dir)
        splits = dataset.get_idx_split()
        split_names = [
            'train_graph_index', 'val_graph_index', 'test_graph_index'
        ]
        for i, key in enumerate(splits.keys()):
            id = splits[key]
            set_dataset_attr(dataset, split_names[i], id, len(id))

    elif name[:4] == "ogbl":
        dataset = PygLinkPropPredDataset(name=name, root=dataset_dir)
        splits = dataset.get_edge_split()

        id = splits['train']['edge'].T
        if cfg.dataset.resample_negative:
            set_dataset_attr(dataset, 'train_pos_edge_index', id, id.shape[1])
            # todo: applying transform for negative sampling is very slow
            dataset.transform = neg_sampling_transform
        else:
            id_neg = negative_sampling(edge_index=id,
                                       num_nodes=dataset.data.num_nodes[0],
                                       num_neg_samples=id.shape[1])
            id_all = torch.cat([id, id_neg], dim=-1)
            label = get_link_label(id, id_neg)
            set_dataset_attr(dataset, 'train_edge_index', id_all,
                             id_all.shape[1])
            set_dataset_attr(dataset, 'train_edge_label', label, len(label))

        id, id_neg = splits['valid']['edge'].T, splits['valid']['edge_neg'].T
        id_all = torch.cat([id, id_neg], dim=-1)
        label = get_link_label(id, id_neg)
        set_dataset_attr(dataset, 'val_edge_index', id_all, id_all.shape[1])
        set_dataset_attr(dataset, 'val_edge_label', label, len(label))

        id, id_neg = splits['test']['edge'].T, splits['test']['edge_neg'].T
        id_all = torch.cat([id, id_neg], dim=-1)
        label = get_link_label(id, id_neg)
        set_dataset_attr(dataset, 'test_edge_index', id_all, id_all.shape[1])
        set_dataset_attr(dataset, 'test_edge_label', label, len(label))

    else:
        raise ValueError('OGB dataset: {} non-exist')

    return dataset
Ejemplo n.º 2
0
def eval_with_partition(args):
    model_load_path = args.model_load_path
    print("Starting evaluating model stored at", model_load_path)

    device = torch.device("cuda")

    dataset = PygNodePropPredDataset(name=args.dataset, root=args.data_folder)
    graph = dataset[0]
    adj = SparseTensor(row=graph.edge_index[0], col=graph.edge_index[1])
    if args.self_loop:
        adj = adj.set_diag()
        graph.edge_index = add_self_loops(edge_index=graph.edge_index,
                                          num_nodes=graph.num_nodes)[0]
    split_idx = dataset.get_idx_split()
    evaluator = Evaluator(args.dataset)

    args.in_channels = graph.x.size(-1)
    args.num_tasks = dataset.num_classes

    # print('%s' % args)

    model = DeeperGCN(args).to(device)
    ckpt = torch.load(model_load_path)
    model.load_state_dict(ckpt['model_state_dict'])

    res = test_with_partition(model,
                              graph,
                              adj,
                              split_idx,
                              num_clusters=args.eval_cluster_number,
                              partition_method=args.partition_method,
                              evaluator=evaluator,
                              device=device)
    print(res)
    return res
Ejemplo n.º 3
0
def eval_model(params):
    model_load_path, args = params
    if os.path.isdir(args.model_load_path):
        model_load_dir = args.model_load_path
        model_load_path = os.path.join(model_load_dir, model_load_path)
    print("Starting evaluating model stored at", model_load_path)

    dataset = PygNodePropPredDataset(name=args.dataset, root=args.data_folder)
    graph = dataset[0]

    if args.self_loop:
        graph.edge_index = add_self_loops(edge_index=graph.edge_index,
                                          num_nodes=graph.num_nodes)[0]
    split_idx = dataset.get_idx_split()

    evaluator = Evaluator(args.dataset)

    args.in_channels = graph.x.size(-1)
    args.num_tasks = dataset.num_classes

    model = DeeperGCN(args)
    ckpt = torch.load(model_load_path, map_location=torch.device('cpu'))
    model.load_state_dict(ckpt['model_state_dict'])
    test_res = test(model, graph.x, graph.edge_index, graph.y, split_idx,
                    evaluator)
    test_res["model_load_path"] = model_load_path

    return test_res
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Arxiv (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_sage', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=500)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--kind', type=str, default="ReLU")
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-arxiv',
                                     transform=T.ToSparseTensor())

    data = dataset[0]
    data.adj_t = data.adj_t.to_symmetric()
    data = data.to(device)

    split_idx = dataset.get_idx_split()
    train_idx = split_idx['train'].to(device)

    if args.use_sage:
        model = SAGE(data.num_features, args.hidden_channels,
                     dataset.num_classes, args.num_layers,
                     args.dropout, kind=args.kind).to(device)
    else:
        model = GCN(data.num_features, args.hidden_channels,
                    dataset.num_classes, args.num_layers,
                    args.dropout, kind=args.kind).to(device)

    evaluator = Evaluator(name='ogbn-arxiv')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, data, train_idx, optimizer)
            result = test(model, data, split_idx, evaluator)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(description="OGBN-Arxiv (MLP)")
    parser.add_argument("--device", type=int, default=0)
    parser.add_argument("--log_steps", type=int, default=1)
    parser.add_argument("--use_node_embedding", action="store_true")
    parser.add_argument("--num_layers", type=int, default=3)
    parser.add_argument("--hidden_channels", type=int, default=256)
    parser.add_argument("--dropout", type=float, default=0.5)
    parser.add_argument("--lr", type=float, default=0.01)
    parser.add_argument("--epochs", type=int, default=500)
    parser.add_argument("--runs", type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name="ogbn-arxiv")
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    x = data.x
    if args.use_node_embedding:
        embedding = torch.load("embedding.pt", map_location="cpu")
        x = torch.cat([x, embedding], dim=-1)
    x = x.to(device)

    y_true = data.y.to(device)
    train_idx = split_idx["train"].to(device)

    model = MLP(
        x.size(-1),
        args.hidden_channels,
        dataset.num_classes,
        args.num_layers,
        args.dropout,
    ).to(device)

    evaluator = Evaluator(name="ogbn-arxiv")
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, x, y_true, train_idx, optimizer)
            result = test(model, x, y_true, split_idx, evaluator)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f"Run: {run + 1:02d}, "
                      f"Epoch: {epoch:02d}, "
                      f"Loss: {loss:.4f}, "
                      f"Train: {100 * train_acc:.2f}%, "
                      f"Valid: {100 * valid_acc:.2f}%, "
                      f"Test: {100 * test_acc:.2f}%")

        logger.print_statistics(run)
    logger.print_statistics()
Ejemplo n.º 6
0
def get_product_clusters():
    dataset_name = "ogbn-products"
    dataset = PygNodePropPredDataset(name=dataset_name)

    print('The {} dataset has {} graph'.format(dataset_name, len(dataset)))

    data = dataset[0]
    print(data)
    split_idx = dataset.get_idx_split()
    train_idx = split_idx['train']
    val_idx = split_idx['valid']
    test_idx = split_idx['test']

    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    train_mask[train_idx] = True
    data['train_mask'] = train_mask

    val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    val_mask[val_idx] = True
    data['valid_mask'] = val_mask

    test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    test_mask[test_idx] = True
    data['test_mask'] = test_mask

    cluster_data = ClusterData(data, num_parts=15000, save_dir="dataset")
    return cluster_data, dataset, data, split_idx
Ejemplo n.º 7
0
def main():

    args = ArgsInit().args

    dataset = PygNodePropPredDataset(name=args.dataset)
    graph = dataset[0]

    if args.self_loop:
        graph.edge_index = add_self_loops(edge_index=graph.edge_index,
                                          num_nodes=graph.num_nodes)[0]
    split_idx = dataset.get_idx_split()

    evaluator = Evaluator(args.dataset)

    args.in_channels = graph.x.size(-1)
    args.num_tasks = dataset.num_classes

    print(args)

    model = DeeperGCN(args)

    print(model)

    model.load_state_dict(torch.load(args.model_load_path)['model_state_dict'])
    result = test(model, graph.x, graph.edge_index, graph.y, split_idx,
                  evaluator)
    print(result)
    model.print_params(final=True)
Ejemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Proteins (MLP)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_node_embedding', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=1000)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-proteins')
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    x = scatter(data.edge_attr, data.edge_index[0], dim=0,
                dim_size=data.num_nodes, reduce='mean').to('cpu')

    if args.use_node_embedding:
        embedding = torch.load('embedding.pt', map_location='cpu')
        x = torch.cat([x, embedding], dim=-1)

    x = x.to(device)
    y_true = data.y.to(device)
    train_idx = split_idx['train'].to(device)

    model = MLP(x.size(-1), args.hidden_channels, 112, args.num_layers,
                args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-proteins')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, x, y_true, train_idx, optimizer)

            if epoch % args.eval_steps == 0:
                result = test(model, x, y_true, split_idx, evaluator)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_rocauc, valid_rocauc, test_rocauc = result
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {100 * train_rocauc:.2f}%, '
                          f'Valid: {100 * valid_rocauc:.2f}% '
                          f'Test: {100 * test_rocauc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()
Ejemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Arxiv (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_sage', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=3000)
    parser.add_argument('--runs', type=int, default=1)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-arxiv',root='/mnt/ogbdata',
                                     transform=T.ToSparseTensor())

    data = dataset[0]
    data.adj_t = data.adj_t.to_symmetric()
    data = data.to(device)

    split_idx = dataset.get_idx_split()
    print(split_idx['train'].nonzero().size(0)/len(split_idx['train']),
    split_idx['valid'].nonzero().size(0)/len(split_idx['train']),
    split_idx['test'].nonzero().size(0)/len(split_idx['train']))
Ejemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Products (MLP)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_node_embedding', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=300)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-products')
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    x = data.x
    if args.use_node_embedding:
        embedding = np.load('./embed_results/embeddings.npy')
        embedding = torch.from_numpy(embedding).float()
        x = torch.cat([x, embedding], dim=-1)
    x = x.to(device)

    y_true = data.y.to(device)
    train_idx = split_idx['train'].to(device)

    model = MLP(x.size(-1), args.hidden_channels, dataset.num_classes,
                args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-products')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, x, y_true, train_idx, optimizer)
            result = test(model, x, y_true, split_idx, evaluator)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}%, '
                      f'Test: {100 * test_acc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()

    total_params = sum(p.numel() for p in model.parameters())
    print(f'mlp total params are {total_params}')
Ejemplo n.º 11
0
def load_ogb_2(dataset):
    ## Load the dataset

    ## Setup PyTorch
    device_name = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device_name)

    dataset = PygNodePropPredDataset(name=dataset,
                                     transform=T.ToSparseTensor())
    ogb_data = dataset[0]
    # TODO: Not sure how to format adj_t...
    ogb_data.adj_t = ogb_data.adj_t.to_symmetric()
    ogb_data = ogb_data.to(device)

    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
        "valid"], split_idx["test"]
    train_idx, valid_idx, split_idx = train_idx.numpy(), valid_idx.numpy(
    ), test_idx.numpy()

    # Convert OGB's data split pytorch index vectors to Wang's data split numpy boolean masks
    train_mask_2 = indexes2booleanvec(ogb_data.num_nodes, train_idx)
    val_mask_2 = indexes2booleanvec(ogb_data.num_nodes, valid_idx)
    test_mask_2 = indexes2booleanvec(ogb_data.num_nodes, test_idx)

    # Add 1's down the diagonal of adj_t
    adj_t = ogb_data.adj_t.to_torch_sparse_coo_tensor()
    adj_t = adj_t + sparse_identity(adj_t.shape[0])
    # Convert OGB's adjacency SparseTensor to Wang's adjacency index matrix (Nx2)
    adj_2_0 = adj_t.coalesce().indices().numpy()
    adj_2_0 = adj_2_0.T.astype('int32')
    ##adj_2_0 = np.vstack((adj_2_0, np.array([[i,i] for i in range(ogb_data.num_nodes)])))
    adj_2_1 = adj_t.coalesce().values().numpy().astype('float64')
    adj_2_2 = tuple(adj_t.size())
    #TODO: Fix the adjacency matrix, bc it probably is symmetric with identity
    adj_2 = (adj_2_0, adj_2_1, adj_2_2)

    from sklearn.preprocessing import OneHotEncoder
    labels_2 = ogb_data.y.numpy()
    labels_2 = OneHotEncoder(sparse=False).fit_transform(labels_2)

    #TODO: I don't know if this feature vector will work
    # OGB used a skip-gram encoding,
    # whereas Wang's Citeseer just used normalized rows with 1-0 for different words
    x = ogb_data.x + 1.5
    norm_x = np.apply_along_axis(np.linalg.norm, 1, x)
    x = x / norm_x[:, None]
    x = x.to_sparse()
    features_2_0 = x.indices().numpy().T.astype('int32')
    features_2_1 = x.values().numpy()
    #features_2_1 = 1.5 + features_2_1
    features_2_1 = features_2_1.astype('float64')
    features_2_2 = tuple(x.size())
    features_2 = features_2_0, features_2_1, features_2_2

    data2 = features_2, labels_2, adj_2, train_mask_2, val_mask_2, test_mask_2

    return data2
Ejemplo n.º 12
0
def main_fixed_mask(args):

    device = torch.device("cuda:" + str(args.device))
    dataset = PygNodePropPredDataset(name=args.dataset)
    data = dataset[0]
    split_idx = dataset.get_idx_split()
    evaluator = Evaluator(args.dataset)

    x = data.x.to(device)
    y_true = data.y.to(device)
    train_idx = split_idx['train'].to(device)

    edge_index = data.edge_index.to(device)
    edge_index = to_undirected(edge_index, data.num_nodes)

    if args.self_loop:
        edge_index = add_self_loops(edge_index, num_nodes=data.num_nodes)[0]

    args.in_channels = data.x.size(-1)
    args.num_tasks = dataset.num_classes

    model = DeeperGCN(args).to(device)
    pruning.add_mask(model, args.num_layers)
    
    for name, param in model.named_parameters():
        if 'mask' in name:
            param.requires_grad = False

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    results = {'highest_valid': 0, 'final_train': 0, 'final_test': 0, 'highest_train': 0, 'epoch': 0}
    
    start_epoch = 1
    for epoch in range(start_epoch, args.epochs + 1):
    
        epoch_loss = train_fixed(model, x, edge_index, y_true, train_idx, optimizer, args)
        result = test(model, x, edge_index, y_true, split_idx, evaluator)
        train_accuracy, valid_accuracy, test_accuracy = result

        if valid_accuracy > results['highest_valid']:
            results['highest_valid'] = valid_accuracy
            results['final_train'] = train_accuracy
            results['final_test'] = test_accuracy
            results['epoch'] = epoch

        print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' | ' +
              'Baseline (FIX Mask) Epoch:[{}/{}]\t LOSS:[{:.4f}] Train :[{:.2f}] Valid:[{:.2f}] Test:[{:.2f}] | Update Test:[{:.2f}] at epoch:[{}]'
              .format(epoch, args.epochs, epoch_loss, train_accuracy * 100,
                                                               valid_accuracy * 100,
                                                               test_accuracy * 100, 
                                                               results['final_test'] * 100,
                                                               results['epoch']))
    print("=" * 120)
    print("syd final: Baseline, Train:[{:.2f}]  Best Val:[{:.2f}] at epoch:[{}] | Final Test Acc:[{:.2f}]"
        .format(            results['final_train'] * 100,
                            results['highest_valid'] * 100,
                            results['epoch'],
                            results['final_test'] * 100))
    print("=" * 120)
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Products (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=6)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=20)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-products')
    splitted_idx = dataset.get_idx_split()
    data = dataset[0]

    # Convert split indices to boolean masks and add them to `data`.
    for key, idx in splitted_idx.items():
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[idx] = True
        data[f'{key}_mask'] = mask

    cluster_data = ClusterData(data,
                               num_parts=args.num_partitions,
                               recursive=False,
                               save_dir=dataset.processed_dir)

    loader = ClusterLoader(cluster_data,
                           batch_size=args.batch_size,
                           shuffle=True,
                           num_workers=args.num_workers)

    model = SAGE(data.x.size(-1), args.hidden_channels, 47, args.num_layers,
                 args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-products')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, loader, optimizer, device)
            if epoch % args.log_steps == 0:
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}')
        result = test(model, data, evaluator)
        logger.add_result(run, result)
        logger.print_statistics(run)
    logger.print_statistics()
Ejemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Products (SIGN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-products')
    split_idx = dataset.get_idx_split()
    data = SIGN(args.num_layers)(dataset[0])  # This might take a while.

    xs = [data.x] + [data[f'x{i}'] for i in range(1, args.num_layers + 1)]
    xs_train = [x[split_idx['train']].to(device) for x in xs]
    xs_valid = [x[split_idx['valid']].to(device) for x in xs]
    xs_test = [x[split_idx['test']].to(device) for x in xs]

    y_train_true = data.y[split_idx['train']].to(device)
    y_valid_true = data.y[split_idx['valid']].to(device)
    y_test_true = data.y[split_idx['test']].to(device)

    model = MLP(data.x.size(-1), args.hidden_channels, dataset.num_classes,
                args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-products')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, xs_train, y_train_true, optimizer)

            train_acc = test(model, xs_train, y_train_true, evaluator)
            valid_acc = test(model, xs_valid, y_valid_true, evaluator)
            test_acc = test(model, xs_test, y_test_true, evaluator)
            result = (train_acc, valid_acc, test_acc)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}%, '
                      f'Test: {100 * test_acc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()
Ejemplo n.º 15
0
def arxiv_data(root):
    # keep the same data loading logic for all architectures
    dataset = PygNodePropPredDataset(
        name="ogbn-arxiv",
        root=root,  # transform=T.ToSparseTensor(),
    )
    data = dataset[0]
    # data.adj_t = data.adj_t.to_symmetric()
    data.edge_index = to_undirected(data.edge_index)
    split_idx = dataset.get_idx_split()

    return data, split_idx
Ejemplo n.º 16
0
def get_data(args):
    dataset = PygNodePropPredDataset(name=args['dataset_name'], transform=T.ToSparseTensor())
    evaluator = Evaluator(name=args['dataset_name'])

    data = dataset[0]
    data.adj_t = data.adj_t.to_symmetric()
    split_idx = dataset.get_idx_split()

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    data = data.to(device)
    for setname in ['train', 'valid', 'test']:
        split_idx[setname] = split_idx[setname].to(device)

    return data, dataset, split_idx, evaluator
Ejemplo n.º 17
0
def main():

    args = ArgsInit().args

    if args.use_gpu:
        device = torch.device("cuda:" +
                              str(args.device)) if torch.cuda.is_available(
                              ) else torch.device("cpu")
    else:
        device = torch.device('cpu')

    dataset = PygNodePropPredDataset(name=args.dataset)
    data = dataset[0]
    split_idx = dataset.get_idx_split()

    evaluator = Evaluator(args.dataset)

    x = data.x.to(device)
    y_true = data.y.to(device)

    edge_index = data.edge_index.to(device)
    edge_index = to_undirected(edge_index, data.num_nodes)

    if args.self_loop:
        edge_index = add_self_loops(edge_index, num_nodes=data.num_nodes)[0]

    args.in_channels = data.x.size(-1)
    args.num_tasks = dataset.num_classes

    print(args)

    model = DeeperGCN(args)

    model.load_state_dict(torch.load(args.model_load_path)['model_state_dict'])
    model.to(device)

    result = test(model, x, edge_index, y_true, split_idx, evaluator)
    train_accuracy, valid_accuracy, test_accuracy = result

    print({
        'Train': train_accuracy,
        'Validation': valid_accuracy,
        'Test': test_accuracy
    })

    model.print_params(final=True)
Ejemplo n.º 18
0
def load_data(args, datapath):
    if args.dataset in ['arxiv'] and args.task == 'lp':
        data = {}
        dataset = PygNodePropPredDataset(name='ogbn-{}'.format(args.dataset),
                                         root='/pasteur/u/jeffgu/hgcn/data')
        split_idx = dataset.get_idx_split()
        train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
            "valid"], split_idx["test"]
        induced_edges_train, _ = subgraph(train_idx, dataset[0].edge_index)
        induced_edges_valid, _ = subgraph(valid_idx, dataset[0].edge_index)
        induced_edges_test, _ = subgraph(test_idx, dataset[0].edge_index)
        neg_edges_train = negative_sampling(induced_edges_train)
        neg_edges_valid = negative_sampling(induced_edges_valid)
        neg_edges_test = negative_sampling(induced_edges_test)
        data['adj_train'] = to_scipy_sparse_matrix(
            dataset[0].edge_index).tocsr()
        data['features'] = dataset[0].x
        data['train_edges'], data[
            'train_edges_false'] = induced_edges_train, neg_edges_train
        data['val_edges'], data[
            'val_edges_false'] = induced_edges_valid, neg_edges_valid
        data['test_edges'], data[
            'test_edges_false'] = induced_edges_test, neg_edges_test
    elif args.task == 'nc':
        data = load_data_nc(args.dataset, args.use_feats, datapath,
                            args.split_seed)
    else:
        data = load_data_lp(args.dataset, args.use_feats, datapath)
        adj = data['adj_train']
        if args.task == 'lp':
            adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = mask_edges(
                adj, args.val_prop, args.test_prop, args.split_seed)
            data['adj_train'] = adj_train
            data['train_edges'], data[
                'train_edges_false'] = train_edges, train_edges_false
            data['val_edges'], data[
                'val_edges_false'] = val_edges, val_edges_false
            data['test_edges'], data[
                'test_edges_false'] = test_edges, test_edges_false
    data['adj_train_norm'], data['features'] = process(data['adj_train'],
                                                       data['features'],
                                                       args.normalize_adj,
                                                       args.normalize_feats)
    if args.dataset == 'airport':
        data['features'] = augment(data['adj_train'], data['features'])
    return data
Ejemplo n.º 19
0
def load_ogb_graph(dataset_name):
    if not os.path.isfile('torch_geometric_data/dgl_' + dataset_name):
        dataset = PygNodePropPredDataset(name="ogbn-" + dataset_name,
                                         root='torch_geometric_data/')
        split_idx = dataset.get_idx_split()
        train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
            "valid"], split_idx["test"]
        edge = dataset[0].edge_index
        num_classes = len(np.unique(dataset[0].y))
        print("Nodes: %d, edges: %d, features: %d, classes: %d. \n" %
              (dataset[0].y.shape[0], len(edge[0]) / 2, len(
                  dataset[0].x[0]), num_classes))
        graph = dgl.DGLGraph((edge[0], edge[1]))
        graph.ndata['features'] = dataset[0].x
        graph.ndata['labels'] = dataset[0].y
        dgl.data.utils.save_graphs('torch_geometric_data/dgl_' + dataset_name,
                                   graph)
        torch.save(
            train_idx, 'torch_geometric_data/ogbn_' + dataset_name +
            '/train_' + dataset_name + '.pt')
        torch.save(
            valid_idx, 'torch_geometric_data/ogbn_' + dataset_name +
            '/valid_' + dataset_name + '.pt')
        torch.save(
            test_idx, 'torch_geometric_data/ogbn_' + dataset_name + '/test_' +
            dataset_name + '.pt')
        labels = graph.ndata.pop('labels')
        features = graph.ndata.pop('features')
        features = torch.hstack([features, torch.ones([features.shape[0], 1])])
        #print(features)
    elif os.path.isfile('torch_geometric_data/dgl_' + dataset_name):
        graph = dgl.data.utils.load_graphs('torch_geometric_data/dgl_' +
                                           dataset_name)[0][0]
        labels = graph.ndata.pop('labels')
        features = graph.ndata.pop('features')
        features = torch.hstack([features, torch.ones([features.shape[0], 1])])
        train_idx = torch.load('torch_geometric_data/ogbn_' + dataset_name +
                               '/train_' + dataset_name + '.pt')
        valid_idx = torch.load('torch_geometric_data/ogbn_' + dataset_name +
                               '/valid_' + dataset_name + '.pt')
        test_idx = torch.load('torch_geometric_data/ogbn_' + dataset_name +
                              '/test_' + dataset_name + '.pt')
        num_classes = len(torch.unique(labels))

    return graph, features, labels, num_classes, train_idx, valid_idx, test_idx
Ejemplo n.º 20
0
    def process_PygNodeDataset_hetero(
        self,
        dataset: PygNodePropPredDataset,
    ):
        data = dataset[0]
        self._name = dataset.name
        self.edge_index_dict = data.edge_index_dict
        self.num_nodes_dict = data.num_nodes_dict if hasattr(
            data, "num_nodes_dict") else self.get_num_nodes_dict(
                self.edge_index_dict)

        if self.node_types is None:
            self.node_types = list(self.num_nodes_dict.keys())

        if hasattr(data, "x_dict"):
            self.x_dict = data.x_dict
        elif hasattr(data, "x"):
            self.x_dict = {self.head_node_type: data.x}
        else:
            self.x_dict = {}

        if hasattr(data, "y_dict"):
            self.y_dict = data.y_dict
        elif hasattr(data, "y"):
            self.y_dict = {self.head_node_type: data.y}
        else:
            self.y_dict = {}

        self.y_index_dict = {
            node_type: torch.arange(self.num_nodes_dict[node_type])
            for node_type in self.y_dict.keys()
        }

        if self.head_node_type is None:
            if hasattr(self, "y_dict"):
                self.head_node_type = list(self.y_dict.keys())[0]
            else:
                self.head_node_type = self.node_types[0]

        self.metapaths = list(self.edge_index_dict.keys())

        split_idx = dataset.get_idx_split()
        self.training_idx, self.validation_idx, self.testing_idx = split_idx["train"][self.head_node_type], \
                                                                   split_idx["valid"][self.head_node_type], \
                                                                   split_idx["test"][self.head_node_type]
Ejemplo n.º 21
0
def load_data_nc(dataset, use_feats, data_path, split_seed):
    if dataset in ['cora', 'pubmed']:
        adj, features, labels, idx_train, idx_val, idx_test = load_citation_data(
            dataset, use_feats, data_path, split_seed)
    elif dataset == 'arxiv':
        dataset = PygNodePropPredDataset(name='ogbn-arxiv',
                                         root='/pasteur/u/jeffgu/hgcn/data')
        split_idx = dataset.get_idx_split()
        idx_train, idx_val, idx_test = split_idx["train"], split_idx[
            "valid"], split_idx["test"]
        adj = to_scipy_sparse_matrix(dataset[0].edge_index).tocsr()
        features = dataset[0].x
        labels = dataset[0].y
    else:
        if dataset == 'disease_nc':
            adj, features, labels = load_synthetic_data(
                dataset, use_feats, data_path)
            val_prop, test_prop = 0.10, 0.60
        elif dataset == 'airport':
            adj, features, labels = load_data_airport(dataset,
                                                      data_path,
                                                      return_label=True)
            val_prop, test_prop = 0.15, 0.15
        else:
            raise FileNotFoundError(
                'Dataset {} is not supported.'.format(dataset))
        idx_val, idx_test, idx_train = split_data(labels,
                                                  val_prop,
                                                  test_prop,
                                                  seed=split_seed)

    labels = torch.LongTensor(labels)
    data = {
        'adj_train': adj,
        'features': features,
        'labels': labels,
        'idx_train': idx_train,
        'idx_val': idx_val,
        'idx_test': idx_test
    }
    return data
Ejemplo n.º 22
0
def load_pyg_dataset(dataset_name, root='dataset/'):
    from ogb.nodeproppred import PygNodePropPredDataset, Evaluator
    source, name = dataset_name.split('-', maxsplit=1)
    assert source in ['ogbn', 'pyg', 'custom']
    if source == 'ogbn':
        dataset = PygNodePropPredDataset(name=dataset_name, root=root)
        return dataset, dataset.get_idx_split(), Evaluator(dataset_name)
    elif source == 'pyg':
        from torch_geometric.datasets import KarateClub, CoraFull
        if name == "karate":
            dataset = KarateClub()
        elif name == "cora":
            dataset = CoraFull(root)
        else:
            raise Exception("Dataset not recognized")

        num_nodes = dataset[0].x.shape[0]
        num_train = int(num_nodes * 0.8)
        num_val = int(num_nodes * 0.1)

        perm = np.arange(num_nodes, dtype=int)
        np.random.shuffle(perm)
        split_idx = {
            'train': perm[:num_train],
            'valid': perm[num_train:num_train + num_val],
            'test': perm[num_train + num_val:]
        }
        return dataset, split_idx, Evaluator('ogbn-arxiv')
    elif source == "custom":
        from dataset import registry
        dataset = registry[name]()
        split_idx = {
            'train': dataset[0].idx_train,
            'valid': dataset[0].idx_val,
            'test': dataset[0].idx_test
        }
        return dataset, split_idx, CustomEvaluator()

    else:
        raise Exception("Dataset not recognized")
Ejemplo n.º 23
0
def load_ogb(dataset_name, time_budget, zero_features=True, sample_num=None):
    print("*" * 30, "Start!", "*" * 30)
    dataset = PygNodePropPredDataset(name=dataset_name)

    split_idx = dataset.get_idx_split()
    train_idx, test_idx = split_idx["train"].numpy().tolist(), split_idx[
        "valid"].numpy().tolist() + split_idx["test"].numpy().tolist()
    print("Train rate {}, test rate {}".format(
        len(train_idx) / (len(train_idx) + len(test_idx)),
        len(test_idx) / (len(train_idx) + len(test_idx))))

    graph = dataset[0]  # pyg graph object
    features, labels = graph.x.numpy(), graph.y.numpy()
    edge_index = graph.edge_index.numpy()
    edge_weight = graph.edge_attr
    print(features.shape, labels.shape, edge_index.shape)
    if zero_features:
        features = np.zeros((features.shape[0], features.shape[1]),
                            dtype=np.float)
    if edge_weight is None:
        edge_weight = np.ones(edge_index.shape[1])

    adj_matrix = sp.coo_matrix((edge_weight.reshape(-1), edge_index),
                               shape=(labels.shape[0], labels.shape[0]))
    node_indexs = np.arange(labels.shape[0])
    n_class = len(np.unique(labels))
    # output directory control
    output_dir = os.path.join(os.path.dirname(__file__) + '../data-offline')
    os.makedirs(output_dir, exist_ok=True)
    sample_num_str = "" if sample_num is None else str(sample_num)
    data_dir = os.path.join(output_dir, dataset_name + sample_num_str)
    if os.path.exists(data_dir):
        shutil.rmtree(data_dir)
    train_dir = os.path.join(data_dir, 'train.data')
    os.makedirs(data_dir, exist_ok=True)
    os.makedirs(train_dir, exist_ok=True)
    transform_to_autograph_format(features, labels, adj_matrix, node_indexs,
                                  n_class, data_dir, train_dir, time_budget)
    print("*" * 30, "Finish!", "*" * 30)
Ejemplo n.º 24
0
    def setup_ogb(self):

        dataset = PygNodePropPredDataset(name='ogbn-arxiv',
                                         root=self.root,
                                         transform=T.ToSparseTensor())
        data = dataset[0]

        self.metric = 'Accuracy'
        self.num_classes = dataset.num_classes
        self.split_idx = dataset.get_idx_split()

        self.x = data.x
        self.y = data.y
        self.adj_t = data.adj_t.to_symmetric()
        self.num_nodes = data.num_nodes

        if self.make_edge_index:
            row = self.adj_t.storage.row()
            col = self.adj_t.storage.col()
            self.edge_index = torch.stack((row, col), dim=0)

        self.criterion = torch.nn.CrossEntropyLoss()
Ejemplo n.º 25
0
def load_proteins_dataset():
    dataset = PygNodePropPredDataset('ogbn-proteins', root='../data')
    splitted_idx = dataset.get_idx_split()
    data = dataset[0]
    data.node_species = None
    data.y = data.y.to(torch.float)
    data.n_id = torch.arange(data.num_nodes)

    # Initialize features of nodes by aggregating edge features.
    row, col = data.edge_index
    data.x = scatter(data.edge_attr,
                     col,
                     0,
                     dim_size=data.num_nodes,
                     reduce='add')

    # Set split indices to masks.
    for split in ['train', 'valid', 'test']:
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[splitted_idx[split]] = True
        data[f'{split}_mask'] = mask

    return data
# Reaches around 0.7945 ± 0.0059 test accuracy.

import os.path as osp

import torch
import torch.nn.functional as F
from torch.nn import Linear as Lin
from tqdm import tqdm
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator
from torch_geometric.data import NeighborSampler
from torch_geometric.nn import GATConv

root = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'products')
dataset = PygNodePropPredDataset('ogbn-products', root)
split_idx = dataset.get_idx_split()
evaluator = Evaluator(name='ogbn-products')
data = dataset[0]

train_idx = split_idx['train']
train_loader = NeighborSampler(data.edge_index,
                               node_idx=train_idx,
                               sizes=[10, 10, 10],
                               batch_size=512,
                               shuffle=True,
                               num_workers=12)
subgraph_loader = NeighborSampler(data.edge_index,
                                  node_idx=None,
                                  sizes=[-1],
                                  batch_size=1024,
                                  shuffle=False,
                                  num_workers=12)
Ejemplo n.º 27
0
def main():
    parser = argparse.ArgumentParser(description='gen_models')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--dataset', type=str, default='arxiv')
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--model', type=str, default='mlp')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--use_embeddings', action='store_true')
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=300)
    parser.add_argument('--runs', type=int, default=10)

    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name=f'ogbn-{args.dataset}',
                                     transform=T.ToSparseTensor())

    data = dataset[0]
    data.adj_t = data.adj_t.to_symmetric()

    x = data.x

    split_idx = dataset.get_idx_split()
    preprocess_data = PygNodePropPredDataset(name=f'ogbn-{args.dataset}')[0]
    if args.dataset == 'arxiv':
        embeddings = torch.cat([
            preprocess(preprocess_data, 'diffusion', post_fix=args.dataset),
            preprocess(preprocess_data, 'spectral', post_fix=args.dataset)
        ],
                               dim=-1)
    elif args.dataset == 'products':
        embeddings = preprocess(preprocess_data,
                                'spectral',
                                post_fix=args.dataset)

    if args.use_embeddings:
        x = torch.cat([x, embeddings], dim=-1)

    if args.dataset == 'arxiv':
        x = (x - x.mean(0)) / x.std(0)

    if args.model == 'mlp':
        model = MLP(x.size(-1), args.hidden_channels, dataset.num_classes,
                    args.num_layers, 0.5,
                    args.dataset == 'products').to(device)
    elif args.model == 'linear':
        model = MLPLinear(x.size(-1), dataset.num_classes).to(device)
    elif args.model == 'plain':
        model = MLPLinear(x.size(-1), dataset.num_classes).to(device)
    elif args.model == 'sgc':
        model = SGC(x.size(-1), dataset.num_classes).to(device)

    x = x.to(device)
    y_true = data.y.to(device)
    train_idx = split_idx['train'].to(device)

    model_dir = prepare_folder(f'{args.dataset}_{args.model}', model)

    evaluator = Evaluator(name=f'ogbn-{args.dataset}')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        import gc
        gc.collect()
        print(sum(p.numel() for p in model.parameters()))
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        best_valid = 0
        best_out = None
        for epoch in range(1, args.epochs):
            loss = train(model, x, y_true, train_idx, optimizer)
            result, out = test(model, x, y_true, split_idx, evaluator)
            train_acc, valid_acc, test_acc = result
            if valid_acc > best_valid:
                best_valid = valid_acc
                best_out = out.cpu().exp()

            if (epoch % 10 == 0):
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')
            logger.add_result(run, result)

        logger.print_statistics(run)
        torch.save(best_out, f'{model_dir}/{run}.pt')

    logger.print_statistics()
Ejemplo n.º 28
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Products (GraphSAINT)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--inductive', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=20000)
    parser.add_argument('--walk_length', type=int, default=3)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--num_steps', type=int, default=30)
    parser.add_argument('--epochs', type=int, default=20)
    parser.add_argument('--eval_steps', type=int, default=2)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(
        name='ogbn-products', root='/srv/scratch/ogb/datasets/nodeproppred')
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    # Convert split indices to boolean masks and add them to `data`.
    for key, idx in split_idx.items():
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[idx] = True
        data[f'{key}_mask'] = mask

    # We omit normalization factors here since those are only defined for the
    # inductive learning setup.
    sampler_data = data
    if args.inductive:
        sampler_data = to_inductive(data)

    loader = GraphSAINTRandomWalkSampler(sampler_data,
                                         batch_size=args.batch_size,
                                         walk_length=args.walk_length,
                                         num_steps=args.num_steps,
                                         sample_coverage=0,
                                         save_dir=dataset.processed_dir)

    model = SAGE(data.x.size(-1), args.hidden_channels, dataset.num_classes,
                 args.num_layers, args.dropout).to(device)

    subgraph_loader = NeighborSampler(data.edge_index,
                                      sizes=[-1],
                                      batch_size=4096,
                                      shuffle=False,
                                      num_workers=12)

    evaluator = Evaluator(name='ogbn-products')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, loader, optimizer, device)
            if epoch % args.log_steps == 0:
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}')

            if epoch > 9 and epoch % args.eval_steps == 0:
                result = test(model, data, evaluator, subgraph_loader, device)
                logger.add_result(run, result)
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')

        logger.add_result(run, result)
        logger.print_statistics(run)
    logger.print_statistics()
Ejemplo n.º 29
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Products (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=12)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-products')
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    # Convert split indices to boolean masks and add them to `data`.
    for key, idx in split_idx.items():
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[idx] = True
        data[f'{key}_mask'] = mask

    cluster_data = ClusterData(data, num_parts=args.num_partitions,
                               recursive=False, save_dir=dataset.processed_dir)

    loader = ClusterLoader(cluster_data, batch_size=args.batch_size,
                           shuffle=True, num_workers=args.num_workers)

    subgraph_loader = NeighborSampler(data.edge_index, sizes=[-1],
                                      batch_size=1024, shuffle=False,
                                      num_workers=args.num_workers)

    model = GCN(data.x.size(-1), args.hidden_channels, dataset.num_classes,
                 args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-products')
    logger = Logger(args.runs, args)
    logger_orig = Logger(args.runs, args)
   
    adj = process_adj(data)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        best_valid = 0
        best_out = None
        for epoch in range(1, 1 + args.epochs):
            loss, train_acc = train(model, loader, optimizer, device)
            if epoch % args.log_steps == 0:
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Approx Train Acc: {train_acc:.4f}')

            if epoch > 19 and epoch % args.eval_steps == 0:
                out, result = test(model, data, evaluator, subgraph_loader, device)
                logger_orig.add_result(run, result)
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')
        logger.print_statistics(run)
    logger.print_statistics()
    logger_orig.print_statistics()
Ejemplo n.º 30
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-papers100M (MLP)')
    parser.add_argument('--data_root_dir', type=str, default='../../dataset')
    parser.add_argument('--num_propagations', type=int, default=3)
    parser.add_argument('--dropedge_rate', type=float, default=0.4)
    parser.add_argument('--node_emb_path', type=str, default=None)
    parser.add_argument('--output_path', type=str, required=True)
    args = parser.parse_args()

    # SGC pre-processing ######################################################

    dataset = PygNodePropPredDataset(name='ogbn-papers100M',
                                     root=args.data_root_dir)
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    x = None
    if args.node_emb_path:
        x = np.load(args.node_emb_path)
    else:
        x = data.x.numpy()
    N = data.num_nodes

    print('Making the graph undirected.')
    ### Randomly drop some edges to save computation
    data.edge_index, _ = dropout_adj(data.edge_index,
                                     p=args.dropedge_rate,
                                     num_nodes=data.num_nodes)
    data.edge_index = to_undirected(data.edge_index, data.num_nodes)

    print(data)

    row, col = data.edge_index

    print('Computing adj...')

    adj = SparseTensor(row=row, col=col, sparse_sizes=(N, N))
    adj = adj.set_diag()
    deg = adj.sum(dim=1).to(torch.float)
    deg_inv_sqrt = deg.pow(-0.5)
    deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
    adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1)

    adj = adj.to_scipy(layout='csr')

    train_idx, valid_idx, test_idx = split_idx['train'], split_idx[
        'valid'], split_idx['test']
    all_idx = torch.cat([train_idx, valid_idx, test_idx])
    mapped_train_idx = torch.arange(len(train_idx))
    mapped_valid_idx = torch.arange(len(train_idx),
                                    len(train_idx) + len(valid_idx))
    mapped_test_idx = torch.arange(
        len(train_idx) + len(valid_idx),
        len(train_idx) + len(valid_idx) + len(test_idx))

    sgc_dict = {}
    sgc_dict['label'] = data.y.data[all_idx].to(torch.long)
    sgc_dict['split_idx'] = {
        'train': mapped_train_idx,
        'valid': mapped_valid_idx,
        'test': mapped_test_idx
    }

    print('Start SGC processing')
    for _ in tqdm(range(args.num_propagations)):
        x = adj @ x
    sgc_dict['sgc_embedding'] = torch.from_numpy(x[all_idx]).to(torch.float)
    torch.save(sgc_dict, args.output_path)