Exemple #1
0
def cluster_data(data, num_clusters, batch_size, shuffle=True, verbose=True):
    """Prepares clusters for batching

    Parameters
    ----------
    data : torch_geometric.data.Data
        Graph data object.
    num_clusters : int
        The number of clusters to chop the input graph into.
    batch_size : int
        The number of clusters in each batch
    shuffle : bool, optional
        If true, the ClusterLoader will shuffle clusters, by default True
    verbose : bool, optional
        If true, prints clusters info, by default True

    Returns
    -------
    torch_geometric.data.ClusterLoader
        A loader for training
    """
    clusters = ClusterData(data, num_clusters, recursive=True, save_dir=None)
    if verbose:
        for cluster in clusters:
            print(cluster)
    return ClusterLoader(clusters, batch_size=batch_size)
Exemple #2
0
def build_sampler(args, data, save_dir):
    if args.sampler == 'rw-my':
        msg = 'Use GraphSaint randomwalk sampler(mysaint sampler)'
        loader = MySAINTSampler(data, batch_size=args.batch_size, sample_type='random_walk',
                                walk_length=2, sample_coverage=1000, save_dir=save_dir)
    elif args.sampler == 'node-my':
        msg = 'Use random node sampler(mysaint sampler)'
        loader = MySAINTSampler(data, sample_type='node', batch_size=args.batch_size * 3,
                                walk_length=2, sample_coverage=1000, save_dir=save_dir)
    elif args.sampler == 'rw':
        msg = 'Use GraphSaint randomwalk sampler'
        loader = GraphSAINTRandomWalkSampler(data, batch_size=args.batch_size, walk_length=2,
                                             num_steps=5, sample_coverage=1000,
                                             save_dir=save_dir)
    elif args.sampler == 'node':
        msg = 'Use GraphSaint node sampler'
        loader = GraphSAINTNodeSampler(data, batch_size=args.batch_size * 3,
                                       num_steps=5, sample_coverage=1000, num_workers=0, save_dir=save_dir)

    elif args.sampler == 'edge':
        msg = 'Use GraphSaint edge sampler'
        loader = GraphSAINTEdgeSampler(data, batch_size=args.batch_size,
                                       num_steps=5, sample_coverage=1000,
                                       save_dir=save_dir, num_workers=0)
    elif args.sampler == 'cluster':
        msg = 'Use cluster sampler'
        cluster_data = ClusterData(data, num_parts=args.num_parts, save_dir=save_dir)
        loader = ClusterLoader(cluster_data, batch_size=20, shuffle=True,
                               num_workers=0)
    else:
        raise KeyError('Sampler type error')

    return loader, msg
Exemple #3
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Products (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=6)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=20)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-products')
    splitted_idx = dataset.get_idx_split()
    data = dataset[0]

    # Convert split indices to boolean masks and add them to `data`.
    for key, idx in splitted_idx.items():
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[idx] = True
        data[f'{key}_mask'] = mask

    cluster_data = ClusterData(data,
                               num_parts=args.num_partitions,
                               recursive=False,
                               save_dir=dataset.processed_dir)

    loader = ClusterLoader(cluster_data,
                           batch_size=args.batch_size,
                           shuffle=True,
                           num_workers=args.num_workers)

    model = SAGE(data.x.size(-1), args.hidden_channels, 47, args.num_layers,
                 args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-products')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, loader, optimizer, device)
            if epoch % args.log_steps == 0:
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}')
        result = test(model, data, evaluator)
        logger.add_result(run, result)
        logger.print_statistics(run)
    logger.print_statistics()
Exemple #4
0
    def _make_graph_sampler(self):
        graph = Data(
            edge_index=self.edge_index, edge_attr=self.edge_weight, 
            n_id=torch.arange(0, self.num_nodes), num_nodes=self.num_nodes
        ).to('cpu')

        cluster_data = ClusterData(
            graph, num_parts=100, recursive=False, save_dir=None
        )

        cluster_loader = ClusterLoader(cluster_data, batch_size=5, shuffle=True, num_workers=0)

        return cluster_loader
def run_sim(cl, lr, layer):
    layer_dict = {'arma': ARMAConv, 'sage': SAGEConv, 'tag': TAGConv}
    mat = load_npz(
        '/gpfs/data/rsingh47/jbigness/data/%s/hic_sparse_vcsqrt_oe_edge_v7.npz'
        % cl)
    hms = np.load(
        '/gpfs/data/rsingh47/jbigness/data/%s/np_hmods_norm_vcsqrt_oe_edge_v7.npy'
        % cl)
    labs = np.load(
        '/gpfs/data/rsingh47/jbigness/data/%s/np_nodes_lab_genes_vcsqrt_oe_edge_v7.npy'
        % cl)

    print('Data Loaded')

    mask = torch.tensor(labs[:, -1]).long()
    loc = {}
    for i in range(labs[:, -1].shape[0]):
        loc[labs[i, -1]] = i
    y = []
    for i in range(mat.shape[0]):
        y.append(labs[loc[i], -2]) if i in mask else y.append(-1)
    y = torch.tensor(y).long()
    extract = torch_geometric.utils.from_scipy_sparse_matrix(mat)
    G = torch_geometric.data.Data(
        edge_index=extract[0],
        edge_attr=extract[1],
        x=torch.tensor(hms[:mat.shape[0]]).float().reshape(-1, 1, 100, 5),
        y=y)

    cluster_data = ClusterData(G, num_parts=20, recursive=False)
    train_loader = ClusterLoader(cluster_data,
                                 batch_size=2,
                                 shuffle=False,
                                 num_workers=0)

    print('Data Clustered')

    random.seed(30)
    idx = list(range(labs.shape[0] - 1))
    random.shuffle(idx)
    train_mask = idx[:10000]
    test_mask = idx[10000:]

    net = GCN(94, 500, 400, 100, 50, 2, layer_dict[layer])
    return train_model(net, train_loader, 1500, lr, train_mask, test_mask,
                       mask)
Exemple #6
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Products (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=12)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=10)

    parser.add_argument('--step-size', type=float, default=8e-3)
    parser.add_argument('-m', type=int, default=3)
    parser.add_argument('--test-freq', type=int, default=5)
    parser.add_argument('--attack', type=str, default='flag')
    parser.add_argument('--amp', type=float, default=2)

    args = parser.parse_args()

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-products')
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    # Convert split indices to boolean masks and add them to `data`.
    for key, idx in split_idx.items():
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[idx] = True
        data[f'{key}_mask'] = mask

    cluster_data = ClusterData(data,
                               num_parts=args.num_partitions,
                               recursive=False,
                               save_dir=dataset.processed_dir)

    loader = ClusterLoader(cluster_data,
                           batch_size=args.batch_size,
                           shuffle=True,
                           num_workers=args.num_workers)

    subgraph_loader = NeighborSampler(data.edge_index,
                                      sizes=[-1],
                                      batch_size=1024,
                                      shuffle=False,
                                      num_workers=args.num_workers)

    model = SAGE(data.x.size(-1), args.hidden_channels, dataset.num_classes,
                 args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-products')

    vals, tests = [], []
    for run in range(args.runs):
        best_val, final_test = 0, 0

        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        for epoch in range(1, args.epochs + 1):
            loss, acc = train_flag(model, loader, optimizer, device, args)
            if epoch > 19 and epoch % args.test_freq == 0 or epoch == args.epochs:
                result = test(model, data, evaluator, subgraph_loader, device)
                train, val, tst = result
                if val > best_val:
                    best_val = val
                    final_test = tst

        print(f'Run{run} val:{best_val}, test:{final_test}')
        vals.append(best_val)
        tests.append(final_test)

    print('')
    print(f"Average val accuracy: {np.mean(vals)} ± {np.std(vals)}")
    print(f"Average test accuracy: {np.mean(tests)} ± {np.std(tests)}")
Exemple #7
0
    def forward(self, X, edge_index, edge_weight):
        """
        :param X: Input data of shape (batch_size, num_nodes, in_channels)
        :param edge_index: Graph connectivity in COO format with shape(2, num_edges)
        :param edge_weight: Edge feature matrix with shape (num_edges, num_edge_features)
        :return: Output data of shape (batch_size, num_nodes, out_channels)
        """
        if torch.is_tensor(X):
            sz = X.shape
        if self.gcn_partition == 'cluster':
            out = torch.zeros(sz[0], sz[1], self.out_channels, device=X.device)
            graph_data = Data(edge_index=edge_index,
                              edge_attr=edge_weight,
                              train_mask=torch.arange(0, sz[1]),
                              num_nodes=sz[1]).to('cpu')
            cluster_data = ClusterData(graph_data,
                                       num_parts=50,
                                       recursive=False,
                                       save_dir='data/cluster')
            loader = ClusterLoader(cluster_data,
                                   batch_size=5,
                                   shuffle=True,
                                   num_workers=0)

            for subgraph in loader:
                out[:, subgraph.train_mask] = self.gcn(
                    X[:, subgraph.train_mask],
                    subgraph.edge_index.to(X.device),
                    subgraph.edge_attr.to(X.device))

        elif self.gcn_partition == 'sample':
            # Use NeighborSampler() to iterates over graph nodes in a mini-batch fashion
            # and constructs sampled subgraphs (use cpu for no CUDA version)
            out = torch.zeros(sz[0], sz[1], self.out_channels, device=X.device)
            graph_data = Data(edge_index=edge_index, num_nodes=sz[1]).to('cpu')
            loader = NeighborSampler(graph_data,
                                     size=[10, 5],
                                     num_hops=2,
                                     batch_size=120,
                                     shuffle=True,
                                     add_self_loops=False)

            for data_flow in loader():
                block1 = data_flow[0]
                t = self.gcn1(X, edge_index[:, block1.e_id],
                              edge_weight[block1.e_id])
                block2 = data_flow[1]
                part_out = self.gcn2(t, edge_index[:, block2.e_id],
                                     edge_weight[block2.e_id])
                out[:, data_flow.n_id] = part_out[:, data_flow.n_id]

        elif self.batch_training:
            if self.adj_available:
                out = self.gcn(X, edge_index, edge_weight)
            else:
                out = self.gcn(X, edge_index)

        else:
            # Currently, conv in [GATConv] cannot use argument node_dim for batch training
            # This is a temp solution but it's very very very slow!
            # Costing about 6 times more than batch_training
            batch = self.get_batch(X)
            if self.adj_available:
                out = self.gcn(batch.x, edge_index, edge_weight)
            else:
                out = self.gcn(batch.x, edge_index)
            out = out.view(sz[0], sz[1], -1)

        return out
Exemple #8
0
def get_cluster_batches(cluster_data, batch_size):
    loader = ClusterLoader(cluster_data,
                           batch_size=batch_size,
                           shuffle=True,
                           num_workers=1)
    return loader
Exemple #9
0
def main():
    parser = argparse.ArgumentParser(
        description='Link Prediction (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--dataset', type=str, default='ogbl-citation')
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--negs', type=int, default=1)
    parser.add_argument('--gnn_type', type=str, default='gcn')
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name=args.dataset)
    data = dataset[0]
    data.edge_index = to_undirected(data.edge_index, data.num_nodes)
    print(data.edge_index.shape, data.num_nodes)

    cluster_data = ClusterData(data,
                               num_parts=args.num_partitions,
                               recursive=False,
                               save_dir=dataset.processed_dir)

    loader = ClusterLoader(cluster_data,
                           batch_size=args.batch_size,
                           shuffle=True,
                           num_workers=args.num_workers)

    model = GCN(data.x.size(-1),
                args.hidden_channels,
                args.hidden_channels,
                args.num_layers,
                args.dropout,
                gnn_type=args.gnn_type).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name=args.dataset)
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            t0 = time.time()
            loss = train(model, predictor, loader, optimizer, device,
                         args.negs)
            tt = time.time()
            print(tt - t0)

            if epoch % args.eval_steps == 0:
                result = test(model, predictor, data, split_edge, evaluator,
                              64 * 4 * args.batch_size, device)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_mrr, valid_mrr, test_mrr = result
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {train_mrr:.4f}, '
                          f'Valid: {valid_mrr:.4f}, '
                          f'Test: {test_mrr:.4f}')

        logger.print_statistics(run)
    logger.print_statistics()
def test_cluster_gcn():
    adj = torch.tensor([
        [1, 1, 1, 0, 1, 0],
        [1, 1, 0, 1, 0, 1],
        [1, 0, 1, 0, 1, 0],
        [0, 1, 0, 1, 0, 1],
        [1, 0, 1, 0, 1, 0],
        [0, 1, 0, 1, 0, 1],
    ])

    edge_index = adj.nonzero().t()
    x = torch.Tensor([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]])
    data = Data(edge_index=edge_index, x=x, num_nodes=6)

    cluster_data = ClusterData(data, num_parts=2, log=False)

    assert cluster_data.partptr.tolist() == [0, 3, 6]
    assert cluster_data.perm.tolist() == [0, 2, 4, 1, 3, 5]
    assert cluster_data.data.x.tolist() == [
        [0, 0],
        [2, 2],
        [4, 4],
        [1, 1],
        [3, 3],
        [5, 5],
    ]
    assert cluster_data.data.adj.to_dense().tolist() == [
        [1, 1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
    ]

    data = cluster_data[0]
    assert data.x.tolist() == [[0, 0], [2, 2], [4, 4]]
    assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2],
                                        [0, 1, 2, 0, 1, 2, 0, 1, 2]]

    data = cluster_data[1]
    assert data.x.tolist() == [[1, 1], [3, 3], [5, 5]]
    assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2],
                                        [0, 1, 2, 0, 1, 2, 0, 1, 2]]

    loader = ClusterLoader(cluster_data, batch_size=1)
    it = iter(loader)

    data = next(it)
    assert data.x.tolist() == [[0, 0], [2, 2], [4, 4]]
    assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2],
                                        [0, 1, 2, 0, 1, 2, 0, 1, 2]]

    data = next(it)
    assert data.x.tolist() == [[1, 1], [3, 3], [5, 5]]
    assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2],
                                        [0, 1, 2, 0, 1, 2, 0, 1, 2]]

    torch.manual_seed(1)
    loader = ClusterLoader(cluster_data, batch_size=2, shuffle=True)
    data = next(iter(loader))
    assert data.x.tolist() == [
        [0, 0],
        [2, 2],
        [4, 4],
        [1, 1],
        [3, 3],
        [5, 5],
    ]
    assert to_dense_adj(data.edge_index).squeeze().tolist() == [
        [1, 1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
    ]

    torch.manual_seed(2)
    loader = ClusterLoader(cluster_data, batch_size=2, shuffle=True)
    data = next(iter(loader))
    assert data.x.tolist() == [
        [1, 1],
        [3, 3],
        [5, 5],
        [0, 0],
        [2, 2],
        [4, 4],
    ]
    assert to_dense_adj(data.edge_index).squeeze().tolist() == [
        [1, 1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
    ]
Exemple #11
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Products (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=12)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-products')
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    # Convert split indices to boolean masks and add them to `data`.
    for key, idx in split_idx.items():
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[idx] = True
        data[f'{key}_mask'] = mask

    cluster_data = ClusterData(data, num_parts=args.num_partitions,
                               recursive=False, save_dir=dataset.processed_dir)

    loader = ClusterLoader(cluster_data, batch_size=args.batch_size,
                           shuffle=True, num_workers=args.num_workers)

    subgraph_loader = NeighborSampler(data.edge_index, sizes=[-1],
                                      batch_size=1024, shuffle=False,
                                      num_workers=args.num_workers)

    model = GCN(data.x.size(-1), args.hidden_channels, dataset.num_classes,
                 args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-products')
    logger = Logger(args.runs, args)
    logger_orig = Logger(args.runs, args)
   
    adj = process_adj(data)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        best_valid = 0
        best_out = None
        for epoch in range(1, 1 + args.epochs):
            loss, train_acc = train(model, loader, optimizer, device)
            if epoch % args.log_steps == 0:
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Approx Train Acc: {train_acc:.4f}')

            if epoch > 19 and epoch % args.eval_steps == 0:
                out, result = test(model, data, evaluator, subgraph_loader, device)
                logger_orig.add_result(run, result)
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')
        logger.print_statistics(run)
    logger.print_statistics()
    logger_orig.print_statistics()
Exemple #12
0
def main(args):

    # Set up logging and devices
    args.save_dir = get_save_dir(args.save_dir, 'test', training=True)
    log = get_logger(args.save_dir, 'test')
    tboard = SummaryWriter(args.save_dir)
    device, args.gpu_ids = get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get data loader
    log.info('Building dataset...')
    # Download and process data at './dataset/xxx'
    dataset = PygNodePropPredDataset(name=args.dataset, root='dataset/')
    evaluator = Evaluator(name=args.dataset)

    split_idx = dataset.get_idx_split()
    data = dataset[0]

    # Convert split indices to boolean masks and add them to `data`.
    for key, idx in split_idx.items():
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[idx] = True
        data[f'{key}_mask'] = mask

    cluster_data = ClusterData(data,
                               num_parts=args.num_partitions,
                               recursive=False,
                               save_dir=dataset.processed_dir)

    dataset_loader = ClusterLoader(cluster_data,
                                   batch_size=args.batch_size,
                                   shuffle=args.data_shuffle,
                                   num_workers=args.num_workers)

    # Get model
    log.info('Building model...')
    model = load_full_model(args.load_path, args.gpu_ids)
    model = nn.DataParallel(model)

    model = model.to(device)
    model.eval()

    # Test
    log.info('Testing...')

    # Evaluate, display the stats and save the model
    dev_results = test(model, dataset_loader, device, evaluator)

    # Log the metrics
    dev_log_message = ''.join('{} - {}; '.format(k, v)
                              for k, v in dev_results.items())

    log.info(f'Testing - {dev_log_message}')
Exemple #13
0
def run():

    cluster_data = ClusterData(
        data,
        num_parts=args.num_partitions,
        recursive=False,
        save_dir=dataset.processed_dir,
    )

    loader = ClusterLoader(
        cluster_data,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
    )

    model = GCN(
        data.x.size(-1),
        args.hidden_channels,
        args.hidden_channels,
        args.num_layers,
        args.dropout,
    ).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name="ogbl-citation")
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, loader, optimizer, device)
            print(f"Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}")

            if epoch > 49 and epoch % args.eval_steps == 0:
                result = test(
                    model,
                    predictor,
                    data,
                    split_edge,
                    evaluator,
                    batch_size=64 * 1024,
                    device=device,
                )
                logger.add_result(run, result)

                train_mrr, valid_mrr, test_mrr = result
                print(f"Run: {run + 1:02d}, "
                      f"Epoch: {epoch:02d}, "
                      f"Loss: {loss:.4f}, "
                      f"Train: {train_mrr:.4f}, "
                      f"Valid: {valid_mrr:.4f}, "
                      f"Test: {test_mrr:.4f}")

        logger.print_statistics(run)
    logger.print_statistics()
Exemple #14
0
train_data = ClusterData(train_data,
                         num_parts=1500,
                         recursive=False,
                         save_dir="data/Reddit/train")
val_data = ClusterData(val_data,
                       num_parts=20,
                       recursive=False,
                       save_dir="data/Reddit/val")
test_data = ClusterData(test_data,
                        num_parts=1,
                        recursive=False,
                        save_dir="data/Reddit/test")

train_loader = ClusterLoader(train_data,
                             batch_size=20,
                             shuffle=True,
                             num_workers=8)
val_loader = ClusterLoader(val_data,
                           batch_size=1,
                           shuffle=False,
                           num_workers=8)
test_loader = ClusterLoader(test_data,
                            batch_size=1,
                            shuffle=False,
                            num_workers=1)

# subgraph_loader = NeighborSampler(data.edge_index, sizes=[-1], batch_size=1024,
#                                   shuffle=False, num_workers=12)


class GCNConvDiagEnhance(MessagePassing):
Exemple #15
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-Citation (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=12)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-citation')
    split_edge = dataset.get_edge_split()
    data = dataset[0]
    data.edge_index = to_undirected(data.edge_index, data.num_nodes)

    cluster_data = ClusterData(data, num_parts=args.num_partitions,
                               recursive=False, save_dir=dataset.processed_dir)

    loader = ClusterLoader(cluster_data, batch_size=args.batch_size,
                           shuffle=True, num_workers=args.num_workers)

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596]
    split_edge['eval_train'] = {
        'source_node': split_edge['train']['source_node'][idx],
        'target_node': split_edge['train']['target_node'][idx],
        'target_node_neg': split_edge['valid']['target_node_neg'],
    }

    model = GCN(data.x.size(-1), args.hidden_channels, args.hidden_channels,
                args.num_layers, args.dropout).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-citation')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(
            list(model.parameters()) + list(predictor.parameters()),
            lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, loader, optimizer, device)
            print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}')

            if epoch > 49 and epoch % args.eval_steps == 0:
                result = test(model, predictor, data, split_edge, evaluator,
                              batch_size=64 * 1024, device=device)
                logger.add_result(run, result)

                train_mrr, valid_mrr, test_mrr = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {train_mrr:.4f}, '
                      f'Valid: {valid_mrr:.4f}, '
                      f'Test: {test_mrr:.4f}')

        logger.print_statistics(run)
    logger.print_statistics()
Exemple #16
0
)

homo_data.y = node_type.new_full((node_type.size(0), 1), -1)
homo_data.y[local2global["paper"]] = data.y_dict["paper"]

homo_data.train_mask = torch.zeros((node_type.size(0)), dtype=torch.bool)
homo_data.train_mask[local2global["paper"][split_idx["train"]["paper"]]] = True

print(homo_data)

cluster_data = ClusterData(homo_data,
                           num_parts=5000,
                           recursive=True,
                           save_dir=dataset.processed_dir)
train_loader = ClusterLoader(cluster_data,
                             batch_size=500,
                             shuffle=True,
                             num_workers=12)

# Map informations to their canonical type.
x_dict = {}
for key, x in data.x_dict.items():
    x_dict[key2int[key]] = x

num_nodes_dict = {}
for key, N in data.num_nodes_dict.items():
    num_nodes_dict[key2int[key]] = N


class RGCNConv(MessagePassing):
    def __init__(self, in_channels, out_channels, num_node_types,
                 num_edge_types):
Exemple #17
0
def test_cluster_gcn():
    adj = torch.tensor([
        [1, 1, 1, 0, 1, 0],
        [1, 1, 0, 1, 0, 1],
        [1, 0, 1, 0, 1, 0],
        [0, 1, 0, 1, 0, 1],
        [1, 0, 1, 0, 1, 0],
        [0, 1, 0, 1, 0, 1],
    ])

    x = torch.Tensor([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]])
    edge_index = adj.nonzero(as_tuple=False).t()
    edge_attr = torch.arange(edge_index.size(1))
    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    data.num_nodes = 6

    cluster_data = ClusterData(data, num_parts=2, log=False)

    assert cluster_data.partptr.tolist() == [0, 3, 6]
    assert cluster_data.perm.tolist() == [0, 2, 4, 1, 3, 5]
    assert cluster_data.data.x.tolist() == [
        [0, 0],
        [2, 2],
        [4, 4],
        [1, 1],
        [3, 3],
        [5, 5],
    ]
    assert cluster_data.data.adj.to_dense().tolist() == [
        [0, 2, 3, 1, 0, 0],
        [8, 9, 10, 0, 0, 0],
        [14, 15, 16, 0, 0, 0],
        [4, 0, 0, 5, 6, 7],
        [0, 0, 0, 11, 12, 13],
        [0, 0, 0, 17, 18, 19],
    ]

    data = cluster_data[0]
    assert data.num_nodes == 3
    assert data.x.tolist() == [[0, 0], [2, 2], [4, 4]]
    assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2],
                                        [0, 1, 2, 0, 1, 2, 0, 1, 2]]
    assert data.edge_attr.tolist() == [0, 2, 3, 8, 9, 10, 14, 15, 16]

    data = cluster_data[1]
    assert data.num_nodes == 3
    assert data.x.tolist() == [[1, 1], [3, 3], [5, 5]]
    assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2],
                                        [0, 1, 2, 0, 1, 2, 0, 1, 2]]
    assert data.edge_attr.tolist() == [5, 6, 7, 11, 12, 13, 17, 18, 19]

    loader = ClusterLoader(cluster_data, batch_size=1)
    iterator = iter(loader)

    data = next(iterator)
    assert data.x.tolist() == [[0, 0], [2, 2], [4, 4]]
    assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2],
                                        [0, 1, 2, 0, 1, 2, 0, 1, 2]]

    data = next(iterator)
    assert data.x.tolist() == [[1, 1], [3, 3], [5, 5]]
    assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2],
                                        [0, 1, 2, 0, 1, 2, 0, 1, 2]]

    torch.manual_seed(1)
    loader = ClusterLoader(cluster_data, batch_size=2, shuffle=True)
    data = next(iter(loader))
    assert data.num_nodes == 6
    assert data.x.tolist() == [
        [0, 0],
        [2, 2],
        [4, 4],
        [1, 1],
        [3, 3],
        [5, 5],
    ]
    assert to_dense_adj(data.edge_index).squeeze().tolist() == [
        [1, 1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
    ]

    torch.manual_seed(2)
    loader = ClusterLoader(cluster_data, batch_size=2, shuffle=True)
    data = next(iter(loader))
    assert data.num_nodes == 6
    assert data.x.tolist() == [
        [1, 1],
        [3, 3],
        [5, 5],
        [0, 0],
        [2, 2],
        [4, 4],
    ]
    assert to_dense_adj(data.edge_index).squeeze().tolist() == [
        [1, 1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
    ]

    loader = ClusterLoader(cluster_data, batch_size=1, shuffle=True)
    data = next(iter(loader))
    assert data.num_nodes == 3
val_data_list = [data for data in val_dataset]
for data in val_data_list:
    data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    data.val_mask = torch.ones(data.num_nodes, dtype=torch.bool)
    data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)

test_data_list = [data for data in test_dataset]
for data in test_data_list:
    data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    data.test_mask = torch.ones(data.num_nodes, dtype=torch.bool)

data = Batch.from_data_list(train_data_list + val_data_list + test_data_list)
cluster_data = ClusterData(data, num_parts=50, recursive=False,
                            save_dir=dataset.processed_dir)
loader = ClusterLoader(cluster_data, batch_size=1, shuffle=True,
                        num_workers=0)  


#Model Structure
class Net(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Net, self).__init__()
        dim = 512
        self.gcn1 = ChebConv(in_channels, dim, K=1)
        self.lin1 = nn.Linear(in_channels, dim)
        self.gcn2 = ChebConv(dim, dim, K=1)
        self.lin2 = nn.Linear(dim, dim)
        self.gcn3 = ChebConv(dim, dim, K=1)
        self.lin3 = nn.Linear(dim, dim)
        self.gcn4 = ChebConv(dim, dim, K=1)
        self.lin4 = nn.Linear(dim, dim)
Exemple #19
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Proteins (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_node_features', action='store_true')
    parser.add_argument('--num_partitions', type=int, default=700)
    parser.add_argument('--num_workers', type=int, default=6)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=128)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=50)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=1000)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-proteins')
    splitted_idx = dataset.get_idx_split()
    data = dataset[0]

    # Convert split indices to boolean masks and add them to `data`.
    for key, idx in splitted_idx.items():
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[idx] = True
        data[f'{key}_mask'] = mask

    cluster_data = ClusterData(
        data,
        num_parts=args.num_partitions,
        recursive=False,
        save_dir=dataset.processed_dir)

    if not args.use_node_features:
        cluster_data.data.x = torch.ones(cluster_data.data.num_nodes, 1)
    else:
        cluster_data.data.x = cluster_data.data.x.to(torch.float)

    loader = ClusterLoader(
        cluster_data,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers)

    model = GIN(
        cluster_data.data.x.size(-1), data.edge_attr.size(-1),
        args.hidden_channels, 112, args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-proteins')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, loader, optimizer, device)

            if epoch % args.eval_steps == 0:
                result = test(model, loader, evaluator, device)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_rocauc, valid_rocauc, test_rocauc = result
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {100 * train_rocauc:.2f}%, '
                          f'Valid: {100 * valid_rocauc:.2f}% '
                          f'Test: {100 * test_rocauc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Reddit
from torch_geometric.data import ClusterDataset, ClusterLoader
from torch_geometric.nn import SAGEConv

dataset = Reddit('../data/Reddit')

print('Partioning the graph... (this may take a while)')
cluster_dataset = ClusterDataset(dataset, num_parts=1500, save=True)
train_loader = ClusterLoader(cluster_dataset,
                             batch_size=20,
                             shuffle=True,
                             drop_last=True,
                             num_workers=6)
test_loader = ClusterLoader(cluster_dataset,
                            batch_size=20,
                            shuffle=False,
                            num_workers=6)
print('Done!')


class Net(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Net, self).__init__()
        self.conv1 = SAGEConv(in_channels, 128, normalize=False)
        self.conv2 = SAGEConv(128, out_channels, normalize=False)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.2, training=self.training)
        x = F.relu(self.conv1(x, edge_index))
Exemple #21
0
# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(
    f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.3f}'
)
print(f'Contains isolated nodes: {data.contains_isolated_nodes()}')
print(f'Contains self-loops: {data.contains_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

### Test data loader
torch.manual_seed(12345)
cluster_data = ClusterData(data, num_parts=128)  # 1. Create subgraphs.
train_loader = ClusterLoader(cluster_data, batch_size=32,
                             shuffle=True)  # 2. Stochastic partioning scheme.

print()
total_num_nodes = 0
for step, sub_data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of nodes in the current batch: {sub_data.num_nodes}')
    print(sub_data)
    print()
    total_num_nodes += sub_data.num_nodes

print(f'Iterated over {total_num_nodes} of {data.num_nodes} nodes!')


class GCN(torch.nn.Module):