Exemple #1
0
def get_product_clusters():
    dataset_name = "ogbn-products"
    dataset = PygNodePropPredDataset(name=dataset_name)

    print('The {} dataset has {} graph'.format(dataset_name, len(dataset)))

    data = dataset[0]
    print(data)
    split_idx = dataset.get_idx_split()
    train_idx = split_idx['train']
    val_idx = split_idx['valid']
    test_idx = split_idx['test']

    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    train_mask[train_idx] = True
    data['train_mask'] = train_mask

    val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    val_mask[val_idx] = True
    data['valid_mask'] = val_mask

    test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    test_mask[test_idx] = True
    data['test_mask'] = test_mask

    cluster_data = ClusterData(data, num_parts=15000, save_dir="dataset")
    return cluster_data, dataset, data, split_idx
Exemple #2
0
def build_sampler(args, data, save_dir):
    if args.sampler == 'rw-my':
        msg = 'Use GraphSaint randomwalk sampler(mysaint sampler)'
        loader = MySAINTSampler(data, batch_size=args.batch_size, sample_type='random_walk',
                                walk_length=2, sample_coverage=1000, save_dir=save_dir)
    elif args.sampler == 'node-my':
        msg = 'Use random node sampler(mysaint sampler)'
        loader = MySAINTSampler(data, sample_type='node', batch_size=args.batch_size * 3,
                                walk_length=2, sample_coverage=1000, save_dir=save_dir)
    elif args.sampler == 'rw':
        msg = 'Use GraphSaint randomwalk sampler'
        loader = GraphSAINTRandomWalkSampler(data, batch_size=args.batch_size, walk_length=2,
                                             num_steps=5, sample_coverage=1000,
                                             save_dir=save_dir)
    elif args.sampler == 'node':
        msg = 'Use GraphSaint node sampler'
        loader = GraphSAINTNodeSampler(data, batch_size=args.batch_size * 3,
                                       num_steps=5, sample_coverage=1000, num_workers=0, save_dir=save_dir)

    elif args.sampler == 'edge':
        msg = 'Use GraphSaint edge sampler'
        loader = GraphSAINTEdgeSampler(data, batch_size=args.batch_size,
                                       num_steps=5, sample_coverage=1000,
                                       save_dir=save_dir, num_workers=0)
    elif args.sampler == 'cluster':
        msg = 'Use cluster sampler'
        cluster_data = ClusterData(data, num_parts=args.num_parts, save_dir=save_dir)
        loader = ClusterLoader(cluster_data, batch_size=20, shuffle=True,
                               num_workers=0)
    else:
        raise KeyError('Sampler type error')

    return loader, msg
Exemple #3
0
def cluster_data(data, num_clusters, batch_size, shuffle=True, verbose=True):
    """Prepares clusters for batching

    Parameters
    ----------
    data : torch_geometric.data.Data
        Graph data object.
    num_clusters : int
        The number of clusters to chop the input graph into.
    batch_size : int
        The number of clusters in each batch
    shuffle : bool, optional
        If true, the ClusterLoader will shuffle clusters, by default True
    verbose : bool, optional
        If true, prints clusters info, by default True

    Returns
    -------
    torch_geometric.data.ClusterLoader
        A loader for training
    """
    clusters = ClusterData(data, num_clusters, recursive=True, save_dir=None)
    if verbose:
        for cluster in clusters:
            print(cluster)
    return ClusterLoader(clusters, batch_size=batch_size)
Exemple #4
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Products (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=6)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=20)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-products')
    splitted_idx = dataset.get_idx_split()
    data = dataset[0]

    # Convert split indices to boolean masks and add them to `data`.
    for key, idx in splitted_idx.items():
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[idx] = True
        data[f'{key}_mask'] = mask

    cluster_data = ClusterData(data,
                               num_parts=args.num_partitions,
                               recursive=False,
                               save_dir=dataset.processed_dir)

    loader = ClusterLoader(cluster_data,
                           batch_size=args.batch_size,
                           shuffle=True,
                           num_workers=args.num_workers)

    model = SAGE(data.x.size(-1), args.hidden_channels, 47, args.num_layers,
                 args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-products')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, loader, optimizer, device)
            if epoch % args.log_steps == 0:
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}')
        result = test(model, data, evaluator)
        logger.add_result(run, result)
        logger.print_statistics(run)
    logger.print_statistics()
Exemple #5
0
    def _make_graph_sampler(self):
        graph = Data(
            edge_index=self.edge_index, edge_attr=self.edge_weight, 
            n_id=torch.arange(0, self.num_nodes), num_nodes=self.num_nodes
        ).to('cpu')

        cluster_data = ClusterData(
            graph, num_parts=100, recursive=False, save_dir=None
        )

        cluster_loader = ClusterLoader(cluster_data, batch_size=5, shuffle=True, num_workers=0)

        return cluster_loader
Exemple #6
0
    def process_cluster_data(self, data):
        """
        Data processing for ClusterSelfGNN. First the data object will be clustered according to the number of partition
        specified by this class. Then, we randomly sample a number of clusters and merge them together. Finally, data 
        augmentation is applied each of the final clusters. This is a simple strategy motivated by ClusterGCN and 
        employed to improve the scalability of SelfGNN.

        :param data: A PyTorch Geometric Data object
        :return: a list of Data objects depending on the final number of clusters.
        """
        data_list = []
        clusters = []
        num_parts, cluster_size = self.num_parts, self.num_parts // self.final_parts

        # Cluster the data
        cd = ClusterData(data, num_parts=num_parts)
        for i in range(1, cd.partptr.shape[0]):
            cls_nodes = cd.perm[cd.partptr[i - 1]: cd.partptr[i]]
            clusters.append(cls_nodes)

        # Randomly merge clusters and apply transformation
        np.random.shuffle(clusters)
        for i in tqdm(range(0, len(clusters), cluster_size), "Processing clusters"):
            end = i + cluster_size if len(clusters) - i > cluster_size else len(clusters)
            cls_nodes = torch.cat(clusters[i:end]).unique()

            x = data.x[cls_nodes]
            y = data.y[cls_nodes]
            train_mask = data.train_mask[cls_nodes]
            dev_mask = data.val_mask[cls_nodes]
            test_mask = data.test_mask[cls_nodes]
            edge_index, edge_attr = subgraph(cls_nodes, data.edge_index, relabel_nodes=True)
            view1data = Data(edge_index=edge_index, x=x, edge_attr=edge_attr, num_nodes=cls_nodes.shape[0])
            view2data = view1data if self.augumentation is None else self.augumentation(view1data)
            if not hasattr(view2data, "edge_attr") or view2data.edge_attr is None:
                view2data.edge_attr = torch.ones(view2data.edge_index.shape[1])
            diff = abs(view2data.x.shape[1] - view1data.x.shape[1])
            if diff > 0:
                smaller_data = view1data if view1data.x.shape[1] < view2data.x.shape[1] else view2data
                smaller_data.x = F.pad(smaller_data.x, pad=(0, diff))
                view1data.x = F.normalize(view1data.x)
                view2data.x = F.normalize(view2data.x)
            new_data = Data(y=y, x=view1data.x, x2=view2data.x, edge_index=view1data.edge_index,
                            edge_index2=view2data.edge_index,
                            edge_attr=view1data.edge_attr, edge_attr2=view2data.edge_attr, train_mask=train_mask,
                            dev_mask=dev_mask, test_mask=test_mask, num_nodes=cls_nodes.shape[0], nodes=cls_nodes)
            data_list.append(new_data)
        print()
        return data_list
def run_sim(cl, lr, layer):
    layer_dict = {'arma': ARMAConv, 'sage': SAGEConv, 'tag': TAGConv}
    mat = load_npz(
        '/gpfs/data/rsingh47/jbigness/data/%s/hic_sparse_vcsqrt_oe_edge_v7.npz'
        % cl)
    hms = np.load(
        '/gpfs/data/rsingh47/jbigness/data/%s/np_hmods_norm_vcsqrt_oe_edge_v7.npy'
        % cl)
    labs = np.load(
        '/gpfs/data/rsingh47/jbigness/data/%s/np_nodes_lab_genes_vcsqrt_oe_edge_v7.npy'
        % cl)

    print('Data Loaded')

    mask = torch.tensor(labs[:, -1]).long()
    loc = {}
    for i in range(labs[:, -1].shape[0]):
        loc[labs[i, -1]] = i
    y = []
    for i in range(mat.shape[0]):
        y.append(labs[loc[i], -2]) if i in mask else y.append(-1)
    y = torch.tensor(y).long()
    extract = torch_geometric.utils.from_scipy_sparse_matrix(mat)
    G = torch_geometric.data.Data(
        edge_index=extract[0],
        edge_attr=extract[1],
        x=torch.tensor(hms[:mat.shape[0]]).float().reshape(-1, 1, 100, 5),
        y=y)

    cluster_data = ClusterData(G, num_parts=20, recursive=False)
    train_loader = ClusterLoader(cluster_data,
                                 batch_size=2,
                                 shuffle=False,
                                 num_workers=0)

    print('Data Clustered')

    random.seed(30)
    idx = list(range(labs.shape[0] - 1))
    random.shuffle(idx)
    train_mask = idx[:10000]
    test_mask = idx[10000:]

    net = GCN(94, 500, 400, 100, 50, 2, layer_dict[layer])
    return train_model(net, train_loader, 1500, lr, train_mask, test_mask,
                       mask)
Exemple #8
0
def main():
    parser = argparse.ArgumentParser(
        description='Link Prediction (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--dataset', type=str, default='ogbl-citation')
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--negs', type=int, default=1)
    parser.add_argument('--gnn_type', type=str, default='gcn')
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name=args.dataset)
    data = dataset[0]
    data.edge_index = to_undirected(data.edge_index, data.num_nodes)
    print(data.edge_index.shape, data.num_nodes)

    cluster_data = ClusterData(data,
                               num_parts=args.num_partitions,
                               recursive=False,
                               save_dir=dataset.processed_dir)

    loader = ClusterLoader(cluster_data,
                           batch_size=args.batch_size,
                           shuffle=True,
                           num_workers=args.num_workers)

    model = GCN(data.x.size(-1),
                args.hidden_channels,
                args.hidden_channels,
                args.num_layers,
                args.dropout,
                gnn_type=args.gnn_type).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name=args.dataset)
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            t0 = time.time()
            loss = train(model, predictor, loader, optimizer, device,
                         args.negs)
            tt = time.time()
            print(tt - t0)

            if epoch % args.eval_steps == 0:
                result = test(model, predictor, data, split_edge, evaluator,
                              64 * 4 * args.batch_size, device)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_mrr, valid_mrr, test_mrr = result
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {train_mrr:.4f}, '
                          f'Valid: {valid_mrr:.4f}, '
                          f'Test: {test_mrr:.4f}')

        logger.print_statistics(run)
    logger.print_statistics()
Exemple #9
0
    def forward(self, X, edge_index, edge_weight):
        """
        :param X: Input data of shape (batch_size, num_nodes, in_channels)
        :param edge_index: Graph connectivity in COO format with shape(2, num_edges)
        :param edge_weight: Edge feature matrix with shape (num_edges, num_edge_features)
        :return: Output data of shape (batch_size, num_nodes, out_channels)
        """
        if torch.is_tensor(X):
            sz = X.shape
        if self.gcn_partition == 'cluster':
            out = torch.zeros(sz[0], sz[1], self.out_channels, device=X.device)
            graph_data = Data(edge_index=edge_index,
                              edge_attr=edge_weight,
                              train_mask=torch.arange(0, sz[1]),
                              num_nodes=sz[1]).to('cpu')
            cluster_data = ClusterData(graph_data,
                                       num_parts=50,
                                       recursive=False,
                                       save_dir='data/cluster')
            loader = ClusterLoader(cluster_data,
                                   batch_size=5,
                                   shuffle=True,
                                   num_workers=0)

            for subgraph in loader:
                out[:, subgraph.train_mask] = self.gcn(
                    X[:, subgraph.train_mask],
                    subgraph.edge_index.to(X.device),
                    subgraph.edge_attr.to(X.device))

        elif self.gcn_partition == 'sample':
            # Use NeighborSampler() to iterates over graph nodes in a mini-batch fashion
            # and constructs sampled subgraphs (use cpu for no CUDA version)
            out = torch.zeros(sz[0], sz[1], self.out_channels, device=X.device)
            graph_data = Data(edge_index=edge_index, num_nodes=sz[1]).to('cpu')
            loader = NeighborSampler(graph_data,
                                     size=[10, 5],
                                     num_hops=2,
                                     batch_size=120,
                                     shuffle=True,
                                     add_self_loops=False)

            for data_flow in loader():
                block1 = data_flow[0]
                t = self.gcn1(X, edge_index[:, block1.e_id],
                              edge_weight[block1.e_id])
                block2 = data_flow[1]
                part_out = self.gcn2(t, edge_index[:, block2.e_id],
                                     edge_weight[block2.e_id])
                out[:, data_flow.n_id] = part_out[:, data_flow.n_id]

        elif self.batch_training:
            if self.adj_available:
                out = self.gcn(X, edge_index, edge_weight)
            else:
                out = self.gcn(X, edge_index)

        else:
            # Currently, conv in [GATConv] cannot use argument node_dim for batch training
            # This is a temp solution but it's very very very slow!
            # Costing about 6 times more than batch_training
            batch = self.get_batch(X)
            if self.adj_available:
                out = self.gcn(batch.x, edge_index, edge_weight)
            else:
                out = self.gcn(batch.x, edge_index)
            out = out.view(sz[0], sz[1], -1)

        return out
Exemple #10
0
# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(
    f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.3f}'
)
print(f'Contains isolated nodes: {data.contains_isolated_nodes()}')
print(f'Contains self-loops: {data.contains_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

### Test data loader
torch.manual_seed(12345)
cluster_data = ClusterData(data, num_parts=128)  # 1. Create subgraphs.
train_loader = ClusterLoader(cluster_data, batch_size=32,
                             shuffle=True)  # 2. Stochastic partioning scheme.

print()
total_num_nodes = 0
for step, sub_data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of nodes in the current batch: {sub_data.num_nodes}')
    print(sub_data)
    print()
    total_num_nodes += sub_data.num_nodes

print(f'Iterated over {total_num_nodes} of {data.num_nodes} nodes!')
Exemple #11
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Proteins (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_node_features', action='store_true')
    parser.add_argument('--num_partitions', type=int, default=700)
    parser.add_argument('--num_workers', type=int, default=6)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=128)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=50)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=1000)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-proteins')
    splitted_idx = dataset.get_idx_split()
    data = dataset[0]

    # Convert split indices to boolean masks and add them to `data`.
    for key, idx in splitted_idx.items():
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[idx] = True
        data[f'{key}_mask'] = mask

    cluster_data = ClusterData(
        data,
        num_parts=args.num_partitions,
        recursive=False,
        save_dir=dataset.processed_dir)

    if not args.use_node_features:
        cluster_data.data.x = torch.ones(cluster_data.data.num_nodes, 1)
    else:
        cluster_data.data.x = cluster_data.data.x.to(torch.float)

    loader = ClusterLoader(
        cluster_data,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers)

    model = GIN(
        cluster_data.data.x.size(-1), data.edge_attr.size(-1),
        args.hidden_channels, 112, args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-proteins')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, loader, optimizer, device)

            if epoch % args.eval_steps == 0:
                result = test(model, loader, evaluator, device)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_rocauc, valid_rocauc, test_rocauc = result
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {100 * train_rocauc:.2f}%, '
                          f'Valid: {100 * valid_rocauc:.2f}% '
                          f'Test: {100 * test_rocauc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()
Exemple #12
0
def run():

    cluster_data = ClusterData(
        data,
        num_parts=args.num_partitions,
        recursive=False,
        save_dir=dataset.processed_dir,
    )

    loader = ClusterLoader(
        cluster_data,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
    )

    model = GCN(
        data.x.size(-1),
        args.hidden_channels,
        args.hidden_channels,
        args.num_layers,
        args.dropout,
    ).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name="ogbl-citation")
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, loader, optimizer, device)
            print(f"Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}")

            if epoch > 49 and epoch % args.eval_steps == 0:
                result = test(
                    model,
                    predictor,
                    data,
                    split_edge,
                    evaluator,
                    batch_size=64 * 1024,
                    device=device,
                )
                logger.add_result(run, result)

                train_mrr, valid_mrr, test_mrr = result
                print(f"Run: {run + 1:02d}, "
                      f"Epoch: {epoch:02d}, "
                      f"Loss: {loss:.4f}, "
                      f"Train: {train_mrr:.4f}, "
                      f"Valid: {valid_mrr:.4f}, "
                      f"Test: {test_mrr:.4f}")

        logger.print_statistics(run)
    logger.print_statistics()
Exemple #13
0
    edge_attr=edge_type,
    node_type=node_type,
    local_node_idx=local_node_idx,
    num_nodes=node_type.size(0),
)

homo_data.y = node_type.new_full((node_type.size(0), 1), -1)
homo_data.y[local2global["paper"]] = data.y_dict["paper"]

homo_data.train_mask = torch.zeros((node_type.size(0)), dtype=torch.bool)
homo_data.train_mask[local2global["paper"][split_idx["train"]["paper"]]] = True

print(homo_data)

cluster_data = ClusterData(homo_data,
                           num_parts=5000,
                           recursive=True,
                           save_dir=dataset.processed_dir)
train_loader = ClusterLoader(cluster_data,
                             batch_size=500,
                             shuffle=True,
                             num_workers=12)

# Map informations to their canonical type.
x_dict = {}
for key, x in data.x_dict.items():
    x_dict[key2int[key]] = x

num_nodes_dict = {}
for key, N in data.num_nodes_dict.items():
    num_nodes_dict[key2int[key]] = N
Exemple #14
0
if __name__ == '__main__':
    adj, features, labels, mask_train,mask_test, \
    y_test_oneclass, mask_test_oneclass,  mask_train1, = load_data('/content/drive/My Drive/NAS-GCN-SAR/data')

    ###传入数据的一些处理
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    edge_index, edge_weight = from_scipy_sparse_matrix(adj)
    features = torch.from_numpy(features).float()
    labels = torch.from_numpy(labels).float()
    mask_test_oneclass = torch.from_numpy(
        np.array(mask_test_oneclass)).to(device)
    y_test_oneclass = torch.from_numpy(np.array(y_test_oneclass)).to(device)

    ##data loader
    data = Data(x=features, edge_index=edge_index, y=labels)
    data.train_mask = torch.from_numpy(mask_train)
    data.test_mask = torch.from_numpy(mask_test)
    cluster_data = ClusterData(data, num_parts=1024, recursive=False)
    train_loader = ClusterLoader(cluster_data,
                                 batch_size=64,
                                 shuffle=True,
                                 num_workers=12)
    subgraph_loader = NeighborSampler(data.edge_index,
                                      sizes=[-1],
                                      batch_size=1024,
                                      shuffle=False,
                                      num_workers=12)

    main()
    data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    
val_data_list = [data for data in val_dataset]
for data in val_data_list:
    data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    data.val_mask = torch.ones(data.num_nodes, dtype=torch.bool)
    data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)

test_data_list = [data for data in test_dataset]
for data in test_data_list:
    data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    data.test_mask = torch.ones(data.num_nodes, dtype=torch.bool)

data = Batch.from_data_list(train_data_list + val_data_list + test_data_list)
cluster_data = ClusterData(data, num_parts=50, recursive=False,
                            save_dir=dataset.processed_dir)
loader = ClusterLoader(cluster_data, batch_size=1, shuffle=True,
                        num_workers=0)  


#Model Structure
class Net(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Net, self).__init__()
        dim = 512
        self.gcn1 = ChebConv(in_channels, dim, K=1)
        self.lin1 = nn.Linear(in_channels, dim)
        self.gcn2 = ChebConv(dim, dim, K=1)
        self.lin2 = nn.Linear(dim, dim)
        self.gcn3 = ChebConv(dim, dim, K=1)
        self.lin3 = nn.Linear(dim, dim)
Exemple #16
0
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GENConv
from gcn import add_features_, dataloader
from torch_geometric.data import InMemoryDataset
from torch_geometric.data import ClusterData, ClusterLoader
from sklearn.metrics import mean_squared_error, r2_score

num_epochs = 20
data_, G = dataloader()
data_ = add_features_(data_, G)
dataset = data_
print(dataset)
# dataset = InMemoryDataset.collate(data)
cluster_data = ClusterData(data_, num_parts=50, recursive=False)
test_mask = cluster_data
train_loader = ClusterLoader(cluster_data,
                             batch_size=5,
                             shuffle=True,
                             num_workers=12)


class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 32)
        # self.conv2 = GCNConv(16, dataset.num_classes)
        self.conv3 = GCNConv(32, 16)
        self.conv2 = GCNConv(16, dataset.y.shape[1])
Exemple #17
0
def main(args):

    # Set up logging and devices
    args.save_dir = get_save_dir(args.save_dir, 
        args.name + '-' + args.dataset + '-' + str(args.hidden_dim) + '-' + str(args.max_forward_iterations) + '-' + args.reg_loss_type + '-' + args.embed_type + '-' + args.init_type, 
        training=True)
    log = get_logger(args.save_dir, args.name)
    tboard = SummaryWriter(args.save_dir)
    device, args.gpu_ids = get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get data loader
    log.info('Building dataset...')
    dataset, split_idx, evaluator = load_pyg_dataset(args.dataset)
    data = dataset[0]    
    # Attach the node idx to the data
    data['orig_node_idx'] = torch.arange(data.x.shape[0])

    # Convert split indices to boolean masks and add them to `data`.
    for key, idx in split_idx.items():
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[idx] = True
        data[f'{key}_mask'] = mask


    cluster_data = ClusterData(data, num_parts=args.num_partitions,
                               recursive=False, save_dir=dataset.processed_dir)

    dataset_loader = CustomClusterLoader(cluster_data, batch_size=args.batch_size,
                           shuffle=args.data_shuffle, num_workers=args.num_workers,
                           normalize_adj_matrix=args.normalize_adj_matrix)

    num_nodes = data.num_nodes

    # If the node features is a zero tensor with dimension one
    # re-create it here as a (num_nodes, num_nodes) sparse identity matrix
    if data.num_node_features == 1 and torch.equal(data['x'], torch.zeros(data.num_nodes, data.num_node_features)):
        node_features = sp.identity(data.num_nodes/len(dataset_loader))
        node_features = sparse_mx_to_torch_sparse_tensor(node_features).float()
        data.x = node_features 
        
    # Get model
    log.info('Building model...')

    # Create the model, optimizer and checkpoint
    model_class = str_to_attribute(sys.modules['models'], args.name)
    model = model_class(data.x.shape[-1], dataset.num_classes, args, log, orig_num_nodes=num_nodes)
    
    model = DataParallelWrapper(model)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model = load_model(model, args.load_path, args.gpu_ids)
    else:
        # Reset parameters only if not loading from checkpoint
        model.reset_parameters()

    model = model.to(device)
    model.train()

    # Get optimizer and scheduler
    parameters = [p for p in model.parameters() if p.requires_grad]
    if args.optimizer == 'Adam':
        optimizer = optim.Adam(parameters, args.learning_rate, 
        weight_decay=args.weight_decay)
    elif args.optimizer == 'SGD':
        optimizer = optim.SGD(parameters, args.learning_rate, 
        momentum=args.momentum, weight_decay=args.weight_decay)
    elif args.optimizer == 'Adadelta':
        optimizer = optim.Adadelta(parameters, args.learning_rate,
                                   weight_decay=args.weight_decay)
    elif args.optimizer == 'Adamax':
        optimizer = optim.Adamax(parameters, args.learning_rate,
                                 weight_decay=args.weight_decay)

    # Get saver
    saver = CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Train
    log.info('Training...')
    with tqdm.tqdm(total=args.num_epochs) as progress_bar:
        for epoch in range(args.num_epochs):

            # Train and display the stats
            train_results = train(model, dataset_loader, optimizer, device, evaluator, args)
            
            # Log the metrics
            train_log_message = ''.join('{} - {}; '.format(k, v) for k, v in train_results.items())
            

            # Visualize in TensorBoard
            for k, v in train_results.items():
                tboard.add_scalar(f'train/{k}', v, epoch)

            # Evaluate, display the stats and save the model
            dev_results = evaluate(model, dataset_loader, device, evaluator, args)

            # Save the model
            saver.save(epoch, model, dev_results[args.metric_name], device)

            # Log the metrics
            dev_log_message = ''.join('{} - {}; '.format(k, v) for k, v in dev_results.items())

            # Visualize in TensorBoard
            for k, v in dev_results.items():
                tboard.add_scalar(f'eval/{k}', v, epoch)

            log.info(f'Epoch: {epoch} - Training - {train_log_message} - Evaluating - {dev_log_message}')

            progress_bar.update(1)
            progress_bar.set_postfix(eval_loss=dev_results['loss'])
Exemple #18
0
val_data.edge_index = torch_geometric.utils.subgraph(data.val_mask,
                                                     data.edge_index,
                                                     relabel_nodes=True)[0]
val_data.mask = data.val_mask
print(val_data)
test_data = Data()
test_data.x = data.x[data.test_mask]
test_data.y = data.y[data.test_mask]
test_data.edge_index = torch_geometric.utils.subgraph(data.test_mask,
                                                      data.edge_index,
                                                      relabel_nodes=True)[0]
test_data.mask = data.test_mask
print(test_data)

train_data = ClusterData(train_data,
                         num_parts=1500,
                         recursive=False,
                         save_dir="data/Reddit/train")
val_data = ClusterData(val_data,
                       num_parts=20,
                       recursive=False,
                       save_dir="data/Reddit/val")
test_data = ClusterData(test_data,
                        num_parts=1,
                        recursive=False,
                        save_dir="data/Reddit/test")

train_loader = ClusterLoader(train_data,
                             batch_size=20,
                             shuffle=True,
                             num_workers=8)
val_loader = ClusterLoader(val_data,
Exemple #19
0
def main(args):

    # Set up logging and devices
    args.save_dir = get_save_dir(args.save_dir, 'test', training=True)
    log = get_logger(args.save_dir, 'test')
    tboard = SummaryWriter(args.save_dir)
    device, args.gpu_ids = get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get data loader
    log.info('Building dataset...')
    # Download and process data at './dataset/xxx'
    dataset = PygNodePropPredDataset(name=args.dataset, root='dataset/')
    evaluator = Evaluator(name=args.dataset)

    split_idx = dataset.get_idx_split()
    data = dataset[0]

    # Convert split indices to boolean masks and add them to `data`.
    for key, idx in split_idx.items():
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[idx] = True
        data[f'{key}_mask'] = mask

    cluster_data = ClusterData(data,
                               num_parts=args.num_partitions,
                               recursive=False,
                               save_dir=dataset.processed_dir)

    dataset_loader = ClusterLoader(cluster_data,
                                   batch_size=args.batch_size,
                                   shuffle=args.data_shuffle,
                                   num_workers=args.num_workers)

    # Get model
    log.info('Building model...')
    model = load_full_model(args.load_path, args.gpu_ids)
    model = nn.DataParallel(model)

    model = model.to(device)
    model.eval()

    # Test
    log.info('Testing...')

    # Evaluate, display the stats and save the model
    dev_results = test(model, dataset_loader, device, evaluator)

    # Log the metrics
    dev_log_message = ''.join('{} - {}; '.format(k, v)
                              for k, v in dev_results.items())

    log.info(f'Testing - {dev_log_message}')
Exemple #20
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Products (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=12)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=10)

    parser.add_argument('--step-size', type=float, default=8e-3)
    parser.add_argument('-m', type=int, default=3)
    parser.add_argument('--test-freq', type=int, default=5)
    parser.add_argument('--attack', type=str, default='flag')
    parser.add_argument('--amp', type=float, default=2)

    args = parser.parse_args()

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-products')
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    # Convert split indices to boolean masks and add them to `data`.
    for key, idx in split_idx.items():
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[idx] = True
        data[f'{key}_mask'] = mask

    cluster_data = ClusterData(data,
                               num_parts=args.num_partitions,
                               recursive=False,
                               save_dir=dataset.processed_dir)

    loader = ClusterLoader(cluster_data,
                           batch_size=args.batch_size,
                           shuffle=True,
                           num_workers=args.num_workers)

    subgraph_loader = NeighborSampler(data.edge_index,
                                      sizes=[-1],
                                      batch_size=1024,
                                      shuffle=False,
                                      num_workers=args.num_workers)

    model = SAGE(data.x.size(-1), args.hidden_channels, dataset.num_classes,
                 args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-products')

    vals, tests = [], []
    for run in range(args.runs):
        best_val, final_test = 0, 0

        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        for epoch in range(1, args.epochs + 1):
            loss, acc = train_flag(model, loader, optimizer, device, args)
            if epoch > 19 and epoch % args.test_freq == 0 or epoch == args.epochs:
                result = test(model, data, evaluator, subgraph_loader, device)
                train, val, tst = result
                if val > best_val:
                    best_val = val
                    final_test = tst

        print(f'Run{run} val:{best_val}, test:{final_test}')
        vals.append(best_val)
        tests.append(final_test)

    print('')
    print(f"Average val accuracy: {np.mean(vals)} ± {np.std(vals)}")
    print(f"Average test accuracy: {np.mean(tests)} ± {np.std(tests)}")
    data = dataset[0]
    dataset_test(data)

    if args.multi_gpu:
        # Unit test: GPU number verification

        # Prepare model
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        model = parse_model_name(args.model, dataset)
        model = DataParallel(model)
        model = model.to(device)

        #Split graph into subgraphs
        if args.subgraph_scheme == 'cluster':
            # Split data into subgraphs using cluster methods
            data_list = list(ClusterData(data, num_parts=args.num_parts))
        elif args.subgraph_scheme == 'neighbor':
            data_list = list(
                NeighborSubgraphLoader(data,
                                       batch_size=args.neighbor_batch_size))
            print(
                f'Using neighbor sampling | number of subgraphs: {len(data_list)}'
            )

        # Run the model for each batch size setups
        batch_sizes = np.array(list(range(1, 65))) * 4
        batch_running_time = []
        for batch_size in batch_sizes:
            batch_size = int(batch_size)
            loader = DataListLoader(data_list,
                                    batch_size=batch_size,
Exemple #22
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-Citation (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=12)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-citation')
    split_edge = dataset.get_edge_split()
    data = dataset[0]
    data.edge_index = to_undirected(data.edge_index, data.num_nodes)

    cluster_data = ClusterData(data, num_parts=args.num_partitions,
                               recursive=False, save_dir=dataset.processed_dir)

    loader = ClusterLoader(cluster_data, batch_size=args.batch_size,
                           shuffle=True, num_workers=args.num_workers)

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596]
    split_edge['eval_train'] = {
        'source_node': split_edge['train']['source_node'][idx],
        'target_node': split_edge['train']['target_node'][idx],
        'target_node_neg': split_edge['valid']['target_node_neg'],
    }

    model = GCN(data.x.size(-1), args.hidden_channels, args.hidden_channels,
                args.num_layers, args.dropout).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-citation')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(
            list(model.parameters()) + list(predictor.parameters()),
            lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, loader, optimizer, device)
            print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}')

            if epoch > 49 and epoch % args.eval_steps == 0:
                result = test(model, predictor, data, split_edge, evaluator,
                              batch_size=64 * 1024, device=device)
                logger.add_result(run, result)

                train_mrr, valid_mrr, test_mrr = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {train_mrr:.4f}, '
                      f'Valid: {valid_mrr:.4f}, '
                      f'Test: {test_mrr:.4f}')

        logger.print_statistics(run)
    logger.print_statistics()
Exemple #23
0
def test_cluster_gcn():
    adj = torch.tensor([
        [1, 1, 1, 0, 1, 0],
        [1, 1, 0, 1, 0, 1],
        [1, 0, 1, 0, 1, 0],
        [0, 1, 0, 1, 0, 1],
        [1, 0, 1, 0, 1, 0],
        [0, 1, 0, 1, 0, 1],
    ])

    x = torch.Tensor([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]])
    edge_index = adj.nonzero(as_tuple=False).t()
    edge_attr = torch.arange(edge_index.size(1))
    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    data.num_nodes = 6

    cluster_data = ClusterData(data, num_parts=2, log=False)

    assert cluster_data.partptr.tolist() == [0, 3, 6]
    assert cluster_data.perm.tolist() == [0, 2, 4, 1, 3, 5]
    assert cluster_data.data.x.tolist() == [
        [0, 0],
        [2, 2],
        [4, 4],
        [1, 1],
        [3, 3],
        [5, 5],
    ]
    assert cluster_data.data.adj.to_dense().tolist() == [
        [0, 2, 3, 1, 0, 0],
        [8, 9, 10, 0, 0, 0],
        [14, 15, 16, 0, 0, 0],
        [4, 0, 0, 5, 6, 7],
        [0, 0, 0, 11, 12, 13],
        [0, 0, 0, 17, 18, 19],
    ]

    data = cluster_data[0]
    assert data.num_nodes == 3
    assert data.x.tolist() == [[0, 0], [2, 2], [4, 4]]
    assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2],
                                        [0, 1, 2, 0, 1, 2, 0, 1, 2]]
    assert data.edge_attr.tolist() == [0, 2, 3, 8, 9, 10, 14, 15, 16]

    data = cluster_data[1]
    assert data.num_nodes == 3
    assert data.x.tolist() == [[1, 1], [3, 3], [5, 5]]
    assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2],
                                        [0, 1, 2, 0, 1, 2, 0, 1, 2]]
    assert data.edge_attr.tolist() == [5, 6, 7, 11, 12, 13, 17, 18, 19]

    loader = ClusterLoader(cluster_data, batch_size=1)
    iterator = iter(loader)

    data = next(iterator)
    assert data.x.tolist() == [[0, 0], [2, 2], [4, 4]]
    assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2],
                                        [0, 1, 2, 0, 1, 2, 0, 1, 2]]

    data = next(iterator)
    assert data.x.tolist() == [[1, 1], [3, 3], [5, 5]]
    assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2],
                                        [0, 1, 2, 0, 1, 2, 0, 1, 2]]

    torch.manual_seed(1)
    loader = ClusterLoader(cluster_data, batch_size=2, shuffle=True)
    data = next(iter(loader))
    assert data.num_nodes == 6
    assert data.x.tolist() == [
        [0, 0],
        [2, 2],
        [4, 4],
        [1, 1],
        [3, 3],
        [5, 5],
    ]
    assert to_dense_adj(data.edge_index).squeeze().tolist() == [
        [1, 1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
    ]

    torch.manual_seed(2)
    loader = ClusterLoader(cluster_data, batch_size=2, shuffle=True)
    data = next(iter(loader))
    assert data.num_nodes == 6
    assert data.x.tolist() == [
        [1, 1],
        [3, 3],
        [5, 5],
        [0, 0],
        [2, 2],
        [4, 4],
    ]
    assert to_dense_adj(data.edge_index).squeeze().tolist() == [
        [1, 1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
    ]

    loader = ClusterLoader(cluster_data, batch_size=1, shuffle=True)
    data = next(iter(loader))
    assert data.num_nodes == 3
Exemple #24
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Products (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=12)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-products')
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    # Convert split indices to boolean masks and add them to `data`.
    for key, idx in split_idx.items():
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[idx] = True
        data[f'{key}_mask'] = mask

    cluster_data = ClusterData(data, num_parts=args.num_partitions,
                               recursive=False, save_dir=dataset.processed_dir)

    loader = ClusterLoader(cluster_data, batch_size=args.batch_size,
                           shuffle=True, num_workers=args.num_workers)

    subgraph_loader = NeighborSampler(data.edge_index, sizes=[-1],
                                      batch_size=1024, shuffle=False,
                                      num_workers=args.num_workers)

    model = GCN(data.x.size(-1), args.hidden_channels, dataset.num_classes,
                 args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-products')
    logger = Logger(args.runs, args)
    logger_orig = Logger(args.runs, args)
   
    adj = process_adj(data)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        best_valid = 0
        best_out = None
        for epoch in range(1, 1 + args.epochs):
            loss, train_acc = train(model, loader, optimizer, device)
            if epoch % args.log_steps == 0:
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Approx Train Acc: {train_acc:.4f}')

            if epoch > 19 and epoch % args.eval_steps == 0:
                out, result = test(model, data, evaluator, subgraph_loader, device)
                logger_orig.add_result(run, result)
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')
        logger.print_statistics(run)
    logger.print_statistics()
    logger_orig.print_statistics()
Exemple #25
0
    def process_cluster_data(self, data):
        """
        Augmented view data generation based on clustering.

        :param data:
        :return:
        """
        data_list = []
        clusters = []
        num_parts, cluster_size = self.num_parts, self.num_parts // self.final_parts

        # Cluster the data
        cd = ClusterData(data, num_parts=num_parts)
        for i in range(1, cd.partptr.shape[0]):
            cls_nodes = cd.perm[cd.partptr[i - 1]:cd.partptr[i]]
            clusters.append(cls_nodes)

        # Randomly merge clusters and apply transformation
        np.random.shuffle(clusters)
        for i in range(0, len(clusters), cluster_size):
            end = i + cluster_size if len(
                clusters) - i > cluster_size else len(clusters)
            cls_nodes = torch.cat(clusters[i:end]).unique()
            sys.stdout.write(
                f'\rProcessing cluster {i + 1}/{len(clusters)} with {self.final_parts} nodes'
            )
            sys.stdout.flush()

            x = data.x[cls_nodes]
            y = data.y[cls_nodes]
            train_mask = data.train_mask[cls_nodes]
            dev_mask = data.val_mask[cls_nodes]
            test_mask = data.test_mask[cls_nodes]
            edge_index, edge_attr = subgraph(cls_nodes,
                                             data.edge_index,
                                             relabel_nodes=True)
            data = Data(edge_index=edge_index,
                        x=x,
                        edge_attr=edge_attr,
                        num_nodes=cls_nodes.shape[0])
            view1data, view2data = self.augumentation(data)
            if not hasattr(view1data,
                           "edge_attr") or view1data.edge_attr is None:
                view1data.edge_attr = torch.ones(view1data.edge_index.shape[1])
            if not hasattr(view2data,
                           "edge_attr") or view2data.edge_attr is None:
                view2data.edge_attr = torch.ones(view2data.edge_index.shape[1])
            diff = abs(view2data.x.shape[1] - view1data.x.shape[1])
            if diff > 0:
                smaller_data = view1data if view1data.x.shape[
                    1] < view2data.x.shape[1] else view2data
                smaller_data.x = F.pad(smaller_data.x, pad=(0, diff))
                view1data.x = F.normalize(view1data.x)
                view2data.x = F.normalize(view2data.x)
            print(view1data)
            print(view2data)
            new_data = Data(y=y,
                            x1=view1data.x,
                            x2=view2data.x,
                            edge_index1=view1data.edge_index,
                            edge_index2=view2data.edge_index,
                            edge_attr1=view1data.edge_attr,
                            edge_attr2=view2data.edge_attr,
                            train_mask=train_mask,
                            dev_mask=dev_mask,
                            test_mask=test_mask,
                            num_nodes=cls_nodes.shape[0],
                            nodes=cls_nodes)
            data_list.append(new_data)
        print()
        return data_list
def test_cluster_gcn():
    adj = torch.tensor([
        [1, 1, 1, 0, 1, 0],
        [1, 1, 0, 1, 0, 1],
        [1, 0, 1, 0, 1, 0],
        [0, 1, 0, 1, 0, 1],
        [1, 0, 1, 0, 1, 0],
        [0, 1, 0, 1, 0, 1],
    ])

    edge_index = adj.nonzero().t()
    x = torch.Tensor([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]])
    data = Data(edge_index=edge_index, x=x, num_nodes=6)

    cluster_data = ClusterData(data, num_parts=2, log=False)

    assert cluster_data.partptr.tolist() == [0, 3, 6]
    assert cluster_data.perm.tolist() == [0, 2, 4, 1, 3, 5]
    assert cluster_data.data.x.tolist() == [
        [0, 0],
        [2, 2],
        [4, 4],
        [1, 1],
        [3, 3],
        [5, 5],
    ]
    assert cluster_data.data.adj.to_dense().tolist() == [
        [1, 1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
    ]

    data = cluster_data[0]
    assert data.x.tolist() == [[0, 0], [2, 2], [4, 4]]
    assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2],
                                        [0, 1, 2, 0, 1, 2, 0, 1, 2]]

    data = cluster_data[1]
    assert data.x.tolist() == [[1, 1], [3, 3], [5, 5]]
    assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2],
                                        [0, 1, 2, 0, 1, 2, 0, 1, 2]]

    loader = ClusterLoader(cluster_data, batch_size=1)
    it = iter(loader)

    data = next(it)
    assert data.x.tolist() == [[0, 0], [2, 2], [4, 4]]
    assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2],
                                        [0, 1, 2, 0, 1, 2, 0, 1, 2]]

    data = next(it)
    assert data.x.tolist() == [[1, 1], [3, 3], [5, 5]]
    assert data.edge_index.tolist() == [[0, 0, 0, 1, 1, 1, 2, 2, 2],
                                        [0, 1, 2, 0, 1, 2, 0, 1, 2]]

    torch.manual_seed(1)
    loader = ClusterLoader(cluster_data, batch_size=2, shuffle=True)
    data = next(iter(loader))
    assert data.x.tolist() == [
        [0, 0],
        [2, 2],
        [4, 4],
        [1, 1],
        [3, 3],
        [5, 5],
    ]
    assert to_dense_adj(data.edge_index).squeeze().tolist() == [
        [1, 1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
    ]

    torch.manual_seed(2)
    loader = ClusterLoader(cluster_data, batch_size=2, shuffle=True)
    data = next(iter(loader))
    assert data.x.tolist() == [
        [1, 1],
        [3, 3],
        [5, 5],
        [0, 0],
        [2, 2],
        [4, 4],
    ]
    assert to_dense_adj(data.edge_index).squeeze().tolist() == [
        [1, 1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
    ]
Exemple #27
0
    labels = torch.from_numpy(labels).float()
    mask_test_oneclass = torch.from_numpy(np.array(mask_test_oneclass))
    y_test_oneclass = torch.from_numpy(np.array(y_test_oneclass))

    ##data loader
    data = Data(x=features, edge_index=edge_index, y=labels)
    data.train_mask = torch.from_numpy(mask_train)
    data.test_mask = torch.from_numpy(mask_test)
    data.mask_test_oneclass = mask_test_oneclass
    data.y_test_oneclass = y_test_oneclass
    total_test_oneclass = []
    for i in range(args.classes):
        total_test_oneclass.append(mask_test_oneclass[i].sum())

    cluster_data = ClusterData(data,
                               num_parts=2000,
                               recursive=False,
                               save_dir='./data')
    train_loader = ClusterLoader(cluster_data,
                                 batch_size=150,
                                 shuffle=True,
                                 num_workers=12)
    subgraph_loader = NeighborSampler(data.edge_index,
                                      sizes=[-1],
                                      batch_size=1024,
                                      shuffle=False,
                                      num_workers=12)

    ######!!!!这里选择结构
    genotype = eval("genotypes.%s" % args.arch)  #eval()执行一个字符串表达式,并返回表达式的值。
    model = Network(args.init_channels,
                    args.classes,