Beispiel #1
0
def gin_reddit():
    # data = GINDataset('REDDITBINARY', self_loop=False)
    data = GINDataset('REDDITMULTI5K', self_loop=False)

    graphs = data.graphs
    batched_graph = dgl.batch(graphs)
    
    # node_encoding = wl_without_node_feature(big_graph, 4)
    node_encoding = linear_swl_without_node_feature(batched_graph, 3)
    graphs_encoding = node_to_graph_encoding(batched_graph, node_encoding)
    eq_cls = equivalence_class(graphs_encoding) 
    print(eq_cls_classification_error(eq_cls, data.labels))
    exit()
Beispiel #2
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--repeat", type=int, default=10)
    parser.add_argument('--dataset', type=str, choices=['MUTAG', 'COLLAB', 'IMDBBINARY', 'IMDBMULTI', 'NCI1', 'PROTEINS', 'PTC', 'REDDITBINARY', 'REDDITMULTI5K'], default='MUTAG')

    args = parser.parse_args()

    device = torch.device('cuda')
    dataset_ = GINDataset(args.dataset, False)
    dataset = DatasetAbstraction([g[0] for g in dataset_], [g[1] for g in dataset_])
    
    # 1. split dataset [fix split]
    dataids = list(range(len(dataset)))
    random.seed(2021)
    random.shuffle(dataids)
    
    fold = int(len(dataset) * 0.1)
    train_dataset = dataset[dataids[:fold * 8]]
    val_dataset = dataset[dataids[fold * 8: fold * 9]]
    test_dataset = dataset[dataids[fold * 9: ]]

    trainloader = GraphDataLoader(train_dataset, batch_size=32, shuffle=True)
    valloader = GraphDataLoader(val_dataset, batch_size=32, shuffle=False)
    testloader = GraphDataLoader(test_dataset, batch_size=32, shuffle=False)

    accs = []
    for seed in tqdm(range(args.repeat)):
        # set up seeds, args.seed supported
        set_seed(seed)

        model = GIN(
            5, 2, dataset_.dim_nfeats, 64, dataset_.gclasses, 0.5, False,
            "sum", "sum").to(device)

        criterion = nn.CrossEntropyLoss()  # defaul reduce is true
        optimizer = optim.Adam(model.parameters(), lr=0.0001)

        model = train(model, trainloader, valloader, optimizer, criterion, 100, device)
        acc = eval_net(model, testloader, device)
        accs.append(acc)

    print('{:.2f} ~ {:.2f}'.format(np.mean(accs) * 100, np.std(accs) * 100))
Beispiel #3
0
def main(args):

    # set up seeds, args.seed supported
    torch.manual_seed(seed=args.seed)
    np.random.seed(seed=args.seed)

    is_cuda = not args.disable_cuda and torch.cuda.is_available()

    if is_cuda:
        args.device = torch.device("cuda:" + str(args.device))
        torch.cuda.manual_seed_all(seed=args.seed)
    else:
        args.device = torch.device("cpu")

    dataset = GINDataset(args.dataset, not args.learn_eps,
                         args.degree_as_nlabel)
    trainloader, validloader = GINDataLoader(
        dataset,
        batch_size=args.batch_size,
        device=args.device,
        seed=args.seed,
        shuffle=True,
        split_name='fold10',
        fold_idx=args.fold_idx).train_valid_loader()
    # or split_name='rand', split_ratio=0.7

    model = GIN(args.num_layers, args.num_mlp_layers, dataset.dim_nfeats,
                args.hidden_dim, dataset.gclasses, args.final_dropout,
                args.learn_eps, args.graph_pooling_type,
                args.neighbor_pooling_type).to(args.device)

    criterion = nn.CrossEntropyLoss()  # defaul reduce is true
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

    # it's not cost-effective to hanle the cursor and init 0
    # https://stackoverflow.com/a/23121189
    tbar = tqdm(range(args.epochs),
                unit="epoch",
                position=3,
                ncols=0,
                file=sys.stdout)
    vbar = tqdm(range(args.epochs),
                unit="epoch",
                position=4,
                ncols=0,
                file=sys.stdout)
    lrbar = tqdm(range(args.epochs),
                 unit="epoch",
                 position=5,
                 ncols=0,
                 file=sys.stdout)

    for epoch, _, _ in zip(tbar, vbar, lrbar):

        train(args, model, trainloader, optimizer, criterion, epoch)
        scheduler.step()

        train_loss, train_acc = eval_net(args, model, trainloader, criterion)
        tbar.set_description(
            'train set - average loss: {:.4f}, accuracy: {:.0f}%'.format(
                train_loss, 100. * train_acc))

        valid_loss, valid_acc = eval_net(args, model, validloader, criterion)
        vbar.set_description(
            'valid set - average loss: {:.4f}, accuracy: {:.0f}%'.format(
                valid_loss, 100. * valid_acc))

        if not args.filename == "":
            with open(args.filename, 'a') as f:
                f.write(
                    '%s %s %s %s %s' %
                    (args.dataset, args.learn_eps, args.neighbor_pooling_type,
                     args.graph_pooling_type, epoch))
                f.write("\n")
                f.write("%f %f %f %f" %
                        (train_loss, train_acc, valid_loss, valid_acc))
                f.write("\n")

        lrbar.set_description("Learning eps with learn_eps={}: {}".format(
            args.learn_eps,
            [layer.eps.data.item() for layer in model.ginlayers]))

    tbar.close()
    vbar.close()
    lrbar.close()
    test_acc = evaluate(model, test_loader, device)
    print('Test acc: {:.4f}'.format(test_acc))
    dist.destroy_process_group()


###############################################################################
# Finally we load the dataset and launch the processes.
#
# .. note::
#
#    You will need to use ``dgl.multiprocessing`` instead of the Python
#    ``multiprocessing`` package. ``dgl.multiprocessing`` is identical to
#    Python’s built-in ``multiprocessing`` except that it handles the
#    subtleties between forking and multithreading in Python.
#

if __name__ == '__main__':
    import dgl.multiprocessing as mp

    from dgl.data import GINDataset

    num_gpus = 4
    procs = []
    dataset = GINDataset(name='IMDBBINARY', self_loop=False)
    for rank in range(num_gpus):
        p = mp.Process(target=main, args=(rank, num_gpus, dataset))
        p.start()
        procs.append(p)
    for p in procs:
        p.join()
Beispiel #5
0
                relabel = { n : i for i,n in enumerate(c)}
                sub = nx.subgraph(g,nbunch = c)
                sub = nx.relabel_nodes(sub , relabel)                
                embedder.fit([sub])
                embCC.append(embedder.get_embedding()[0])
            
            embCC = np.array(embCC)
            embCC = np.concatenate(tuple(embCC) , axis = 0)
            embCC = np.random.choice(embCC, size=500, replace=False)
            data.append(embCC)
            y.append(l)
    
    return np.array(data) , np.array(y)
    

data = GINDataset(name='PROTEINS', self_loop=True)
Loader = GDataLoader(data,1 , collate)
train_loader , test_loader = Loader.train_valid_loader()


test_emd , test_y = DGLtoemd(test_loader)

train_emd , train_y = DGLtoemd(train_loader)
print('all size data ',len(train_emd) + len(test_emd))
print('Data done')

initial_idx = np.random.choice(range(len(train_emd)), size=50, replace=False)
X_init = train_emd[initial_idx]
y_init = train_y[initial_idx]

X_pool = np.delete(train_emd, initial_idx, axis=0)
    batched_graph.ndata['graph_id'] = graph_id

    return batched_graph, batched_labels


if __name__ == '__main__':

    # Step 1: Prepare graph data   ===================================== #
    args = argument()
    print(args)

    log_interval = 1

    # load dataset from dgl.data.GINDataset
    dataset = GINDataset(args.dataname, False)

    # get graphs and labels
    graphs, labels = map(list, zip(*dataset))

    # generate a full-graph with all examples for evaluation
    wholegraph = dgl.batch(graphs)
    wholegraph.ndata['attr'] = wholegraph.ndata['attr'].to(th.float32)

    # creata dataloader for batch training
    dataloader = GraphDataLoader(dataset,
                                 batch_size=args.batch_size,
                                 collate_fn=collate,
                                 drop_last=False,
                                 shuffle=True)
Beispiel #7
0
                        default='MUTAG')
    parser.add_argument('--dataset_seed', type=int, default=2021)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--repeat', type=int, default=50)
    parser.add_argument('--model',
                        type=str,
                        choices=['gin', 'gat', 'gcn', 'sage', 'topk'],
                        default='gin')
    parser.add_argument('--lr', type=float, default=0.0001)
    parser.add_argument('--epoch', type=int, default=100)

    args = parser.parse_args()

    # seed = 100
    # dataset = build_dataset_from_name('mutag')
    dataset_ = GINDataset(args.dataset, False)
    dataset = DatasetAbstraction([g[0] for g in dataset_],
                                 [g[1] for g in dataset_])

    # 1. split dataset [fix split]
    dataids = list(range(len(dataset)))
    random.seed(args.dataset_seed)
    random.shuffle(dataids)

    fold = int(len(dataset) * 0.1)
    train_dataset = dataset[dataids[:fold * 8]]
    val_dataset = dataset[dataids[fold * 8:fold * 9]]
    test_dataset = dataset[dataids[fold * 9:]]

    dataset = DatasetAbstraction.build_from_train_val(train_dataset,
                                                      val_dataset,
Beispiel #8
0
if args.dataset.lower() in ['cora', 'cora_path', 'citeseer', 'pubmed', 'regular'] and not args.graph_level:
    if args.dataset.lower() == 'cora':
        dataset = CoraDataset()
    elif args.dataset.lower() == 'cora_path':
        dataset = Cora_Count_Path()
    elif args.dataset.lower() == 'regular':
        dataset = RegularGraph_Count_Path(1000, 6, length_path = 3)
    else:
        dataset = CitationGraphDataset(args.dataset.lower())

    train_mask = torch.BoolTensor(dataset.train_mask)
    # val_mask = torch.BoolTensor(dataset.val_mask)
    test_mask = torch.BoolTensor(dataset.test_mask)
    graph = dataset[0].to(device)
elif args.dataset.lower() in ['imdbbinary', 'imdbmulti', 'redditbinary', 'redditmulti5k' , 'collab'] and args.graph_level:
    dataset = GINDataset(args.dataset.upper(), self_loop = args.self_loop)
    random_permutation = list(permutation(len(dataset)))
    train_mask, test_mask = random_permutation[:int(0.9 * len(dataset))], random_permutation[int(0.9 * len(dataset)):]
    train_loader = DataLoader(Subset(dataset, train_mask), batch_size = 32, shuffle  = True, collate_fn = collate)
    test_loader = DataLoader(Subset(dataset, test_mask), batch_size = 32, shuffle  = False, collate_fn = collate)
else:
    print('Either dataset or task is wrong!', 'Dataset:', args.dataset.lower(), 'Graph-level:', args.graph_level)
    assert False

# model
if args.op_base not in ['adj', 'laplacian', 'chebyshev']:
    print("Wrong operator base!")
    assert False

if args.graph_level:
    params_dict = {'input_dim': 1,
Beispiel #9
0
    graph_id = th.arange(n_graphs)
    graph_id = dgl.broadcast_nodes(batched_graph, graph_id)

    batched_graph.ndata['graph_id'] = graph_id

    return batched_graph, batched_labels


if __name__ == '__main__':

    # Step 1: Prepare graph data   ===================================== #
    args = argument()
    print(args)

    # load dataset from dgl.data.GINDataset
    dataset = GINDataset(args.dataname, self_loop = False)

    # get graphs and labels
    graphs, labels = map(list, zip(*dataset))

    # generate a full-graph with all examples for evaluation
    wholegraph = dgl.batch(graphs)
    wholegraph.ndata['attr'] = wholegraph.ndata['attr'].to(th.float32)

    # create dataloader for batch training
    dataloader = GraphDataLoader(dataset,
                                 batch_size=args.batch_size,
                                 collate_fn=collate,
                                 drop_last=False,
                                 shuffle=True)
Beispiel #10
0
def main(args):

    device = torch.device(args.device)
    dataset_ = GINDataset(args.dataset, False)
    dataset = DatasetAbstraction([g[0] for g in dataset_],
                                 [g[1] for g in dataset_])

    # 1. split dataset [fix split]
    dataids = list(range(len(dataset)))
    random.seed(args.dataset_seed)
    random.shuffle(dataids)

    fold = int(len(dataset) * 0.1)
    train_dataset = dataset[dataids[:fold * 8]]
    val_dataset = dataset[dataids[fold * 8:fold * 9]]
    test_dataset = dataset[dataids[fold * 9:]]

    trainloader = GraphDataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=True)
    valloader = GraphDataLoader(val_dataset,
                                batch_size=args.batch_size,
                                shuffle=False)
    testloader = GraphDataLoader(test_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False)

    accs = []
    for seed in tqdm(range(args.repeat)):
        # set up seeds, args.seed supported
        set_seed(seed)

        if args.model == 'gin':
            model = AutoGIN(
                num_features=dataset_.dim_nfeats,
                num_classes=dataset_.gclasses,
                device=device,
            ).from_hyper_parameter({
                "num_layers": 5,
                "hidden": [64, 64, 64, 64],
                "dropout": 0.5,
                "act": "relu",
                "eps": "False",
                "mlp_layers": 2,
                "neighbor_pooling_type": "sum",
                "graph_pooling_type": "sum"
            }).model
        elif args.model == 'topkpool':
            model = AutoTopkpool(
                num_features=dataset_.dim_nfeats,
                num_classes=dataset_.gclasses,
                device=device,
            ).from_hyper_parameter({
                "num_layers": 5,
                "hidden": [64, 64, 64, 64],
                "dropout": 0.5
            }).model

        model = model.to(device)

        criterion = nn.CrossEntropyLoss()  # defaul reduce is true
        optimizer = optim.Adam(model.parameters(), lr=args.lr)

        model = train(model, trainloader, valloader, optimizer, criterion,
                      args.epoch, device)
        acc = eval_net(model, testloader, device)
        accs.append(acc)

    print('{:.2f} ~ {:.2f}'.format(np.mean(accs) * 100, np.std(accs) * 100))