def gin_reddit(): # data = GINDataset('REDDITBINARY', self_loop=False) data = GINDataset('REDDITMULTI5K', self_loop=False) graphs = data.graphs batched_graph = dgl.batch(graphs) # node_encoding = wl_without_node_feature(big_graph, 4) node_encoding = linear_swl_without_node_feature(batched_graph, 3) graphs_encoding = node_to_graph_encoding(batched_graph, node_encoding) eq_cls = equivalence_class(graphs_encoding) print(eq_cls_classification_error(eq_cls, data.labels)) exit()
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--repeat", type=int, default=10) parser.add_argument('--dataset', type=str, choices=['MUTAG', 'COLLAB', 'IMDBBINARY', 'IMDBMULTI', 'NCI1', 'PROTEINS', 'PTC', 'REDDITBINARY', 'REDDITMULTI5K'], default='MUTAG') args = parser.parse_args() device = torch.device('cuda') dataset_ = GINDataset(args.dataset, False) dataset = DatasetAbstraction([g[0] for g in dataset_], [g[1] for g in dataset_]) # 1. split dataset [fix split] dataids = list(range(len(dataset))) random.seed(2021) random.shuffle(dataids) fold = int(len(dataset) * 0.1) train_dataset = dataset[dataids[:fold * 8]] val_dataset = dataset[dataids[fold * 8: fold * 9]] test_dataset = dataset[dataids[fold * 9: ]] trainloader = GraphDataLoader(train_dataset, batch_size=32, shuffle=True) valloader = GraphDataLoader(val_dataset, batch_size=32, shuffle=False) testloader = GraphDataLoader(test_dataset, batch_size=32, shuffle=False) accs = [] for seed in tqdm(range(args.repeat)): # set up seeds, args.seed supported set_seed(seed) model = GIN( 5, 2, dataset_.dim_nfeats, 64, dataset_.gclasses, 0.5, False, "sum", "sum").to(device) criterion = nn.CrossEntropyLoss() # defaul reduce is true optimizer = optim.Adam(model.parameters(), lr=0.0001) model = train(model, trainloader, valloader, optimizer, criterion, 100, device) acc = eval_net(model, testloader, device) accs.append(acc) print('{:.2f} ~ {:.2f}'.format(np.mean(accs) * 100, np.std(accs) * 100))
def main(args): # set up seeds, args.seed supported torch.manual_seed(seed=args.seed) np.random.seed(seed=args.seed) is_cuda = not args.disable_cuda and torch.cuda.is_available() if is_cuda: args.device = torch.device("cuda:" + str(args.device)) torch.cuda.manual_seed_all(seed=args.seed) else: args.device = torch.device("cpu") dataset = GINDataset(args.dataset, not args.learn_eps, args.degree_as_nlabel) trainloader, validloader = GINDataLoader( dataset, batch_size=args.batch_size, device=args.device, seed=args.seed, shuffle=True, split_name='fold10', fold_idx=args.fold_idx).train_valid_loader() # or split_name='rand', split_ratio=0.7 model = GIN(args.num_layers, args.num_mlp_layers, dataset.dim_nfeats, args.hidden_dim, dataset.gclasses, args.final_dropout, args.learn_eps, args.graph_pooling_type, args.neighbor_pooling_type).to(args.device) criterion = nn.CrossEntropyLoss() # defaul reduce is true optimizer = optim.Adam(model.parameters(), lr=args.lr) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5) # it's not cost-effective to hanle the cursor and init 0 # https://stackoverflow.com/a/23121189 tbar = tqdm(range(args.epochs), unit="epoch", position=3, ncols=0, file=sys.stdout) vbar = tqdm(range(args.epochs), unit="epoch", position=4, ncols=0, file=sys.stdout) lrbar = tqdm(range(args.epochs), unit="epoch", position=5, ncols=0, file=sys.stdout) for epoch, _, _ in zip(tbar, vbar, lrbar): train(args, model, trainloader, optimizer, criterion, epoch) scheduler.step() train_loss, train_acc = eval_net(args, model, trainloader, criterion) tbar.set_description( 'train set - average loss: {:.4f}, accuracy: {:.0f}%'.format( train_loss, 100. * train_acc)) valid_loss, valid_acc = eval_net(args, model, validloader, criterion) vbar.set_description( 'valid set - average loss: {:.4f}, accuracy: {:.0f}%'.format( valid_loss, 100. * valid_acc)) if not args.filename == "": with open(args.filename, 'a') as f: f.write( '%s %s %s %s %s' % (args.dataset, args.learn_eps, args.neighbor_pooling_type, args.graph_pooling_type, epoch)) f.write("\n") f.write("%f %f %f %f" % (train_loss, train_acc, valid_loss, valid_acc)) f.write("\n") lrbar.set_description("Learning eps with learn_eps={}: {}".format( args.learn_eps, [layer.eps.data.item() for layer in model.ginlayers])) tbar.close() vbar.close() lrbar.close()
test_acc = evaluate(model, test_loader, device) print('Test acc: {:.4f}'.format(test_acc)) dist.destroy_process_group() ############################################################################### # Finally we load the dataset and launch the processes. # # .. note:: # # You will need to use ``dgl.multiprocessing`` instead of the Python # ``multiprocessing`` package. ``dgl.multiprocessing`` is identical to # Python’s built-in ``multiprocessing`` except that it handles the # subtleties between forking and multithreading in Python. # if __name__ == '__main__': import dgl.multiprocessing as mp from dgl.data import GINDataset num_gpus = 4 procs = [] dataset = GINDataset(name='IMDBBINARY', self_loop=False) for rank in range(num_gpus): p = mp.Process(target=main, args=(rank, num_gpus, dataset)) p.start() procs.append(p) for p in procs: p.join()
relabel = { n : i for i,n in enumerate(c)} sub = nx.subgraph(g,nbunch = c) sub = nx.relabel_nodes(sub , relabel) embedder.fit([sub]) embCC.append(embedder.get_embedding()[0]) embCC = np.array(embCC) embCC = np.concatenate(tuple(embCC) , axis = 0) embCC = np.random.choice(embCC, size=500, replace=False) data.append(embCC) y.append(l) return np.array(data) , np.array(y) data = GINDataset(name='PROTEINS', self_loop=True) Loader = GDataLoader(data,1 , collate) train_loader , test_loader = Loader.train_valid_loader() test_emd , test_y = DGLtoemd(test_loader) train_emd , train_y = DGLtoemd(train_loader) print('all size data ',len(train_emd) + len(test_emd)) print('Data done') initial_idx = np.random.choice(range(len(train_emd)), size=50, replace=False) X_init = train_emd[initial_idx] y_init = train_y[initial_idx] X_pool = np.delete(train_emd, initial_idx, axis=0)
batched_graph.ndata['graph_id'] = graph_id return batched_graph, batched_labels if __name__ == '__main__': # Step 1: Prepare graph data ===================================== # args = argument() print(args) log_interval = 1 # load dataset from dgl.data.GINDataset dataset = GINDataset(args.dataname, False) # get graphs and labels graphs, labels = map(list, zip(*dataset)) # generate a full-graph with all examples for evaluation wholegraph = dgl.batch(graphs) wholegraph.ndata['attr'] = wholegraph.ndata['attr'].to(th.float32) # creata dataloader for batch training dataloader = GraphDataLoader(dataset, batch_size=args.batch_size, collate_fn=collate, drop_last=False, shuffle=True)
default='MUTAG') parser.add_argument('--dataset_seed', type=int, default=2021) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--repeat', type=int, default=50) parser.add_argument('--model', type=str, choices=['gin', 'gat', 'gcn', 'sage', 'topk'], default='gin') parser.add_argument('--lr', type=float, default=0.0001) parser.add_argument('--epoch', type=int, default=100) args = parser.parse_args() # seed = 100 # dataset = build_dataset_from_name('mutag') dataset_ = GINDataset(args.dataset, False) dataset = DatasetAbstraction([g[0] for g in dataset_], [g[1] for g in dataset_]) # 1. split dataset [fix split] dataids = list(range(len(dataset))) random.seed(args.dataset_seed) random.shuffle(dataids) fold = int(len(dataset) * 0.1) train_dataset = dataset[dataids[:fold * 8]] val_dataset = dataset[dataids[fold * 8:fold * 9]] test_dataset = dataset[dataids[fold * 9:]] dataset = DatasetAbstraction.build_from_train_val(train_dataset, val_dataset,
if args.dataset.lower() in ['cora', 'cora_path', 'citeseer', 'pubmed', 'regular'] and not args.graph_level: if args.dataset.lower() == 'cora': dataset = CoraDataset() elif args.dataset.lower() == 'cora_path': dataset = Cora_Count_Path() elif args.dataset.lower() == 'regular': dataset = RegularGraph_Count_Path(1000, 6, length_path = 3) else: dataset = CitationGraphDataset(args.dataset.lower()) train_mask = torch.BoolTensor(dataset.train_mask) # val_mask = torch.BoolTensor(dataset.val_mask) test_mask = torch.BoolTensor(dataset.test_mask) graph = dataset[0].to(device) elif args.dataset.lower() in ['imdbbinary', 'imdbmulti', 'redditbinary', 'redditmulti5k' , 'collab'] and args.graph_level: dataset = GINDataset(args.dataset.upper(), self_loop = args.self_loop) random_permutation = list(permutation(len(dataset))) train_mask, test_mask = random_permutation[:int(0.9 * len(dataset))], random_permutation[int(0.9 * len(dataset)):] train_loader = DataLoader(Subset(dataset, train_mask), batch_size = 32, shuffle = True, collate_fn = collate) test_loader = DataLoader(Subset(dataset, test_mask), batch_size = 32, shuffle = False, collate_fn = collate) else: print('Either dataset or task is wrong!', 'Dataset:', args.dataset.lower(), 'Graph-level:', args.graph_level) assert False # model if args.op_base not in ['adj', 'laplacian', 'chebyshev']: print("Wrong operator base!") assert False if args.graph_level: params_dict = {'input_dim': 1,
graph_id = th.arange(n_graphs) graph_id = dgl.broadcast_nodes(batched_graph, graph_id) batched_graph.ndata['graph_id'] = graph_id return batched_graph, batched_labels if __name__ == '__main__': # Step 1: Prepare graph data ===================================== # args = argument() print(args) # load dataset from dgl.data.GINDataset dataset = GINDataset(args.dataname, self_loop = False) # get graphs and labels graphs, labels = map(list, zip(*dataset)) # generate a full-graph with all examples for evaluation wholegraph = dgl.batch(graphs) wholegraph.ndata['attr'] = wholegraph.ndata['attr'].to(th.float32) # create dataloader for batch training dataloader = GraphDataLoader(dataset, batch_size=args.batch_size, collate_fn=collate, drop_last=False, shuffle=True)
def main(args): device = torch.device(args.device) dataset_ = GINDataset(args.dataset, False) dataset = DatasetAbstraction([g[0] for g in dataset_], [g[1] for g in dataset_]) # 1. split dataset [fix split] dataids = list(range(len(dataset))) random.seed(args.dataset_seed) random.shuffle(dataids) fold = int(len(dataset) * 0.1) train_dataset = dataset[dataids[:fold * 8]] val_dataset = dataset[dataids[fold * 8:fold * 9]] test_dataset = dataset[dataids[fold * 9:]] trainloader = GraphDataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) valloader = GraphDataLoader(val_dataset, batch_size=args.batch_size, shuffle=False) testloader = GraphDataLoader(test_dataset, batch_size=args.batch_size, shuffle=False) accs = [] for seed in tqdm(range(args.repeat)): # set up seeds, args.seed supported set_seed(seed) if args.model == 'gin': model = AutoGIN( num_features=dataset_.dim_nfeats, num_classes=dataset_.gclasses, device=device, ).from_hyper_parameter({ "num_layers": 5, "hidden": [64, 64, 64, 64], "dropout": 0.5, "act": "relu", "eps": "False", "mlp_layers": 2, "neighbor_pooling_type": "sum", "graph_pooling_type": "sum" }).model elif args.model == 'topkpool': model = AutoTopkpool( num_features=dataset_.dim_nfeats, num_classes=dataset_.gclasses, device=device, ).from_hyper_parameter({ "num_layers": 5, "hidden": [64, 64, 64, 64], "dropout": 0.5 }).model model = model.to(device) criterion = nn.CrossEntropyLoss() # defaul reduce is true optimizer = optim.Adam(model.parameters(), lr=args.lr) model = train(model, trainloader, valloader, optimizer, criterion, args.epoch, device) acc = eval_net(model, testloader, device) accs.append(acc) print('{:.2f} ~ {:.2f}'.format(np.mean(accs) * 100, np.std(accs) * 100))