Example #1
0
def test_edge_coarsening(idtype, g, weight, relabel):
    num_nodes = g.num_nodes()
    g = dgl.to_bidirected(g)
    g = g.astype(idtype).to(F.ctx())
    edge_weight = None
    if weight:
        edge_weight = F.abs(F.randn((g.num_edges(),))).to(F.ctx())
    node_labels = neighbor_matching(g, edge_weight, relabel_idx=relabel)
    unique_ids, counts = th.unique(node_labels, return_counts=True)
    num_result_ids = unique_ids.size(0)

    # shape correct
    assert node_labels.shape == (g.num_nodes(),)

    # all nodes marked
    assert F.reduce_sum(node_labels < 0).item() == 0

    # number of unique node ids correct.
    assert num_result_ids >= num_nodes // 2 and num_result_ids <= num_nodes

    # each unique id has <= 2 nodes
    assert F.reduce_sum(counts > 2).item() == 0

    # if two nodes have the same id, they must be neighbors
    idxs = F.arange(0, num_nodes, idtype)
    for l in unique_ids:
        l = l.item()
        idx = idxs[(node_labels == l)]
        if idx.size(0) == 2:
            u, v = idx[0].item(), idx[1].item()
            assert g.has_edges_between(u, v)
    def __getitem__(self, index):
        # Read the graph and label
        G, group_labels, entity_labels, entity_links = self.read_annotations(
            self.files[index])

        # Convert to DGL format
        node_label = torch.stack(
            [torch.tensor(v['position']) for k, v in G.nodes.items()]).float()
        node_label = (node_label - node_label.mean(0)) / node_label.std(0)

        node_word = torch.stack(
            [torch.tensor(v['w_embed']) for k, v in G.nodes.items()]).float()
        node_entity = torch.stack(
            [torch.tensor(v['entity']) for k, v in G.nodes.items()]).float()
        # ENSURE BIDIRECTED
        g_in = dgl.transform.knn_graph(node_label, 10)
        g_in = dgl.to_bidirected(g_in)

        g_in.ndata['position'] = node_label.float()
        g_in.ndata['w_embed'] = node_word.float()
        g_in.ndata['entity'] = node_entity

        input_edges = torch.stack(g_in.edges()).t().tolist()
        edges = list(map(tuple, input_edges))
        target_edges = group_labels
        group_labels = torch.tensor([target_edges[e] for e in edges])

        return g_in, group_labels, entity_labels, entity_links
def makedgl(num, pos):
    G = dgl.DGLGraph()
    G.add_nodes(num)
    G.add_edges(G.nodes(), G.nodes()) #self loop all
    G.add_edges(*zip(*pos)) #add edges list(zip(*pos))
    G = dgl.to_bidirected(G) 
    G = dgl.graph(G.edges(), 'user', 'frd')
    print('-> Graph G has %d nodes' % G.number_of_nodes(), 'with %d edges' % (G.number_of_edges()/2)) 
    return G
Example #4
0
 def _build_loc(self):
     self.loc_g = dgl.DGLGraph()
     self.loc_g.add_nodes(self.node_num)
     self.loc_g.add_edges(
         self.edges[:, 0],
         self.edges[:, 1],
         data={'edge_type': torch.tensor(self.edges[:, 2]).long()})
     self.loc_g = dgl.to_bidirected(self.loc_g)
     self.loc_g.ndata['pos'] = self.coordinates
     self.loc_g.ndata['nodes'] = self.nidx
Example #5
0
 def _test(in_readonly, out_readonly):
     elist = [(0, 0), (0, 1), (1, 0), (1, 1), (2, 1), (2, 2)]
     num_edges = 7
     g = dgl.DGLGraph(elist, readonly=in_readonly)
     elist.append((1, 2))
     elist = set(elist)
     big = dgl.to_bidirected(g, out_readonly)
     assert big.number_of_edges() == num_edges
     src, dst = big.edges()
     eset = set(zip(list(F.asnumpy(src)), list(F.asnumpy(dst))))
     assert eset == set(elist)
Example #6
0
def create_graph(path):
    '''生成图'''
    edges_data = pd.read_csv(path)
    src = edges_data['Src'].to_numpy()
    dst = edges_data['Dst'].to_numpy()
    weight = th.tensor(edges_data['W'])
    weight = th.cat((weight, weight))
    g = dgl.graph((src, dst))
    g = dgl.to_bidirected(g)
    g.edata['w'] = weight
    return g
Example #7
0
 def forward(self, g):
     h = g.ndata['x']
     # we change the graph so we can pass the message along all vertices
     g = dgl.to_bidirected(g, True)
     for conv in self.layers:
         h = conv(g, h)
     g.ndata['h'] = h
     mN = dgl.mean_nodes(g, 'h')
     PI = self.policy(g.ndata['h'])
     V = self.value(mN)
     g.ndata.pop('h')
     return PI, V
Example #8
0
def build_karate_club_graph():
    src = np.array([
        1, 2, 2, 3, 3, 3, 4, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 9, 10, 10, 10, 11,
        12, 12, 13, 13, 13, 13, 16, 16, 17, 17, 19, 19, 21, 21, 25, 25, 27, 27,
        27, 28, 29, 29, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
        32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
        33, 33
    ])
    dst = np.array([
        0, 0, 1, 0, 1, 2, 0, 0, 0, 4, 5, 0, 1, 2, 3, 0, 2, 2, 0, 4, 5, 0, 0, 3,
        0, 1, 2, 3, 5, 6, 0, 1, 0, 1, 0, 1, 23, 24, 2, 23, 24, 2, 23, 26, 1, 8,
        0, 24, 25, 28, 2, 8, 14, 15, 18, 20, 22, 23, 29, 30, 31, 8, 9, 13, 14,
        15, 18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32
    ])
    return dgl.to_bidirected(dgl.graph((src, dst)))
Example #9
0
def preprocess(graph):
    global n_node_feats

    # make bidirected
    feat = graph.ndata["feat"]
    graph = dgl.to_bidirected(graph)
    graph.ndata["feat"] = feat

    # add self-loop
    print(f"Total edges before adding self-loop {graph.number_of_edges()}")
    graph = graph.remove_self_loop().add_self_loop()
    print(f"Total edges after adding self-loop {graph.number_of_edges()}")

    graph.create_formats_()

    return graph
Example #10
0
def run_tests():
    """
    Tests the custom_send_and_recv against DGL's send_and_recv given a graph and some initial node features
    DO NOT EDIT THIS FUNCTION
    """
    def test_node_feats(g, node_feats, test_num):
        tf.debugging.assert_equal(
            perform_message_passing(g, node_feats, custom_send_and_recv, True),
            perform_message_passing(g, node_feats, g.send_and_recv),
            "SNR implementation failed for test " + str(test_num))

    # test on a batch of graphs (size= batch_size)
    batch_size = 3
    batched_graphs = []
    cur_batch = []

    test_dict = get_test_edges()

    for key in test_dict:
        u, v = test_dict[key]
        g = dgl.DGLGraph((u, v))
        g = dgl.to_bidirected(g)

        if len(cur_batch) == batch_size:
            batched_graphs.append(cur_batch)
            cur_batch = []
        cur_batch.append(g)

        node_feats_tests = gen_node_feats_tests(g)
        for i in range(len(node_feats_tests)):
            test_node_feats(g, node_feats_tests[i], i + 1)
            print(f"{key} Test #{i+1} passed")

    for k in range(len(batched_graphs)):
        g = dgl.batch(batched_graphs[k])
        node_feats_tests = gen_node_feats_tests(g)
        for i in range(len(node_feats_tests)):
            test_node_feats(g, node_feats_tests[i], i + 1)
            print("Batched Graph Test " + str(k + 1) + "." + str(i + 1) +
                  " passed")

    print("All tests passed! 🎉🎉🎉")
Example #11
0
def load_data(name, ogb_root, device):
    if name in ('ogbn-products', 'ogbn-arxiv'):
        data = DglNodePropPredDataset(name, ogb_root)
        g, labels = data[0]
        if name == 'ogbn-arxiv':
            g = dgl.to_bidirected(g, copy_ndata=True)
            feat = g.ndata['feat']
            feat = (feat - feat.mean(dim=0)) / feat.std(dim=0)
            g.ndata['feat'] = feat
        g = g.to(device)
        labels = labels.squeeze(dim=1).to(device)
        split_idx = data.get_idx_split()
        train_idx = split_idx['train'].to(device)
        val_idx = split_idx['valid'].to(device)
        test_idx = split_idx['test'].to(device)
        return g, labels, data.num_classes, train_idx, val_idx, test_idx
    else:
        data = load_citation_dataset(name)
        g = data[0].to(device)
        train_idx = g.ndata['train_mask'].nonzero(as_tuple=True)[0]
        val_idx = g.ndata['val_mask'].nonzero(as_tuple=True)[0]
        test_idx = g.ndata['test_mask'].nonzero(as_tuple=True)[0]
        return g, g.ndata[
            'label'], data.num_classes, train_idx, val_idx, test_idx
Example #12
0
      # Make sure each muon in the pair only appears once 
      m_pair1 = l_r.split("::")
      m_pair2 = l_s.split("::")
      
      edges_s.append(i_s)
      edges_r.append(i_r)

  break

# Create the graph object 
import torch
import dgl

u, v = torch.tensor(edges_s), torch.tensor(edges_r)
g = dgl.graph((u, v))
g = dgl.to_bidirected(g)

# Draw the graph using networkx
import networkx as nx
import matplotlib.pyplot as plt

# Plotting stuff 
nx_g = g.to_networkx().to_undirected()
pos = nx.spring_layout(nx_g, seed = 1)
plt.figure(figsize = (8, 8))
plt.axis('off')
nx.draw_networkx(nx_g, pos = pos, node_size=50, cmap = plt.get_cmap("coolwarm"), node_color = torch.tensor(truth), edge_color = 'k', arrows = False, with_labels = True)
plt.savefig("network.png")

# ///////////////////////////// Prepare the data ////////////////////////////////////////// #
Masks = dict()
Example #13
0
def main():
    # check cuda
    device = f'cuda:{args.gpu}' if torch.cuda.is_available() and args.gpu >= 0 else 'cpu'
    # load data
    dataset = DglNodePropPredDataset(name=args.dataset)
    evaluator = Evaluator(name=args.dataset)

    split_idx = dataset.get_idx_split()
    g, labels = dataset[0] # graph: DGLGraph object, label: torch tensor of shape (num_nodes, num_tasks)
    
    if args.dataset == 'ogbn-arxiv':
        g = dgl.to_bidirected(g, copy_ndata=True)
        
        feat = g.ndata['feat']
        feat = (feat - feat.mean(0)) / feat.std(0)
        g.ndata['feat'] = feat

    g = g.to(device)
    feats = g.ndata['feat']
    labels = labels.to(device)

    # load masks for train / validation / test
    train_idx = split_idx["train"].to(device)
    valid_idx = split_idx["valid"].to(device)
    test_idx = split_idx["test"].to(device)

    n_features = feats.size()[-1]
    n_classes = dataset.num_classes
    
    # load model
    if args.model == 'mlp':
        model = MLP(n_features, args.hid_dim, n_classes, args.num_layers, args.dropout)
    elif args.model == 'linear':
        model = MLPLinear(n_features, n_classes)
    else:
        raise NotImplementedError(f'Model {args.model} is not supported.')

    model = model.to(device)
    print(f'Model parameters: {sum(p.numel() for p in model.parameters())}')

    if args.pretrain:
        print('---------- Before ----------')
        model.load_state_dict(torch.load(f'base/{args.dataset}-{args.model}.pt'))
        model.eval()

        y_soft = model(feats).exp()

        y_pred = y_soft.argmax(dim=-1, keepdim=True)
        valid_acc = evaluate(y_pred, labels, valid_idx, evaluator)
        test_acc = evaluate(y_pred, labels, test_idx, evaluator)
        print(f'Valid acc: {valid_acc:.4f} | Test acc: {test_acc:.4f}')

        print('---------- Correct & Smoothing ----------')
        cs = CorrectAndSmooth(num_correction_layers=args.num_correction_layers,
                              correction_alpha=args.correction_alpha,
                              correction_adj=args.correction_adj,
                              num_smoothing_layers=args.num_smoothing_layers,
                              smoothing_alpha=args.smoothing_alpha,
                              smoothing_adj=args.smoothing_adj,
                              autoscale=args.autoscale,
                              scale=args.scale)
        
        mask_idx = torch.cat([train_idx, valid_idx])
        y_soft = cs.correct(g, y_soft, labels[mask_idx], mask_idx)
        y_soft = cs.smooth(g, y_soft, labels[mask_idx], mask_idx)
        y_pred = y_soft.argmax(dim=-1, keepdim=True)
        valid_acc = evaluate(y_pred, labels, valid_idx, evaluator)
        test_acc = evaluate(y_pred, labels, test_idx, evaluator)
        print(f'Valid acc: {valid_acc:.4f} | Test acc: {test_acc:.4f}')
    else:
        opt = optim.Adam(model.parameters(), lr=args.lr)

        best_acc = 0
        best_model = copy.deepcopy(model)

        # training
        print('---------- Training ----------')
        for i in range(args.epochs):

            model.train()
            opt.zero_grad()

            logits = model(feats)
            
            train_loss = F.nll_loss(logits[train_idx], labels.squeeze(1)[train_idx])
            train_loss.backward()

            opt.step()
            
            model.eval()
            with torch.no_grad():
                logits = model(feats)
                
                y_pred = logits.argmax(dim=-1, keepdim=True)

                train_acc = evaluate(y_pred, labels, train_idx, evaluator)
                valid_acc = evaluate(y_pred, labels, valid_idx, evaluator)

                print(f'Epoch {i} | Train loss: {train_loss.item():.4f} | Train acc: {train_acc:.4f} | Valid acc {valid_acc:.4f}')

                if valid_acc > best_acc:
                    best_acc = valid_acc
                    best_model = copy.deepcopy(model)
        
        # testing & saving model
        print('---------- Testing ----------')
        best_model.eval()
        
        logits = best_model(feats)
        
        y_pred = logits.argmax(dim=-1, keepdim=True)
        test_acc = evaluate(y_pred, labels, test_idx, evaluator)
        print(f'Test acc: {test_acc:.4f}')

        if not os.path.exists('base'):
            os.makedirs('base')

        torch.save(best_model.state_dict(), f'base/{args.dataset}-{args.model}.pt')
Example #14
0
    elif args.dataset == 'ogb-product':
        g, _ = load_ogb('ogbn-products')
    elif args.dataset == 'ogb-paper100M':
        g, _ = load_ogb('ogbn-papers100M')
    print('load {} takes {:.3f} seconds'.format(args.dataset,
                                                time.time() - start))
    print('|V|={}, |E|={}'.format(g.number_of_nodes(), g.number_of_edges()))
    print('train: {}, valid: {}, test: {}'.format(
        th.sum(g.ndata['train_mask']), th.sum(g.ndata['val_mask']),
        th.sum(g.ndata['test_mask'])))
    if args.balance_train:
        balance_ntypes = g.ndata['train_mask']
    else:
        balance_ntypes = None

    if args.undirected:
        sym_g = dgl.to_bidirected(g, readonly=True)
        for key in g.ndata:
            sym_g.ndata[key] = g.ndata[key]
        g = sym_g

    dgl.distributed.partition_graph(
        g,
        args.dataset,
        args.num_parts,
        args.output,
        part_method=args.part_method,
        balance_ntypes=balance_ntypes,
        balance_edges=args.balance_edges,
        num_trainers_per_machine=args.num_trainers_per_machine)
Example #15
0
    node_label_num = label_max_index-label_min_index+1
    ## data processing
    # to avoid missing nodes, we should add all the self loop in the edge index and then build up the graph
    self_loop = torch.arange(source_data.x.shape[0])
    self_loop = self_loop.unsqueeze(1).repeat(1,2)
    src_edge_index_sl = torch.cat([source_data.edge_index.T,self_loop]).T #[2,N]

    self_loop = torch.arange(target_data.x.shape[0])
    self_loop = self_loop.unsqueeze(1).repeat(1,2)
    tgt_edge_index_sl = torch.cat([target_data.edge_index.T,self_loop]).T #[2,N]
    del self_loop
    ## generate train graph
    source_graph = dgl.to_simple(dgl.graph((src_edge_index_sl[0],src_edge_index_sl[1])))
    target_graph = dgl.to_simple(dgl.graph((tgt_edge_index_sl[0],tgt_edge_index_sl[1])))
    ## make edge index to be bidirected
    source_graph = dgl.to_bidirected(source_graph)
    target_graph = dgl.to_bidirected(target_graph)
    src_edge_index_sl = torch.vstack([source_graph.edges()[0],source_graph.edges()[1]])
    tgt_edge_index_sl = torch.vstack([target_graph.edges()[0],target_graph.edges()[1]])
    ##generate all node pair label
    source_node_num = source_data.x.shape[0]
    target_node_num = target_data.x.shape[0]
    source_node_feat = source_data.x
    target_node_feat = target_data.x
    source_node_label = source_data.y
    target_node_label = target_data.y
    del source_data,target_data
    src_all_node_pair,src_all_node_pair_label,max_np_label =generate_all_node_pair(source_node_num,src_edge_index_sl,source_node_label,
                                                                                    node_label_num,source_graph.adjacency_matrix()) # tensor,tensor
    src_all_node_pair = src_all_node_pair.view(-1,2)
    src_all_node_pair_label = src_all_node_pair_label.view(-1)
Example #16
0
                new_dst = src

            else:
                new_src = src
                new_dst = dst

            np_type = int((node_label_num + node_label_num -
                           (new_src - 1)) * ((new_src - 1) + 1) / 2 +
                          (new_dst - new_src) + 1) - 1
            mapping_M[np_type, src] = 1

    mapping_M[pos_np_type_num:, :] = mapping_M[:pos_np_type_num, :]
    return mapping_M


if __name__ == "__main__":
    node_label = torch.tensor([0, 1, 2, 0, 2])
    edge_index = torch.tensor([[0, 0, 0, 1, 1, 2, 2, 3, 4],
                               [0, 2, 4, 1, 4, 2, 3, 3, 4]])
    g = dgl.to_bidirected(dgl.graph((edge_index[0], edge_index[1])))
    # ntype_etype_mapping = torch.zeros([3,3],dtype=torch.long)
    # i = 1
    # for src_node_type in range(0,3):
    #     for tgt_node_type in range(src_node_type,3):
    #         ntype_etype_mapping[src_node_type,tgt_node_type] = i
    #         ntype_etype_mapping[tgt_node_type,src_node_type] = i
    #         i+=1
    max_np_label, node_pair_label, max_pos_np_label, max_neg_np_label = generate_all_node_pair_minus_class(
        5, edge_index, node_label, 3, g.adjacency_matrix())
    M = generate_mapping_M_minus_class(4, 20, 10)
    #print(node_pair_label,max_np_label)
Example #17
0
def preprocess_data(dataset, train_ratio):

    if dataset in ['cora', 'citeseer', 'pubmed']:

        edge = np.loadtxt('../low_freq/{}.edge'.format(dataset),
                          dtype=int).tolist()
        feat = np.loadtxt('../low_freq/{}.feature'.format(dataset))
        labels = np.loadtxt('../low_freq/{}.label'.format(dataset), dtype=int)
        train = np.loadtxt('../low_freq/{}.train'.format(dataset), dtype=int)
        val = np.loadtxt('../low_freq/{}.val'.format(dataset), dtype=int)
        test = np.loadtxt('../low_freq/{}.test'.format(dataset), dtype=int)
        nclass = len(set(labels.tolist()))
        print(dataset, nclass)

        U = [e[0] for e in edge]
        V = [e[1] for e in edge]
        g = dgl.graph((U, V))
        g = dgl.to_simple(g)
        g = dgl.remove_self_loop(g)
        g = dgl.to_bidirected(g)

        feat = normalize_features(feat)
        feat = torch.FloatTensor(feat)
        labels = torch.LongTensor(labels)
        train = torch.LongTensor(train)
        val = torch.LongTensor(val)
        test = torch.LongTensor(test)

        return g, nclass, feat, labels, train, val, test

    elif 'syn' in dataset:
        edge = np.loadtxt('../syn/{}.edge'.format(dataset), dtype=int).tolist()
        labels = np.loadtxt('../syn/{}.lab'.format(dataset), dtype=int)
        features = np.loadtxt('../syn/{}.feat'.format(dataset), dtype=float)

        n = labels.shape[0]
        idx = [i for i in range(n)]
        random.shuffle(idx)
        idx_train = np.array(idx[:100])
        idx_test = np.array(idx[100:])

        U = [e[0] for e in edge]
        V = [e[1] for e in edge]
        g = dgl.graph((U, V))

        c1 = 0
        c2 = 0
        lab = labels.tolist()
        for e in edge:
            if lab[e[0]] == lab[e[1]]:
                c1 += 1
            else:
                c2 += 1
        print(c1 / len(edge), c2 / len(edge))

        #normalization will make features degenerated
        #features = normalize_features(features)
        features = torch.FloatTensor(features)

        nclass = 2
        labels = torch.LongTensor(labels)
        train = torch.LongTensor(idx_train)
        test = torch.LongTensor(idx_test)
        print(dataset, nclass)

        return g, nclass, features, labels, train, train, test

    elif dataset in ['film']:
        graph_adjacency_list_file_path = '../high_freq/{}/out1_graph_edges.txt'.format(
            dataset)
        graph_node_features_and_labels_file_path = '../high_freq/{}/out1_node_feature_label.txt'.format(
            dataset)

        G = nx.DiGraph()
        graph_node_features_dict = {}
        graph_labels_dict = {}

        if dataset == 'film':
            with open(graph_node_features_and_labels_file_path
                      ) as graph_node_features_and_labels_file:
                graph_node_features_and_labels_file.readline()
                for line in graph_node_features_and_labels_file:
                    line = line.rstrip().split('\t')
                    assert (len(line) == 3)
                    assert (int(line[0]) not in graph_node_features_dict
                            and int(line[0]) not in graph_labels_dict)
                    feature_blank = np.zeros(932, dtype=np.uint16)
                    feature_blank[np.array(line[1].split(','),
                                           dtype=np.uint16)] = 1
                    graph_node_features_dict[int(line[0])] = feature_blank
                    graph_labels_dict[int(line[0])] = int(line[2])
        else:
            with open(graph_node_features_and_labels_file_path
                      ) as graph_node_features_and_labels_file:
                graph_node_features_and_labels_file.readline()
                for line in graph_node_features_and_labels_file:
                    line = line.rstrip().split('\t')
                    assert (len(line) == 3)
                    assert (int(line[0]) not in graph_node_features_dict
                            and int(line[0]) not in graph_labels_dict)
                    graph_node_features_dict[int(line[0])] = np.array(
                        line[1].split(','), dtype=np.uint8)
                    graph_labels_dict[int(line[0])] = int(line[2])

        with open(graph_adjacency_list_file_path) as graph_adjacency_list_file:
            graph_adjacency_list_file.readline()
            for line in graph_adjacency_list_file:
                line = line.rstrip().split('\t')
                assert (len(line) == 2)
                if int(line[0]) not in G:
                    G.add_node(int(line[0]),
                               features=graph_node_features_dict[int(line[0])],
                               label=graph_labels_dict[int(line[0])])
                if int(line[1]) not in G:
                    G.add_node(int(line[1]),
                               features=graph_node_features_dict[int(line[1])],
                               label=graph_labels_dict[int(line[1])])
                G.add_edge(int(line[0]), int(line[1]))

        adj = nx.adjacency_matrix(G, sorted(G.nodes()))
        row, col = np.where(adj.todense() > 0)

        U = row.tolist()
        V = col.tolist()
        g = dgl.graph((U, V))
        g = dgl.to_simple(g)
        g = dgl.to_bidirected(g)
        g = dgl.remove_self_loop(g)

        features = np.array([
            features for _, features in sorted(G.nodes(data='features'),
                                               key=lambda x: x[0])
        ],
                            dtype=float)
        labels = np.array([
            label
            for _, label in sorted(G.nodes(data='label'), key=lambda x: x[0])
        ],
                          dtype=int)

        n = labels.shape[0]
        idx = [i for i in range(n)]
        #random.shuffle(idx)
        r0 = int(n * train_ratio)
        r1 = int(n * 0.6)
        r2 = int(n * 0.8)

        idx_train = np.array(idx[:r0])
        idx_val = np.array(idx[r1:r2])
        idx_test = np.array(idx[r2:])

        features = normalize_features(features)
        features = torch.FloatTensor(features)

        nclass = 5
        labels = torch.LongTensor(labels)
        train = torch.LongTensor(idx_train)
        val = torch.LongTensor(idx_val)
        test = torch.LongTensor(idx_test)
        print(dataset, nclass)

        return g, nclass, features, labels, train, val, test

    # datasets in Geom-GCN
    elif dataset in ['cornell', 'texas', 'wisconsin', 'chameleon', 'squirrel']:

        graph_adjacency_list_file_path = '../high_freq/{}/out1_graph_edges.txt'.format(
            dataset)
        graph_node_features_and_labels_file_path = '../high_freq/{}/out1_node_feature_label.txt'.format(
            dataset)

        G = nx.DiGraph()
        graph_node_features_dict = {}
        graph_labels_dict = {}

        with open(graph_node_features_and_labels_file_path
                  ) as graph_node_features_and_labels_file:
            graph_node_features_and_labels_file.readline()
            for line in graph_node_features_and_labels_file:
                line = line.rstrip().split('\t')
                assert (len(line) == 3)
                assert (int(line[0]) not in graph_node_features_dict
                        and int(line[0]) not in graph_labels_dict)
                graph_node_features_dict[int(line[0])] = np.array(
                    line[1].split(','), dtype=np.uint8)
                graph_labels_dict[int(line[0])] = int(line[2])

        with open(graph_adjacency_list_file_path) as graph_adjacency_list_file:
            graph_adjacency_list_file.readline()
            for line in graph_adjacency_list_file:
                line = line.rstrip().split('\t')
                assert (len(line) == 2)
                if int(line[0]) not in G:
                    G.add_node(int(line[0]),
                               features=graph_node_features_dict[int(line[0])],
                               label=graph_labels_dict[int(line[0])])
                if int(line[1]) not in G:
                    G.add_node(int(line[1]),
                               features=graph_node_features_dict[int(line[1])],
                               label=graph_labels_dict[int(line[1])])
                G.add_edge(int(line[0]), int(line[1]))

        adj = nx.adjacency_matrix(G, sorted(G.nodes()))
        features = np.array([
            features for _, features in sorted(G.nodes(data='features'),
                                               key=lambda x: x[0])
        ])
        labels = np.array([
            label
            for _, label in sorted(G.nodes(data='label'), key=lambda x: x[0])
        ])

        features = normalize_features(features)

        g = DGLGraph(adj)
        g = dgl.to_simple(g)
        g = dgl.to_bidirected(g)
        g = dgl.remove_self_loop(g)

        n = len(labels.tolist())
        idx = [i for i in range(n)]
        #random.shuffle(idx)
        r0 = int(n * train_ratio)
        r1 = int(n * 0.6)
        r2 = int(n * 0.8)
        train = np.array(idx[:r0])
        val = np.array(idx[r1:r2])
        test = np.array(idx[r2:])

        nclass = len(set(labels.tolist()))
        features = torch.FloatTensor(features)
        labels = torch.LongTensor(labels)
        train = torch.LongTensor(train)
        val = torch.LongTensor(val)
        test = torch.LongTensor(test)
        print(dataset, nclass)

        return g, nclass, features, labels, train, val, test

    # datasets in FAGCN
    elif dataset in ['new_chameleon', 'new_squirrel']:
        edge = np.loadtxt('../high_freq/{}/edges.txt'.format(dataset),
                          dtype=int)
        labels = np.loadtxt('../high_freq/{}/labels.txt'.format(dataset),
                            dtype=int).tolist()
        features = np.loadtxt('../high_freq/{}/features.txt'.format(dataset),
                              dtype=float)

        U = [e[0] for e in edge]
        V = [e[1] for e in edge]
        g = dgl.graph((U, V))
        g = dgl.to_simple(g)
        g = dgl.to_bidirected(g)
        g = dgl.remove_self_loop(g)

        n = len(labels)
        idx = [i for i in range(n)]
        #random.shuffle(idx)
        r0 = int(n * train_ratio)
        r1 = int(n * 0.6)
        r2 = int(n * 0.8)
        train = np.array(idx[:r0])
        val = np.array(idx[r1:r2])
        test = np.array(idx[r2:])

        features = normalize_features(features)
        features = torch.FloatTensor(features)

        nclass = 3
        labels = torch.LongTensor(labels)
        train = torch.LongTensor(train)
        val = torch.LongTensor(val)
        test = torch.LongTensor(test)
        print(dataset, nclass)

        return g, nclass, features, labels, train, val, test
Example #18
0
def preprocess_data(dataset, train_percentage):
    import dgl

    # Modified from AAAI21 FA-GCN
    if dataset in ['cora', 'citeseer', 'pubmed']:
        load_default_split = train_percentage <= 0
        edge = np.loadtxt(f'{DATA_PATH}/{dataset}/{dataset}.edge',
                          dtype=int).tolist()
        features = np.loadtxt(f'{DATA_PATH}/{dataset}/{dataset}.feature')
        labels = np.loadtxt(f'{DATA_PATH}/{dataset}/{dataset}.label',
                            dtype=int)
        if load_default_split:
            train = np.loadtxt(f'{DATA_PATH}/{dataset}/{dataset}.train',
                               dtype=int)
            val = np.loadtxt(f'{DATA_PATH}/{dataset}/{dataset}.val', dtype=int)
            test = np.loadtxt(f'{DATA_PATH}/{dataset}/{dataset}.test',
                              dtype=int)
        else:
            train, val, test = stratified_train_test_split(
                np.arange(len(labels)), labels, len(labels), train_percentage)
        nclass = len(set(labels.tolist()))
        print(dataset, nclass)

        U = [e[0] for e in edge]
        V = [e[1] for e in edge]
        g = dgl.graph((U, V))
        g = dgl.to_simple(g)
        g = dgl.remove_self_loop(g)
        g = dgl.to_bidirected(g)

        features = normalize_features(features)
        features = th.FloatTensor(features)
        labels = th.LongTensor(labels)
        train = th.LongTensor(train)
        val = th.LongTensor(val)
        test = th.LongTensor(test)

    elif dataset in ['airport', 'blogcatalog', 'flickr']:
        load_default_split = train_percentage <= 0
        adj_orig = pickle.load(
            open(f'{DATA_PATH}/{dataset}/{dataset}_adj.pkl', 'rb'))  # sparse
        features = pickle.load(
            open(f'{DATA_PATH}/{dataset}/{dataset}_features.pkl',
                 'rb'))  # sparase
        labels = pickle.load(
            open(f'{DATA_PATH}/{dataset}/{dataset}_labels.pkl',
                 'rb'))  # tensor
        if th.is_tensor(labels):
            labels = labels.numpy()

        if load_default_split:
            tvt_nids = pickle.load(
                open(f'{DATA_PATH}/{dataset}/{dataset}_tvt_nids.pkl',
                     'rb'))  # 3 array
            train = tvt_nids[0]
            val = tvt_nids[1]
            test = tvt_nids[2]
        else:
            train, val, test = stratified_train_test_split(
                np.arange(len(labels)), labels, len(labels), train_percentage)
        nclass = len(set(labels.tolist()))
        print(dataset, nclass)

        adj_orig = adj_orig.tocoo()
        U = adj_orig.row.tolist()
        V = adj_orig.col.tolist()
        g = dgl.graph((U, V))
        g = dgl.to_simple(g)
        g = dgl.remove_self_loop(g)
        g = dgl.to_bidirected(g)

        if dataset in ['airport']:
            features = normalize_features(features)

        if sp.issparse(features):
            features = torch.FloatTensor(features.toarray())
        else:
            features = th.FloatTensor(features)

        labels = th.LongTensor(labels)
        train = th.LongTensor(train)
        val = th.LongTensor(val)
        test = th.LongTensor(test)

    elif dataset in ['arxiv']:
        dataset = DglNodePropPredDataset(name='ogbn-arxiv',
                                         root='data/ogb_arxiv')
        split_idx = dataset.get_idx_split()
        train, val, test = split_idx["train"], split_idx["valid"], split_idx[
            "test"]
        g, labels = dataset[0]
        features = g.ndata['feat']
        nclass = 40
        labels = labels.squeeze()
        g = dgl.to_bidirected(g)
        g = dgl.to_bidirected(g)
    if dataset in ['citeseer']:
        g = dgl.add_self_loop(g)
    return g, features, features.shape[1], nclass, labels, train, val, test
Example #19
0
def main():
    parser = argparse.ArgumentParser(
        description='Link prediction (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--dataset', type=str, default='ogbl-citation')
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--negs', type=int, default=1)
    parser.add_argument('--gnn_type', type=str, default='gcn')
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = DglLinkPropPredDataset(name=args.dataset)

    # Manually add self-loop link since GCN will wash out the feature of isolated nodes.
    n_nodes = dataset[0].number_of_nodes()
    g_data = dgl.add_self_loop(dataset[0])
    g_data = dgl.to_bidirected(g_data)

    for k in dataset[0].node_attr_schemes().keys():
        g_data.ndata[k] = dataset[0].ndata[k]
    print(g_data.number_of_nodes(), g_data.number_of_edges())

    g_data.create_formats_()

    cluster_dataset = ClusterIterDataset(args.dataset,
                                         g_data,
                                         args.num_partitions,
                                         use_pp=False)
    cluster_iterator = DataLoader(cluster_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=args.num_workers,
                                  collate_fn=partial(subgraph_collate_fn,
                                                     g_data,
                                                     negs=args.negs))

    model = GCN(g_data.ndata['feat'].size(-1),
                args.hidden_channels,
                args.hidden_channels,
                args.num_layers,
                args.dropout,
                gnn_type=args.gnn_type).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name=args.dataset)
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        epoch_time, to_device_time, ff_time, pred_loss_time, bp_time, io_time, memory, part_1 = 0, 0, 0, 0, 0, 0, 0, 0
        for epoch in range(1, 1 + args.epochs):
            loss, c_epoch_time, c_to_device_time, c_ff_time, c_pred_loss_time, c_bp_time, c_io_time, c_memory, c_part1 = train(
                model, predictor, cluster_iterator, optimizer, device)
            print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}')
            epoch_time += c_epoch_time
            to_device_time += c_to_device_time
            ff_time += c_ff_time
            pred_loss_time += c_pred_loss_time
            bp_time += c_bp_time
            io_time += c_io_time
            part_1 += c_part1
            memory = max(memory, c_memory[0])

            if epoch % args.eval_steps == 0:
                print('Ave')
                print('epoch time: ', epoch_time / args.eval_steps)
                print('to_device_time: ', to_device_time / args.eval_steps)
                print('ff_time: ', ff_time / args.eval_steps)
                print('part1_time: ', part_1 / args.eval_steps)
                print('pred_loss_time: ', pred_loss_time / args.eval_steps)
                print('bp_time: ', bp_time / args.eval_steps)
                print('io_time: ', io_time / args.eval_steps)
                print('max memory', memory)
                print('\n')
                epoch_time, to_device_time, ff_time, pred_loss_time, bp_time, io_time, memory, part_1 = 0, 0, 0, 0, 0, 0, 0, 0

                result = test(model, predictor, g_data, split_edge, evaluator,
                              64 * 4 * args.batch_size, device)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_mrr, valid_mrr, test_mrr = result
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {train_mrr:.4f}, '
                          f'Valid: {valid_mrr:.4f}, '
                          f'Test: {test_mrr:.4f}')

        logger.print_statistics(run)
    logger.print_statistics()
Example #20
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Arxiv (GraphSAGE Full-Batch)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=500)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument("--eval", action='store_true',
                        help='If not set, we will only do the training part.')
    args = parser.parse_args()
    print(args)

    dataset = DglNodePropPredDataset(name='ogbn-arxiv')
    split_idx = dataset.get_idx_split()

    g, labels = dataset[0]
    feats = jax.device_put(
            g.ndata['feat'],
            jax.devices()[0]
    )

    g = g.to(jax.devices("cpu")[0])

    g = dgl.to_bidirected(g)
    g = g.int()
    g = g.to(jax.devices()[0])

    train_idx = split_idx['train'].numpy()

    _model = GraphSAGE.partial(in_feats=feats.shape[-1],
                      hidden_feats=args.hidden_channels,
                      out_feats=dataset.num_classes,
                      num_layers=args.num_layers,
                      dropout=args.dropout)

    _, initial_params = _model.init(jax.random.PRNGKey(0), g, feats)
    model = nn.Model(_model, initial_params)

    evaluator = Evaluator(name='ogbn-arxiv')
    logger = Logger(args.runs, args)

    dur = []
    for run in range(args.runs):
        _, initial_params = _model.init(jax.random.PRNGKey(0), g, feats)
        model = nn.Model(_model, initial_params)
        optimizer = flax.optim.Adam(args.lr).create(model)
        for epoch in range(1, 1 + args.epochs):
            t0 = time.time()
            optimizer, loss = train(model, g, feats, labels, train_idx, optimizer)
            if epoch >= 3:
                dur.append(time.time() - t0)
                print('Training time/epoch {}'.format(np.mean(dur)))
            if not args.eval:
                continue

            result = test(model, g, feats, labels, split_idx, evaluator)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')

        if args.eval:
            logger.print_statistics(run)
    if args.eval:
        logger.print_statistics()
def main():
    parser = argparse.ArgumentParser(description='OGBN-Arxiv (GAT Full-Batch)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument("--num-layers", type=int, default=3,
                        help="number of hidden layers")
    parser.add_argument("--lr", type=float, default=0.0029739421726400865,
                        help="learning rate")
    parser.add_argument('--weight-decay', type=float, default=2.4222556964495987e-05,
                        help="weight decay")
    parser.add_argument("--num-hidden", type=int, default=16,
                        help="number of hidden units")
    parser.add_argument("--dropout", type=float, default=0.18074706609292976,
                        help="Dropout to use")
    parser.add_argument('--epochs', type=int, default=500)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument("--eval", action='store_true',
                        help='If not set, we will only do the training part.')
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = DglNodePropPredDataset(name='ogbn-arxiv')
    split_idx = dataset.get_idx_split()

    g, labels = dataset[0]
    feats = g.ndata['feat'].to(device)
    labels = labels.to(device)
    train_idx = split_idx['train'].to(device)

    g = dgl.to_bidirected(g)
    g = dgl.add_self_loop(g)
    g = g.int().to(device)
    print(g)

    model = GAT(num_layers=args.num_layers,
                in_feats=feats.size(-1),
                num_hidden=args.num_hidden,
                num_classes=dataset.num_classes,
                heads=[4, 4, 4],
                feat_drop=args.dropout,
                attn_drop=args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-arxiv')
    logger = Logger(args.runs, args)

    dur = []
    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
        for epoch in range(1, 1 + args.epochs):
            t0 = time.time()
            loss = train(model, g, feats, labels, train_idx, optimizer)
            if epoch >= 3:
                dur.append(time.time() - t0)
                print('Training time/epoch {}'.format(np.mean(dur)))

            if not args.eval:
                continue

            result = test(model, g, feats, labels, split_idx, evaluator)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')
        if args.eval:
            logger.print_statistics(run)
    if args.eval:
        logger.print_statistics()
Example #22
0
def main():
    parser = argparse.ArgumentParser(
        description='OGBN-Arxiv (GraphSAGE Full-Batch)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=500)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument("--eval",
                        action='store_true',
                        help='If not set, we will only do the training part.')
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = DglNodePropPredDataset(name='ogbn-arxiv')
    split_idx = dataset.get_idx_split()

    g, labels = dataset[0]
    feats = g.ndata['feat']
    g = dgl.to_bidirected(g)
    g = g.int().to(device)
    feats, labels = feats.to(device), labels.to(device)
    train_idx = split_idx['train'].to(device)

    model = GraphSAGE(in_feats=feats.size(-1),
                      hidden_feats=args.hidden_channels,
                      out_feats=dataset.num_classes,
                      num_layers=args.num_layers,
                      dropout=args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-arxiv')
    logger = Logger(args.runs, args)

    dur = []
    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            t0 = time.time()
            loss = train(model, g, feats, labels, train_idx, optimizer)
            if epoch >= 3:
                dur.append(time.time() - t0)
                print('Training time/epoch {}'.format(np.mean(dur)))
            if not args.eval:
                continue

            result = test(model, g, feats, labels, split_idx, evaluator)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')

        if args.eval:
            logger.print_statistics(run)
    if args.eval:
        logger.print_statistics()
Example #23
0
    T_3_1.append(sig[2] * sig[0])
    T_3_2.append(sig[2] * sig[1])
    T_3_4.append(sig[2] * sig[3])

    T_4_1.append(sig[3] * sig[0])
    T_4_2.append(sig[3] * sig[1])
    T_4_3.append(sig[3] * sig[2])

# Create the edges: In our case we want -> mu1, mu2, mu3, mu4 so 6 edges we need to create edges for both directions
u, v = torch.tensor([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3,
                     3]), torch.tensor([1, 2, 3, 0, 2, 3, 0, 1, 3, 0, 1, 2])
g = dgl.graph((u, v))

# Convert the tensor graphs to a bidirectional
edge_pred_graph = dgl.to_bidirected(g)

# Here (I think) we populate the features of the nodes and edges with the data
edge_pred_graph.ndata["Charge"] = torch.tensor([Mu1_C, Mu2_C, Mu3_C, Mu4_C])
edge_pred_graph.edata["InvMass"] = torch.tensor([
    IM_1_2, IM_1_3, IM_1_4, IM_2_1, IM_2_3, IM_2_4, IM_3_1, IM_3_2, IM_3_4,
    IM_4_1, IM_4_2, IM_4_3
])
edge_pred_graph.edata["C1xC2"] = torch.tensor([
    CxC_1_2, CxC_1_3, CxC_1_4, CxC_2_1, CxC_2_3, CxC_2_4, CxC_3_1, CxC_3_2,
    CxC_3_4, CxC_4_1, CxC_4_2, CxC_4_3
])

# Give each edge a label
edge_pred_graph.edata["label"] = torch.randn(12)
Example #24
0
def test_neighbor_sampler_dataloader():
    g = dgl.heterograph({('user', 'follow', 'user'): ([0, 0, 0, 1, 1], [1, 2, 3, 3, 4])},
                        {'user': 6}).long()
    g = dgl.to_bidirected(g).to(F.ctx())
    g.ndata['feat'] = F.randn((6, 8))
    g.edata['feat'] = F.randn((10, 4))
    reverse_eids = F.tensor([5, 6, 7, 8, 9, 0, 1, 2, 3, 4], dtype=F.int64)
    g_sampler1 = dgl.dataloading.MultiLayerNeighborSampler([2, 2], return_eids=True)
    g_sampler2 = dgl.dataloading.MultiLayerFullNeighborSampler(2, return_eids=True)

    hg = dgl.heterograph({
         ('user', 'follow', 'user'): ([0, 0, 0, 1, 1, 1, 2], [1, 2, 3, 0, 2, 3, 0]),
         ('user', 'followed-by', 'user'): ([1, 2, 3, 0, 2, 3, 0], [0, 0, 0, 1, 1, 1, 2]),
         ('user', 'play', 'game'): ([0, 1, 1, 3, 5], [0, 1, 2, 0, 2]),
         ('game', 'played-by', 'user'): ([0, 1, 2, 0, 2], [0, 1, 1, 3, 5])
    }).long().to(F.ctx())
    for ntype in hg.ntypes:
        hg.nodes[ntype].data['feat'] = F.randn((hg.number_of_nodes(ntype), 8))
    for etype in hg.canonical_etypes:
        hg.edges[etype].data['feat'] = F.randn((hg.number_of_edges(etype), 4))
    hg_sampler1 = dgl.dataloading.MultiLayerNeighborSampler(
        [{'play': 1, 'played-by': 1, 'follow': 2, 'followed-by': 1}] * 2, return_eids=True)
    hg_sampler2 = dgl.dataloading.MultiLayerFullNeighborSampler(2, return_eids=True)
    reverse_etypes = {'follow': 'followed-by', 'followed-by': 'follow', 'play': 'played-by', 'played-by': 'play'}

    collators = []
    graphs = []
    nids = []
    modes = []
    for seeds, sampler in product(
            [F.tensor([0, 1, 2, 3, 5], dtype=F.int64), F.tensor([4, 5], dtype=F.int64)],
            [g_sampler1, g_sampler2]):
        collators.append(dgl.dataloading.NodeCollator(g, seeds, sampler))
        graphs.append(g)
        nids.append({'user': seeds})
        modes.append('node')

        collators.append(dgl.dataloading.EdgeCollator(g, seeds, sampler))
        graphs.append(g)
        nids.append({'follow': seeds})
        modes.append('edge')

        collators.append(dgl.dataloading.EdgeCollator(
            g, seeds, sampler, exclude='self'))
        graphs.append(g)
        nids.append({'follow': seeds})
        modes.append('edge')

        collators.append(dgl.dataloading.EdgeCollator(
            g, seeds, sampler, exclude='reverse_id', reverse_eids=reverse_eids))
        graphs.append(g)
        nids.append({'follow': seeds})
        modes.append('edge')

        collators.append(dgl.dataloading.EdgeCollator(
            g, seeds, sampler, negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
        graphs.append(g)
        nids.append({'follow': seeds})
        modes.append('link')

        collators.append(dgl.dataloading.EdgeCollator(
            g, seeds, sampler, exclude='self', negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
        graphs.append(g)
        nids.append({'follow': seeds})
        modes.append('link')

        collators.append(dgl.dataloading.EdgeCollator(
            g, seeds, sampler, exclude='reverse_id', reverse_eids=reverse_eids,
            negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
        graphs.append(g)
        nids.append({'follow': seeds})
        modes.append('link')

    for seeds, sampler in product(
            [{'user': F.tensor([0, 1, 3, 5], dtype=F.int64), 'game': F.tensor([0, 1, 2], dtype=F.int64)},
             {'user': F.tensor([4, 5], dtype=F.int64), 'game': F.tensor([0, 1, 2], dtype=F.int64)}],
            [hg_sampler1, hg_sampler2]):
        collators.append(dgl.dataloading.NodeCollator(hg, seeds, sampler))
        graphs.append(hg)
        nids.append(seeds)
        modes.append('node')

    for seeds, sampler in product(
            [{'follow': F.tensor([0, 1, 3, 5], dtype=F.int64), 'play': F.tensor([1, 3], dtype=F.int64)},
             {'follow': F.tensor([4, 5], dtype=F.int64), 'play': F.tensor([1, 3], dtype=F.int64)}],
            [hg_sampler1, hg_sampler2]):
        collators.append(dgl.dataloading.EdgeCollator(hg, seeds, sampler))
        graphs.append(hg)
        nids.append(seeds)
        modes.append('edge')

        collators.append(dgl.dataloading.EdgeCollator(
            hg, seeds, sampler, exclude='reverse_types', reverse_etypes=reverse_etypes))
        graphs.append(hg)
        nids.append(seeds)
        modes.append('edge')

        collators.append(dgl.dataloading.EdgeCollator(
            hg, seeds, sampler, negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
        graphs.append(hg)
        nids.append(seeds)
        modes.append('link')

        collators.append(dgl.dataloading.EdgeCollator(
            hg, seeds, sampler, exclude='reverse_types', reverse_etypes=reverse_etypes,
            negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
        graphs.append(hg)
        nids.append(seeds)
        modes.append('link')

    for _g, nid, collator, mode in zip(graphs, nids, collators, modes):
        dl = DataLoader(
            collator.dataset, collate_fn=collator.collate, batch_size=2, shuffle=True, drop_last=False)
        assert isinstance(iter(dl), Iterator)
        _check_neighbor_sampling_dataloader(_g, nid, dl, mode, collator)
Example #25
0
def main():
    parser = argparse.ArgumentParser(description='OGBN (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--project', type=str, default='lcgnn')
    parser.add_argument('--dataset', type=str, default='flickr')
    parser.add_argument('--model', type=str, default='gcn')
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=4)
    parser.add_argument('--num_heads', type=int, default=1)
    parser.add_argument('--ego_size', type=int, default=64)
    parser.add_argument('--hidden_size', type=int, default=64)
    parser.add_argument('--input_dropout', type=float, default=0.2)
    parser.add_argument('--hidden_dropout', type=float, default=0.4)
    parser.add_argument('--weight_decay', type=float, default=0.0005)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=500)
    parser.add_argument('--early_stopping', type=int, default=20)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--eval_batch_size', type=int, default=512)
    parser.add_argument('--batch_norm', type=int, default=1)
    parser.add_argument('--residual', type=int, default=1)
    parser.add_argument('--linear_layer', type=int, default=1)
    parser.add_argument('--num_workers',
                        type=int,
                        default=4,
                        help='number of workers')
    parser.add_argument("--optimizer",
                        type=str,
                        default='adamw',
                        choices=['adam', 'adamw'],
                        help="optimizer")
    parser.add_argument('--warmup', type=int, default=0)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--load_path', type=str, default='')
    parser.add_argument('--exp_name', type=str, default='')
    args = parser.parse_args()
    print(args)

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    para_dic = {
        '': args.model,
        'nl': args.num_layers,
        'nh': args.num_heads,
        'es': args.ego_size,
        'hs': args.hidden_size,
        'id': args.input_dropout,
        'hd': args.hidden_dropout,
        'bs': args.batch_size,
        'op': args.optimizer,
        'lr': args.lr,
        'wd': args.weight_decay,
        'bn': args.batch_norm,
        'rs': args.residual,
        'll': args.linear_layer,
        'sd': args.seed
    }
    para_dic['warm'] = args.warmup
    exp_name = get_exp_name(args.dataset, para_dic, args.exp_name)

    wandb_name = exp_name.replace('_sd' + str(args.seed), '')
    wandb.init(name=wandb_name, project=args.project)
    wandb.config.update(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    if args.dataset == 'papers100M':
        dataset = MyNodePropPredDataset(name=args.dataset)
    elif args.dataset in ['flickr', 'reddit', 'yelp', 'amazon']:
        dataset = SAINTDataset(name=args.dataset)
    else:
        dataset = DglNodePropPredDataset(name=f'ogbn-{args.dataset}')

    split_idx = dataset.get_idx_split()
    train_idx = set(split_idx['train'].cpu().numpy())
    valid_idx = set(split_idx['valid'].cpu().numpy())
    test_idx = set(split_idx['test'].cpu().numpy())

    tmp_ego_size = 256 if args.dataset == 'products' else args.ego_size
    if args.ego_size < 64:
        tmp_ego_size = 64
    ego_graphs_unpadded = np.load(
        f'data/{args.dataset}-lc-ego-graphs-{tmp_ego_size}.npy',
        allow_pickle=True)
    conds_unpadded = np.load(
        f'data/{args.dataset}-lc-conds-{tmp_ego_size}.npy', allow_pickle=True)

    ego_graphs_train, ego_graphs_valid, ego_graphs_test = [], [], []
    cut_train, cut_valid, cut_test = [], [], []

    for i, ego_graph in enumerate(ego_graphs_unpadded):
        idx = ego_graph[0]
        assert len(ego_graph) == len(conds_unpadded[i])
        if len(ego_graph) > args.ego_size:
            ego_graph = ego_graph[:args.ego_size]
            conds_unpadded[i] = conds_unpadded[i][:args.ego_size]
        cut_position = np.argmin(conds_unpadded[i])
        cut = torch.zeros(len(ego_graph), dtype=torch.float32)
        cut[:cut_position + 1] = 1.0
        cut = cut.unsqueeze(1)
        if idx in train_idx:
            ego_graphs_train.append(ego_graph)
            cut_train.append(cut)
        elif idx in valid_idx:
            ego_graphs_valid.append(ego_graph)
            cut_valid.append(cut)
        elif idx in test_idx:
            ego_graphs_test.append(ego_graph)
            cut_test.append(cut)
        else:
            print(f"{idx} not in train/valid/test idx")

    num_classes = dataset.num_classes

    if isinstance(dataset, DglNodePropPredDataset):
        data = dataset[0]
        graph = dgl.remove_self_loop(data[0])
        graph = dgl.add_self_loop(graph)
        if args.dataset == 'arxiv' or args.dataset == 'papers100M':
            temp_graph = dgl.to_bidirected(graph)
            temp_graph.ndata['feat'] = graph.ndata['feat']
            graph = temp_graph
        data = (graph, data[1].long())

        graph = data[0]
        graph.ndata['labels'] = data[1]
    elif isinstance(dataset, SAINTDataset):
        data = dataset[0]
        edge_index = data.edge_index
        graph = dgl.DGLGraph((edge_index[0], edge_index[1]))
        graph = dgl.remove_self_loop(graph)
        graph = dgl.add_self_loop(graph)
        graph.ndata['feat'] = data.x
        label = data.y
        if len(label.shape) == 1:
            label = label.unsqueeze(1)
        data = (graph, label)
    else:
        raise NotImplementedError

    train_dataset = NodeClassificationDataset(data, ego_graphs_train,
                                              cut_train)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers,
                              collate_fn=batcher(),
                              pin_memory=True)

    valid_dataset = NodeClassificationDataset(data, ego_graphs_valid,
                                              cut_valid)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=args.eval_batch_size,
                              shuffle=False,
                              num_workers=args.num_workers,
                              collate_fn=batcher(),
                              pin_memory=True)

    test_dataset = NodeClassificationDataset(data, ego_graphs_test, cut_test)
    test_loader = DataLoader(test_dataset,
                             batch_size=args.eval_batch_size,
                             shuffle=False,
                             num_workers=args.num_workers,
                             collate_fn=batcher(),
                             pin_memory=True)

    model = GNNModel(conv_type=args.model,
                     input_size=graph.ndata['feat'].shape[1] + 1,
                     hidden_size=args.hidden_size,
                     num_layers=args.num_layers,
                     num_classes=num_classes,
                     batch_norm=args.batch_norm,
                     residual=args.residual,
                     idropout=args.input_dropout,
                     dropout=args.hidden_dropout,
                     linear_layer=args.linear_layer,
                     num_heads=args.num_heads).to(device)

    wandb.watch(model, log='all')

    pytorch_total_params = sum(p.numel() for p in model.parameters()
                               if p.requires_grad)
    print('model parameters:', pytorch_total_params)

    if not os.path.exists('saved'):
        os.mkdir('saved')

    model.reset_parameters()

    if args.load_path:
        model.load_state_dict(torch.load(args.load_path,
                                         map_location='cuda:0'))

        valid_acc, valid_loss = test(model, valid_loader, device, args)
        valid_output = f'Valid: {100 * valid_acc:.2f}% '

        cor_train_acc, _ = test(model, train_loader, device, args)

        cor_test_acc, cor_test_loss = test(model, test_loader, device, args)
        train_output = f'Train: {100 * cor_train_acc:.2f}%, '
        test_output = f'Test: {100 * cor_test_acc:.2f}%'

        print(train_output + valid_output + test_output)
        return

    best_val_acc = 0
    cor_train_acc = 0
    cor_test_acc = 0
    patience = 0

    if args.optimizer == 'adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.weight_decay)
    elif args.optimizer == 'adamw':
        optimizer = torch.optim.AdamW(model.parameters(),
                                      lr=args.lr,
                                      weight_decay=args.weight_decay)
    else:
        raise NotImplementedError
    if args.warmup > 0:
        optimizer = NoamOptim(
            optimizer,
            args.hidden_size if args.hidden_size > 0 else data.x.size(1),
            n_warmup_steps=args.warmup,
            init_lr=args.lr)

    for epoch in range(1, 1 + args.epochs):
        # lp = LineProfiler()
        # lp_wrapper = lp(train)
        # loss = lp_wrapper(model, train_loader, device, optimizer, args)
        # lp.print_stats()
        loss = train(model, train_loader, device, optimizer, args)

        train_output = valid_output = test_output = ''
        if epoch >= 10 and epoch % args.log_steps == 0:
            valid_acc, valid_loss = test(model, valid_loader, device, args)
            valid_output = f'Valid: {100 * valid_acc:.2f}% '

            if valid_acc > best_val_acc:
                best_val_acc = valid_acc
                # cor_train_acc, _ = test(model, train_loader, device, args)
                cor_test_acc, cor_test_loss = test(model, test_loader, device,
                                                   args)
                # train_output = f'Train: {100 * cor_train_acc:.2f}%, '
                test_output = f'Test: {100 * cor_test_acc:.2f}%'
                patience = 0
                try:
                    torch.save(model.state_dict(), 'saved/' + exp_name + '.pt')
                    wandb.save('saved/' + exp_name + '.pt')
                except FileNotFoundError as e:
                    print(e)
            else:
                patience += 1
                if patience >= args.early_stopping:
                    print('Early stopping...')
                    break
            wandb.log({
                'Train Loss': loss,
                'Valid Acc': valid_acc,
                'best_val_acc': best_val_acc,
                'cor_test_acc': cor_test_acc,
                'LR': get_lr(optimizer),
                'Valid Loss': valid_loss,
                'cor_test_loss': cor_test_loss
            })
        else:
            wandb.log({'Train Loss': loss, 'LR': get_lr(optimizer)})
        # train_output +
        print(f'Epoch: {epoch:02d}, '
              f'Loss: {loss:.4f}, ' + valid_output + test_output)
Example #26
0
def test_to_bidirected():
    # homogeneous graph
    g = dgl.graph((F.tensor([0, 1, 3, 1]), F.tensor([1, 2, 0, 2])))
    g.ndata['h'] = F.tensor([[0.], [1.], [2.], [1.]])
    g.edata['h'] = F.tensor([[3.], [4.], [5.], [6.]])
    bg = dgl.to_bidirected(g, copy_ndata=True, copy_edata=True)
    u, v = g.edges()
    ub, vb = bg.edges()
    assert F.array_equal(F.cat([u, v], dim=0), ub)
    assert F.array_equal(F.cat([v, u], dim=0), vb)
    assert F.array_equal(g.ndata['h'], bg.ndata['h'])
    assert F.array_equal(F.cat([g.edata['h'], g.edata['h']], dim=0), bg.edata['h'])
    bg.ndata['hh'] = F.tensor([[0.], [1.], [2.], [1.]])
    assert ('hh' in g.ndata) is False
    bg.edata['hh'] = F.tensor([[0.], [1.], [2.], [1.], [0.], [1.], [2.], [1.]])
    assert ('hh' in g.edata) is False

    # donot share ndata and edata
    bg = dgl.to_bidirected(g, copy_ndata=False, copy_edata=False)
    ub, vb = bg.edges()
    assert F.array_equal(F.cat([u, v], dim=0), ub)
    assert F.array_equal(F.cat([v, u], dim=0), vb)
    assert ('h' in bg.ndata) is False
    assert ('h' in bg.edata) is False

    # zero edge graph
    g = dgl.graph([])
    bg = dgl.to_bidirected(g, copy_ndata=True, copy_edata=True)

    # heterogeneous graph
    g = dgl.heterograph({
        ('user', 'wins', 'user'): (F.tensor([0, 2, 0, 2, 2]), F.tensor([1, 1, 2, 1, 0])),
        ('user', 'plays', 'game'): (F.tensor([1, 2, 1]), F.tensor([2, 1, 1])),
        ('user', 'follows', 'user'): (F.tensor([1, 2, 1]), F.tensor([0, 0, 0]))
    })
    g.nodes['game'].data['hv'] = F.ones((3, 1))
    g.nodes['user'].data['hv'] = F.ones((3, 1))
    g.edges['wins'].data['h'] = F.tensor([0, 1, 2, 3, 4])
    bg = dgl.to_bidirected(g, copy_ndata=True, copy_edata=True, ignore_bipartite=True)
    assert F.array_equal(g.nodes['game'].data['hv'], bg.nodes['game'].data['hv'])
    assert F.array_equal(g.nodes['user'].data['hv'], bg.nodes['user'].data['hv'])
    u, v = g.all_edges(order='eid', etype=('user', 'wins', 'user'))
    ub, vb = bg.all_edges(order='eid', etype=('user', 'wins', 'user'))
    assert F.array_equal(F.cat([u, v], dim=0), ub)
    assert F.array_equal(F.cat([v, u], dim=0), vb)
    assert F.array_equal(F.cat([g.edges['wins'].data['h'], g.edges['wins'].data['h']], dim=0),
                         bg.edges['wins'].data['h'])
    u, v = g.all_edges(order='eid', etype=('user', 'follows', 'user'))
    ub, vb = bg.all_edges(order='eid', etype=('user', 'follows', 'user'))
    assert F.array_equal(F.cat([u, v], dim=0), ub)
    assert F.array_equal(F.cat([v, u], dim=0), vb)
    u, v = g.all_edges(order='eid', etype=('user', 'plays', 'game'))
    ub, vb = bg.all_edges(order='eid', etype=('user', 'plays', 'game'))
    assert F.array_equal(u, ub)
    assert F.array_equal(v, vb)
    assert len(bg.edges['plays'].data) == 0
    assert len(bg.edges['follows'].data) == 0

    # donot share ndata and edata
    bg = dgl.to_bidirected(g, copy_ndata=False, copy_edata=False, ignore_bipartite=True)
    assert len(bg.edges['wins'].data) == 0
    assert len(bg.edges['plays'].data) == 0
    assert len(bg.edges['follows'].data) == 0
    assert len(bg.nodes['game'].data) == 0
    assert len(bg.nodes['user'].data) == 0
    u, v = g.all_edges(order='eid', etype=('user', 'wins', 'user'))
    ub, vb = bg.all_edges(order='eid', etype=('user', 'wins', 'user'))
    assert F.array_equal(F.cat([u, v], dim=0), ub)
    assert F.array_equal(F.cat([v, u], dim=0), vb)
    u, v = g.all_edges(order='eid', etype=('user', 'follows', 'user'))
    ub, vb = bg.all_edges(order='eid', etype=('user', 'follows', 'user'))
    assert F.array_equal(F.cat([u, v], dim=0), ub)
    assert F.array_equal(F.cat([v, u], dim=0), vb)
    u, v = g.all_edges(order='eid', etype=('user', 'plays', 'game'))
    ub, vb = bg.all_edges(order='eid', etype=('user', 'plays', 'game'))
    assert F.array_equal(u, ub)
    assert F.array_equal(v, vb)
Example #27
0
def test_neighbor_sampler_dataloader():
    g = dgl.graph([(0,1),(0,2),(0,3),(1,3),(1,4)],
            'user', 'follow', num_nodes=6).long()
    g = dgl.to_bidirected(g)
    reverse_eids = F.tensor([5, 6, 7, 8, 9, 0, 1, 2, 3, 4], dtype=F.int64)
    g_sampler1 = dgl.dataloading.MultiLayerNeighborSampler([2, 2], return_eids=True)
    g_sampler2 = dgl.dataloading.MultiLayerFullNeighborSampler(2, return_eids=True)

    hg = dgl.heterograph({
        ('user', 'follow', 'user'): [(0, 1), (0, 2), (0, 3), (1, 0), (1, 2), (1, 3), (2, 0)],
        ('user', 'followed-by', 'user'): [(1, 0), (2, 0), (3, 0), (0, 1), (2, 1), (3, 1), (0, 2)],
        ('user', 'play', 'game'): [(0, 0), (1, 1), (1, 2), (3, 0), (5, 2)],
        ('game', 'played-by', 'user'): [(0, 0), (1, 1), (2, 1), (0, 3), (2, 5)]}).long()
    hg_sampler1 = dgl.dataloading.MultiLayerNeighborSampler(
        [{'play': 1, 'played-by': 1, 'follow': 2, 'followed-by': 1}] * 2, return_eids=True)
    hg_sampler2 = dgl.dataloading.MultiLayerFullNeighborSampler(2, return_eids=True)
    reverse_etypes = {'follow': 'followed-by', 'followed-by': 'follow', 'play': 'played-by', 'played-by': 'play'}

    collators = []
    graphs = []
    nids = []
    modes = []
    for seeds, sampler in product(
            [F.tensor([0, 1, 2, 3, 5], dtype=F.int64), F.tensor([4, 5], dtype=F.int64)],
            [g_sampler1, g_sampler2]):
        collators.append(dgl.dataloading.NodeCollator(g, seeds, sampler))
        graphs.append(g)
        nids.append({'user': seeds})
        modes.append('node')

        collators.append(dgl.dataloading.EdgeCollator(g, seeds, sampler))
        graphs.append(g)
        nids.append({'follow': seeds})
        modes.append('edge')

        collators.append(dgl.dataloading.EdgeCollator(
            g, seeds, sampler, exclude='reverse_id', reverse_eids=reverse_eids))
        graphs.append(g)
        nids.append({'follow': seeds})
        modes.append('edge')

        collators.append(dgl.dataloading.EdgeCollator(
            g, seeds, sampler, negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
        graphs.append(g)
        nids.append({'follow': seeds})
        modes.append('link')

        collators.append(dgl.dataloading.EdgeCollator(
            g, seeds, sampler, exclude='reverse_id', reverse_eids=reverse_eids,
            negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
        graphs.append(g)
        nids.append({'follow': seeds})
        modes.append('link')

    for seeds, sampler in product(
            [{'user': F.tensor([0, 1, 3, 5], dtype=F.int64), 'game': F.tensor([0, 1, 2], dtype=F.int64)},
             {'user': F.tensor([4, 5], dtype=F.int64), 'game': F.tensor([0, 1, 2], dtype=F.int64)}],
            [hg_sampler1, hg_sampler2]):
        collators.append(dgl.dataloading.NodeCollator(hg, seeds, sampler))
        graphs.append(hg)
        nids.append(seeds)
        modes.append('node')

    for seeds, sampler in product(
            [{'follow': F.tensor([0, 1, 3, 5], dtype=F.int64), 'play': F.tensor([1, 3], dtype=F.int64)},
             {'follow': F.tensor([4, 5], dtype=F.int64), 'play': F.tensor([1, 3], dtype=F.int64)}],
            [hg_sampler1, hg_sampler2]):
        collators.append(dgl.dataloading.EdgeCollator(hg, seeds, sampler))
        graphs.append(hg)
        nids.append(seeds)
        modes.append('edge')

        collators.append(dgl.dataloading.EdgeCollator(
            hg, seeds, sampler, exclude='reverse_types', reverse_etypes=reverse_etypes))
        graphs.append(hg)
        nids.append(seeds)
        modes.append('edge')

        collators.append(dgl.dataloading.EdgeCollator(
            hg, seeds, sampler, negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
        graphs.append(hg)
        nids.append(seeds)
        modes.append('link')

        collators.append(dgl.dataloading.EdgeCollator(
            hg, seeds, sampler, exclude='reverse_types', reverse_etypes=reverse_etypes,
            negative_sampler=dgl.dataloading.negative_sampler.Uniform(2)))
        graphs.append(hg)
        nids.append(seeds)
        modes.append('link')

    for _g, nid, collator, mode in zip(graphs, nids, collators, modes):
        dl = DataLoader(
            collator.dataset, collate_fn=collator.collate, batch_size=2, shuffle=True, drop_last=False)
        _check_neighbor_sampling_dataloader(_g, nid, dl, mode)