Esempio n. 1
0
class COLLABDataset(Dataset):
    def __init__(self, name):
        start = time.time()
        print("[I] Loading dataset %s..." % (name))
        self.name = name
        self.dataset = DglLinkPropPredDataset(name='ogbl-collab')
        
        self.graph = self.dataset[0]  # single DGL graph
        
        # Create edge feat by concatenating weight and year
        self.graph.edata['feat'] = torch.cat( 
            [self.graph.edata['edge_weight'], self.graph.edata['edge_year']], 
            dim=1 
        )
        
        self.split_edge = self.dataset.get_edge_split()
        self.train_edges = self.split_edge['train']['edge']  # positive train edges
        self.val_edges = self.split_edge['valid']['edge']  # positive val edges
        self.val_edges_neg = self.split_edge['valid']['edge_neg']  # negative val edges
        self.test_edges = self.split_edge['test']['edge']  # positive test edges
        self.test_edges_neg = self.split_edge['test']['edge_neg']  # negative test edges
        
        self.evaluator = Evaluator(name='ogbl-collab')
        
        print("[I] Finished loading.")
        print("[I] Data load time: {:.4f}s".format(time.time()-start))

    def _add_positional_encodings(self, pos_enc_dim):
        
        # Graph positional encoding v/ Laplacian eigenvectors
        self.graph = positional_encoding(self.graph, pos_enc_dim)
Esempio n. 2
0
class DDIDataset(Dataset):
    def __init__(self, name):
        start = time.time()
        print("[I] Loading dataset %s..." % (name))
        self.name = name
        self.dataset = DglLinkPropPredDataset(name='ogbl-ddi')
        
        self.graph = self.dataset[0]  # single DGL graph

        self.split_edge = self.dataset.get_edge_split()
        self.train_edges = self.split_edge['train']['edge']  # positive train edges
        self.val_edges = self.split_edge['valid']['edge']  # positive val edges
        self.val_edges_neg = self.split_edge['valid']['edge_neg']  # negative val edges
        self.test_edges = self.split_edge['test']['edge']  # positive test edges
        self.test_edges_neg = self.split_edge['test']['edge_neg']  # negative test edges
        
        self.evaluator = Evaluator(name='ogbl-ddi')
        
        print("[I] Finished loading.")
        print("[I] Data load time: {:.4f}s".format(time.time()-start))

    def _add_positional_encodings(self, pos_enc_dim):
        
        # Graph positional encoding v/ Laplacian eigenvectors
        self.graph = positional_encoding(self.graph, pos_enc_dim)
Esempio n. 3
0
def load_ogbl(name,
              device=th.device('cpu'),
              root='/home/eva_share_users/zhuyu'):
    from ogb.linkproppred import DglLinkPropPredDataset

    print('load', name)
    data = DglLinkPropPredDataset(name=name, root=root)
    print('finish loading', name)
    splitted_idx = data.get_edge_split()
    if name == 'ogbl-citation2':
        splitted_idx['train']['edge'] = th.cat(
            (splitted_idx['train']['source_node'].unsqueeze(1),
             splitted_idx['train']['target_node'].unsqueeze(1)),
            axis=1)
        splitted_idx['valid']['edge'] = th.cat(
            (splitted_idx['valid']['source_node'].unsqueeze(1),
             splitted_idx['valid']['target_node'].unsqueeze(1)),
            axis=1)
        splitted_idx['valid']['neg_edge'] = th.cat(
            (splitted_idx['valid']['source_node'].repeat(1000).unsqueeze(1),
             splitted_idx['valid']['target_node_neg'].view(-1).unsqueeze(1)),
            axis=1)
        splitted_idx['test']['edge'] = th.cat(
            (splitted_idx['test']['source_node'].unsqueeze(1),
             splitted_idx['test']['target_node'].unsqueeze(1)),
            axis=1)
        splitted_idx['test']['neg_edge'] = th.cat(
            (splitted_idx['test']['source_node'].repeat(1000).unsqueeze(1),
             splitted_idx['test']['target_node_neg'].view(-1).unsqueeze(1)),
            axis=1)
    graph = data[0]
    #from IPython import embed; embed()
    return graph, splitted_idx
Esempio n. 4
0
def load_ogb_dataset(dataset):
    """
    Load OGB dataset
    Args:
        dataset(str): name of dataset (ogbl-collab, ogbl-ddi, ogbl-citation)

    Returns:
        graph(DGLGraph): graph
        split_edge(dict): split edge

    """
    dataset = DglLinkPropPredDataset(name=dataset)
    split_edge = dataset.get_edge_split()
    graph = dataset[0]

    return graph, split_edge
Esempio n. 5
0
def prepare_train_labels() -> Tuple[dgl.DGLHeteroGraph, Tensor, Tensor, Tensor]:
    dataset = DglLinkPropPredDataset(name="ogbl-collab")
    split_edge = dataset.get_edge_split()
    train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
    graph: dgl.DGLGraph = dataset[0]

    train_src_nodes = torch.cat([train_edge["edge"][:, 0], train_edge["edge"][:, 1]], dim=0)
    train_dst_nodes = torch.cat([train_edge["edge"][:, 1], train_edge["edge"][:, 0]], dim=0)
    train_graph = dgl.graph(data=(train_src_nodes, train_dst_nodes), num_nodes=graph.number_of_nodes())

    train_graph.ndata["feat"] = graph.ndata["feat"]

    train_labels = train_edge["edge"]
    train_labels = torch.cat([train_labels, torch.ones(len(train_labels), 1).long()], dim=1)
    valid_labels = get_label_from_split(valid_edge)
    test_labels = get_label_from_split(test_edge)
    return train_graph, train_labels, valid_labels, test_labels
Esempio n. 6
0
def prepare_ogb(name):
    dataset = DglLinkPropPredDataset(name)

    split_edge = dataset.get_edge_split()
    train_edge, valid_edge, test_edge = split_edge["train"], split_edge[
        "valid"], split_edge["test"]
    g = dataset[0]  # dgl graph object containing only training edges

    train_data = sample_data(1500000, train_edge, sampling=True).numpy()
    valid_data = sample_data(50000, valid_edge)
    test_data = sample_data(50000, test_edge)

    num_nodes = g.number_of_nodes()
    num_rels = len(torch.unique(train_edge['relation']))
    del g

    return train_data, valid_data, test_data, num_nodes, num_rels
Esempio n. 7
0
class COLLABDataset(Dataset):
    def __init__(self, name, norm='none', verbose=True):
        start = time.time()
        if verbose:
            print("[I] Loading dataset %s..." % (name))
        self.name = name
        self.dataset = DglLinkPropPredDataset(name='ogbl-collab')

        self.graph = self.dataset[0]  # single DGL graph
        #self._add_positional_encodings(10, norm)
        self._add_eig(norm=norm, number=6)

        # Create edge feat by concatenating weight and year
        self.graph.edata['feat'] = torch.cat(
            [self.graph.edata['edge_weight'], self.graph.edata['edge_year']],
            dim=1
        )

        self.split_edge = self.dataset.get_edge_split()
        self.train_edges = self.split_edge['train']['edge']  # positive train edges
        self.val_edges = self.split_edge['valid']['edge']  # positive val edges
        self.val_edges_neg = self.split_edge['valid']['edge_neg']  # negative val edges
        self.test_edges = self.split_edge['test']['edge']  # positive test edges
        self.test_edges_neg = self.split_edge['test']['edge_neg']  # negative test edges

        self.evaluator = Evaluator(name='ogbl-collab')
        if verbose:
            print("[I] Finished loading.")
            print("[I] Data load time: {:.4f}s".format(time.time() - start))

    def _add_positional_encodings(self, pos_enc_dim, norm):
        # Graph positional encoding v/ Laplacian eigenvectors
        self.graph = positional_encoding(self.graph, pos_enc_dim, norm)

    def _add_eig(self, norm='none', number=6):

        dataset = LinkPropPredDataset(name='ogbl-collab')
        graph = dataset[0]
        G = nx.Graph()
        G.add_nodes_from([i for i in range(235868)])

        for nod1, nod2 in zip(graph['edge_index'][0], graph['edge_index'][1]):
            G.add_edge(nod1, nod2)

        components = list(nx.connected_components(G))
        list_G = []
        list_nodes = []

        for component in components:
            G_new = nx.Graph()
            G_new.add_nodes_from(list(component))
            list_G.append(G_new)
            list_nodes.append(list(component))
        for i in range(len(list_G)):
            for nod1, nod2 in list(G.edges(list_nodes[i])):
                list_G[i].add_edge(nod1, nod2)

        EigVec_global = np.ones((235868, number))
        for g in list_G:
            node_list = list(g.nodes)
            A = nx.adjacency_matrix(g, nodelist=node_list).astype(float)
            if norm == 'none':
                D = sp.diags(list(map(lambda x: x[1], g.degree())))
                L = D - A
            elif norm == 'sym':
                D_norm = sp.diags(list(map(lambda x: x[1]**(-0.5), g.degree())))
                D = sp.diags(list(map(lambda x: x[1], g.degree())))
                L = D_norm * (D - A) * D_norm
            elif norm == 'walk':
                D_norm = sp.diags(list(map(lambda x: x[1]**(-1), g.degree())))
                D = sp.diags(list(map(lambda x: x[1], g.degree())))
                L = D_norm * (D - A)

            if len(node_list) > 2:
                EigVal, EigVec = sp.linalg.eigs(L, k=min(len(node_list) - 2, number), which='SR', tol=0)
                EigVec = EigVec[:, EigVal.argsort()] / np.max(EigVec[:, EigVal.argsort()], 0)
                EigVec_global[node_list, : min(len(node_list) - 2, number)] = EigVec[:, :]
            elif len(node_list) == 2:
                EigVec_global[node_list[0], :number] = np.zeros((1, number))
        self.graph.ndata['eig'] = torch.from_numpy(EigVec_global).float()
        print(sorted(self.graph.ndata['eig'][1]))
Esempio n. 8
0
])
snapshot_dir = os.path.join(log_dir, "snapshot")
if not os.path.isdir(snapshot_dir):
    os.makedirs(snapshot_dir)
print("Process Id:", os.getpid())
print(os.path.join(log_dir, sys.argv[0]))
print(args)
shutil.copyfile(__file__, os.path.join(log_dir, "train.py"))
shutil.copyfile(model_file + ".py", os.path.join(log_dir, model_file + ".py"))

evaluator = Evaluator(name="ogbl-citation")
print(evaluator.expected_input_format)
print(evaluator.expected_output_format)

dataset = DglLinkPropPredDataset(name="ogbl-citation")
split_edge = dataset.get_edge_split()
num_worker = 16
train_edge, valid_edge, test_edge = split_edge["train"], split_edge[
    "valid"], split_edge["test"]
graph = dataset[0]
origin_graph = copy.deepcopy(graph)
graph.readonly(False)
graph.add_edges(graph.edges()[1], graph.edges()[0])
graph.add_edges(
    torch.arange(0, graph.number_of_nodes()).long(),
    torch.arange(0, graph.number_of_nodes()).long())
graph.edata["etype"] = torch.cat([
    torch.ones(
        (graph.number_of_edges() - graph.number_of_nodes()) // 2).long(),
    (torch.ones(
        (graph.number_of_edges() - graph.number_of_nodes()) // 2) * 2).long(),
def main():
    parser = argparse.ArgumentParser(description='OGBL-PPA (Full-Batch)')
    parser.add_argument('--use_gpu',
                        action='store_true',
                        help='Use gpu for computation (default: False)')
    parser.add_argument(
        '--log_steps',
        type=int,
        default=1,
        help='Print training progress every {log_steps} epochs (default: 1)')
    parser.add_argument('--use_sage',
                        action='store_true',
                        help='Use GraphSAGE rather than GCN (default: False)')
    parser.add_argument(
        '--num_layers',
        type=int,
        default=3,
        help='Number of GNN layers to use as well as '
        'linear layers to use for final link prediction (default: 3)')
    parser.add_argument('--hidden_feats',
                        type=int,
                        default=256,
                        help='Size for hidden representations (default: 256)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.0,
                        help='Dropout (default: 0.0)')
    parser.add_argument(
        '--batch_size',
        type=int,
        default=64 * 1024,
        help='Batch size to use for link prediction (default: 64 * 1024)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        help='Learning rate (default: 0.01)')
    parser.add_argument('--epochs',
                        type=int,
                        default=20,
                        help='Number of epochs for training (default: 20)')
    parser.add_argument(
        '--eval_steps',
        type=int,
        default=1,
        help='Evaluate hits@100 every {eval_steps} epochs (default: 1)')
    parser.add_argument(
        '--runs',
        type=int,
        default=10,
        help='Number of random experiments to perform (default: 10)')
    args = parser.parse_args()
    print(args)

    if args.use_gpu and torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')

    dataset = DglLinkPropPredDataset(name='ogbl-ppa')
    # Get DGLGraph
    data = dataset[0]
    data.readonly(False)
    data.add_edges(data.nodes(), data.nodes())
    data = data.to(device)
    splitted_edge = dataset.get_edge_split()
    x = data.ndata['feat'].float().to(device)

    if args.use_sage:
        model = GraphSAGE(
            in_feats=x.size(-1),
            hidden_feats=[args.hidden_feats for _ in range(args.num_layers)],
            activation=[F.relu for _ in range(args.num_layers - 1)] + [None],
            dropout=[0] + [args.dropout
                           for _ in range(args.num_layers - 1)]).to(device)
    else:
        model = GCN(
            in_feats=x.size(-1),
            hidden_feats=[args.hidden_feats for _ in range(args.num_layers)],
            activation=[F.relu for _ in range(args.num_layers - 1)] + [None],
            residual=[False for _ in range(args.num_layers)],
            batchnorm=[False for _ in range(args.num_layers)],
            dropout=[args.dropout
                     for _ in range(args.num_layers - 1)] + [0]).to(device)

    predictor = HadamardLinkPredictor(in_feats=args.hidden_feats,
                                      hidden_feats=args.hidden_feats,
                                      num_layers=args.num_layers,
                                      n_tasks=1,
                                      dropout=args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-ppa')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@50': Logger(args.runs, args),
        'Hits@100': Logger(args.runs, args),
    }

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, data, x, splitted_edge, optimizer,
                         args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(model, predictor, data, x, splitted_edge,
                               evaluator, args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
Esempio n. 10
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-COLLAB (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--eval_steps', type=int, default=1)
    parser.add_argument('--runs', type=int, default=1)
    
    parser.add_argument('--gnn_type', type=str, default='gated-gcn')
    parser.add_argument('--num_layer', type=int, default=3)
    parser.add_argument('--emb_dim', type=int, default=64)
    parser.add_argument('--dropout', type=float, default=0.0)
    
    parser.add_argument('--batch_size', type=int, default=32*1024)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=200)
    
    args = parser.parse_args()
    print(args)
    
    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)
    print(device)

    dataset = DglLinkPropPredDataset(name='ogbl-collab')
    split_edge = dataset.get_edge_split()
    data = dataset[0]
    print(data)

    model = GNN(gnn_type=args.gnn_type, emb_dim=args.emb_dim, num_layer=args.num_layer, dropout=args.dropout).to(device)
    print(model)
    total_param = 0
    for param in model.parameters():
        total_param += np.prod(list(param.data.size()))
    print(f'Model parameters: {total_param}')

    predictor = LinkPredictor(emb_dim=args.emb_dim).to(device)
    print(predictor)
    total_param = 0
    for param in predictor.parameters():
        total_param += np.prod(list(param.data.size()))
    print(f'Predictor parameters: {total_param}')

    evaluator = Evaluator(name='ogbl-collab')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@50': Logger(args.runs, args),
        'Hits@100': Logger(args.runs, args),
    }
    
    tb_logger = SummaryWriter(
        os.path.join(
            "logs", 
            f"{args.gnn_type}-L{args.num_layer}-h{args.emb_dim}-d{args.dropout}-LR{args.lr}", 
            time.strftime("%Y%m%dT%H%M%S")
        )
    )
    
    for run in range(args.runs):
        assert args.runs == 1
        # model.reset_parameters()

        optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5, min_lr=1e-5, verbose=True)

        for epoch in range(1, 1 + args.epochs):

            loss = train(model, predictor, data, split_edge, optimizer, args.batch_size, device)

            if epoch % args.eval_steps == 0:
                results = test(model, predictor, data, split_edge, evaluator, args.batch_size, device)

                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    tb_logger.add_scalar('loss', loss, epoch)
                    tb_logger.add_scalar('lr', optimizer.param_groups[0]['lr'], epoch)
                    
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')

                        tb_logger.add_scalar(f'{key}/train_hits', 100 * train_hits, epoch)
                        tb_logger.add_scalar(f'{key}/valid_hits', 100 * valid_hits, epoch)
                        tb_logger.add_scalar(f'{key}/test_hits', 100 * test_hits, epoch)     
                        
                    print('---')
                
                scheduler.step(100 * results["Hits@10"][1])
            
            if optimizer.param_groups[0]['lr'] < 1e-5:
                break

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)