Example #1
0
def load_ogb_mag():
    name = 'ogbn-mag'
    from ogb.nodeproppred import DglNodePropPredDataset

    os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset'))

    print('load', name)
    dataset = DglNodePropPredDataset(name=name)
    print('finish loading', name)
    split_idx = dataset.get_idx_split()
    train_idx = split_idx["train"]['paper']
    val_idx = split_idx["valid"]['paper']
    test_idx = split_idx["test"]['paper']
    hg_orig, labels = dataset[0]
    subgs = {}
    for etype in hg_orig.canonical_etypes:
        u, v = hg_orig.all_edges(etype=etype)
        subgs[etype] = (u, v)
        subgs[(etype[2], 'rev-' + etype[1], etype[0])] = (v, u)
    hg = dgl.heterograph(subgs)
    hg.nodes['paper'].data['feat'] = hg_orig.nodes['paper'].data['feat']
    hg.nodes['paper'].data['labels'] = labels['paper'].squeeze()
    train_mask = torch.zeros((hg.number_of_nodes('paper'), ), dtype=torch.bool)
    train_mask[train_idx] = True
    val_mask = torch.zeros((hg.number_of_nodes('paper'), ), dtype=torch.bool)
    val_mask[val_idx] = True
    test_mask = torch.zeros((hg.number_of_nodes('paper'), ), dtype=torch.bool)
    test_mask[test_idx] = True
    hg.nodes['paper'].data['train_mask'] = train_mask
    hg.nodes['paper'].data['val_mask'] = val_mask
    hg.nodes['paper'].data['test_mask'] = test_mask

    num_classes = dataset.num_classes
    return OGBDataset(hg, num_classes, 'paper')
Example #2
0
def load_ogb(name):
    from ogb.nodeproppred import DglNodePropPredDataset

    data = DglNodePropPredDataset(name=name)
    splitted_idx = data.get_idx_split()
    graph, labels = data[0]
    labels = labels[:, 0]

    graph.ndata['features'] = graph.ndata['feat']
    graph.ndata['labels'] = labels
    in_feats = graph.ndata['features'].shape[1]
    num_labels = len(th.unique(labels))

    # Find the node IDs in the training, validation, and test set.
    train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
    train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.int64)
    train_mask[train_nid] = 1
    val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.int64)
    val_mask[val_nid] = 1
    test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.int64)
    test_mask[test_nid] = 1
    graph.ndata['train_mask'] = train_mask
    graph.ndata['val_mask'] = val_mask
    graph.ndata['test_mask'] = test_mask
    return graph, len(th.unique(graph.ndata['labels']))
Example #3
0
    def __init__(self, dataset_name='ogbn-arxiv', k=5):
        super(Ogbn, self).__init__()
        print("Loading dataset {}".format(dataset_name))
        self.dataset_name = dataset_name
        self.dataset_path = os.path.join('./data',
                                         dataset_name.replace('-', '_'))
        ogbn_dataset = DglNodePropPredDataset(dataset_name, root='./data')
        self.graph, self.label = ogbn_dataset[0]
        self.graph = self.graph.add_self_loop()
        self.label = self.label.flatten()

        self.split = ogbn_dataset.get_idx_split()

        self.length = self.graph.num_nodes()
        self.nodes = self.graph.nodes()
        self.edges = self.graph.edges()
        self.k = k
        print("Generate Context...")
        time1 = time.time()
        self.context = np.squeeze(
            dgl.sampling.random_walk(self.graph, self.nodes, length=self.k)[0])
        time2 = time.time()
        print("Context shape: {}, time: {}s".format(self.context.shape,
                                                    time2 - time1))
        print("Loading WL...")
        self.path_WL = os.path.join(self.dataset_path, 'WL.pkl')
        self.WL = self.load_WL(self.path_WL)
        max_wl = 0
        for i in self.WL.values():
            max_wl = i if i > max_wl else max_wl
        print("Max WL id: ", max_wl)
Example #4
0
def load_data(name, ogb_root, seed, device):
    if name == 'ogbn-arxiv':
        data = DglNodePropPredDataset('ogbn-arxiv', ogb_root)
        g, labels = data[0]
        split = data.get_idx_split()
        return g.to(device), labels.squeeze(dim=-1).to(device), data.num_classes, \
               split['train'].to(device), split['valid'].to(device), split['test'].to(device)
    elif name in ('cora', 'citeseer', 'pubmed'):
        data = load_citation_dataset(name)
    elif name == 'cora_full':
        data = gnn_benckmark.CoraFullDataset()
    elif name in ('cs', 'physics'):
        data = gnn_benckmark.Coauthor(name)
    elif name in ('photo', 'computers'):
        data = gnn_benckmark.AmazonCoBuy(name)
    else:
        raise ValueError('Unknown dataset:', name)

    g = data[0].to(device)
    # https://github.com/dmlc/dgl/issues/2479
    num_classes = data.num_classes
    if name in ('photo', 'computers'):
        num_classes = g.ndata['label'].max().item() + 1
    if 'train_mask' in g.ndata:
        train_idx = g.ndata['train_mask'].nonzero(as_tuple=True)[0]
        val_idx = g.ndata['val_mask'].nonzero(as_tuple=True)[0]
        test_idx = g.ndata['test_mask'].nonzero(as_tuple=True)[0]
    else:
        train_idx, val_idx, test_idx = split_idx(torch.arange(g.num_nodes()),
                                                 0.2, 0.3, seed)
    return g, g.ndata['label'], num_classes, train_idx.to(device), val_idx.to(
        device), test_idx.to(device)
Example #5
0
    def process_DglNodeDataset_hetero(self, dataset: DglNodePropPredDataset):
        graph, labels = dataset[0]
        self._name = dataset.name

        if self.node_types is None:
            self.node_types = graph.ntypes

        self.num_nodes_dict = {
            ntype: graph.num_nodes(ntype)
            for ntype in self.node_types
        }
        self.y_dict = labels

        self.x_dict = graph.ndata["feat"]

        for ntype, labels in self.y_dict.items():
            if labels.dim() == 2 and labels.shape[1] == 1:
                labels = labels.squeeze(1)
            graph.nodes[ntype].data["labels"] = labels

        if self.head_node_type is None:
            if self.y_dict is not None:
                self.head_node_type = list(self.y_dict.keys())[0]
            else:
                self.head_node_type = self.node_types[0]

        self.metapaths = graph.canonical_etypes

        split_idx = dataset.get_idx_split()
        self.training_idx, self.validation_idx, self.testing_idx = split_idx["train"][self.head_node_type], \
                                                                   split_idx["valid"][self.head_node_type], \
                                                                   split_idx["test"][self.head_node_type]

        self.G = graph
Example #6
0
def load_ogb_product(name):
    from ogb.nodeproppred import DglNodePropPredDataset

    os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset'))

    print('load', name)
    data = DglNodePropPredDataset(name=name)
    print('finish loading', name)
    splitted_idx = data.get_idx_split()
    graph, labels = data[0]
    labels = labels[:, 0]

    graph.ndata['label'] = labels
    in_feats = graph.ndata['feat'].shape[1]
    num_labels = len(
        torch.unique(labels[torch.logical_not(torch.isnan(labels))]))

    # Find the node IDs in the training, validation, and test set.
    train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx[
        'valid'], splitted_idx['test']
    train_mask = torch.zeros((graph.number_of_nodes(), ), dtype=torch.bool)
    train_mask[train_nid] = True
    val_mask = torch.zeros((graph.number_of_nodes(), ), dtype=torch.bool)
    val_mask[val_nid] = True
    test_mask = torch.zeros((graph.number_of_nodes(), ), dtype=torch.bool)
    test_mask[test_nid] = True
    graph.ndata['train_mask'] = train_mask
    graph.ndata['val_mask'] = val_mask
    graph.ndata['test_mask'] = test_mask

    return OGBDataset(graph, num_labels)
def load_ogb(name):
    from ogb.nodeproppred import DglNodePropPredDataset

    print('load', name)
    data = DglNodePropPredDataset(name=name)
    print('finish loading', name)
    splitted_idx = data.get_idx_split()
    graph, labels = data[0]
    labels = labels[:, 0]

    graph.ndata['features'] = graph.ndata['feat']
    del graph.ndata['feat']
    graph.ndata['labels'] = labels
    in_feats = graph.ndata['features'].shape[1]
    num_labels = len(th.unique(labels[th.logical_not(th.isnan(labels))]))

    # Find the node IDs in the training, validation, and test set.
    train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
    train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    train_mask[train_nid] = True
    val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    val_mask[val_nid] = True
    test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    test_mask[test_nid] = True
    graph.ndata['train_mask'] = train_mask
    graph.ndata['val_mask'] = val_mask
    graph.ndata['test_mask'] = test_mask
    print('finish constructing', name)
    return graph, num_labels
Example #8
0
def load_ogb(name):
    from ogb.nodeproppred import DglNodePropPredDataset

    data = DglNodePropPredDataset(name=name)
    splitted_idx = data.get_idx_split()
    graph, labels = data[0]
    labels = labels[:, 0]

    graph.ndata["features"] = graph.ndata["feat"]
    graph.ndata["labels"] = labels
    in_feats = graph.ndata["features"].shape[1]
    num_labels = len(th.unique(labels))

    # Find the node IDs in the training, validation, and test set.
    train_nid, val_nid, test_nid = (
        splitted_idx["train"],
        splitted_idx["valid"],
        splitted_idx["test"],
    )
    train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    train_mask[train_nid] = True
    val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    val_mask[val_nid] = True
    test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    test_mask[test_nid] = True
    graph.ndata["train_mask"] = train_mask
    graph.ndata["val_mask"] = val_mask
    graph.ndata["test_mask"] = test_mask
    return graph, len(th.unique(graph.ndata["labels"]))
Example #9
0
 def __init__(self, batch_size: int):
     super().__init__()
     dataset = DglNodePropPredDataset(name='ogbn-arxiv')
     self.split_idx = dataset.get_idx_split()
     self.g, labels = dataset[0]
     self.g.ndata["label"] = labels.squeeze()
     self.g = add_self_loop(self.g)
     self.batch_size = batch_size
Example #10
0
def load_dataset():
    dataset = DglNodePropPredDataset(name='ogbn-arxiv')
    split_idx = dataset.get_idx_split()

    # there is only one graph in Node Property Prediction datasets
    g, labels = dataset[0]
    g = dgl.add_self_loop(g)
    
    return g, labels, split_idx
Example #11
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Arxiv (Full-Batch)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--wd', type=float, default=0)
    parser.add_argument('--epochs', type=int, default=500)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--model', type=str, default='AFFN')
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = DglNodePropPredDataset(name='ogbn-arxiv')

    split_idx = dataset.get_idx_split()
    graph, label = dataset[0]

    # 有向图转无向图,添加反向边
    g = dgl.DGLGraph((graph.edges()[0], graph.edges()[1]))
    g.add_edges(graph.edges()[1], graph.edges()[0])

    x = graph.ndata['feat'].to(device)
    y_true = label.to(device)

    train_idx = split_idx['train'].to(device)

    model = Net(x.size(-1), args.hidden_channels, dataset.num_classes,
                args.model, args.dropout).to(device)
    print(model)

    evaluator = Evaluator(name='ogbn-arxiv')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(
            model.parameters(), lr=args.lr, weight_decay=args.wd)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, g, x, y_true, train_idx, optimizer)
            result = test(model, g, x, y_true, split_idx, evaluator)
            logger.add_result(run, result)

            train_acc, valid_acc, test_acc = result
            print(f'Run: {run + 1:02d}, '
                  f'Epoch: {epoch:02d}, '
                  f'Loss: {loss:.4f}, '
                  f'Train: {100 * train_acc:.2f}%, '
                  f'Valid: {100 * valid_acc:.2f}% '
                  f'Test: {100 * test_acc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()
Example #12
0
def load_data(path, device):
    data = DglNodePropPredDataset('ogbn-mag', path)
    g, labels = data[0]
    g = add_reverse_edges(g)
    labels = labels['paper'].to(device)
    split_idx = data.get_idx_split()
    train_idx = split_idx['train']['paper'].to(device)
    val_idx = split_idx['valid']['paper'].to(device)
    test_idx = split_idx['test']['paper'].to(device)
    return g, labels, data.num_classes, train_idx, val_idx, test_idx, Evaluator(data.name)
Example #13
0
def load_data(dataset):
    data = DglNodePropPredDataset(name=dataset)
    evaluator = Evaluator(name=dataset)

    splitted_idx = data.get_idx_split()
    train_idx, val_idx, test_idx = splitted_idx["train"], splitted_idx["valid"], splitted_idx["test"]
    graph, labels = data[0]
    graph.ndata["labels"] = labels

    return graph, labels, train_idx, val_idx, test_idx, evaluator
Example #14
0
 def __init__(self, dataset_name):
     super(NodeClassificationDataset, self).__init__()
     if dataset_name == 'ogb-mag':
         dataset = DglNodePropPredDataset(name='ogbn-mag')
     else:
         raise ValueError
     split_idx = dataset.get_idx_split()
     self.num_classes = dataset.num_classes
     self.train_idx, self.valid_idx, self.test_idx = split_idx[
         "train"], split_idx["valid"], split_idx["test"]
     self.g, self.label = dataset[0]
     self.category = 'paper'  # graph: dgl graph object, label: torch tensor of shape (num_nodes, num_tasks)
Example #15
0
def load_dataset(dataset_type, **kwargs):
    """
    Load dataset.
    Args:
        dataset_type: str, support 'proteins', 'cora', 'citeseer', 'pubmed', 'amazon', 'reddit'.
    """
    if dataset_type == 'proteins':
        data = DglNodePropPredDataset(name='ogbn-proteins',
                                      root=kwargs['root'])
        evaluator = Evaluator(name='ogbn-proteins')

        splitted_idx = data.get_idx_split()
        train_idx, val_idx, test_idx = splitted_idx["train"], splitted_idx[
            "valid"], splitted_idx["test"]
        graph, labels = data[0]
        species = graph.ndata['species']
        features = one_hot_encoder(species)
        graph.ndata['feat'] = features
        graph.ndata['label'] = labels

        return graph, labels, train_idx, val_idx, test_idx, evaluator

    if dataset_type == 'cora':
        dataset = dgl.data.CoraGraphDataset()
    elif dataset_type == 'citeseer':
        dataset = dgl.data.CiteseerGraphDataset()
    elif dataset_type == 'pubmed':
        dataset = dgl.data.PubmedGraphDataset()
    elif dataset_type == 'amazon':
        dataset = dgl.data.AmazonCoBuyComputerDataset()
    elif dataset_type == 'reddit':
        dataset = dgl.data.RedditDataset()
    else:
        raise (KeyError(
            'Dataset type {} not recognized.'.format(dataset_type)))

    if dataset_type == 'amazon':
        num_classes = dataset.num_classes
        graph = dataset[0]
        features = th.FloatTensor(graph.ndata['feat'])
        labels = th.LongTensor(graph.ndata['label'])
    else:
        num_classes = dataset.num_classes
        graph = dataset[0]
        features = th.FloatTensor(graph.ndata['feat'])
        labels = th.LongTensor(graph.ndata['label'])
        train_mask = th.BoolTensor(graph.ndata['train_mask'])
        val_mask = th.BoolTensor(graph.ndata['val_mask'])
        test_mask = th.BoolTensor(graph.ndata['test_mask'])

    return graph, features, labels, num_classes, train_mask, val_mask, test_mask
def load_ogb(name):

    tic_step = time.time()
    get_memory("-" * 40 + "---------------------from ogb.nodeproppred import DglNodePropPredDataset***************************")
    print('load', name)
    data = DglNodePropPredDataset(name=name)
    t1 = ttt(tic_step, "-"*40+"---------------------data = DglNodePropPredDataset(name=name)***************************")
    # get_memory("-"*40+"---------------------data = DglNodePropPredDataset(name=name)***************************")
    print('finish loading', name)
    splitted_idx = data.get_idx_split()
    t2 = ttt(t1,"-" * 40 + "---------------------splitted_idx = data.get_idx_split()***************************")
    # get_memory("-" * 40 + "---------------------splitted_idx = data.get_idx_split()***************************")
    graph, labels = data[0]
    # get_memory("-" * 40 + "---------------------graph, labels = data[0]***************************")
    t3 = ttt(t2, "-" * 40 + "---------------------graph, labels = data[0]***************************")
    print(labels)
    print(data[0])
    print(graph)
    labels = labels[:, 0]
    # get_memory("-" * 40 + "---------------------labels = labels[:, 0]***************************")
    t4 = ttt(t3, "-" * 40 + "---------------------labels = labels[:, 0]***************************")

    graph.ndata['features'] = graph.ndata['feat']
    # get_memory("-" * 40 + "---------------------graph.ndata['features'] = graph.ndata['feat']***************************")
    t5 = ttt(t4, "-" * 40 + "---------------------graph.ndata['features'] = graph.ndata['feat']***************************")
    graph.ndata['labels'] = labels
    t6 = ttt(t5, "-" * 40 + "---------graph.ndata['labels'] = labels******************")
    in_feats = graph.ndata['features'].shape[1]
    num_labels = len(th.unique(labels[th.logical_not(th.isnan(labels))]))

    # Find the node IDs in the training, validation, and test set.
    train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
    t7 = ttt(t6, "-" * 40 + "---------train_nid, val_nid, test_nid = splitted_idx******************")
    # get_memory(
	    # "-" * 40 + "---------------------train_nid, val_nid, test_nid = splitted_idx***************************")
    train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    train_mask[train_nid] = True
    val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    val_mask[val_nid] = True
    test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    test_mask[test_nid] = True
    graph.ndata['train_mask'] = train_mask
    graph.ndata['val_mask'] = val_mask
    graph.ndata['test_mask'] = test_mask
    t8 = ttt(t7, "-" * 40 + "---------end of load ogb******************")
    # get_memory(
	    # "-" * 40 + "---------------------end of load ogb***************************")

    print('finish constructing', name)
    print('load ogb-products time total: '+ str(time.time()-tic_step))
    return graph, num_labels
Example #17
0
def load_data(dataset):
    global n_node_feats, n_classes

    data = DglNodePropPredDataset(name=dataset)
    evaluator = Evaluator(name=dataset)

    splitted_idx = data.get_idx_split()
    train_idx, val_idx, test_idx = splitted_idx["train"], splitted_idx["valid"], splitted_idx["test"]
    graph, labels = data[0]

    n_node_feats = graph.ndata["feat"].shape[1]
    n_classes = (labels.max() + 1).item()

    return graph, labels, train_idx, val_idx, test_idx, evaluator
Example #18
0
 def __init__(self, dataset_name):
     super(NodeClassificationDataset, self).__init__()
     if dataset_name in ['aifb', 'mutag', 'bgs', 'am']:
         self.g, self.category, self.num_classes = self.load_RDF_dgl(
             dataset_name)
     elif dataset_name in ['acm', 'imdb', 'acm1', 'academic']:
         self.g, self.category, self.num_classes = self.load_HIN(
             dataset_name)
     elif dataset_name in 'ogbn-mag':
         dataset = DglNodePropPredDataset(name='ogbn-mag')
         split_idx = dataset.get_idx_split()
         self.num_classes = dataset.num_classes
         self.train_idx, self.valid_idx, self.test_idx = split_idx[
             "train"], split_idx["valid"], split_idx["test"]
         self.g, self.label = dataset[0]
         self.category = 'paper'  # graph: dgl graph object, label: torch tensor of shape (num_nodes, num_tasks)
def load_ogb(dataset):
    if dataset == 'ogbn-mag':
        dataset = DglNodePropPredDataset(name=dataset)
        split_idx = dataset.get_idx_split()
        train_idx = split_idx["train"]['paper']
        val_idx = split_idx["valid"]['paper']
        test_idx = split_idx["test"]['paper']
        hg_orig, labels = dataset[0]
        subgs = {}
        for etype in hg_orig.canonical_etypes:
            u, v = hg_orig.all_edges(etype=etype)
            subgs[etype] = (u, v)
            subgs[(etype[2], 'rev-' + etype[1], etype[0])] = (v, u)
        hg = dgl.heterograph(subgs)
        hg.nodes['paper'].data['feat'] = hg_orig.nodes['paper'].data['feat']
        paper_labels = labels['paper'].squeeze()

        num_rels = len(hg.canonical_etypes)
        num_of_ntype = len(hg.ntypes)
        num_classes = dataset.num_classes
        category = 'paper'
        print('Number of relations: {}'.format(num_rels))
        print('Number of class: {}'.format(num_classes))
        print('Number of train: {}'.format(len(train_idx)))
        print('Number of valid: {}'.format(len(val_idx)))
        print('Number of test: {}'.format(len(test_idx)))

        # get target category id
        category_id = len(hg.ntypes)
        for i, ntype in enumerate(hg.ntypes):
            if ntype == category:
                category_id = i

        train_mask = th.zeros((hg.number_of_nodes('paper'), ), dtype=th.bool)
        train_mask[train_idx] = True
        val_mask = th.zeros((hg.number_of_nodes('paper'), ), dtype=th.bool)
        val_mask[val_idx] = True
        test_mask = th.zeros((hg.number_of_nodes('paper'), ), dtype=th.bool)
        test_mask[test_idx] = True
        hg.nodes['paper'].data['train_mask'] = train_mask
        hg.nodes['paper'].data['val_mask'] = val_mask
        hg.nodes['paper'].data['test_mask'] = test_mask

        hg.nodes['paper'].data['labels'] = paper_labels
        return hg
    else:
        raise ("Do not support other ogbn datasets.")
Example #20
0
def load_ogb_data(dataset, device):
    from ogb.nodeproppred import DglNodePropPredDataset
    data = DglNodePropPredDataset(name="ogbn-" + dataset, root='data')
    splitted_idx = data.get_idx_split()
    idx_train, idx_val, idx_test = splitted_idx["train"], splitted_idx[
        "valid"], splitted_idx["test"]
    graph, labels = data[0]
    labels = labels.squeeze()
    srcs, dsts = graph.all_edges()
    graph.add_edges(dsts, srcs)
    graph = graph.remove_self_loop().add_self_loop()
    features = graph.ndata['feat']
    graph = graph.to(device)
    features = features.to(device)
    labels = labels.to(device)
    idx_train = idx_train.to(device)
    idx_val = idx_val.to(device)
    idx_test = idx_test.to(device)
    return graph, features, labels, idx_train, idx_val, idx_test
Example #21
0
File: utils.py Project: yifeim/dgl
def load_ogbn_mag(root: str = None) -> OGBDataset:
    dataset = DglNodePropPredDataset(name='ogbn-mag', root=root)

    split_idx = dataset.get_idx_split()

    train_idx = split_idx['train']['paper']
    valid_idx = split_idx['valid']['paper']
    test_idx = split_idx['test']['paper']

    hg_original, labels = dataset[0]

    labels = labels['paper'].squeeze()
    num_labels = dataset.num_classes

    subgraphs = {}

    for etype in hg_original.canonical_etypes:
        src, dst = hg_original.all_edges(etype=etype)

        subgraphs[etype] = (src, dst)
        subgraphs[(etype[2], f'rev-{etype[1]}', etype[0])] = (dst, src)

    hg = dgl.heterograph(subgraphs)

    hg.nodes['paper'].data['feat'] = hg_original.nodes['paper'].data['feat']
    hg.nodes['paper'].data['labels'] = labels

    train_mask = torch.zeros((hg.num_nodes('paper'), ), dtype=torch.bool)
    train_mask[train_idx] = True
    valid_mask = torch.zeros((hg.num_nodes('paper'), ), dtype=torch.bool)
    valid_mask[valid_idx] = True
    test_mask = torch.zeros((hg.num_nodes('paper'), ), dtype=torch.bool)
    test_mask[test_idx] = True

    hg.nodes['paper'].data['train_mask'] = train_mask
    hg.nodes['paper'].data['valid_mask'] = valid_mask
    hg.nodes['paper'].data['test_mask'] = test_mask

    ogb_dataset = OGBDataset(hg, num_labels, 'paper')

    return ogb_dataset
Example #22
0
def load_ogbn_mag(device, add_reverse_edge, reverse_self):
    """加载ogbn-mag数据集

    :param device: torch.device 将图和数据移动到指定的设备上,默认为CPU
    :param add_reverse_edge: bool 是否添加反向边
    :param reverse_self: bool 起点和终点类型相同时是否添加反向边
    :return: dataset, g, features, labels, predict_ntype, train_mask, val_mask, test_mask, evaluator
    """
    data = DglNodePropPredDataset('ogbn-mag', DATA_DIR)
    g, labels = data[0]
    if add_reverse_edge:
        g = add_reverse_edges(g, reverse_self)
    g = g.to(device)
    features = g.nodes['paper'].data['feat']
    labels = labels['paper'].squeeze(dim=1).to(device)
    split_idx = data.get_idx_split()
    train_idx = split_idx['train']['paper'].to(device)
    val_idx = split_idx['valid']['paper'].to(device)
    test_idx = split_idx['test']['paper'].to(device)
    evaluator = Evaluator(data.name)
    return data, g, features, labels, 'paper', train_idx, val_idx, test_idx, evaluator
Example #23
0
def load_dataset(name, device):
    """
    Load dataset and move graph and features to device
    """
    if name not in ["ogbn-products", "ogbn-arxiv", "ogbn-mag"]:
        raise RuntimeError("Dataset {} is not supported".format(name))
    dataset = DglNodePropPredDataset(name=name)
    splitted_idx = dataset.get_idx_split()
    train_nid = splitted_idx["train"]
    val_nid = splitted_idx["valid"]
    test_nid = splitted_idx["test"]
    g, labels = dataset[0]
    g = g.to(device)
    if name == "ogbn-arxiv":
        g = dgl.add_reverse_edges(g, copy_ndata=True)
        g = dgl.add_self_loop(g)
        g.ndata['feat'] = g.ndata['feat'].float()
    elif name == "ogbn-mag":
        # MAG is a heterogeneous graph. The task is to make prediction for
        # paper nodes
        labels = labels["paper"]
        train_nid = train_nid["paper"]
        val_nid = val_nid["paper"]
        test_nid = test_nid["paper"]
        g = convert_mag_to_homograph(g, device)
    else:
        g.ndata['feat'] = g.ndata['feat'].float()
    n_classes = dataset.num_classes
    labels = labels.squeeze()
    evaluator = get_ogb_evaluator(name)

    print(f"# Nodes: {g.number_of_nodes()}\n"
          f"# Edges: {g.number_of_edges()}\n"
          f"# Train: {len(train_nid)}\n"
          f"# Val: {len(val_nid)}\n"
          f"# Test: {len(test_nid)}\n"
          f"# Classes: {n_classes}")

    return g, labels, n_classes, train_nid, val_nid, test_nid, evaluator
Example #24
0
def load_mag(device, args):
    from ogb.nodeproppred import DglNodePropPredDataset
    path = args.use_emb
    home_dir = os.getenv("HOME")
    dataset = DglNodePropPredDataset(name="ogbn-mag",
                                     root=os.path.join(home_dir, ".ogb",
                                                       "dataset"))
    g, labels = dataset[0]
    splitted_idx = dataset.get_idx_split()
    train_nid = splitted_idx["train"]['paper']
    val_nid = splitted_idx["valid"]['paper']
    test_nid = splitted_idx["test"]['paper']
    features = g.nodes['paper'].data['feat']
    author_emb = torch.load(os.path.join(path, "author.pt")).float()
    topic_emb = torch.load(os.path.join(path, "field_of_study.pt")).float()
    institution_emb = torch.load(os.path.join(path, "institution.pt")).float()

    g.nodes["author"].data["feat"] = author_emb.to(device)
    g.nodes["institution"].data["feat"] = institution_emb.to(device)
    g.nodes["field_of_study"].data["feat"] = topic_emb.to(device)
    g.nodes["paper"].data["feat"] = features.to(device)
    paper_dim = g.nodes["paper"].data["feat"].shape[1]
    author_dim = g.nodes["author"].data["feat"].shape[1]
    if paper_dim != author_dim:
        paper_feat = g.nodes["paper"].data.pop("feat")
        rand_weight = torch.Tensor(paper_dim, author_dim).uniform_(-0.5, 0.5)
        g.nodes["paper"].data["feat"] = torch.matmul(paper_feat,
                                                     rand_weight.to(device))
        print(
            f"Randomly project paper feature from dimension {paper_dim} to {author_dim}"
        )

    labels = labels['paper'].to(device).squeeze()
    n_classes = int(labels.max() - labels.min()) + 1
    train_nid, val_nid, test_nid = np.array(train_nid), np.array(
        val_nid), np.array(test_nid)
    return g, labels, n_classes, train_nid, val_nid, test_nid
Example #25
0
def load_data(name, ogb_root, device):
    if name in ('ogbn-products', 'ogbn-arxiv'):
        data = DglNodePropPredDataset(name, ogb_root)
        g, labels = data[0]
        if name == 'ogbn-arxiv':
            g = dgl.to_bidirected(g, copy_ndata=True)
            feat = g.ndata['feat']
            feat = (feat - feat.mean(dim=0)) / feat.std(dim=0)
            g.ndata['feat'] = feat
        g = g.to(device)
        labels = labels.squeeze(dim=1).to(device)
        split_idx = data.get_idx_split()
        train_idx = split_idx['train'].to(device)
        val_idx = split_idx['valid'].to(device)
        test_idx = split_idx['test'].to(device)
        return g, labels, data.num_classes, train_idx, val_idx, test_idx
    else:
        data = load_citation_dataset(name)
        g = data[0].to(device)
        train_idx = g.ndata['train_mask'].nonzero(as_tuple=True)[0]
        val_idx = g.ndata['val_mask'].nonzero(as_tuple=True)[0]
        test_idx = g.ndata['test_mask'].nonzero(as_tuple=True)[0]
        return g, g.ndata[
            'label'], data.num_classes, train_idx, val_idx, test_idx
Example #26
0
    argparser.add_argument('--lr', type=float, default=0.001)
    argparser.add_argument('--num-workers', type=int, default=8,
        help="Number of sampling processes. Use 0 for no extra process.")
    argparser.add_argument('--save-pred', type=str, default='')
    argparser.add_argument('--head', type=int, default=4)
    argparser.add_argument('--wd', type=float, default=0)
    args = argparser.parse_args()
    
    if args.gpu >= 0:
        device = th.device('cuda:%d' % args.gpu)
    else:
        device = th.device('cpu')

    # load data
    data = DglNodePropPredDataset(name='ogbn-products')
    splitted_idx = data.get_idx_split()
    train_idx, val_idx, test_idx = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
    graph, labels = data[0]
    nfeat = graph.ndata.pop('feat').to(device)
    labels = labels[:, 0].to(device)

    print('Total edges before adding self-loop {}'.format(graph.num_edges()))
    graph = graph.remove_self_loop().add_self_loop()
    print('Total edges after adding self-loop {}'.format(graph.num_edges()))

    in_feats = nfeat.shape[1]
    n_classes = (labels.max() + 1).item()

    # Create csr/coo/csc formats before launching sampling processes
    # This avoids creating certain formats in each data loader process, which saves momory and CPU.
    graph.create_formats_()
Example #27
0
        'field_of_study']

    # add types of edges, not used in this work
    for src_type, etype, dst_type in original_graph.canonical_etypes:
        graph.edges[(src_type, etype, dst_type)].data['reltype'] = \
        original_graph.edges[(src_type, etype, dst_type)].data['reltype']
        graph.edges[(dst_type, f'rev_{etype}', src_type)].data['reltype'] = \
        original_graph.edges[(src_type, etype, dst_type)].data['reltype'] + len(original_graph.etypes)

    graph_output_path = '../dataset/OGB_MAG/OGB_MAG.pkl'

    save_graphs(graph_output_path, graph, labels)

    print(f"{graph_output_path} writes successfully.")

    split_idx = dataset.get_idx_split()

    split_idx = {
        'train': {
            'paper': split_idx['train']['paper']
        },
        'valid': {
            'paper': split_idx['valid']['paper']
        },
        'test': {
            'paper': split_idx['test']['paper']
        }
    }
    split_idx_output_path = '../dataset/OGB_MAG/OGB_MAG_split_idx.pkl'
    torch.save(split_idx, split_idx_output_path)
    print(f"{split_idx_output_path} writes successfully.")
Example #28
0
def main():
    parser = argparse.ArgumentParser(
        description='OGBN-Arxiv (GraphSAGE Full-Batch)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=500)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument("--eval",
                        action='store_true',
                        help='If not set, we will only do the training part.')
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = DglNodePropPredDataset(name='ogbn-arxiv')
    split_idx = dataset.get_idx_split()

    g, labels = dataset[0]
    feats = g.ndata['feat']
    g = dgl.to_bidirected(g)
    g = g.int().to(device)
    feats, labels = feats.to(device), labels.to(device)
    train_idx = split_idx['train'].to(device)

    model = GraphSAGE(in_feats=feats.size(-1),
                      hidden_feats=args.hidden_channels,
                      out_feats=dataset.num_classes,
                      num_layers=args.num_layers,
                      dropout=args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-arxiv')
    logger = Logger(args.runs, args)

    dur = []
    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            t0 = time.time()
            loss = train(model, g, feats, labels, train_idx, optimizer)
            if epoch >= 3:
                dur.append(time.time() - t0)
                print('Training time/epoch {}'.format(np.mean(dur)))
            if not args.eval:
                continue

            result = test(model, g, feats, labels, split_idx, evaluator)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')

        if args.eval:
            logger.print_statistics(run)
    if args.eval:
        logger.print_statistics()
Example #29
0
def main():
    # check cuda
    device = f'cuda:{args.gpu}' if torch.cuda.is_available() and args.gpu >= 0 else 'cpu'
    # load data
    dataset = DglNodePropPredDataset(name=args.dataset)
    evaluator = Evaluator(name=args.dataset)

    split_idx = dataset.get_idx_split()
    g, labels = dataset[0] # graph: DGLGraph object, label: torch tensor of shape (num_nodes, num_tasks)
    
    if args.dataset == 'ogbn-arxiv':
        g = dgl.to_bidirected(g, copy_ndata=True)
        
        feat = g.ndata['feat']
        feat = (feat - feat.mean(0)) / feat.std(0)
        g.ndata['feat'] = feat

    g = g.to(device)
    feats = g.ndata['feat']
    labels = labels.to(device)

    # load masks for train / validation / test
    train_idx = split_idx["train"].to(device)
    valid_idx = split_idx["valid"].to(device)
    test_idx = split_idx["test"].to(device)

    n_features = feats.size()[-1]
    n_classes = dataset.num_classes
    
    # load model
    if args.model == 'mlp':
        model = MLP(n_features, args.hid_dim, n_classes, args.num_layers, args.dropout)
    elif args.model == 'linear':
        model = MLPLinear(n_features, n_classes)
    else:
        raise NotImplementedError(f'Model {args.model} is not supported.')

    model = model.to(device)
    print(f'Model parameters: {sum(p.numel() for p in model.parameters())}')

    if args.pretrain:
        print('---------- Before ----------')
        model.load_state_dict(torch.load(f'base/{args.dataset}-{args.model}.pt'))
        model.eval()

        y_soft = model(feats).exp()

        y_pred = y_soft.argmax(dim=-1, keepdim=True)
        valid_acc = evaluate(y_pred, labels, valid_idx, evaluator)
        test_acc = evaluate(y_pred, labels, test_idx, evaluator)
        print(f'Valid acc: {valid_acc:.4f} | Test acc: {test_acc:.4f}')

        print('---------- Correct & Smoothing ----------')
        cs = CorrectAndSmooth(num_correction_layers=args.num_correction_layers,
                              correction_alpha=args.correction_alpha,
                              correction_adj=args.correction_adj,
                              num_smoothing_layers=args.num_smoothing_layers,
                              smoothing_alpha=args.smoothing_alpha,
                              smoothing_adj=args.smoothing_adj,
                              autoscale=args.autoscale,
                              scale=args.scale)
        
        mask_idx = torch.cat([train_idx, valid_idx])
        y_soft = cs.correct(g, y_soft, labels[mask_idx], mask_idx)
        y_soft = cs.smooth(g, y_soft, labels[mask_idx], mask_idx)
        y_pred = y_soft.argmax(dim=-1, keepdim=True)
        valid_acc = evaluate(y_pred, labels, valid_idx, evaluator)
        test_acc = evaluate(y_pred, labels, test_idx, evaluator)
        print(f'Valid acc: {valid_acc:.4f} | Test acc: {test_acc:.4f}')
    else:
        opt = optim.Adam(model.parameters(), lr=args.lr)

        best_acc = 0
        best_model = copy.deepcopy(model)

        # training
        print('---------- Training ----------')
        for i in range(args.epochs):

            model.train()
            opt.zero_grad()

            logits = model(feats)
            
            train_loss = F.nll_loss(logits[train_idx], labels.squeeze(1)[train_idx])
            train_loss.backward()

            opt.step()
            
            model.eval()
            with torch.no_grad():
                logits = model(feats)
                
                y_pred = logits.argmax(dim=-1, keepdim=True)

                train_acc = evaluate(y_pred, labels, train_idx, evaluator)
                valid_acc = evaluate(y_pred, labels, valid_idx, evaluator)

                print(f'Epoch {i} | Train loss: {train_loss.item():.4f} | Train acc: {train_acc:.4f} | Valid acc {valid_acc:.4f}')

                if valid_acc > best_acc:
                    best_acc = valid_acc
                    best_model = copy.deepcopy(model)
        
        # testing & saving model
        print('---------- Testing ----------')
        best_model.eval()
        
        logits = best_model(feats)
        
        y_pred = logits.argmax(dim=-1, keepdim=True)
        test_acc = evaluate(y_pred, labels, test_idx, evaluator)
        print(f'Test acc: {test_acc:.4f}')

        if not os.path.exists('base'):
            os.makedirs('base')

        torch.save(best_model.state_dict(), f'base/{args.dataset}-{args.model}.pt')
Example #30
0
def main():
    global device, in_feats, n_classes, epsilon

    argparser = argparse.ArgumentParser("GAT on OGBN-Arxiv", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    argparser.add_argument("--cpu", action="store_true", help="CPU mode. This option overrides --gpu.")
    argparser.add_argument("--gpu", type=int, default=0, help="GPU device ID.")
    argparser.add_argument("--n-runs", type=int, default=10)
    argparser.add_argument("--n-epochs", type=int, default=2000)
    argparser.add_argument(
        "--use-labels", action="store_true", help="Use labels in the training set as input features."
    )
    argparser.add_argument("--use-norm", action="store_true", help="Use symmetrically normalized adjacency matrix.")
    argparser.add_argument("--lr", type=float, default=0.002)
    argparser.add_argument("--n-layers", type=int, default=3)
    argparser.add_argument("--n-heads", type=int, default=3)
    argparser.add_argument("--n-hidden", type=int, default=256)
    argparser.add_argument("--dropout", type=float, default=0.75)
    argparser.add_argument("--attn_drop", type=float, default=0.05)
    argparser.add_argument("--wd", type=float, default=0)
    argparser.add_argument("--log-every", type=int, default=20)
    argparser.add_argument("--plot-curves", action="store_true")
    argparser.add_argument("--competition", action="store_true")
    args = argparser.parse_args()

    if args.cpu:
        device = th.device("cpu")
    else:
        device = th.device("cuda:%d" % args.gpu)

    # load data
    if not args.competition:
        data = DglNodePropPredDataset(name="ogbn-arxiv")
        evaluator = Evaluator(name="ogbn-arxiv")

        splitted_idx = data.get_idx_split()
        train_idx, val_idx, test_idx = splitted_idx["train"], splitted_idx["valid"], splitted_idx["test"]
        graph, labels = data[0]
    else:
        evaluator = Evaluator(name="ogbn-arxiv")

        edges = pd.read_csv("dataset/ogbn_arxiv/pgl/edges.csv", header=None, names=["src", "dst"]).values
        graph = dgl.graph((edges[:, 0], edges[:, 1]))

        node_feat = np.load("dataset/ogbn_arxiv/pgl/feat.npy")
        graph.ndata['feat'] = th.from_numpy(node_feat)

        df = pd.read_csv("dataset/ogbn_arxiv/pgl/train.csv")
        node_index = df["nid"].values
        labels = np.zeros(node_feat.shape[0], dtype=int)
        for k, v in enumerate(df["nid"]):
            labels[v] = df["label"][k]

        labels = th.from_numpy(labels).reshape((len(labels) ,1))

        train_part = int(len(node_index) * 0.8)
        #train_idx = th.from_numpy(node_index[:train_part])
        train_idx = th.from_numpy(node_index)
        val_idx = th.from_numpy(node_index[train_part:])
        
        test_idx = val_idx
        # test_idx = th.from_numpy(pd.read_csv("dataset/ogbn_arxiv/pgl/test.csv")["nid"].values)

    # add reverse edges
    srcs, dsts = graph.all_edges()
    graph.add_edges(dsts, srcs)

    # add self-loop
    print(f"Total edges before adding self-loop {graph.number_of_edges()}")
    graph = graph.remove_self_loop().add_self_loop()
    print(f"Total edges after adding self-loop {graph.number_of_edges()}")

    in_feats = graph.ndata["feat"].shape[1]
    n_classes = (labels.max() + 1).item()
    # graph.create_format_()

    train_idx = train_idx.to(device)
    val_idx = val_idx.to(device)
    test_idx = test_idx.to(device)
    labels = labels.to(device)
    graph = graph.to(device)

    # run
    val_accs = []
    test_accs = []
    model_dir = f'../models/arxiv_gat'
       
    if os.path.exists(model_dir):
        shutil.rmtree(model_dir)
    os.makedirs(model_dir)
    with open(f'{model_dir}/metadata', 'w') as f:
        f.write(f'# of params: {sum(p.numel() for p in gen_model(args).parameters())}\n')

    for i in range(1, args.n_runs + 1):
        val_acc, test_acc, out = run(args, graph, labels, train_idx, val_idx, test_idx, evaluator, i)
        val_accs.append(val_acc)
        test_accs.append(test_acc)
        th.save(F.softmax(out, dim=1), f'{model_dir}/{i-1}.pt')

    print(f"Runned {args.n_runs} times")
    print("Val Accs:", val_accs)
    print("Test Accs:", test_accs)
    print(f"Average val accuracy: {np.mean(val_accs)} ± {np.std(val_accs)}")
    print(f"Average test accuracy: {np.mean(test_accs)} ± {np.std(test_accs)}")
    print(f"Number of params: {count_parameters(args)}")