Example #1
0
    def __init__(self, root, name):
        super(OGBNDataset, self).__init__(root)
        dataset = NodePropPredDataset(name, root)
        graph, y = dataset[0]
        x = torch.tensor(graph["node_feat"])
        y = torch.tensor(y.squeeze())
        row, col, edge_attr = coalesce(graph["edge_index"][0], graph["edge_index"][1], graph["edge_feat"])
        edge_index = torch.stack([row, col], dim=0)
        edge_index, edge_attr = remove_self_loops(edge_index, edge_attr)
        row = torch.cat([edge_index[0], edge_index[1]])
        col = torch.cat([edge_index[1], edge_index[0]])
        edge_index = torch.stack([row, col], dim=0)
        if edge_attr is not None:
            edge_attr = torch.cat([edge_attr, edge_attr], dim=0)

        self.data = Graph(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
        self.data.num_nodes = graph["num_nodes"]
        assert self.data.num_nodes == self.data.x.shape[0]

        # split
        split_index = dataset.get_idx_split()
        self.data.train_mask = torch.zeros(self.data.num_nodes, dtype=torch.bool)
        self.data.test_mask = torch.zeros(self.data.num_nodes, dtype=torch.bool)
        self.data.val_mask = torch.zeros(self.data.num_nodes, dtype=torch.bool)
        self.data.train_mask[split_index["train"]] = True
        self.data.test_mask[split_index["test"]] = True
        self.data.val_mask[split_index["valid"]] = True

        self.transform = None
Example #2
0
    def __init__(self, root, name):
        self.name = name
        from ogb.nodeproppred import NodePropPredDataset
        dataset = NodePropPredDataset(name=name, root=root)
        split_idx = dataset.get_idx_split()
        data = dataset[0]
        num_nodes=data[1].shape[0]
        edge = data[0]["edge_index"]
        if name == "ogbn-arxiv":
            #convert ogbn-arxiv to undirected graph
            edge = np.concatenate([edge, edge[[1, 0]]], axis=1)
        self.graph = _C.Graph(
            edge_index=edge,
            num_nodes=num_nodes
        )
        split_idx = dataset.get_idx_split()
        train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

        self.x = data[0]["node_feat"]
        self.y = data[1].squeeze()
        self.train_mask = np.zeros(num_nodes, np.int32)
        self.train_mask[train_idx] = 1
        self.train_mask[test_idx] = 2
        self.num_classes = dataset.num_classes
Example #3
0
 def ogbn_dataset_to_general_static_graph(
     cls,
     ogbn_dataset: NodePropPredDataset,
     nodes_label_key: str,
     nodes_data_key_mapping: _typing.Optional[_typing.Mapping[str,
                                                              str]] = ...,
     edges_data_key_mapping: _typing.Optional[_typing.Mapping[str,
                                                              str]] = ...,
     graph_data_key_mapping: _typing.Optional[_typing.Mapping[str,
                                                              str]] = ...
 ) -> GeneralStaticGraph:
     split_idx = ogbn_dataset.get_idx_split()
     return cls.ogbn_data_to_general_static_graph(
         ogbn_dataset[0][0], ogbn_dataset[0][1], nodes_label_key,
         split_idx["train"], split_idx["valid"], split_idx["test"],
         nodes_data_key_mapping, edges_data_key_mapping,
         graph_data_key_mapping)
Example #4
0
    # log_out.write(args)
    print(args, file=log_out, flush=True)

    epochs = args.epoch
    node_dim = args.node_dim
    num_channels = args.num_channels
    lr = args.lr
    weight_decay = args.weight_decay
    num_layers = args.num_layers
    norm = args.norm
    adaptive_lr = args.adaptive_lr

    if args.ogb_mag:
        print("Using OGB MAG", flush=True)
        dataset = NodePropPredDataset(name="ogbn-mag")
        split_idx = dataset.get_idx_split()
        train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
            "valid"], split_idx["test"]
        graph, label = dataset[0]  # graph: library-agnostic graph object

        AvsI = graph['edge_index_dict'][('author', 'affiliated_with',
                                         'institution')]
        AvsP = graph['edge_index_dict'][('author', 'writes', 'paper')]
        PvsP = graph['edge_index_dict'][('paper', 'cites', 'paper')]
        PvsS = graph['edge_index_dict'][('paper', 'has_topic',
                                         'field_of_study')]

        # empty_lists = [ [] for _ in range(len(AvsI[0])) ]
        # AvsIdict = dict(zip(AvsI[0],empty_lists))
        empty_lists = [[] for _ in range(len(AvsI[1]))]
        IvsAdict = dict(zip(AvsI[1], empty_lists))
Example #5
0
def get_graph_data(d_name="ogbn-proteins", mini_data=False):
    """
        Param:
            d_name: name of dataset
            mini_data: if mini_data==True, only use a small dataset (for test)
    """
    # import ogb data
    dataset = NodePropPredDataset(name=d_name)
    num_tasks = dataset.num_tasks  # obtaining the number of prediction tasks in a dataset

    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
        "valid"], split_idx["test"]
    graph, label = dataset[0]

    # reshape
    graph["edge_index"] = graph["edge_index"].T

    # mini dataset
    if mini_data:
        graph['num_nodes'] = 500
        mask = (graph['edge_index'][:, 0] < 500) * (graph['edge_index'][:, 1] <
                                                    500)
        graph["edge_index"] = graph["edge_index"][mask]
        graph["edge_feat"] = graph["edge_feat"][mask]
        label = label[:500]
        train_idx = np.arange(0, 400)
        valid_idx = np.arange(400, 450)
        test_idx = np.arange(450, 500)

    # read/compute node feature
    if mini_data:
        node_feat_path = './dataset/ogbn_proteins_node_feat_small.npy'
    else:
        node_feat_path = './dataset/ogbn_proteins_node_feat.npy'

    new_node_feat = None
    if os.path.exists(node_feat_path):
        print("Begin: read node feature".center(50, '='))
        new_node_feat = np.load(node_feat_path)
        print("End: read node feature".center(50, '='))
    else:
        print("Begin: compute node feature".center(50, '='))
        start = time.perf_counter()
        for i in range(graph['num_nodes']):
            if i % 100 == 0:
                dur = time.perf_counter() - start
                print("{}/{}({}%), times: {:.2f}s".format(
                    i, graph['num_nodes'], i / graph['num_nodes'] * 100, dur))
            mask = (graph['edge_index'][:, 0] == i)

            current_node_feat = np.mean(np.compress(mask,
                                                    graph['edge_feat'],
                                                    axis=0),
                                        axis=0,
                                        keepdims=True)
            if i == 0:
                new_node_feat = [current_node_feat]
            else:
                new_node_feat.append(current_node_feat)

        new_node_feat = np.concatenate(new_node_feat, axis=0)
        print("End: compute node feature".center(50, '='))

        print("Saving node feature in " + node_feat_path.center(50, '='))
        np.save(node_feat_path, new_node_feat)
        print("Saving finish".center(50, '='))

    print(new_node_feat)

    # create graph
    g = pgl.graph.Graph(num_nodes=graph["num_nodes"],
                        edges=graph["edge_index"],
                        node_feat={'node_feat': new_node_feat},
                        edge_feat=None)
    print("Create graph")
    print(g)
    return g, label, train_idx, valid_idx, test_idx, Evaluator(d_name)
Example #6
0
dataset = OGB(ogb_dataset, transforms=[GCNFilter(), AdjToSpTensor()])
graph = dataset[0]
x, adj, y = graph.x, graph.a, graph.y

# Parameters
channels = 256  # Number of channels for GCN layers
dropout = 0.5  # Dropout rate for the features
learning_rate = 1e-2  # Learning rate
epochs = 200  # Number of training epochs

N = dataset.n_nodes  # Number of nodes in the graph
F = dataset.n_node_features  # Original size of node features
n_out = ogb_dataset.num_classes  # OGB labels are sparse indices

# Data splits
idx = ogb_dataset.get_idx_split()
idx_tr, idx_va, idx_te = idx["train"], idx["valid"], idx["test"]
mask_tr = np.zeros(N, dtype=bool)
mask_va = np.zeros(N, dtype=bool)
mask_te = np.zeros(N, dtype=bool)
mask_tr[idx_tr] = True
mask_va[idx_va] = True
mask_te[idx_te] = True
masks = [mask_tr, mask_va, mask_te]

# Model definition
x_in = Input(shape=(F, ))
a_in = Input((N, ), sparse=True)
x_1 = GCNConv(channels, activation="relu")([x_in, a_in])
x_1 = BatchNormalization()(x_1)
x_1 = Dropout(dropout)(x_1)
Example #7
0
        'y_true': y[te_mask],
        'y_pred': p[te_mask]
    })['rocauc']
    return tr_auc, va_auc, te_auc


# Load data
dataset_name = 'ogbn-proteins'
dataset = NodePropPredDataset(dataset_name)
evaluator = Evaluator(dataset_name)
graph, y = dataset[0]
X, A, _ = ogb.graph_to_numpy(graph)
N = A.shape[0]

# Data splits
idxs = dataset.get_idx_split()
tr_idx, va_idx, te_idx = idxs["train"], idxs["valid"], idxs["test"]
tr_mask = np.zeros(N, dtype=bool)
tr_mask[tr_idx] = True
va_mask = np.zeros(N, dtype=bool)
va_mask[va_idx] = True
te_mask = np.zeros(N, dtype=bool)
te_mask[te_idx] = True
masks = [tr_mask, va_mask, te_mask]

# Parameters
channels = 256
learning_rate = 1e-2
epochs = 200
es_patience = 200
F = X.shape[1]
Example #8
0
    def process(self):
        dataset = NodePropPredDataset(name=self.name, root="./data")
        node_type_dict = {"paper": 0, "author": 1, "field_of_study": 2, "institution": 3}
        edge_type_dict = {
            ("paper", "cites", "paper"): 0,
            ("author", "affiliated_with", "institution"): 1,
            ("author", "writes", "paper"): 2,
            ("paper", "has_topic", "field_of_study"): 3,
        }
        num_nodes_dict = dataset[0][0]["num_nodes_dict"]
        num_nodes = torch.as_tensor(
            [0]
            + [
                num_nodes_dict["paper"],
                num_nodes_dict["author"],
                num_nodes_dict["field_of_study"],
                num_nodes_dict["institution"],
            ]
        )
        cum_num_nodes = torch.cumsum(num_nodes, dim=-1)
        node_types = torch.repeat_interleave(torch.arange(0, 4), num_nodes[1:])

        edge_index_dict = dataset[0][0]["edge_index_dict"]

        edge_index = [None] * len(edge_type_dict)
        edge_attr = [None] * len(edge_type_dict)

        i = 0
        for k, v in edge_index_dict.items():
            head, edge_type, tail = k
            head_offset = cum_num_nodes[node_type_dict[head]].item()
            tail_offset = cum_num_nodes[node_type_dict[tail]].item()
            src = v[0] + head_offset
            tgt = v[1] + tail_offset
            edge_tps = np.full(src.shape, edge_type_dict[k])

            if edge_type == "cites":
                _edges = torch.as_tensor([src, tgt])
                _src, _tgt = to_undirected(_edges).numpy()
                edge_tps = np.full(_src.shape, edge_type_dict[k])
                edge_idx = np.vstack([_src, _tgt])
            else:
                _src = np.concatenate([src, tgt])
                _tgt = np.concatenate([tgt, src])
                re_tps = np.full(src.shape, len(edge_type_dict))

                re_k = (tail, "to", head)
                edge_type_dict[re_k] = len(edge_type_dict)
                edge_tps = np.concatenate([edge_tps, re_tps])
                edge_idx = np.vstack([_src, _tgt])

            edge_index[i] = edge_idx
            edge_attr[i] = edge_tps
            assert edge_index[i].shape[1] == edge_attr[i].shape[0]
            i += 1
        edge_index = np.concatenate(edge_index, axis=-1)
        edge_index = torch.from_numpy(edge_index)
        edge_attr = torch.from_numpy(np.concatenate(edge_attr))

        assert edge_index.shape[1] == edge_attr.shape[0]

        split_index = dataset.get_idx_split()
        train_index = torch.from_numpy(split_index["train"]["paper"])
        val_index = torch.from_numpy(split_index["valid"]["paper"])
        test_index = torch.from_numpy(split_index["test"]["paper"])
        y = torch.as_tensor(dataset[0][1]["paper"]).view(-1)

        paper_feat = dataset[0][0]["node_feat_dict"]["paper"]
        data = Graph(
            y=y,
            edge_index=edge_index,
            edge_types=edge_attr,
            train_mask=train_index,
            val_mask=val_index,
            test_mask=test_index,
            node_types=node_types,
        )
        # self.save_edges(data)
        torch.save((data, node_type_dict, edge_type_dict, num_nodes_dict), self.processed_paths[0])
        np.save(self.processed_paths[1], paper_feat)
Example #9
0
def load_data(data_dir, dataset_str, knn_size=None, epsilon=None, knn_metric='cosine', prob_del_edge=None, prob_add_edge=None, seed=1234, sparse_init_adj=False):
    """
    Loads input data from gcn/data directory

    ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
    ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
    ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
        object;
    ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.

    All objects above must be saved using python pickle module.

    :param dataset_str: Dataset name ('cora', 'citeseer', 'pubmed')
    :return: All data input files loaded (as well the training/test data).
    """
    assert (knn_size is None) or (epsilon is None)

    if dataset_str.startswith('ogbn'): # Open Graph Benchmark datasets
        from ogb.nodeproppred import NodePropPredDataset

        dataset = NodePropPredDataset(name=dataset_str)

        split_idx = dataset.get_idx_split()
        idx_train, idx_val, idx_test = torch.LongTensor(split_idx["train"]), torch.LongTensor(split_idx["valid"]), torch.LongTensor(split_idx["test"])

        data = dataset[0] # This dataset has only one graph
        features = torch.Tensor(data[0]['node_feat'])
        labels = torch.LongTensor(data[1]).squeeze(-1)

        edge_index = data[0]['edge_index']
        adj = to_undirected(edge_index, num_nodes=data[0]['num_nodes'])
        assert adj.diagonal().sum() == 0 and adj.max() <= 1 and (adj != adj.transpose()).sum() == 0


    else: # datasets: Cora, Citeseer, PubMed

        names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
        objects = []
        for i in range(len(names)):
            with open(os.path.join(data_dir, 'ind.{}.{}'.format(dataset_str, names[i])), 'rb') as f:
                if sys.version_info > (3, 0):
                    objects.append(pkl.load(f, encoding='latin1'))
                else:
                    objects.append(pkl.load(f))

        x, y, tx, ty, allx, ally, graph = tuple(objects)
        test_idx_reorder = parse_index_file(os.path.join(data_dir, 'ind.{}.test.index'.format(dataset_str)))
        test_idx_range = np.sort(test_idx_reorder)

        if dataset_str == 'citeseer':
            # Fix citeseer dataset (there are some isolated nodes in the graph)
            # Find isolated nodes, add them as zero-vecs into the right position
            test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
            tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
            tx_extended[test_idx_range-min(test_idx_range), :] = tx
            tx = tx_extended
            ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
            ty_extended[test_idx_range-min(test_idx_range), :] = ty
            ty = ty_extended

        raw_features = sp.vstack((allx, tx)).tolil()
        raw_features[test_idx_reorder, :] = raw_features[test_idx_range, :]
        features = normalize_features(raw_features)
        raw_features = torch.Tensor(raw_features.todense())
        features = torch.Tensor(features.todense())

        adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))


        labels = np.vstack((ally, ty))
        labels[test_idx_reorder, :] = labels[test_idx_range, :]
        # labels = torch.LongTensor(np.where(labels)[1])
        labels = torch.LongTensor(np.argmax(labels, axis=1))

        idx_train = torch.LongTensor(range(len(y)))
        idx_val = torch.LongTensor(range(len(y), len(y) + 500))
        idx_test = torch.LongTensor(test_idx_range.tolist())



    if not knn_size is None:
        print('[ Using KNN-graph as input graph: {} ]'.format(knn_size))
        adj = kneighbors_graph(features, knn_size, metric=knn_metric, include_self=True)
        adj_norm = normalize_sparse_adj(adj)
        if sparse_init_adj:
            adj_norm = sparse_mx_to_torch_sparse_tensor(adj_norm)
        else:
            adj_norm = torch.Tensor(adj_norm.todense())

    elif not epsilon is None:
        print('[ Using Epsilon-graph as input graph: {} ]'.format(epsilon))
        feature_norm = features.div(torch.norm(features, p=2, dim=-1, keepdim=True))
        attention = torch.mm(feature_norm, feature_norm.transpose(-1, -2))
        mask = (attention > epsilon).float()
        adj = attention * mask
        adj = (adj > 0).float()
        adj = sp.csr_matrix(adj)
        adj_norm = normalize_sparse_adj(adj)
        if sparse_init_adj:
            adj_norm = sparse_mx_to_torch_sparse_tensor(adj_norm)
        else:
            adj_norm = torch.Tensor(adj_norm.todense())

    else:
        print('[ Using ground-truth input graph ]')

        if prob_del_edge is not None:
            adj = graph_delete_connections(prob_del_edge, seed, adj.toarray(), enforce_connected=False)
            adj = adj + np.eye(adj.shape[0])
            adj_norm = normalize_adj(torch.Tensor(adj))
            adj_norm = sp.csr_matrix(adj_norm)


        elif prob_add_edge is not None:
            adj = graph_add_connections(prob_add_edge, seed, adj.toarray(), enforce_connected=False)
            adj = adj + np.eye(adj.shape[0])
            adj_norm = normalize_adj(torch.Tensor(adj))
            adj_norm = sp.csr_matrix(adj_norm)

        else:
            adj = adj + sp.eye(adj.shape[0])
            adj_norm = normalize_sparse_adj(adj)


        if sparse_init_adj:
            adj_norm = sparse_mx_to_torch_sparse_tensor(adj_norm)
        else:
            adj_norm = torch.Tensor(adj_norm.todense())

    return adj_norm, features, labels, idx_train, idx_val, idx_test
Example #10
0
def get_graph_data(d_name="ogbn-proteins", mini_data=False):
    """
        Param:
            d_name: name of dataset
            mini_data: if mini_data==True, only use a small dataset (for test)
    """
    # 导入 ogb 数据
    dataset = NodePropPredDataset(name=d_name)
    num_tasks = dataset.num_tasks  # obtaining the number of prediction tasks in a dataset

    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
        "valid"], split_idx["test"]
    graph, label = dataset[0]

    # 调整维度,符合 PGL 的 Graph 要求
    graph["edge_index"] = graph["edge_index"].T

    # 使用小规模数据,500个节点
    if mini_data:
        graph['num_nodes'] = 500
        mask = (graph['edge_index'][:, 0] < 500) * (graph['edge_index'][:, 1] <
                                                    500)
        graph["edge_index"] = graph["edge_index"][mask]
        graph["edge_feat"] = graph["edge_feat"][mask]
        label = label[:500]
        train_idx = np.arange(0, 400)
        valid_idx = np.arange(400, 450)
        test_idx = np.arange(450, 500)

    # 输出 dataset 的信息
    print(graph.keys())
    print("节点个数 ", graph["num_nodes"])
    print("节点最小编号", graph['edge_index'][0].min())
    print("边个数 ", graph["edge_index"].shape[1])
    print("边索引 shape ", graph["edge_index"].shape)
    print("边特征 shape ", graph["edge_feat"].shape)
    print("节点特征是 ", graph["node_feat"])
    print("species shape", graph['species'].shape)
    print("label shape ", label.shape)

    # 读取/计算 node feature
    # 确定读取文件的路径
    if mini_data:
        node_feat_path = './dataset/ogbn_proteins_node_feat_small.npy'
    else:
        node_feat_path = './dataset/ogbn_proteins_node_feat.npy'

    new_node_feat = None
    if os.path.exists(node_feat_path):
        # 如果文件存在,直接读取
        print("读取 node feature 开始".center(50, '='))
        new_node_feat = np.load(node_feat_path)
        print("读取 node feature 成功".center(50, '='))
    else:
        # 如果文件不存在,则计算
        # 每个节点 i 的特征为其邻边特征的均值
        print("计算 node feature 开始".center(50, '='))
        start = time.perf_counter()
        for i in range(graph['num_nodes']):
            if i % 100 == 0:
                dur = time.perf_counter() - start
                print("{}/{}({}%), times: {:.2f}s".format(
                    i, graph['num_nodes'], i / graph['num_nodes'] * 100, dur))
            mask = (graph['edge_index'][:, 0] == i)  # 选择 i 的所有邻边
            # 计算均值
            current_node_feat = np.mean(np.compress(mask,
                                                    graph['edge_feat'],
                                                    axis=0),
                                        axis=0,
                                        keepdims=True)
            if i == 0:
                new_node_feat = [current_node_feat]
            else:
                new_node_feat.append(current_node_feat)

        new_node_feat = np.concatenate(new_node_feat, axis=0)
        print("计算 node feature 结束".center(50, '='))

        print("存储 node feature 中,在" + node_feat_path.center(50, '='))
        np.save(node_feat_path, new_node_feat)
        print("存储 node feature 结束".center(50, '='))

    print(new_node_feat)

    # 构造 Graph 对象
    g = pgl.graph.Graph(num_nodes=graph["num_nodes"],
                        edges=graph["edge_index"],
                        node_feat={'node_feat': new_node_feat},
                        edge_feat=None)
    print("创建 Graph 对象成功")
    print(g)
    return g, label, train_idx, valid_idx, test_idx, Evaluator(d_name)
Example #11
0
def ogbn_generate_split(job: signac.Project.Job, splitJob: signac.Project.Job,
                        feature_graph_name, feature_graph_files):
    import constraint
    with utils.chdir(splitJob.sp.ogbn_path):
        from ogb.nodeproppred import NodePropPredDataset
        d_name = splitJob.sp.ogbn_name

        lock = ogbnLockDict.setdefault(splitJob.sp.ogbn_path, threading.Lock())
        if not os.path.exists("dataset"):  # In case dataset is not downloaded
            lock.acquire()
            ogbnDataset = NodePropPredDataset(name=d_name)
            lock.release()
        else:
            ogbnDataset = NodePropPredDataset(name=d_name)

        split_idx = ogbnDataset.get_idx_split()
        train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
            "valid"], split_idx["test"]
        graph, label = ogbnDataset[0]

    with job:
        splitJobSrc = utils.signac_tools.access_proj_job(
            job, splitJob.sp.feature_source, splitJob.sp.split_source)
        splitSrcName = splitJobSrc.doc["split_name"]
        # Copy not changing files
        for source_file, dest_file in [
            (splitJobSrc.fn(f"{splitSrcName}.{ext}"),
             splitJob.fn(f"{feature_graph_name}.{ext}"))
                for ext in ('y', 'ty', 'ally', 'graph', 'test.index')
        ]:
            shutil.copy2(source_file, dest_file)

        with splitJobSrc:
            datasetSrc = utils.PlanetoidData(splitJobSrc.doc.split_name,
                                             ".",
                                             val_size=None)

        ogbnLabelCount = np.zeros((3, ogbnDataset.num_classes))
        ogbnLabelCount[0, :] = (label[train_idx] == np.arange(
            ogbnDataset.num_classes)).sum(0)
        ogbnLabelCount[1, :] = (label[valid_idx] == np.arange(
            ogbnDataset.num_classes)).sum(0)
        ogbnLabelCount[2, :] = (label[test_idx] == np.arange(
            ogbnDataset.num_classes)).sum(0)

        srcLabelCount = np.zeros((3, job.sp.numClass))
        srcLabelCount[0, :] = datasetSrc.y_all[datasetSrc.train_mask, :].sum(0)
        srcLabelCount[1, :] = datasetSrc.y_all[datasetSrc.val_mask, :].sum(0)
        srcLabelCount[2, :] = datasetSrc.y_all[datasetSrc.test_mask, :].sum(0)

        problem = constraint.Problem()
        problem.addVariables(range(job.sp.numClass),
                             range(ogbnDataset.num_classes))
        problem.addConstraint(constraint.AllDifferentConstraint())
        for i in range(job.sp.numClass):
            problem.addConstraint(
                lambda x: np.all(ogbnLabelCount[:, x] >= srcLabelCount[:, i]),
                (i, ))
        solution = problem.getSolution()

        for srcClass, dstClass in solution.items():
            assert np.all(
                ogbnLabelCount[:, dstClass] >= srcLabelCount[:, srcClass])

        newFeatures = np.zeros(
            (datasetSrc.num_samples, graph["node_feat"].shape[1]))
        for scope, idx in (("train", train_idx), ("val", valid_idx),
                           ("test", test_idx)):
            scope_mask = getattr(datasetSrc, f"{scope}_mask")
            for srcClass, dstClass in solution.items():
                srcOpMask = np.logical_and(scope_mask,
                                           datasetSrc.labels == srcClass)
                dstSampleSet = list(
                    set(idx).intersection(np.where(label == dstClass)[0]))
                sampleInds = random_state.choice(dstSampleSet,
                                                 srcOpMask.sum(),
                                                 replace=False)
                newFeatures[srcOpMask, :] = graph["node_feat"][sampleInds, :]

        x_mask = datasetSrc.train_mask
        allx_mask = (datasetSrc.train_mask + datasetSrc.val_mask)
        test_mask = datasetSrc.test_mask

        x = newFeatures[x_mask]
        allx = newFeatures[allx_mask]
        tx = newFeatures[test_mask]

        # .x; .tx; .allx
        pickle.dump(scipy.sparse.csr_matrix(x),
                    open(splitJob.fn(f"{feature_graph_name}.x"), "wb"))
        pickle.dump(scipy.sparse.csr_matrix(allx),
                    open(splitJob.fn(f"{feature_graph_name}.allx"), "wb"))
        pickle.dump(scipy.sparse.csr_matrix(tx),
                    open(splitJob.fn(f"{feature_graph_name}.tx"), "wb"))

        assert all(map(splitJob.isfile, feature_graph_files))
        splitJob.doc["succeeded"] = True
        splitJob.doc["split_name"] = feature_graph_name
        splitJob.doc.val_size = splitJobSrc.doc.val_size