Ejemplo n.º 1
0
    def __init__(self, root, name):
        super(OGBNDataset, self).__init__(root)
        dataset = NodePropPredDataset(name, root)
        graph, y = dataset[0]
        x = torch.tensor(graph["node_feat"])
        y = torch.tensor(y.squeeze())
        row, col, edge_attr = coalesce(graph["edge_index"][0], graph["edge_index"][1], graph["edge_feat"])
        edge_index = torch.stack([row, col], dim=0)
        edge_index, edge_attr = remove_self_loops(edge_index, edge_attr)
        row = torch.cat([edge_index[0], edge_index[1]])
        col = torch.cat([edge_index[1], edge_index[0]])
        edge_index = torch.stack([row, col], dim=0)
        if edge_attr is not None:
            edge_attr = torch.cat([edge_attr, edge_attr], dim=0)

        self.data = Graph(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
        self.data.num_nodes = graph["num_nodes"]
        assert self.data.num_nodes == self.data.x.shape[0]

        # split
        split_index = dataset.get_idx_split()
        self.data.train_mask = torch.zeros(self.data.num_nodes, dtype=torch.bool)
        self.data.test_mask = torch.zeros(self.data.num_nodes, dtype=torch.bool)
        self.data.val_mask = torch.zeros(self.data.num_nodes, dtype=torch.bool)
        self.data.train_mask[split_index["train"]] = True
        self.data.test_mask[split_index["test"]] = True
        self.data.val_mask[split_index["valid"]] = True

        self.transform = None
Ejemplo n.º 2
0
 def __init__(self, path: str):
     ogbn_dataset = NodePropPredDataset("ogbn-products", path)
     if _backend.DependentBackend.is_dgl():
         super(OGBNProductsDataset, self).__init__([
             _OGBNDatasetUtil.ogbn_dataset_to_general_static_graph(
                 ogbn_dataset, "label", {"node_feat": "feat"},
                 {"edge_feat": "edge_feat"})
         ])
     elif _backend.DependentBackend.is_pyg():
         super(OGBNProductsDataset, self).__init__([
             _OGBNDatasetUtil.ogbn_dataset_to_general_static_graph(
                 ogbn_dataset, "y", {"node_feat": "x"})
         ])
Ejemplo n.º 3
0
def load_arxiv_year_dataset(nclass=5):
    filename = 'arxiv-year'
    dataset = NCDataset(filename)
    ogb_dataset = NodePropPredDataset(name='ogbn-arxiv')
    dataset.graph = ogb_dataset.graph
    dataset.graph['edge_index'] = torch.as_tensor(dataset.graph['edge_index'])
    dataset.graph['node_feat'] = torch.as_tensor(dataset.graph['node_feat'])

    label = even_quantile_labels(dataset.graph['node_year'].flatten(),
                                 nclass,
                                 verbose=False)
    dataset.label = torch.as_tensor(label).reshape(-1, 1)
    return dataset
Ejemplo n.º 4
0
def get_data(feature_address,
             edges_address,
             encoding_config=None,
             directed=False):
    if feature_address == 'arxiv':
        d = NodePropPredDataset('ogbn-arxiv', root='/datasets/ogb/ogbn-arxiv')
        graph, labels = d[0]
        labels = np.ravel(labels)
        edges = list(zip(graph["edge_index"][0], graph["edge_index"][1]))
        G = nx.DiGraph(edges)
        adj = nx.adjacency_matrix(G)
    else:
        features = pd.read_csv(feature_address, sep='\t', header=None)
        edges = pd.read_csv(edges_address, sep='\t', header=None)

        #adjacency matrix
        adj = get_adj(edges, directed)

        #encoding
        encoded_labels = features if encoding_config == None else encode(
            features, encoding_config)

    #put numpy arrays to tensors
    if feature_address == 'arxiv' and torch.cuda.is_available():
        device = torch.device('cuda')
        features = torch.FloatTensor(graph["node_feat"]).cuda().to(device)
        labels = torch.LongTensor(labels).cuda().to(device)

        adj_added = coo_matrix(adj + identity(adj.shape[0]))

        #add identity matrix to adjacency matrix
        values = adj_added.data
        indices = np.vstack((adj_added.row, adj_added.col))

        i = torch.LongTensor(indices)
        v = torch.FloatTensor(values)
        shape = adj_added.shape

        A = torch.sparse.FloatTensor(i, v, torch.Size(shape))
    else:
        features = np.array(features.iloc[:, 1:features.shape[1] - 1])
        features = torch.FloatTensor(features)
        labels = torch.LongTensor(np.where(encoded_labels)[1])

        #add identity matrix to adjacency matrix
        adj_added = adj + np.eye(adj.shape[0])

        A = torch.from_numpy(adj_added).float()

    return features, labels, A
Ejemplo n.º 5
0
    def __init__(self, root, name):
        self.name = name
        from ogb.nodeproppred import NodePropPredDataset
        dataset = NodePropPredDataset(name=name, root=root)
        split_idx = dataset.get_idx_split()
        data = dataset[0]
        num_nodes=data[1].shape[0]
        edge = data[0]["edge_index"]
        if name == "ogbn-arxiv":
            #convert ogbn-arxiv to undirected graph
            edge = np.concatenate([edge, edge[[1, 0]]], axis=1)
        self.graph = _C.Graph(
            edge_index=edge,
            num_nodes=num_nodes
        )
        split_idx = dataset.get_idx_split()
        train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

        self.x = data[0]["node_feat"]
        self.y = data[1].squeeze()
        self.train_mask = np.zeros(num_nodes, np.int32)
        self.train_mask[train_idx] = 1
        self.train_mask[test_idx] = 2
        self.num_classes = dataset.num_classes
Ejemplo n.º 6
0
 def ogbn_dataset_to_general_static_graph(
     cls,
     ogbn_dataset: NodePropPredDataset,
     nodes_label_key: str,
     nodes_data_key_mapping: _typing.Optional[_typing.Mapping[str,
                                                              str]] = ...,
     edges_data_key_mapping: _typing.Optional[_typing.Mapping[str,
                                                              str]] = ...,
     graph_data_key_mapping: _typing.Optional[_typing.Mapping[str,
                                                              str]] = ...
 ) -> GeneralStaticGraph:
     split_idx = ogbn_dataset.get_idx_split()
     return cls.ogbn_data_to_general_static_graph(
         ogbn_dataset[0][0], ogbn_dataset[0][1], nodes_label_key,
         split_idx["train"], split_idx["valid"], split_idx["test"],
         nodes_data_key_mapping, edges_data_key_mapping,
         graph_data_key_mapping)
Ejemplo n.º 7
0
 def __init__(self, path: str):
     ogbn_dataset = NodePropPredDataset("ogbn-papers100M", path)
     if _backend.DependentBackend.is_dgl():
         super(OGBNPapers100MDataset, self).__init__([
             _OGBNDatasetUtil.ogbn_dataset_to_general_static_graph(
                 ogbn_dataset, "label", {
                     "node_feat": "feat",
                     "node_year": "year"
                 })
         ])
     elif _backend.DependentBackend.is_pyg():
         super(OGBNPapers100MDataset, self).__init__([
             _OGBNDatasetUtil.ogbn_dataset_to_general_static_graph(
                 ogbn_dataset, "y", {
                     "node_feat": "x",
                     "node_year": "year"
                 })
         ])
Ejemplo n.º 8
0
def load_ogb_dataset(name):
    dataset = NCDataset(name)
    ogb_dataset = NodePropPredDataset(name=name)
    dataset.graph = ogb_dataset.graph
    dataset.graph['edge_index'] = torch.as_tensor(dataset.graph['edge_index'])
    dataset.graph['node_feat'] = torch.as_tensor(dataset.graph['node_feat'])

    def ogb_idx_to_tensor():
        split_idx = ogb_dataset.get_idx_split()
        tensor_split_idx = {
            key: torch.as_tensor(split_idx[key])
            for key in split_idx
        }
        return tensor_split_idx

    dataset.get_idx_split = ogb_idx_to_tensor  # ogb_dataset.get_idx_split
    dataset.label = torch.as_tensor(ogb_dataset.labels).reshape(-1, 1)
    return dataset
Ejemplo n.º 9
0
def load_proteins_dataset():
    ogb_dataset = NodePropPredDataset(name='ogbn-proteins')
    dataset = NCDataset('ogbn-proteins')

    def protein_orig_split(**kwargs):
        split_idx = ogb_dataset.get_idx_split()
        return {
            'train': torch.as_tensor(split_idx['train']),
            'valid': torch.as_tensor(split_idx['valid']),
            'test': torch.as_tensor(split_idx['test'])
        }

    dataset.get_idx_split = protein_orig_split
    dataset.graph, dataset.label = ogb_dataset.graph, ogb_dataset.labels

    dataset.graph['edge_index'] = torch.as_tensor(dataset.graph['edge_index'])
    dataset.graph['edge_feat'] = torch.as_tensor(dataset.graph['edge_feat'])
    dataset.label = torch.as_tensor(dataset.label)
    return dataset
Ejemplo n.º 10
0
def load_data(name):
    dir_name = "_".join(name.split("-"))
    root = os.path.join("dataset", dir_name)
    dir_gnn_bs = os.path.join(root, "gnn_bs")

    if not os.path.exists(root):
        os.mkdir(root)

    if not os.path.exists(dir_gnn_bs):
        dataset = NodePropPredDataset(name)
        print("data preprocess...")
        preprocess(name, root, dataset)
        adj_full = sp.load_npz('{}/adj_full.npz'.format(dir_gnn_bs)).astype(
            np.bool)
    else:
        adj_full = sp.load_npz('{}/adj_full.npz'.format(dir_gnn_bs)).astype(
            np.bool)
    splitted_idx = np.load('{}/splitted_idx.npy'.format(dir_gnn_bs),
                           allow_pickle=True).item()
    feats = np.load('{}/feats.npy'.format(dir_gnn_bs))
    labels = np.load('{}/labels.npy'.format(dir_gnn_bs))

    ## ---- normalize feats ----
    #if dataset == "reddit" or dataset == "flickr" or dataset == "yelp" or dataset == "amazon" or dataset == "ppi-large":
    #    train_nodes = np.array(list(set(adj_train.nonzero()[0])))
    #    train_feats = feats[train_nodes]
    #    scaler = StandardScaler()
    #    scaler.fit(train_feats)
    #    feats = scaler.transform(feats)

    adj_full = adj_full.tolil()
    train_nodes = splitted_idx['train']
    y_train = labels[train_nodes]
    valid_nodes = splitted_idx['valid']
    y_valid = labels[valid_nodes]
    test_nodes = splitted_idx['test']
    y_test = labels[test_nodes]

    return adj_full, feats, train_nodes, y_train, \
           valid_nodes, y_valid, test_nodes, y_test
Ejemplo n.º 11
0
def arxiv_ingestion():
    from ogb.nodeproppred import NodePropPredDataset

    d = NodePropPredDataset('ogbn-arxiv')

    edge_list = pd.DataFrame(d[0][0]['edge_index'].T)
    feature_list = pd.DataFrame(d[0][0]['node_feat'])  # [:3000]
    labels = d.labels

    # Used Sample []
    # sample_labels = d.labels[:3000]

    idx = np.array(list(range(feature_list.shape[0])), dtype=np.int32)
    idx_map = {j: i for i, j in enumerate(idx)}

    edges = np.array(edge_list.values.tolist())
    edges = np.array(list(map(idx_map.get, edges.flatten()))).reshape(edges.shape)

    features = sparse.csr_matrix(d.graph['node_feat'], dtype=np.float32)
    features = torch.FloatTensor(np.array(features.todense()))

    labels = np.array(labels.flatten())

    labels_lpa = (pd.get_dummies(labels)).to_numpy()

    adj = to_adj_list(edges, labels)
    labels = torch.LongTensor(labels)

    idx_train = d[0][0]['node_year'].flatten() < 2017
    idx_val = d[0][0]['node_year'].flatten() == 2018
    idx_test = d[0][0]['node_year'].flatten() == 2019

    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)
    idx_test = torch.LongTensor(idx_test)

    return adj, features, labels, idx_train, idx_val, idx_test, labels_lpa
Ejemplo n.º 12
0
    # log_out.write(args)
    print(args, file=log_out, flush=True)

    epochs = args.epoch
    node_dim = args.node_dim
    num_channels = args.num_channels
    lr = args.lr
    weight_decay = args.weight_decay
    num_layers = args.num_layers
    norm = args.norm
    adaptive_lr = args.adaptive_lr

    if args.ogb_mag:
        print("Using OGB MAG", flush=True)
        dataset = NodePropPredDataset(name="ogbn-mag")
        split_idx = dataset.get_idx_split()
        train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
            "valid"], split_idx["test"]
        graph, label = dataset[0]  # graph: library-agnostic graph object

        AvsI = graph['edge_index_dict'][('author', 'affiliated_with',
                                         'institution')]
        AvsP = graph['edge_index_dict'][('author', 'writes', 'paper')]
        PvsP = graph['edge_index_dict'][('paper', 'cites', 'paper')]
        PvsS = graph['edge_index_dict'][('paper', 'has_topic',
                                         'field_of_study')]

        # empty_lists = [ [] for _ in range(len(AvsI[0])) ]
        # AvsIdict = dict(zip(AvsI[0],empty_lists))
        empty_lists = [[] for _ in range(len(AvsI[1]))]
Ejemplo n.º 13
0
def get_graph_data(d_name="ogbn-proteins", mini_data=False):
    """
        Param:
            d_name: name of dataset
            mini_data: if mini_data==True, only use a small dataset (for test)
    """
    # import ogb data
    dataset = NodePropPredDataset(name=d_name)
    num_tasks = dataset.num_tasks  # obtaining the number of prediction tasks in a dataset

    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
        "valid"], split_idx["test"]
    graph, label = dataset[0]

    # reshape
    graph["edge_index"] = graph["edge_index"].T

    # mini dataset
    if mini_data:
        graph['num_nodes'] = 500
        mask = (graph['edge_index'][:, 0] < 500) * (graph['edge_index'][:, 1] <
                                                    500)
        graph["edge_index"] = graph["edge_index"][mask]
        graph["edge_feat"] = graph["edge_feat"][mask]
        label = label[:500]
        train_idx = np.arange(0, 400)
        valid_idx = np.arange(400, 450)
        test_idx = np.arange(450, 500)

    # read/compute node feature
    if mini_data:
        node_feat_path = './dataset/ogbn_proteins_node_feat_small.npy'
    else:
        node_feat_path = './dataset/ogbn_proteins_node_feat.npy'

    new_node_feat = None
    if os.path.exists(node_feat_path):
        print("Begin: read node feature".center(50, '='))
        new_node_feat = np.load(node_feat_path)
        print("End: read node feature".center(50, '='))
    else:
        print("Begin: compute node feature".center(50, '='))
        start = time.perf_counter()
        for i in range(graph['num_nodes']):
            if i % 100 == 0:
                dur = time.perf_counter() - start
                print("{}/{}({}%), times: {:.2f}s".format(
                    i, graph['num_nodes'], i / graph['num_nodes'] * 100, dur))
            mask = (graph['edge_index'][:, 0] == i)

            current_node_feat = np.mean(np.compress(mask,
                                                    graph['edge_feat'],
                                                    axis=0),
                                        axis=0,
                                        keepdims=True)
            if i == 0:
                new_node_feat = [current_node_feat]
            else:
                new_node_feat.append(current_node_feat)

        new_node_feat = np.concatenate(new_node_feat, axis=0)
        print("End: compute node feature".center(50, '='))

        print("Saving node feature in " + node_feat_path.center(50, '='))
        np.save(node_feat_path, new_node_feat)
        print("Saving finish".center(50, '='))

    print(new_node_feat)

    # create graph
    g = pgl.graph.Graph(num_nodes=graph["num_nodes"],
                        edges=graph["edge_index"],
                        node_feat={'node_feat': new_node_feat},
                        edge_feat=None)
    print("Create graph")
    print(g)
    return g, label, train_idx, valid_idx, test_idx, Evaluator(d_name)
Ejemplo n.º 14
0
train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
    "valid"], split_idx["test"]

G = nx.Graph()
os.makedirs(f"{d_name}/dataset/{dataset_str}/", exist_ok=True)

pbar = tqdm.tqdm(total=edgelist.shape[1])
pbar.set_description('Adding edges to graph')
for i in range(edgelist.shape[1]):
    G.add_edge(int(edgelist[0][i]), int(edgelist[1][i]))
    pbar.update(1)

if args.save_label or args.save_feature:
    if dataset_str == "arxiv":
        graph, label = NodePropPredDataset(
            name=d_name, root=f"{d_name}/dataset")[
                0]  # graph: library-agnostic graph object
    else:
        raise NotImplementedError
    os.makedirs(f"{d_name}/dataset/{dataset_str}/", exist_ok=True)

if args.save_label:
    with open(
            "{}/dataset/{}/{}-class_map.json".format(d_name, dataset_str,
                                                     dataset_str), 'w') as f:
        f.write('{')
        for i in range(num_nodes):
            if i > 0:
                f.write(', ')
            f.write('\"' + str(i) + "\": ")
            f.write(str(label[i][0]))
Ejemplo n.º 15
0
"""
import numpy as np
import tensorflow as tf
from ogb.nodeproppred import Evaluator, NodePropPredDataset
from tensorflow.keras.layers import BatchNormalization, Dropout, Input
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

from spektral.datasets.ogb import OGB
from spektral.layers import GCNConv
from spektral.transforms import AdjToSpTensor, GCNFilter

# Load data
dataset_name = "ogbn-arxiv"
ogb_dataset = NodePropPredDataset(dataset_name)
dataset = OGB(ogb_dataset, transforms=[GCNFilter(), AdjToSpTensor()])
graph = dataset[0]
x, adj, y = graph.x, graph.a, graph.y

# Parameters
channels = 256  # Number of channels for GCN layers
dropout = 0.5  # Dropout rate for the features
learning_rate = 1e-2  # Learning rate
epochs = 200  # Number of training epochs

N = dataset.n_nodes  # Number of nodes in the graph
F = dataset.n_node_features  # Original size of node features
n_out = ogb_dataset.num_classes  # OGB labels are sparse indices

# Data splits
Ejemplo n.º 16
0
 def __init__(self):
     d_name = "ogbn-arxiv"
     dataset = NodePropPredDataset(name=d_name)
     graph, label = dataset[0]
     self.num_nodes = graph["num_nodes"]
     self.ogb_evaluator = Evaluator(name="ogbn-arxiv")
Ejemplo n.º 17
0
def test_datasetsaver():
    # test on graph classification
    # ogbg-molhiv

    test_task = 'link'

    # testing all the dataset objects are working.
    if test_task == 'graph':
        from ogb.graphproppred import PygGraphPropPredDataset, DglGraphPropPredDataset, GraphPropPredDataset
        dataset_name = 'ogbg-molhiv'
        dataset = PygGraphPropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = DglGraphPropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = GraphPropPredDataset(dataset_name)
        dataset.get_idx_split()
    elif test_task == 'node':
        from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset
        dataset_name = 'ogbn-arxiv'  # test ogbn-proteins
        dataset = PygNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = DglNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = NodePropPredDataset(dataset_name)
        dataset.get_idx_split()
    elif test_task == 'link':
        from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset
        dataset_name = 'ogbl-collab'
        dataset = PygLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = DglLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = LinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
    elif test_task == 'heteronode':
        from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset
        dataset_name = 'ogbn-mag'
        dataset = PygNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = DglNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = NodePropPredDataset(dataset_name)
        dataset.get_idx_split()
    elif test_task == 'heterolink':
        from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset
        dataset_name = 'ogbl-biokg'
        dataset = PygLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = DglLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = LinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
    else:
        raise ValueError('Invalid task category')

    print(dataset[0])
    if 'link' in test_task:
        print(dataset.get_edge_split())
    else:
        print(dataset.get_idx_split())

    if 'graph' in test_task:
        graph_list = dataset.graphs
    else:
        graph_list = [dataset.graph]

    if 'link' not in test_task:
        labels = dataset.labels

    is_hetero = 'hetero' in test_task
    version = 2 if dataset_name == 'ogbn-mag' else 1
    saver = DatasetSaver(dataset_name, is_hetero, version=version)

    # saving graph objects
    saver.save_graph_list(graph_list)
    # saving target labels
    if 'link' not in test_task:
        saver.save_target_labels(labels)
    # saving split
    if 'link' in test_task:
        split_idx = dataset.get_edge_split()
    else:
        split_idx = dataset.get_idx_split()
    # second argument must be the name of the split
    saver.save_split(split_idx, dataset.meta_info['split'])
    # copying mapping dir
    # saver.copy_mapping_dir(f"dataset/{'_'.join(dataset_name.split('-'))}/mapping/")
    saver.copy_mapping_dir("dataset/{}/mapping/".format('_'.join(
        dataset_name.split('-'))))

    saver.save_task_info(
        dataset.task_type, dataset.eval_metric,
        dataset.num_classes if hasattr(dataset, 'num_classes') else None)

    meta_dict = saver.get_meta_dict()

    print(meta_dict)

    print('Now testing.')

    if 'graph' in test_task:
        print('library agnostic')
        dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('Pytorch Geometric')
        dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('DGL')
        dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
    elif 'node' in test_task:
        print('library agnostic')
        dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('Pytorch Geometric')
        dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('DGL')
        dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())

    elif 'link' in test_task:
        print('library agnostic')
        dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        # print(dataset.get_edge_split())
        print('Pytorch Geometric')
        dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        # print(dataset.get_edge_split())
        print('DGL')
        dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        # print(dataset.get_edge_split())
    else:
        raise ValueError('Invalid task category')

    # zip
    saver.zip()
    print('Finished zipping!')

    saver.cleanup()
Ejemplo n.º 18
0
    def process(self):
        dataset = NodePropPredDataset(name=self.name, root="./data")
        node_type_dict = {"paper": 0, "author": 1, "field_of_study": 2, "institution": 3}
        edge_type_dict = {
            ("paper", "cites", "paper"): 0,
            ("author", "affiliated_with", "institution"): 1,
            ("author", "writes", "paper"): 2,
            ("paper", "has_topic", "field_of_study"): 3,
        }
        num_nodes_dict = dataset[0][0]["num_nodes_dict"]
        num_nodes = torch.as_tensor(
            [0]
            + [
                num_nodes_dict["paper"],
                num_nodes_dict["author"],
                num_nodes_dict["field_of_study"],
                num_nodes_dict["institution"],
            ]
        )
        cum_num_nodes = torch.cumsum(num_nodes, dim=-1)
        node_types = torch.repeat_interleave(torch.arange(0, 4), num_nodes[1:])

        edge_index_dict = dataset[0][0]["edge_index_dict"]

        edge_index = [None] * len(edge_type_dict)
        edge_attr = [None] * len(edge_type_dict)

        i = 0
        for k, v in edge_index_dict.items():
            head, edge_type, tail = k
            head_offset = cum_num_nodes[node_type_dict[head]].item()
            tail_offset = cum_num_nodes[node_type_dict[tail]].item()
            src = v[0] + head_offset
            tgt = v[1] + tail_offset
            edge_tps = np.full(src.shape, edge_type_dict[k])

            if edge_type == "cites":
                _edges = torch.as_tensor([src, tgt])
                _src, _tgt = to_undirected(_edges).numpy()
                edge_tps = np.full(_src.shape, edge_type_dict[k])
                edge_idx = np.vstack([_src, _tgt])
            else:
                _src = np.concatenate([src, tgt])
                _tgt = np.concatenate([tgt, src])
                re_tps = np.full(src.shape, len(edge_type_dict))

                re_k = (tail, "to", head)
                edge_type_dict[re_k] = len(edge_type_dict)
                edge_tps = np.concatenate([edge_tps, re_tps])
                edge_idx = np.vstack([_src, _tgt])

            edge_index[i] = edge_idx
            edge_attr[i] = edge_tps
            assert edge_index[i].shape[1] == edge_attr[i].shape[0]
            i += 1
        edge_index = np.concatenate(edge_index, axis=-1)
        edge_index = torch.from_numpy(edge_index)
        edge_attr = torch.from_numpy(np.concatenate(edge_attr))

        assert edge_index.shape[1] == edge_attr.shape[0]

        split_index = dataset.get_idx_split()
        train_index = torch.from_numpy(split_index["train"]["paper"])
        val_index = torch.from_numpy(split_index["valid"]["paper"])
        test_index = torch.from_numpy(split_index["test"]["paper"])
        y = torch.as_tensor(dataset[0][1]["paper"]).view(-1)

        paper_feat = dataset[0][0]["node_feat_dict"]["paper"]
        data = Graph(
            y=y,
            edge_index=edge_index,
            edge_types=edge_attr,
            train_mask=train_index,
            val_mask=val_index,
            test_mask=test_index,
            node_types=node_types,
        )
        # self.save_edges(data)
        torch.save((data, node_type_dict, edge_type_dict, num_nodes_dict), self.processed_paths[0])
        np.save(self.processed_paths[1], paper_feat)
Ejemplo n.º 19
0
def ogbn_generate_split(job: signac.Project.Job, splitJob: signac.Project.Job,
                        feature_graph_name, feature_graph_files):
    import constraint
    with utils.chdir(splitJob.sp.ogbn_path):
        from ogb.nodeproppred import NodePropPredDataset
        d_name = splitJob.sp.ogbn_name

        lock = ogbnLockDict.setdefault(splitJob.sp.ogbn_path, threading.Lock())
        if not os.path.exists("dataset"):  # In case dataset is not downloaded
            lock.acquire()
            ogbnDataset = NodePropPredDataset(name=d_name)
            lock.release()
        else:
            ogbnDataset = NodePropPredDataset(name=d_name)

        split_idx = ogbnDataset.get_idx_split()
        train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
            "valid"], split_idx["test"]
        graph, label = ogbnDataset[0]

    with job:
        splitJobSrc = utils.signac_tools.access_proj_job(
            job, splitJob.sp.feature_source, splitJob.sp.split_source)
        splitSrcName = splitJobSrc.doc["split_name"]
        # Copy not changing files
        for source_file, dest_file in [
            (splitJobSrc.fn(f"{splitSrcName}.{ext}"),
             splitJob.fn(f"{feature_graph_name}.{ext}"))
                for ext in ('y', 'ty', 'ally', 'graph', 'test.index')
        ]:
            shutil.copy2(source_file, dest_file)

        with splitJobSrc:
            datasetSrc = utils.PlanetoidData(splitJobSrc.doc.split_name,
                                             ".",
                                             val_size=None)

        ogbnLabelCount = np.zeros((3, ogbnDataset.num_classes))
        ogbnLabelCount[0, :] = (label[train_idx] == np.arange(
            ogbnDataset.num_classes)).sum(0)
        ogbnLabelCount[1, :] = (label[valid_idx] == np.arange(
            ogbnDataset.num_classes)).sum(0)
        ogbnLabelCount[2, :] = (label[test_idx] == np.arange(
            ogbnDataset.num_classes)).sum(0)

        srcLabelCount = np.zeros((3, job.sp.numClass))
        srcLabelCount[0, :] = datasetSrc.y_all[datasetSrc.train_mask, :].sum(0)
        srcLabelCount[1, :] = datasetSrc.y_all[datasetSrc.val_mask, :].sum(0)
        srcLabelCount[2, :] = datasetSrc.y_all[datasetSrc.test_mask, :].sum(0)

        problem = constraint.Problem()
        problem.addVariables(range(job.sp.numClass),
                             range(ogbnDataset.num_classes))
        problem.addConstraint(constraint.AllDifferentConstraint())
        for i in range(job.sp.numClass):
            problem.addConstraint(
                lambda x: np.all(ogbnLabelCount[:, x] >= srcLabelCount[:, i]),
                (i, ))
        solution = problem.getSolution()

        for srcClass, dstClass in solution.items():
            assert np.all(
                ogbnLabelCount[:, dstClass] >= srcLabelCount[:, srcClass])

        newFeatures = np.zeros(
            (datasetSrc.num_samples, graph["node_feat"].shape[1]))
        for scope, idx in (("train", train_idx), ("val", valid_idx),
                           ("test", test_idx)):
            scope_mask = getattr(datasetSrc, f"{scope}_mask")
            for srcClass, dstClass in solution.items():
                srcOpMask = np.logical_and(scope_mask,
                                           datasetSrc.labels == srcClass)
                dstSampleSet = list(
                    set(idx).intersection(np.where(label == dstClass)[0]))
                sampleInds = random_state.choice(dstSampleSet,
                                                 srcOpMask.sum(),
                                                 replace=False)
                newFeatures[srcOpMask, :] = graph["node_feat"][sampleInds, :]

        x_mask = datasetSrc.train_mask
        allx_mask = (datasetSrc.train_mask + datasetSrc.val_mask)
        test_mask = datasetSrc.test_mask

        x = newFeatures[x_mask]
        allx = newFeatures[allx_mask]
        tx = newFeatures[test_mask]

        # .x; .tx; .allx
        pickle.dump(scipy.sparse.csr_matrix(x),
                    open(splitJob.fn(f"{feature_graph_name}.x"), "wb"))
        pickle.dump(scipy.sparse.csr_matrix(allx),
                    open(splitJob.fn(f"{feature_graph_name}.allx"), "wb"))
        pickle.dump(scipy.sparse.csr_matrix(tx),
                    open(splitJob.fn(f"{feature_graph_name}.tx"), "wb"))

        assert all(map(splitJob.isfile, feature_graph_files))
        splitJob.doc["succeeded"] = True
        splitJob.doc["split_name"] = feature_graph_name
        splitJob.doc.val_size = splitJobSrc.doc.val_size
Ejemplo n.º 20
0
def load_data(data_dir, dataset_str, knn_size=None, epsilon=None, knn_metric='cosine', prob_del_edge=None, prob_add_edge=None, seed=1234, sparse_init_adj=False):
    """
    Loads input data from gcn/data directory

    ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
    ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
    ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
        object;
    ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.

    All objects above must be saved using python pickle module.

    :param dataset_str: Dataset name ('cora', 'citeseer', 'pubmed')
    :return: All data input files loaded (as well the training/test data).
    """
    assert (knn_size is None) or (epsilon is None)

    if dataset_str.startswith('ogbn'): # Open Graph Benchmark datasets
        from ogb.nodeproppred import NodePropPredDataset

        dataset = NodePropPredDataset(name=dataset_str)

        split_idx = dataset.get_idx_split()
        idx_train, idx_val, idx_test = torch.LongTensor(split_idx["train"]), torch.LongTensor(split_idx["valid"]), torch.LongTensor(split_idx["test"])

        data = dataset[0] # This dataset has only one graph
        features = torch.Tensor(data[0]['node_feat'])
        labels = torch.LongTensor(data[1]).squeeze(-1)

        edge_index = data[0]['edge_index']
        adj = to_undirected(edge_index, num_nodes=data[0]['num_nodes'])
        assert adj.diagonal().sum() == 0 and adj.max() <= 1 and (adj != adj.transpose()).sum() == 0


    else: # datasets: Cora, Citeseer, PubMed

        names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
        objects = []
        for i in range(len(names)):
            with open(os.path.join(data_dir, 'ind.{}.{}'.format(dataset_str, names[i])), 'rb') as f:
                if sys.version_info > (3, 0):
                    objects.append(pkl.load(f, encoding='latin1'))
                else:
                    objects.append(pkl.load(f))

        x, y, tx, ty, allx, ally, graph = tuple(objects)
        test_idx_reorder = parse_index_file(os.path.join(data_dir, 'ind.{}.test.index'.format(dataset_str)))
        test_idx_range = np.sort(test_idx_reorder)

        if dataset_str == 'citeseer':
            # Fix citeseer dataset (there are some isolated nodes in the graph)
            # Find isolated nodes, add them as zero-vecs into the right position
            test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
            tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
            tx_extended[test_idx_range-min(test_idx_range), :] = tx
            tx = tx_extended
            ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
            ty_extended[test_idx_range-min(test_idx_range), :] = ty
            ty = ty_extended

        raw_features = sp.vstack((allx, tx)).tolil()
        raw_features[test_idx_reorder, :] = raw_features[test_idx_range, :]
        features = normalize_features(raw_features)
        raw_features = torch.Tensor(raw_features.todense())
        features = torch.Tensor(features.todense())

        adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))


        labels = np.vstack((ally, ty))
        labels[test_idx_reorder, :] = labels[test_idx_range, :]
        # labels = torch.LongTensor(np.where(labels)[1])
        labels = torch.LongTensor(np.argmax(labels, axis=1))

        idx_train = torch.LongTensor(range(len(y)))
        idx_val = torch.LongTensor(range(len(y), len(y) + 500))
        idx_test = torch.LongTensor(test_idx_range.tolist())



    if not knn_size is None:
        print('[ Using KNN-graph as input graph: {} ]'.format(knn_size))
        adj = kneighbors_graph(features, knn_size, metric=knn_metric, include_self=True)
        adj_norm = normalize_sparse_adj(adj)
        if sparse_init_adj:
            adj_norm = sparse_mx_to_torch_sparse_tensor(adj_norm)
        else:
            adj_norm = torch.Tensor(adj_norm.todense())

    elif not epsilon is None:
        print('[ Using Epsilon-graph as input graph: {} ]'.format(epsilon))
        feature_norm = features.div(torch.norm(features, p=2, dim=-1, keepdim=True))
        attention = torch.mm(feature_norm, feature_norm.transpose(-1, -2))
        mask = (attention > epsilon).float()
        adj = attention * mask
        adj = (adj > 0).float()
        adj = sp.csr_matrix(adj)
        adj_norm = normalize_sparse_adj(adj)
        if sparse_init_adj:
            adj_norm = sparse_mx_to_torch_sparse_tensor(adj_norm)
        else:
            adj_norm = torch.Tensor(adj_norm.todense())

    else:
        print('[ Using ground-truth input graph ]')

        if prob_del_edge is not None:
            adj = graph_delete_connections(prob_del_edge, seed, adj.toarray(), enforce_connected=False)
            adj = adj + np.eye(adj.shape[0])
            adj_norm = normalize_adj(torch.Tensor(adj))
            adj_norm = sp.csr_matrix(adj_norm)


        elif prob_add_edge is not None:
            adj = graph_add_connections(prob_add_edge, seed, adj.toarray(), enforce_connected=False)
            adj = adj + np.eye(adj.shape[0])
            adj_norm = normalize_adj(torch.Tensor(adj))
            adj_norm = sp.csr_matrix(adj_norm)

        else:
            adj = adj + sp.eye(adj.shape[0])
            adj_norm = normalize_sparse_adj(adj)


        if sparse_init_adj:
            adj_norm = sparse_mx_to_torch_sparse_tensor(adj_norm)
        else:
            adj_norm = torch.Tensor(adj_norm.todense())

    return adj_norm, features, labels, idx_train, idx_val, idx_test
Ejemplo n.º 21
0
def make_dataset(targets):
    try:
        if targets[0] == 'cora':
            pass
    except IndexError:
        print('No data target found, using test')
        targets[0] ='test'

    if targets[0] == 'cora':
        try:
            d1 = pd.read_csv('teams/DSC180A_FA20_A00/b01graphdataanalysis/cora.content', sep='\t', header=None)
            d2 = pd.read_csv('teams/DSC180A_FA20_A00/b01graphdataanalysis/cora.cites', sep='\t', header=None)

        except:
            d1 = pd.read_csv('data/cora.content', sep='\t', header=None)
            d2 = pd.read_csv('data/cora.cites', sep='\t', header=None)

        testfile = open('test/coradata.txt', 'w')
        testfile.write('')
        testfile.close()
        # New Output
        testfile = open('test/coradata.txt', 'a')

    elif targets[0] == 'test':
        d1 = pd.DataFrame(data=np.arange(0, 10))
        d1 = pd.concat([d1, pd.DataFrame(np.random.randint(2, size=(10, 1433)))], axis=1)
        d1 = pd.concat([d1, pd.DataFrame(np.array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'A', 'B', 'C']))], axis=1)
        d1.columns = np.arange(0, 1435)
        d2 = pd.DataFrame(np.random.randint(10, size=[50, 2]))

        # Clear Text File
        testfile = open('test/testdata.txt', 'w')
        testfile.write('')
        testfile.close()
        # New Output
        testfile = open('test/testdata.txt', 'a')

    elif targets[0] == 'ogb':
        from ogb.nodeproppred import NodePropPredDataset

        d = NodePropPredDataset('ogbn-arxiv', root='/datasets/ogb/ogbn-arxiv')
        d2 = pd.DataFrame(d[0][0]['edge_index'].T)
        d1 = pd.DataFrame(d[0][0]['node_feat'])
        labels = d[0][1]

        # Clear Text File
        testfile = open('test/ogbdata.txt', 'w')
        testfile.write('')
        testfile.close()
        # New Output
        testfile = open('test/ogbdata.txt', 'a')

    elif targets[0] == 'ogbsample':
        from ogb.nodeproppred import NodePropPredDataset

        d = NodePropPredDataset('ogbn-arxiv', root='/datasets/ogb/ogbn-arxiv')
        d2 = pd.DataFrame(d[0][0]['edge_index'].T)
        d1 = pd.DataFrame(d[0][0]['node_feat'])
        labels = d[0][1]

        # Sample of OGB dataset
        d2 = d2.sort_values(0).iloc[:2000]
        partial_idx = list(set(d2[0].unique()) | set(d2[1].unique()))
        d1 = d1.iloc[partial_idx]

        # Clear Text File
        testfile = open('test/ogbdata.txt', 'w')
        testfile.write('')
        testfile.close()
        # New Output
        testfile = open('test/ogbdata.txt', 'a')


    le = prep.LabelEncoder()
    le.fit(d1.iloc[:, -1])
    d1.iloc[:, -1] = le.transform(d1.iloc[:, -1])

    if targets[0] == 'test' or targets[0] == 'cora':
        d1 = d1.set_index(0)
        d1 = d1.sort_index()
        d1 = d1.reset_index()
        labels = d1.iloc[:, -1]

    elif targets[0] == 'ogbsample':
        labels = d[0][1].flatten()[partial_idx]

    else:
        labels = d[0][1].flatten()

    labels = torch.Tensor(labels).long()

    labels_distr = np.zeros([len(labels), len(le.classes_)])
    for row in range(len(labels)):
        labels_distr[row][labels[row]] = 1

    return testfile, d1, d2, labels, targets, labels_distr, le
Ejemplo n.º 22
0
def get_graph_data(d_name="ogbn-proteins", mini_data=False):
    """
        Param:
            d_name: name of dataset
            mini_data: if mini_data==True, only use a small dataset (for test)
    """
    # 导入 ogb 数据
    dataset = NodePropPredDataset(name=d_name)
    num_tasks = dataset.num_tasks  # obtaining the number of prediction tasks in a dataset

    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
        "valid"], split_idx["test"]
    graph, label = dataset[0]

    # 调整维度,符合 PGL 的 Graph 要求
    graph["edge_index"] = graph["edge_index"].T

    # 使用小规模数据,500个节点
    if mini_data:
        graph['num_nodes'] = 500
        mask = (graph['edge_index'][:, 0] < 500) * (graph['edge_index'][:, 1] <
                                                    500)
        graph["edge_index"] = graph["edge_index"][mask]
        graph["edge_feat"] = graph["edge_feat"][mask]
        label = label[:500]
        train_idx = np.arange(0, 400)
        valid_idx = np.arange(400, 450)
        test_idx = np.arange(450, 500)

    # 输出 dataset 的信息
    print(graph.keys())
    print("节点个数 ", graph["num_nodes"])
    print("节点最小编号", graph['edge_index'][0].min())
    print("边个数 ", graph["edge_index"].shape[1])
    print("边索引 shape ", graph["edge_index"].shape)
    print("边特征 shape ", graph["edge_feat"].shape)
    print("节点特征是 ", graph["node_feat"])
    print("species shape", graph['species'].shape)
    print("label shape ", label.shape)

    # 读取/计算 node feature
    # 确定读取文件的路径
    if mini_data:
        node_feat_path = './dataset/ogbn_proteins_node_feat_small.npy'
    else:
        node_feat_path = './dataset/ogbn_proteins_node_feat.npy'

    new_node_feat = None
    if os.path.exists(node_feat_path):
        # 如果文件存在,直接读取
        print("读取 node feature 开始".center(50, '='))
        new_node_feat = np.load(node_feat_path)
        print("读取 node feature 成功".center(50, '='))
    else:
        # 如果文件不存在,则计算
        # 每个节点 i 的特征为其邻边特征的均值
        print("计算 node feature 开始".center(50, '='))
        start = time.perf_counter()
        for i in range(graph['num_nodes']):
            if i % 100 == 0:
                dur = time.perf_counter() - start
                print("{}/{}({}%), times: {:.2f}s".format(
                    i, graph['num_nodes'], i / graph['num_nodes'] * 100, dur))
            mask = (graph['edge_index'][:, 0] == i)  # 选择 i 的所有邻边
            # 计算均值
            current_node_feat = np.mean(np.compress(mask,
                                                    graph['edge_feat'],
                                                    axis=0),
                                        axis=0,
                                        keepdims=True)
            if i == 0:
                new_node_feat = [current_node_feat]
            else:
                new_node_feat.append(current_node_feat)

        new_node_feat = np.concatenate(new_node_feat, axis=0)
        print("计算 node feature 结束".center(50, '='))

        print("存储 node feature 中,在" + node_feat_path.center(50, '='))
        np.save(node_feat_path, new_node_feat)
        print("存储 node feature 结束".center(50, '='))

    print(new_node_feat)

    # 构造 Graph 对象
    g = pgl.graph.Graph(num_nodes=graph["num_nodes"],
                        edges=graph["edge_index"],
                        node_feat={'node_feat': new_node_feat},
                        edge_feat=None)
    print("创建 Graph 对象成功")
    print(g)
    return g, label, train_idx, valid_idx, test_idx, Evaluator(d_name)