def __init__(self, root, name): super(OGBNDataset, self).__init__(root) dataset = NodePropPredDataset(name, root) graph, y = dataset[0] x = torch.tensor(graph["node_feat"]) y = torch.tensor(y.squeeze()) row, col, edge_attr = coalesce(graph["edge_index"][0], graph["edge_index"][1], graph["edge_feat"]) edge_index = torch.stack([row, col], dim=0) edge_index, edge_attr = remove_self_loops(edge_index, edge_attr) row = torch.cat([edge_index[0], edge_index[1]]) col = torch.cat([edge_index[1], edge_index[0]]) edge_index = torch.stack([row, col], dim=0) if edge_attr is not None: edge_attr = torch.cat([edge_attr, edge_attr], dim=0) self.data = Graph(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y) self.data.num_nodes = graph["num_nodes"] assert self.data.num_nodes == self.data.x.shape[0] # split split_index = dataset.get_idx_split() self.data.train_mask = torch.zeros(self.data.num_nodes, dtype=torch.bool) self.data.test_mask = torch.zeros(self.data.num_nodes, dtype=torch.bool) self.data.val_mask = torch.zeros(self.data.num_nodes, dtype=torch.bool) self.data.train_mask[split_index["train"]] = True self.data.test_mask[split_index["test"]] = True self.data.val_mask[split_index["valid"]] = True self.transform = None
def __init__(self, path: str): ogbn_dataset = NodePropPredDataset("ogbn-products", path) if _backend.DependentBackend.is_dgl(): super(OGBNProductsDataset, self).__init__([ _OGBNDatasetUtil.ogbn_dataset_to_general_static_graph( ogbn_dataset, "label", {"node_feat": "feat"}, {"edge_feat": "edge_feat"}) ]) elif _backend.DependentBackend.is_pyg(): super(OGBNProductsDataset, self).__init__([ _OGBNDatasetUtil.ogbn_dataset_to_general_static_graph( ogbn_dataset, "y", {"node_feat": "x"}) ])
def load_arxiv_year_dataset(nclass=5): filename = 'arxiv-year' dataset = NCDataset(filename) ogb_dataset = NodePropPredDataset(name='ogbn-arxiv') dataset.graph = ogb_dataset.graph dataset.graph['edge_index'] = torch.as_tensor(dataset.graph['edge_index']) dataset.graph['node_feat'] = torch.as_tensor(dataset.graph['node_feat']) label = even_quantile_labels(dataset.graph['node_year'].flatten(), nclass, verbose=False) dataset.label = torch.as_tensor(label).reshape(-1, 1) return dataset
def get_data(feature_address, edges_address, encoding_config=None, directed=False): if feature_address == 'arxiv': d = NodePropPredDataset('ogbn-arxiv', root='/datasets/ogb/ogbn-arxiv') graph, labels = d[0] labels = np.ravel(labels) edges = list(zip(graph["edge_index"][0], graph["edge_index"][1])) G = nx.DiGraph(edges) adj = nx.adjacency_matrix(G) else: features = pd.read_csv(feature_address, sep='\t', header=None) edges = pd.read_csv(edges_address, sep='\t', header=None) #adjacency matrix adj = get_adj(edges, directed) #encoding encoded_labels = features if encoding_config == None else encode( features, encoding_config) #put numpy arrays to tensors if feature_address == 'arxiv' and torch.cuda.is_available(): device = torch.device('cuda') features = torch.FloatTensor(graph["node_feat"]).cuda().to(device) labels = torch.LongTensor(labels).cuda().to(device) adj_added = coo_matrix(adj + identity(adj.shape[0])) #add identity matrix to adjacency matrix values = adj_added.data indices = np.vstack((adj_added.row, adj_added.col)) i = torch.LongTensor(indices) v = torch.FloatTensor(values) shape = adj_added.shape A = torch.sparse.FloatTensor(i, v, torch.Size(shape)) else: features = np.array(features.iloc[:, 1:features.shape[1] - 1]) features = torch.FloatTensor(features) labels = torch.LongTensor(np.where(encoded_labels)[1]) #add identity matrix to adjacency matrix adj_added = adj + np.eye(adj.shape[0]) A = torch.from_numpy(adj_added).float() return features, labels, A
def __init__(self, root, name): self.name = name from ogb.nodeproppred import NodePropPredDataset dataset = NodePropPredDataset(name=name, root=root) split_idx = dataset.get_idx_split() data = dataset[0] num_nodes=data[1].shape[0] edge = data[0]["edge_index"] if name == "ogbn-arxiv": #convert ogbn-arxiv to undirected graph edge = np.concatenate([edge, edge[[1, 0]]], axis=1) self.graph = _C.Graph( edge_index=edge, num_nodes=num_nodes ) split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"] self.x = data[0]["node_feat"] self.y = data[1].squeeze() self.train_mask = np.zeros(num_nodes, np.int32) self.train_mask[train_idx] = 1 self.train_mask[test_idx] = 2 self.num_classes = dataset.num_classes
def ogbn_dataset_to_general_static_graph( cls, ogbn_dataset: NodePropPredDataset, nodes_label_key: str, nodes_data_key_mapping: _typing.Optional[_typing.Mapping[str, str]] = ..., edges_data_key_mapping: _typing.Optional[_typing.Mapping[str, str]] = ..., graph_data_key_mapping: _typing.Optional[_typing.Mapping[str, str]] = ... ) -> GeneralStaticGraph: split_idx = ogbn_dataset.get_idx_split() return cls.ogbn_data_to_general_static_graph( ogbn_dataset[0][0], ogbn_dataset[0][1], nodes_label_key, split_idx["train"], split_idx["valid"], split_idx["test"], nodes_data_key_mapping, edges_data_key_mapping, graph_data_key_mapping)
def __init__(self, path: str): ogbn_dataset = NodePropPredDataset("ogbn-papers100M", path) if _backend.DependentBackend.is_dgl(): super(OGBNPapers100MDataset, self).__init__([ _OGBNDatasetUtil.ogbn_dataset_to_general_static_graph( ogbn_dataset, "label", { "node_feat": "feat", "node_year": "year" }) ]) elif _backend.DependentBackend.is_pyg(): super(OGBNPapers100MDataset, self).__init__([ _OGBNDatasetUtil.ogbn_dataset_to_general_static_graph( ogbn_dataset, "y", { "node_feat": "x", "node_year": "year" }) ])
def load_ogb_dataset(name): dataset = NCDataset(name) ogb_dataset = NodePropPredDataset(name=name) dataset.graph = ogb_dataset.graph dataset.graph['edge_index'] = torch.as_tensor(dataset.graph['edge_index']) dataset.graph['node_feat'] = torch.as_tensor(dataset.graph['node_feat']) def ogb_idx_to_tensor(): split_idx = ogb_dataset.get_idx_split() tensor_split_idx = { key: torch.as_tensor(split_idx[key]) for key in split_idx } return tensor_split_idx dataset.get_idx_split = ogb_idx_to_tensor # ogb_dataset.get_idx_split dataset.label = torch.as_tensor(ogb_dataset.labels).reshape(-1, 1) return dataset
def load_proteins_dataset(): ogb_dataset = NodePropPredDataset(name='ogbn-proteins') dataset = NCDataset('ogbn-proteins') def protein_orig_split(**kwargs): split_idx = ogb_dataset.get_idx_split() return { 'train': torch.as_tensor(split_idx['train']), 'valid': torch.as_tensor(split_idx['valid']), 'test': torch.as_tensor(split_idx['test']) } dataset.get_idx_split = protein_orig_split dataset.graph, dataset.label = ogb_dataset.graph, ogb_dataset.labels dataset.graph['edge_index'] = torch.as_tensor(dataset.graph['edge_index']) dataset.graph['edge_feat'] = torch.as_tensor(dataset.graph['edge_feat']) dataset.label = torch.as_tensor(dataset.label) return dataset
def load_data(name): dir_name = "_".join(name.split("-")) root = os.path.join("dataset", dir_name) dir_gnn_bs = os.path.join(root, "gnn_bs") if not os.path.exists(root): os.mkdir(root) if not os.path.exists(dir_gnn_bs): dataset = NodePropPredDataset(name) print("data preprocess...") preprocess(name, root, dataset) adj_full = sp.load_npz('{}/adj_full.npz'.format(dir_gnn_bs)).astype( np.bool) else: adj_full = sp.load_npz('{}/adj_full.npz'.format(dir_gnn_bs)).astype( np.bool) splitted_idx = np.load('{}/splitted_idx.npy'.format(dir_gnn_bs), allow_pickle=True).item() feats = np.load('{}/feats.npy'.format(dir_gnn_bs)) labels = np.load('{}/labels.npy'.format(dir_gnn_bs)) ## ---- normalize feats ---- #if dataset == "reddit" or dataset == "flickr" or dataset == "yelp" or dataset == "amazon" or dataset == "ppi-large": # train_nodes = np.array(list(set(adj_train.nonzero()[0]))) # train_feats = feats[train_nodes] # scaler = StandardScaler() # scaler.fit(train_feats) # feats = scaler.transform(feats) adj_full = adj_full.tolil() train_nodes = splitted_idx['train'] y_train = labels[train_nodes] valid_nodes = splitted_idx['valid'] y_valid = labels[valid_nodes] test_nodes = splitted_idx['test'] y_test = labels[test_nodes] return adj_full, feats, train_nodes, y_train, \ valid_nodes, y_valid, test_nodes, y_test
def arxiv_ingestion(): from ogb.nodeproppred import NodePropPredDataset d = NodePropPredDataset('ogbn-arxiv') edge_list = pd.DataFrame(d[0][0]['edge_index'].T) feature_list = pd.DataFrame(d[0][0]['node_feat']) # [:3000] labels = d.labels # Used Sample [] # sample_labels = d.labels[:3000] idx = np.array(list(range(feature_list.shape[0])), dtype=np.int32) idx_map = {j: i for i, j in enumerate(idx)} edges = np.array(edge_list.values.tolist()) edges = np.array(list(map(idx_map.get, edges.flatten()))).reshape(edges.shape) features = sparse.csr_matrix(d.graph['node_feat'], dtype=np.float32) features = torch.FloatTensor(np.array(features.todense())) labels = np.array(labels.flatten()) labels_lpa = (pd.get_dummies(labels)).to_numpy() adj = to_adj_list(edges, labels) labels = torch.LongTensor(labels) idx_train = d[0][0]['node_year'].flatten() < 2017 idx_val = d[0][0]['node_year'].flatten() == 2018 idx_test = d[0][0]['node_year'].flatten() == 2019 idx_train = torch.LongTensor(idx_train) idx_val = torch.LongTensor(idx_val) idx_test = torch.LongTensor(idx_test) return adj, features, labels, idx_train, idx_val, idx_test, labels_lpa
# log_out.write(args) print(args, file=log_out, flush=True) epochs = args.epoch node_dim = args.node_dim num_channels = args.num_channels lr = args.lr weight_decay = args.weight_decay num_layers = args.num_layers norm = args.norm adaptive_lr = args.adaptive_lr if args.ogb_mag: print("Using OGB MAG", flush=True) dataset = NodePropPredDataset(name="ogbn-mag") split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx[ "valid"], split_idx["test"] graph, label = dataset[0] # graph: library-agnostic graph object AvsI = graph['edge_index_dict'][('author', 'affiliated_with', 'institution')] AvsP = graph['edge_index_dict'][('author', 'writes', 'paper')] PvsP = graph['edge_index_dict'][('paper', 'cites', 'paper')] PvsS = graph['edge_index_dict'][('paper', 'has_topic', 'field_of_study')] # empty_lists = [ [] for _ in range(len(AvsI[0])) ] # AvsIdict = dict(zip(AvsI[0],empty_lists)) empty_lists = [[] for _ in range(len(AvsI[1]))]
def get_graph_data(d_name="ogbn-proteins", mini_data=False): """ Param: d_name: name of dataset mini_data: if mini_data==True, only use a small dataset (for test) """ # import ogb data dataset = NodePropPredDataset(name=d_name) num_tasks = dataset.num_tasks # obtaining the number of prediction tasks in a dataset split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx[ "valid"], split_idx["test"] graph, label = dataset[0] # reshape graph["edge_index"] = graph["edge_index"].T # mini dataset if mini_data: graph['num_nodes'] = 500 mask = (graph['edge_index'][:, 0] < 500) * (graph['edge_index'][:, 1] < 500) graph["edge_index"] = graph["edge_index"][mask] graph["edge_feat"] = graph["edge_feat"][mask] label = label[:500] train_idx = np.arange(0, 400) valid_idx = np.arange(400, 450) test_idx = np.arange(450, 500) # read/compute node feature if mini_data: node_feat_path = './dataset/ogbn_proteins_node_feat_small.npy' else: node_feat_path = './dataset/ogbn_proteins_node_feat.npy' new_node_feat = None if os.path.exists(node_feat_path): print("Begin: read node feature".center(50, '=')) new_node_feat = np.load(node_feat_path) print("End: read node feature".center(50, '=')) else: print("Begin: compute node feature".center(50, '=')) start = time.perf_counter() for i in range(graph['num_nodes']): if i % 100 == 0: dur = time.perf_counter() - start print("{}/{}({}%), times: {:.2f}s".format( i, graph['num_nodes'], i / graph['num_nodes'] * 100, dur)) mask = (graph['edge_index'][:, 0] == i) current_node_feat = np.mean(np.compress(mask, graph['edge_feat'], axis=0), axis=0, keepdims=True) if i == 0: new_node_feat = [current_node_feat] else: new_node_feat.append(current_node_feat) new_node_feat = np.concatenate(new_node_feat, axis=0) print("End: compute node feature".center(50, '=')) print("Saving node feature in " + node_feat_path.center(50, '=')) np.save(node_feat_path, new_node_feat) print("Saving finish".center(50, '=')) print(new_node_feat) # create graph g = pgl.graph.Graph(num_nodes=graph["num_nodes"], edges=graph["edge_index"], node_feat={'node_feat': new_node_feat}, edge_feat=None) print("Create graph") print(g) return g, label, train_idx, valid_idx, test_idx, Evaluator(d_name)
train_idx, valid_idx, test_idx = split_idx["train"], split_idx[ "valid"], split_idx["test"] G = nx.Graph() os.makedirs(f"{d_name}/dataset/{dataset_str}/", exist_ok=True) pbar = tqdm.tqdm(total=edgelist.shape[1]) pbar.set_description('Adding edges to graph') for i in range(edgelist.shape[1]): G.add_edge(int(edgelist[0][i]), int(edgelist[1][i])) pbar.update(1) if args.save_label or args.save_feature: if dataset_str == "arxiv": graph, label = NodePropPredDataset( name=d_name, root=f"{d_name}/dataset")[ 0] # graph: library-agnostic graph object else: raise NotImplementedError os.makedirs(f"{d_name}/dataset/{dataset_str}/", exist_ok=True) if args.save_label: with open( "{}/dataset/{}/{}-class_map.json".format(d_name, dataset_str, dataset_str), 'w') as f: f.write('{') for i in range(num_nodes): if i > 0: f.write(', ') f.write('\"' + str(i) + "\": ") f.write(str(label[i][0]))
""" import numpy as np import tensorflow as tf from ogb.nodeproppred import Evaluator, NodePropPredDataset from tensorflow.keras.layers import BatchNormalization, Dropout, Input from tensorflow.keras.losses import SparseCategoricalCrossentropy from tensorflow.keras.models import Model from tensorflow.keras.optimizers import Adam from spektral.datasets.ogb import OGB from spektral.layers import GCNConv from spektral.transforms import AdjToSpTensor, GCNFilter # Load data dataset_name = "ogbn-arxiv" ogb_dataset = NodePropPredDataset(dataset_name) dataset = OGB(ogb_dataset, transforms=[GCNFilter(), AdjToSpTensor()]) graph = dataset[0] x, adj, y = graph.x, graph.a, graph.y # Parameters channels = 256 # Number of channels for GCN layers dropout = 0.5 # Dropout rate for the features learning_rate = 1e-2 # Learning rate epochs = 200 # Number of training epochs N = dataset.n_nodes # Number of nodes in the graph F = dataset.n_node_features # Original size of node features n_out = ogb_dataset.num_classes # OGB labels are sparse indices # Data splits
def __init__(self): d_name = "ogbn-arxiv" dataset = NodePropPredDataset(name=d_name) graph, label = dataset[0] self.num_nodes = graph["num_nodes"] self.ogb_evaluator = Evaluator(name="ogbn-arxiv")
def test_datasetsaver(): # test on graph classification # ogbg-molhiv test_task = 'link' # testing all the dataset objects are working. if test_task == 'graph': from ogb.graphproppred import PygGraphPropPredDataset, DglGraphPropPredDataset, GraphPropPredDataset dataset_name = 'ogbg-molhiv' dataset = PygGraphPropPredDataset(dataset_name) dataset.get_idx_split() dataset = DglGraphPropPredDataset(dataset_name) dataset.get_idx_split() dataset = GraphPropPredDataset(dataset_name) dataset.get_idx_split() elif test_task == 'node': from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset dataset_name = 'ogbn-arxiv' # test ogbn-proteins dataset = PygNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = DglNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = NodePropPredDataset(dataset_name) dataset.get_idx_split() elif test_task == 'link': from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset dataset_name = 'ogbl-collab' dataset = PygLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = DglLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = LinkPropPredDataset(dataset_name) dataset.get_edge_split() elif test_task == 'heteronode': from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset dataset_name = 'ogbn-mag' dataset = PygNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = DglNodePropPredDataset(dataset_name) dataset.get_idx_split() dataset = NodePropPredDataset(dataset_name) dataset.get_idx_split() elif test_task == 'heterolink': from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset dataset_name = 'ogbl-biokg' dataset = PygLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = DglLinkPropPredDataset(dataset_name) dataset.get_edge_split() dataset = LinkPropPredDataset(dataset_name) dataset.get_edge_split() else: raise ValueError('Invalid task category') print(dataset[0]) if 'link' in test_task: print(dataset.get_edge_split()) else: print(dataset.get_idx_split()) if 'graph' in test_task: graph_list = dataset.graphs else: graph_list = [dataset.graph] if 'link' not in test_task: labels = dataset.labels is_hetero = 'hetero' in test_task version = 2 if dataset_name == 'ogbn-mag' else 1 saver = DatasetSaver(dataset_name, is_hetero, version=version) # saving graph objects saver.save_graph_list(graph_list) # saving target labels if 'link' not in test_task: saver.save_target_labels(labels) # saving split if 'link' in test_task: split_idx = dataset.get_edge_split() else: split_idx = dataset.get_idx_split() # second argument must be the name of the split saver.save_split(split_idx, dataset.meta_info['split']) # copying mapping dir # saver.copy_mapping_dir(f"dataset/{'_'.join(dataset_name.split('-'))}/mapping/") saver.copy_mapping_dir("dataset/{}/mapping/".format('_'.join( dataset_name.split('-')))) saver.save_task_info( dataset.task_type, dataset.eval_metric, dataset.num_classes if hasattr(dataset, 'num_classes') else None) meta_dict = saver.get_meta_dict() print(meta_dict) print('Now testing.') if 'graph' in test_task: print('library agnostic') dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('Pytorch Geometric') dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('DGL') dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) elif 'node' in test_task: print('library agnostic') dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict) dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('Pytorch Geometric') dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict) dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) print('DGL') dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict) dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) print(dataset.get_idx_split()) elif 'link' in test_task: print('library agnostic') dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) # print(dataset.get_edge_split()) print('Pytorch Geometric') dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) # print(dataset.get_edge_split()) print('DGL') dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict) dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict) print(dataset[0]) # print(dataset.get_edge_split()) else: raise ValueError('Invalid task category') # zip saver.zip() print('Finished zipping!') saver.cleanup()
def process(self): dataset = NodePropPredDataset(name=self.name, root="./data") node_type_dict = {"paper": 0, "author": 1, "field_of_study": 2, "institution": 3} edge_type_dict = { ("paper", "cites", "paper"): 0, ("author", "affiliated_with", "institution"): 1, ("author", "writes", "paper"): 2, ("paper", "has_topic", "field_of_study"): 3, } num_nodes_dict = dataset[0][0]["num_nodes_dict"] num_nodes = torch.as_tensor( [0] + [ num_nodes_dict["paper"], num_nodes_dict["author"], num_nodes_dict["field_of_study"], num_nodes_dict["institution"], ] ) cum_num_nodes = torch.cumsum(num_nodes, dim=-1) node_types = torch.repeat_interleave(torch.arange(0, 4), num_nodes[1:]) edge_index_dict = dataset[0][0]["edge_index_dict"] edge_index = [None] * len(edge_type_dict) edge_attr = [None] * len(edge_type_dict) i = 0 for k, v in edge_index_dict.items(): head, edge_type, tail = k head_offset = cum_num_nodes[node_type_dict[head]].item() tail_offset = cum_num_nodes[node_type_dict[tail]].item() src = v[0] + head_offset tgt = v[1] + tail_offset edge_tps = np.full(src.shape, edge_type_dict[k]) if edge_type == "cites": _edges = torch.as_tensor([src, tgt]) _src, _tgt = to_undirected(_edges).numpy() edge_tps = np.full(_src.shape, edge_type_dict[k]) edge_idx = np.vstack([_src, _tgt]) else: _src = np.concatenate([src, tgt]) _tgt = np.concatenate([tgt, src]) re_tps = np.full(src.shape, len(edge_type_dict)) re_k = (tail, "to", head) edge_type_dict[re_k] = len(edge_type_dict) edge_tps = np.concatenate([edge_tps, re_tps]) edge_idx = np.vstack([_src, _tgt]) edge_index[i] = edge_idx edge_attr[i] = edge_tps assert edge_index[i].shape[1] == edge_attr[i].shape[0] i += 1 edge_index = np.concatenate(edge_index, axis=-1) edge_index = torch.from_numpy(edge_index) edge_attr = torch.from_numpy(np.concatenate(edge_attr)) assert edge_index.shape[1] == edge_attr.shape[0] split_index = dataset.get_idx_split() train_index = torch.from_numpy(split_index["train"]["paper"]) val_index = torch.from_numpy(split_index["valid"]["paper"]) test_index = torch.from_numpy(split_index["test"]["paper"]) y = torch.as_tensor(dataset[0][1]["paper"]).view(-1) paper_feat = dataset[0][0]["node_feat_dict"]["paper"] data = Graph( y=y, edge_index=edge_index, edge_types=edge_attr, train_mask=train_index, val_mask=val_index, test_mask=test_index, node_types=node_types, ) # self.save_edges(data) torch.save((data, node_type_dict, edge_type_dict, num_nodes_dict), self.processed_paths[0]) np.save(self.processed_paths[1], paper_feat)
def ogbn_generate_split(job: signac.Project.Job, splitJob: signac.Project.Job, feature_graph_name, feature_graph_files): import constraint with utils.chdir(splitJob.sp.ogbn_path): from ogb.nodeproppred import NodePropPredDataset d_name = splitJob.sp.ogbn_name lock = ogbnLockDict.setdefault(splitJob.sp.ogbn_path, threading.Lock()) if not os.path.exists("dataset"): # In case dataset is not downloaded lock.acquire() ogbnDataset = NodePropPredDataset(name=d_name) lock.release() else: ogbnDataset = NodePropPredDataset(name=d_name) split_idx = ogbnDataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx[ "valid"], split_idx["test"] graph, label = ogbnDataset[0] with job: splitJobSrc = utils.signac_tools.access_proj_job( job, splitJob.sp.feature_source, splitJob.sp.split_source) splitSrcName = splitJobSrc.doc["split_name"] # Copy not changing files for source_file, dest_file in [ (splitJobSrc.fn(f"{splitSrcName}.{ext}"), splitJob.fn(f"{feature_graph_name}.{ext}")) for ext in ('y', 'ty', 'ally', 'graph', 'test.index') ]: shutil.copy2(source_file, dest_file) with splitJobSrc: datasetSrc = utils.PlanetoidData(splitJobSrc.doc.split_name, ".", val_size=None) ogbnLabelCount = np.zeros((3, ogbnDataset.num_classes)) ogbnLabelCount[0, :] = (label[train_idx] == np.arange( ogbnDataset.num_classes)).sum(0) ogbnLabelCount[1, :] = (label[valid_idx] == np.arange( ogbnDataset.num_classes)).sum(0) ogbnLabelCount[2, :] = (label[test_idx] == np.arange( ogbnDataset.num_classes)).sum(0) srcLabelCount = np.zeros((3, job.sp.numClass)) srcLabelCount[0, :] = datasetSrc.y_all[datasetSrc.train_mask, :].sum(0) srcLabelCount[1, :] = datasetSrc.y_all[datasetSrc.val_mask, :].sum(0) srcLabelCount[2, :] = datasetSrc.y_all[datasetSrc.test_mask, :].sum(0) problem = constraint.Problem() problem.addVariables(range(job.sp.numClass), range(ogbnDataset.num_classes)) problem.addConstraint(constraint.AllDifferentConstraint()) for i in range(job.sp.numClass): problem.addConstraint( lambda x: np.all(ogbnLabelCount[:, x] >= srcLabelCount[:, i]), (i, )) solution = problem.getSolution() for srcClass, dstClass in solution.items(): assert np.all( ogbnLabelCount[:, dstClass] >= srcLabelCount[:, srcClass]) newFeatures = np.zeros( (datasetSrc.num_samples, graph["node_feat"].shape[1])) for scope, idx in (("train", train_idx), ("val", valid_idx), ("test", test_idx)): scope_mask = getattr(datasetSrc, f"{scope}_mask") for srcClass, dstClass in solution.items(): srcOpMask = np.logical_and(scope_mask, datasetSrc.labels == srcClass) dstSampleSet = list( set(idx).intersection(np.where(label == dstClass)[0])) sampleInds = random_state.choice(dstSampleSet, srcOpMask.sum(), replace=False) newFeatures[srcOpMask, :] = graph["node_feat"][sampleInds, :] x_mask = datasetSrc.train_mask allx_mask = (datasetSrc.train_mask + datasetSrc.val_mask) test_mask = datasetSrc.test_mask x = newFeatures[x_mask] allx = newFeatures[allx_mask] tx = newFeatures[test_mask] # .x; .tx; .allx pickle.dump(scipy.sparse.csr_matrix(x), open(splitJob.fn(f"{feature_graph_name}.x"), "wb")) pickle.dump(scipy.sparse.csr_matrix(allx), open(splitJob.fn(f"{feature_graph_name}.allx"), "wb")) pickle.dump(scipy.sparse.csr_matrix(tx), open(splitJob.fn(f"{feature_graph_name}.tx"), "wb")) assert all(map(splitJob.isfile, feature_graph_files)) splitJob.doc["succeeded"] = True splitJob.doc["split_name"] = feature_graph_name splitJob.doc.val_size = splitJobSrc.doc.val_size
def load_data(data_dir, dataset_str, knn_size=None, epsilon=None, knn_metric='cosine', prob_del_edge=None, prob_add_edge=None, seed=1234, sparse_init_adj=False): """ Loads input data from gcn/data directory ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object; ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object; ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object; ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object; ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object; ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object; ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict object; ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object. All objects above must be saved using python pickle module. :param dataset_str: Dataset name ('cora', 'citeseer', 'pubmed') :return: All data input files loaded (as well the training/test data). """ assert (knn_size is None) or (epsilon is None) if dataset_str.startswith('ogbn'): # Open Graph Benchmark datasets from ogb.nodeproppred import NodePropPredDataset dataset = NodePropPredDataset(name=dataset_str) split_idx = dataset.get_idx_split() idx_train, idx_val, idx_test = torch.LongTensor(split_idx["train"]), torch.LongTensor(split_idx["valid"]), torch.LongTensor(split_idx["test"]) data = dataset[0] # This dataset has only one graph features = torch.Tensor(data[0]['node_feat']) labels = torch.LongTensor(data[1]).squeeze(-1) edge_index = data[0]['edge_index'] adj = to_undirected(edge_index, num_nodes=data[0]['num_nodes']) assert adj.diagonal().sum() == 0 and adj.max() <= 1 and (adj != adj.transpose()).sum() == 0 else: # datasets: Cora, Citeseer, PubMed names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open(os.path.join(data_dir, 'ind.{}.{}'.format(dataset_str, names[i])), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file(os.path.join(data_dir, 'ind.{}.test.index'.format(dataset_str))) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range-min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range-min(test_idx_range), :] = ty ty = ty_extended raw_features = sp.vstack((allx, tx)).tolil() raw_features[test_idx_reorder, :] = raw_features[test_idx_range, :] features = normalize_features(raw_features) raw_features = torch.Tensor(raw_features.todense()) features = torch.Tensor(features.todense()) adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] # labels = torch.LongTensor(np.where(labels)[1]) labels = torch.LongTensor(np.argmax(labels, axis=1)) idx_train = torch.LongTensor(range(len(y))) idx_val = torch.LongTensor(range(len(y), len(y) + 500)) idx_test = torch.LongTensor(test_idx_range.tolist()) if not knn_size is None: print('[ Using KNN-graph as input graph: {} ]'.format(knn_size)) adj = kneighbors_graph(features, knn_size, metric=knn_metric, include_self=True) adj_norm = normalize_sparse_adj(adj) if sparse_init_adj: adj_norm = sparse_mx_to_torch_sparse_tensor(adj_norm) else: adj_norm = torch.Tensor(adj_norm.todense()) elif not epsilon is None: print('[ Using Epsilon-graph as input graph: {} ]'.format(epsilon)) feature_norm = features.div(torch.norm(features, p=2, dim=-1, keepdim=True)) attention = torch.mm(feature_norm, feature_norm.transpose(-1, -2)) mask = (attention > epsilon).float() adj = attention * mask adj = (adj > 0).float() adj = sp.csr_matrix(adj) adj_norm = normalize_sparse_adj(adj) if sparse_init_adj: adj_norm = sparse_mx_to_torch_sparse_tensor(adj_norm) else: adj_norm = torch.Tensor(adj_norm.todense()) else: print('[ Using ground-truth input graph ]') if prob_del_edge is not None: adj = graph_delete_connections(prob_del_edge, seed, adj.toarray(), enforce_connected=False) adj = adj + np.eye(adj.shape[0]) adj_norm = normalize_adj(torch.Tensor(adj)) adj_norm = sp.csr_matrix(adj_norm) elif prob_add_edge is not None: adj = graph_add_connections(prob_add_edge, seed, adj.toarray(), enforce_connected=False) adj = adj + np.eye(adj.shape[0]) adj_norm = normalize_adj(torch.Tensor(adj)) adj_norm = sp.csr_matrix(adj_norm) else: adj = adj + sp.eye(adj.shape[0]) adj_norm = normalize_sparse_adj(adj) if sparse_init_adj: adj_norm = sparse_mx_to_torch_sparse_tensor(adj_norm) else: adj_norm = torch.Tensor(adj_norm.todense()) return adj_norm, features, labels, idx_train, idx_val, idx_test
def make_dataset(targets): try: if targets[0] == 'cora': pass except IndexError: print('No data target found, using test') targets[0] ='test' if targets[0] == 'cora': try: d1 = pd.read_csv('teams/DSC180A_FA20_A00/b01graphdataanalysis/cora.content', sep='\t', header=None) d2 = pd.read_csv('teams/DSC180A_FA20_A00/b01graphdataanalysis/cora.cites', sep='\t', header=None) except: d1 = pd.read_csv('data/cora.content', sep='\t', header=None) d2 = pd.read_csv('data/cora.cites', sep='\t', header=None) testfile = open('test/coradata.txt', 'w') testfile.write('') testfile.close() # New Output testfile = open('test/coradata.txt', 'a') elif targets[0] == 'test': d1 = pd.DataFrame(data=np.arange(0, 10)) d1 = pd.concat([d1, pd.DataFrame(np.random.randint(2, size=(10, 1433)))], axis=1) d1 = pd.concat([d1, pd.DataFrame(np.array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'A', 'B', 'C']))], axis=1) d1.columns = np.arange(0, 1435) d2 = pd.DataFrame(np.random.randint(10, size=[50, 2])) # Clear Text File testfile = open('test/testdata.txt', 'w') testfile.write('') testfile.close() # New Output testfile = open('test/testdata.txt', 'a') elif targets[0] == 'ogb': from ogb.nodeproppred import NodePropPredDataset d = NodePropPredDataset('ogbn-arxiv', root='/datasets/ogb/ogbn-arxiv') d2 = pd.DataFrame(d[0][0]['edge_index'].T) d1 = pd.DataFrame(d[0][0]['node_feat']) labels = d[0][1] # Clear Text File testfile = open('test/ogbdata.txt', 'w') testfile.write('') testfile.close() # New Output testfile = open('test/ogbdata.txt', 'a') elif targets[0] == 'ogbsample': from ogb.nodeproppred import NodePropPredDataset d = NodePropPredDataset('ogbn-arxiv', root='/datasets/ogb/ogbn-arxiv') d2 = pd.DataFrame(d[0][0]['edge_index'].T) d1 = pd.DataFrame(d[0][0]['node_feat']) labels = d[0][1] # Sample of OGB dataset d2 = d2.sort_values(0).iloc[:2000] partial_idx = list(set(d2[0].unique()) | set(d2[1].unique())) d1 = d1.iloc[partial_idx] # Clear Text File testfile = open('test/ogbdata.txt', 'w') testfile.write('') testfile.close() # New Output testfile = open('test/ogbdata.txt', 'a') le = prep.LabelEncoder() le.fit(d1.iloc[:, -1]) d1.iloc[:, -1] = le.transform(d1.iloc[:, -1]) if targets[0] == 'test' or targets[0] == 'cora': d1 = d1.set_index(0) d1 = d1.sort_index() d1 = d1.reset_index() labels = d1.iloc[:, -1] elif targets[0] == 'ogbsample': labels = d[0][1].flatten()[partial_idx] else: labels = d[0][1].flatten() labels = torch.Tensor(labels).long() labels_distr = np.zeros([len(labels), len(le.classes_)]) for row in range(len(labels)): labels_distr[row][labels[row]] = 1 return testfile, d1, d2, labels, targets, labels_distr, le
def get_graph_data(d_name="ogbn-proteins", mini_data=False): """ Param: d_name: name of dataset mini_data: if mini_data==True, only use a small dataset (for test) """ # 导入 ogb 数据 dataset = NodePropPredDataset(name=d_name) num_tasks = dataset.num_tasks # obtaining the number of prediction tasks in a dataset split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx[ "valid"], split_idx["test"] graph, label = dataset[0] # 调整维度,符合 PGL 的 Graph 要求 graph["edge_index"] = graph["edge_index"].T # 使用小规模数据,500个节点 if mini_data: graph['num_nodes'] = 500 mask = (graph['edge_index'][:, 0] < 500) * (graph['edge_index'][:, 1] < 500) graph["edge_index"] = graph["edge_index"][mask] graph["edge_feat"] = graph["edge_feat"][mask] label = label[:500] train_idx = np.arange(0, 400) valid_idx = np.arange(400, 450) test_idx = np.arange(450, 500) # 输出 dataset 的信息 print(graph.keys()) print("节点个数 ", graph["num_nodes"]) print("节点最小编号", graph['edge_index'][0].min()) print("边个数 ", graph["edge_index"].shape[1]) print("边索引 shape ", graph["edge_index"].shape) print("边特征 shape ", graph["edge_feat"].shape) print("节点特征是 ", graph["node_feat"]) print("species shape", graph['species'].shape) print("label shape ", label.shape) # 读取/计算 node feature # 确定读取文件的路径 if mini_data: node_feat_path = './dataset/ogbn_proteins_node_feat_small.npy' else: node_feat_path = './dataset/ogbn_proteins_node_feat.npy' new_node_feat = None if os.path.exists(node_feat_path): # 如果文件存在,直接读取 print("读取 node feature 开始".center(50, '=')) new_node_feat = np.load(node_feat_path) print("读取 node feature 成功".center(50, '=')) else: # 如果文件不存在,则计算 # 每个节点 i 的特征为其邻边特征的均值 print("计算 node feature 开始".center(50, '=')) start = time.perf_counter() for i in range(graph['num_nodes']): if i % 100 == 0: dur = time.perf_counter() - start print("{}/{}({}%), times: {:.2f}s".format( i, graph['num_nodes'], i / graph['num_nodes'] * 100, dur)) mask = (graph['edge_index'][:, 0] == i) # 选择 i 的所有邻边 # 计算均值 current_node_feat = np.mean(np.compress(mask, graph['edge_feat'], axis=0), axis=0, keepdims=True) if i == 0: new_node_feat = [current_node_feat] else: new_node_feat.append(current_node_feat) new_node_feat = np.concatenate(new_node_feat, axis=0) print("计算 node feature 结束".center(50, '=')) print("存储 node feature 中,在" + node_feat_path.center(50, '=')) np.save(node_feat_path, new_node_feat) print("存储 node feature 结束".center(50, '=')) print(new_node_feat) # 构造 Graph 对象 g = pgl.graph.Graph(num_nodes=graph["num_nodes"], edges=graph["edge_index"], node_feat={'node_feat': new_node_feat}, edge_feat=None) print("创建 Graph 对象成功") print(g) return g, label, train_idx, valid_idx, test_idx, Evaluator(d_name)