def load_dataset(name): if name in ["Cora", "CiteSeer", "PubMed"]: dataset = Planetoid(root='./data/'+name, name=name) elif name == "CoraFull": dataset = CoraFull(root='./data/'+name) elif name in ["Computers", "Photo"]: dataset = Amazon(root='./data/'+name, name=name) elif name in ["CS", "Physics"]: dataset = Coauthor(root='./data/'+name, name=name) else: exit("wrong dataset") return dataset
def __init__(self, path: str): pyg_dataset = Coauthor(os.path.join(path, '_pyg'), "CS") if hasattr(pyg_dataset, "__data_list__"): delattr(pyg_dataset, "__data_list__") if hasattr(pyg_dataset, "_data_list"): delattr(pyg_dataset, "_data_list") pyg_data = pyg_dataset[0] static_graph = GeneralStaticGraphGenerator.create_homogeneous_static_graph( { 'x': pyg_data.x, 'y': pyg_data.y }, pyg_data.edge_index) super(CoauthorCSDataset, self).__init__([static_graph])
def load_dataset(dataset, transform=None): if dataset.lower() in ["cora", "citeseer", "pubmed"]: path = os.path.join(".datasets", "Plantoid") dataset = Planetoid(path, dataset.lower(), transform=transform) elif dataset.lower() in ["cs", "physics"]: path = os.path.join(".datasets", "Coauthor", dataset.lower()) dataset = Coauthor(path, dataset.lower(), transform=transform) elif dataset.lower() in ["computers", "photo"]: path = os.path.join(".datasets", "Amazon", dataset.lower()) dataset = Amazon(path, dataset.lower(), transform=transform) else: print("Dataset not supported!") assert False return dataset
def load_data(dataset="Cora"): path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "data", dataset) if dataset in ["Cora", "Citeseer", "Pubmed"]: data = Planetoid(path, dataset, split="public", transform=T.NormalizeFeatures())[0] num_nodes = data.x.size(0) edge_index, _ = remove_self_loops(data.edge_index) edge_index = add_self_loops(edge_index, num_nodes=num_nodes) if isinstance(edge_index, tuple): data.edge_index = edge_index[0] # !!! 2*N 新版可能有改变 else: data.edge_index = edge_index return data elif dataset in ["CoauthorCS"]: data = Coauthor(path, "cs", transform=T.NormalizeFeatures())[0] num_nodes = data.x.size(0) edge_index, _ = remove_self_loops(data.edge_index) edge_index = add_self_loops(edge_index, num_nodes=num_nodes) if isinstance(edge_index, tuple): data.edge_index = edge_index[0] else: data.edge_index = edge_index # devide training validation and testing set train_mask = torch.zeros((num_nodes, ), dtype=torch.bool) val_mask = torch.zeros((num_nodes, ), dtype=torch.bool) test_mask = torch.zeros((num_nodes, ), dtype=torch.bool) train_num = 40 val_num = 150 for i in range(15): # number of labels index = (data.y == i).nonzero()[:, 0] perm = torch.randperm(index.size(0)) train_mask[index[perm[:train_num]]] = 1 val_mask[index[perm[train_num:(train_num + val_num)]]] = 1 test_mask[index[perm[(train_num + val_num):]]] = 1 data.train_mask = train_mask data.val_mask = val_mask data.test_mask = test_mask return data else: raise Exception(f"the dataset of {dataset} has not been implemented")
def load_data(self): data_name = self._params['data_name'] if self._params['net'] in {'combined', 'symmetric', 'asymmetric', 'combined_gcn'}: self._data_path = './data/{}'.format(data_name) gnx = nx.read_gpickle("./data/{}/gnx.pkl".format(data_name)) bow = pickle.load(open("./data/{}/content.pkl".format(data_name), "rb")) nodes = sorted(gnx.nodes) dict = {x: i for i, x in enumerate(nodes)} x = torch.Tensor(np.vstack([bow[node] for node in nodes])).to(self._device) y = torch.LongTensor([gnx.nodes[node]['label'] for node in nodes]).to(self._device) edges = torch.LongTensor(np.vstack([[dict[x[0]] for x in gnx.edges], [dict[x[1]] for x in gnx.edges]])).to(self._device) self._data = Data(x=x, edge_index=edges, y=y) self._num_features = x.shape[1] self._num_classes = len(gnx.graph['node_labels']) # Adjacency matrices adj = nx.adjacency_matrix(gnx, nodelist=nodes).astype(np.float32) if self._params['net'] == 'symmetric': self._adj = handle_matrix_symmetric(adj) self._adj = sparse_mx_to_torch_sparse_tensor(self._adj).to_dense().to(self._device) else: self._adj = handle_matrix_concat(adj, should_normalize=True) self._adj = sparse_mx_to_torch_sparse_tensor(self._adj).to_dense().to(self._device) return self._data data_transform = T.NormalizeFeatures() if self._params['norm'] == True else None self._data_path = './DataSets/{}'.format(data_name) if data_name == "CoraFull": self._data_set = CoraFull(self._data_path) elif data_name in {"CS", "Physics"}: self._data_set = Coauthor(self._data_path, data_name) else: self._data_set = Planetoid(self._data_path, data_name, data_transform) self._data_set.data.to(self._device) self._data = self._data_set[0] # self._data = self._data_set.data self._num_features = self._data_set.num_features self._num_classes = self._data_set.num_classes return self._data
def fetch_dataset(root, name): """ Fetchs datasets from the PyTorch Geometric library :param root: A path to the root directory a dataset will be placed :param name: Name of the dataset. Currently, the following names are supported 'cora', 'citeseer', "pubmed", 'Computers', "Photo", 'CS', 'Physics' :return: A PyTorch Geometric dataset """ print(name.lower()) if name.lower() in {'cora', 'citeseer', "pubmed"}: return Planetoid(root=root, name=name) elif name.lower() in {'computers', "photo"}: return Amazon(root=root, name=name) elif name.lower() in {'cs', 'physics'}: return Coauthor(root=root, name=name) elif name.lower() == "wiki": return WikiCS(osp.join(root, "WikiCS")) elif name.lower() == "actor": return Actor(osp.join(root, name))
def load_data(dataset_name): """ Loads required data set and normalizes features. Implemented data sets are any of type Planetoid and Reddit. :param dataset_name: Name of data set :return: Tuple of dataset and extracted graph """ path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset_name) if dataset_name == 'cora_full': dataset = CoraFull(path, T.NormalizeFeatures()) elif dataset_name.lower() == 'coauthor': dataset = Coauthor(path, 'Physics', T.NormalizeFeatures()) elif dataset_name.lower() == 'reddit': dataset = Reddit(path, T.NormalizeFeatures()) elif dataset_name.lower() == 'amazon': dataset = Amazon(path) else: dataset = Planetoid(path, dataset_name, T.NormalizeFeatures()) print(f"Loading data set {dataset_name} from: ", path) data = dataset[0] # Extract graph return dataset, data
def load_data(dataset="Cora", supervised=False, full_data=True, args=None): ''' support semi-supervised and supervised :param dataset: :param supervised: :return: ''' path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset) dataset_name = dataset if dataset in ["CS", "Physics"]: dataset = Coauthor(path, dataset, T.NormalizeFeatures()) elif dataset in ["Computers", "Photo"]: dataset = Amazon(path, dataset, T.NormalizeFeatures()) elif dataset in ["Cora", "Citeseer", "Pubmed"]: dataset = Planetoid(path, dataset, T.NormalizeFeatures()) # path = path + '/processed/data.pt' # dataset = torch.load(path) data = dataset[0] data['adj'] = load_citation(dataset_name, args.normalization) if supervised: if full_data: data.train_mask = torch.zeros(data.num_nodes, dtype=torch.uint8) data.train_mask[:-1000] = 1 data.val_mask = torch.zeros(data.num_nodes, dtype=torch.uint8) data.val_mask[-1000:-500] = 1 data.test_mask = torch.zeros(data.num_nodes, dtype=torch.uint8) data.test_mask[-500:] = 1 else: data.train_mask = torch.zeros(data.num_nodes, dtype=torch.uint8) data.train_mask[:1000] = 1 data.val_mask = torch.zeros(data.num_nodes, dtype=torch.uint8) data.val_mask[1000:1500] = 1 data.test_mask = torch.zeros(data.num_nodes, dtype=torch.uint8) data.test_mask[1500:2000] = 1 return data
if pr.net == 1: print("Data Cora") _data = Planetoid(root="./pcora", name="Cora") elif pr.net == 2: print("Data CiteSeer") _data = Planetoid(root="./pciteseer", name="Citeseer") elif pr.net == 3: print("Data Pubmed") _data = Planetoid(root="./ppubmed", name="Pubmed") elif pr.net == 4: print("Data CoraFull") _data = CoraFull("./Corafull") elif pr.net == 5: print("Data Coauthor CS") _data = Coauthor("./CS", "CS") elif pr.net == 6: print("Data Coauthor Physics") _data = Coauthor("./Physics", "Physics") elif pr.net == 7: print("Data Amazon Computer") _data = Amazon("./Computer", "Computers") elif pr.net == 8: print("Data Amazon Photos") _data = Amazon("./Photo", "Photo") #_data = Coauthor("./Physics","Physics") #_data = Coauthor("./CS","CS") #_data = CoraFull("./Corafull")
from torch_geometric.datasets import Coauthor from torch_geometric.utils import to_networkx import pickle from GraphRicciCurvature.OllivierRicci import OllivierRicci from GraphRicciCurvature.FormanRicci import FormanRicci import numpy as np import sklearn.preprocessing as pp import torch import os datasets = ['CS'] #,'Physics'] for dataset in datasets: coauth = Coauthor('data_coauthor_'+dataset,dataset) print("coauth done") data = torch.load('data_coauthor_' + dataset + '/' + dataset + '/processed/data.pt') print("assigned val to data") G = to_networkx(data[0],to_undirected=True,remove_self_loops=True) print("made G") frc = FormanRicci(G) frc.compute_ricci_curvature() orc = OllivierRicci(G, alpha=0.5, verbose="INFO") orc.compute_ricci_curvature() print("Ollivier comp done") X = np.array(data[0].x) y_arr = np.array(data[0].y) Y = pp.label_binarize(y_arr, list(set(y_arr))) counts = [0]*15 train_idx = [] for i,yval in enumerate(y_arr):
def __init__(self, path): dataset = "CS" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) Coauthor(path, dataset) super(CoauthorCSDataset, self).__init__(path, dataset)
def get_small_dataset(dataset_name, normalize_attributes=False, add_self_loops=False, remove_isolated_nodes=False, make_undirected=False, graph_availability=None, seed=0, create_adjacency_lists=True): """ Get the pytorch_geometric.data.Data object associated with the specified dataset name. :param dataset_name: str => One of the datasets mentioned below. :param normalize_attributes: Whether the attributes for each node should be normalized to sum to 1. :param add_self_loops: Add self loops to the input Graph. :param remove_isolated_nodes: Remove isolated nodes. :param make_undirected: Make the Graph undirected. :param graph_availability: Either inductive and transductive. If transductive, all the graph nodes are available during training. Otherwise, only training split nodes are available. :param seed: The random seed to use while splitting into train/val/test splits. :param create_adjacency_lists: Whether to process and store adjacency lists that can be used for efficient r-radius neighborhood sampling. :return: A pytorch_geometric.data.Data object for that dataset. """ assert dataset_name in { 'amazon-computers', 'amazon-photo', 'citeseer', 'coauthor-cs', 'coauthor-physics', 'cora', 'cora-full', 'ppi', 'pubmed', 'reddit' } assert graph_availability in {'inductive', 'transductive'} # Compose transforms that should be applied. transforms = [] if normalize_attributes: transforms.append(NormalizeFeatures()) if remove_isolated_nodes: transforms.append(RemoveIsolatedNodes()) if add_self_loops: transforms.append(AddSelfLoops()) transforms = Compose(transforms) if transforms else None # Load the specified dataset and apply transforms. root_dir = '/tmp/{dir}'.format(dir=dataset_name) processed_dir = os.path.join(root_dir, dataset_name, 'processed') # Remove any previously pre-processed data, so pytorch_geometric can pre-process it again. if os.path.exists(processed_dir) and os.path.isdir(processed_dir): shutil.rmtree(processed_dir) data = None def split_function(y): return _get_train_val_test_masks(y.shape[0], y, 0.2, 0.2, seed) if dataset_name in ['citeseer', 'cora', 'pubmed']: data = Planetoid(root=root_dir, name=dataset_name, pre_transform=transforms, split='full').data if seed != 0: data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'cora-full': data = CoraFull(root=root_dir, pre_transform=transforms).data data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'amazon-computers': data = Amazon(root=root_dir, name='Computers', pre_transform=transforms).data data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'amazon-photo': data = Amazon(root=root_dir, name='Photo', pre_transform=transforms).data data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'coauthor-cs': data = Coauthor(root=root_dir, name='CS', pre_transform=transforms).data data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'coauthor-physics': data = Coauthor(root=root_dir, name='Physics', pre_transform=transforms).data data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'reddit': data = Reddit(root=root_dir, pre_transform=transforms).data if seed != 0: data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'ppi': data = SimpleNamespace() data.graphs = [] for split in ['train', 'val', 'test']: split_data = PPI(root=root_dir, split=split, pre_transform=transforms) x_idxs = split_data.slices['x'].numpy() edge_idxs = split_data.slices['edge_index'].numpy() split_data = split_data.data for x_start, x_end, e_start, e_end in zip(x_idxs, x_idxs[1:], edge_idxs, edge_idxs[1:]): graph = Data(split_data.x[x_start:x_end], split_data.edge_index[:, e_start:e_end], y=split_data.y[x_start:x_end]) graph.num_nodes = int(x_end - x_start) graph.split = split all_true = torch.ones(graph.num_nodes).bool() all_false = torch.zeros(graph.num_nodes).bool() graph.train_mask = all_true if split == 'train' else all_false graph.val_mask = all_true if split == 'val' else all_false graph.test_mask = all_true if split == 'test' else all_false data.graphs.append(graph) if seed != 0: temp_random = random.Random(seed) val_graphs = temp_random.sample(range(len(data.graphs)), 2) test_candidates = [ graph_idx for graph_idx in range(len(data.graphs)) if graph_idx not in val_graphs ] test_graphs = temp_random.sample(test_candidates, 2) for graph_idx, graph in enumerate(data.graphs): all_true = torch.ones(graph.num_nodes).bool() all_false = torch.zeros(graph.num_nodes).bool() graph.split = 'test' if graph_idx in test_graphs else 'val' if graph_idx in val_graphs else 'train' graph.train_mask = all_true if graph.split == 'train' else all_false graph.val_mask = all_true if graph.split == 'val' else all_false graph.test_mask = all_true if graph.split == 'test' else all_false if make_undirected: for graph in data.graphs: graph.edge_index = to_undirected(graph.edge_index, graph.num_nodes) LOG.info(f'Downloaded and transformed {len(data.graphs)} graph(s).') # Populate adjacency lists for efficient k-neighborhood sampling. # Only retain edges coming into a node and reverse the edges for the purpose of adjacency lists. LOG.info('Processing adjacency lists and degree information.') for graph in data.graphs: train_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64) val_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64) test_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64) adjacency_lists = defaultdict(list) not_val_test_mask = (~graph.val_mask & ~graph.test_mask).numpy() val_mask = graph.val_mask.numpy() test_mask = graph.test_mask.numpy() if create_adjacency_lists: num_edges = graph.edge_index[0].shape[0] sources, dests = graph.edge_index[0].numpy( ), graph.edge_index[1].numpy() for source, dest in tqdm(zip(sources, dests), total=num_edges, leave=False): if not_val_test_mask[dest] and not_val_test_mask[source]: train_in_degrees[dest] += 1 val_in_degrees[dest] += 1 elif val_mask[dest] and not test_mask[source]: val_in_degrees[dest] += 1 test_in_degrees[dest] += 1 adjacency_lists[dest].append(source) graph.adjacency_lists = dict(adjacency_lists) graph.train_in_degrees = torch.from_numpy(train_in_degrees).long() graph.val_in_degrees = torch.from_numpy(val_in_degrees).long() graph.test_in_degrees = torch.from_numpy(test_in_degrees).long() if graph_availability == 'transductive': graph.train_in_degrees = data.test_in_degrees graph.val_in_degrees = data.test_in_degrees graph.graph_availability = graph_availability # To accumulate any neighborhood perturbations to the graph. graph.perturbed_neighborhoods = defaultdict(set) graph.added_nodes = defaultdict(set) graph.modified_degrees = {} # For small datasets, cache the neighborhoods for all nodes for at least 3 different radii queries. graph.use_cache = True graph.neighborhood_cache = NeighborhoodCache(graph.num_nodes * 3) graph.train_mask_original = graph.train_mask graph.val_mask_original = graph.val_mask graph.test_mask_original = graph.test_mask graph.train_mask = torch.ones( graph.num_nodes).bool() & ~graph.val_mask & ~graph.test_mask return data