def test_normalize_scale(): assert NormalizeFeatures().__repr__() == 'NormalizeFeatures()' x = torch.Tensor([[1, 0, 1], [0, 1, 0], [0, 0, 0]]) data = Data(x=x) data = NormalizeFeatures()(data) assert len(data) == 1 assert data.x.tolist() == [[0.5, 0, 0.5], [0, 1, 0], [0, 0, 0]]
def get_graph_dataset(name, destination_dir="", min_num_nodes=None, max_num_nodes=None): """ Get a dataset from the TUD library (https://chrsmrrs.github.io/datasets/docs/home/) """ if destination_dir == None: destination_dir = "" if name == "ENZYMES": num_node_features = 18 train_val_test_ratio = [0.7, 0.1, 0.2] elif name == "PROTEINS": name = "PROTEINS_full" num_node_features = 29 train_val_test_ratio = [0.7, 0.1, 0.2] elif name == "DHFR": num_node_features = 3 train_val_test_ratio = [0.7, 0.1, 0.2] elif name == "COX2": num_node_features = 3 train_val_test_ratio = [0.7, 0.2, 0.1] dataset = TUDataset( root=destination_dir, name=name, use_node_attr=True, pre_filter=DataSizeFilter(min_num_nodes, max_num_nodes), pre_transform=SeparateNodeFeaturesAndLabels(num_node_features), transform=NormalizeFeatures()) print_dataset_stats(dataset) return dataset, train_val_test_ratio
def get_node_dataset(name, norm_feat=False, root=None): r"""A pre-implemented function to retrieve node datasets from Planetoid. Args: name (string): The name of the dataset (:obj:`"Cora"`, :obj:`"CiteSeer"`, :obj:`"PubMed"`). norm_feat (bool, optional): Whether to normalize node features. root (string, optional): Root directory where the dataset should be saved. (default: :obj:`None`) :rtype: :class:`torch_geometric.data.Dataset` Example ------- >>> dataset = get_node_dataset("Cora") >>> dataset Cora() """ root = "." if root is None else root transform = NormalizeFeatures() if norm_feat else None full_dataset = Planetoid(root + "/node_dataset/", name, transform=transform) return full_dataset
def test_hetero_normalize_scale(): x = torch.tensor([[1, 0, 1], [0, 1, 0], [0, 0, 0]], dtype=torch.float) data = HeteroData() data['v'].x = x data['w'].x = x data = NormalizeFeatures()(data) assert data['v'].x.tolist() == [[0.5, 0, 0.5], [0, 1, 0], [0, 0, 0]] assert data['w'].x.tolist() == [[0.5, 0, 0.5], [0, 1, 0], [0, 0, 0]]
def data_prepare(dataset_name='Cora'): ''' Args: dataset_name Returns: an object contains properties and data of the graph ''' if dataset_name in ('Cora', 'CiteSeer', 'PubMed'): dataset = Planetoid(root='../../data/', name=dataset_name, transform=NormalizeFeatures()) data = dataset[0] # Get the first graph object. hr = homo_ratio(data.edge_index.t(), data.y) print('h**o ratio:', hr) return data, dataset.num_features, dataset.num_classes
def __init__(self, args): super(GCN, self).__init__() num_node_features = args.get("num_node_features") num_classes = args.get("num_classes") self.dropout = args.get("dropout", 0.0) self.norm = NormalizeFeatures() self.conv1 = GCNConv(num_node_features, 32) self.conv2 = GCNConv(32, 32) self.fc1 = Linear(32, 32) self.fc2 = Linear(32, num_classes) self.graph_embedding_function = args.get("graph_embedding_function", None) self.reset_parameters()
def visualize(h, color): z = TSNE(n_components=2).fit_transform(out.detach().cpu().numpy()) plt.figure(figsize=(10, 10)) plt.xticks([]) plt.yticks([]) plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2") # plt.show() plt.pause(2) plt.close() dataset = Planetoid(root='data/Planetoid', name='Cora', transform=NormalizeFeatures()) print() print(f'Dataset: {dataset}:') print('======================') print(f'Number of graphs: {len(dataset)}') print(f'Number of features: {dataset.num_features}') print(f'Number of classes: {dataset.num_classes}') data = dataset[0] # Get the first graph object. print() print(data) # print('===========================================================================================================')
self.lin1.reset_parameters() self.lin2.reset_parameters() def forward(self, x, edge_index, dropout_ratio=0.6): x = F.dropout(x, p=dropout_ratio, training=self.training) x = F.relu(self.lin1(x)) x = F.dropout(x, p=dropout_ratio, training=self.training) x = self.lin2(x) x = self.prop(x, edge_index) return F.log_softmax(x, dim=1) if __name__ == '__main__': from torch_geometric.datasets import Planetoid from torch_geometric.transforms import NormalizeFeatures dataset = Planetoid(root='../../data/', name='Cora', transform=NormalizeFeatures()) model = APPNP_Net(hidden_channels=16) print(model) optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4) criterion = torch.nn.CrossEntropyLoss() data = dataset[0] def train(): model.train() optimizer.zero_grad() # Clear gradients. out = model(data.x, data.edge_index) # Perform a single forward pass. loss = criterion(out[data.train_mask], data.y[data.train_mask]) # Compute the loss solely based on the training nodes. loss.backward() # Derive gradients. optimizer.step() # Update parameters based on gradients.
def test(): model.eval() results = [] for mask in ["train_mask", "val_mask", "test_mask"]: out = model(data.x, data.edge_index) pred = out.argmax(dim=1) num_correct = (pred[data[mask]] == data.y[data[mask]]) acc = int(num_correct.sum()) / int(data[mask].sum()) results.append(acc) return results dataset = Planetoid(root="data/Planetoid", name='Cora', transform=NormalizeFeatures()) print(f"Datasets: {dataset}") print("=======================") print(f"NUmber of graphs: {len(dataset)}") print(f"NUmber of features: {dataset.num_features}") print(f"NUmber of calsses: {dataset.num_classes}") data = dataset[0] print(data) print("=====================================================================") # Gather some statistics about the graph. print(f'Number of nodes: {data.num_nodes}') print(f'Number of edges: {data.num_edges}')
def default_transforms() -> Optional[Dict[str, Callable]]: return { "pre_tensor_transform": NormalizeFeatures(), "collate": Batch.from_data_list }
def get_small_dataset(dataset_name, normalize_attributes=False, add_self_loops=False, remove_isolated_nodes=False, make_undirected=False, graph_availability=None, seed=0, create_adjacency_lists=True): """ Get the pytorch_geometric.data.Data object associated with the specified dataset name. :param dataset_name: str => One of the datasets mentioned below. :param normalize_attributes: Whether the attributes for each node should be normalized to sum to 1. :param add_self_loops: Add self loops to the input Graph. :param remove_isolated_nodes: Remove isolated nodes. :param make_undirected: Make the Graph undirected. :param graph_availability: Either inductive and transductive. If transductive, all the graph nodes are available during training. Otherwise, only training split nodes are available. :param seed: The random seed to use while splitting into train/val/test splits. :param create_adjacency_lists: Whether to process and store adjacency lists that can be used for efficient r-radius neighborhood sampling. :return: A pytorch_geometric.data.Data object for that dataset. """ assert dataset_name in { 'amazon-computers', 'amazon-photo', 'citeseer', 'coauthor-cs', 'coauthor-physics', 'cora', 'cora-full', 'ppi', 'pubmed', 'reddit' } assert graph_availability in {'inductive', 'transductive'} # Compose transforms that should be applied. transforms = [] if normalize_attributes: transforms.append(NormalizeFeatures()) if remove_isolated_nodes: transforms.append(RemoveIsolatedNodes()) if add_self_loops: transforms.append(AddSelfLoops()) transforms = Compose(transforms) if transforms else None # Load the specified dataset and apply transforms. root_dir = '/tmp/{dir}'.format(dir=dataset_name) processed_dir = os.path.join(root_dir, dataset_name, 'processed') # Remove any previously pre-processed data, so pytorch_geometric can pre-process it again. if os.path.exists(processed_dir) and os.path.isdir(processed_dir): shutil.rmtree(processed_dir) data = None def split_function(y): return _get_train_val_test_masks(y.shape[0], y, 0.2, 0.2, seed) if dataset_name in ['citeseer', 'cora', 'pubmed']: data = Planetoid(root=root_dir, name=dataset_name, pre_transform=transforms, split='full').data if seed != 0: data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'cora-full': data = CoraFull(root=root_dir, pre_transform=transforms).data data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'amazon-computers': data = Amazon(root=root_dir, name='Computers', pre_transform=transforms).data data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'amazon-photo': data = Amazon(root=root_dir, name='Photo', pre_transform=transforms).data data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'coauthor-cs': data = Coauthor(root=root_dir, name='CS', pre_transform=transforms).data data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'coauthor-physics': data = Coauthor(root=root_dir, name='Physics', pre_transform=transforms).data data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'reddit': data = Reddit(root=root_dir, pre_transform=transforms).data if seed != 0: data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) data.graphs = [data] elif dataset_name == 'ppi': data = SimpleNamespace() data.graphs = [] for split in ['train', 'val', 'test']: split_data = PPI(root=root_dir, split=split, pre_transform=transforms) x_idxs = split_data.slices['x'].numpy() edge_idxs = split_data.slices['edge_index'].numpy() split_data = split_data.data for x_start, x_end, e_start, e_end in zip(x_idxs, x_idxs[1:], edge_idxs, edge_idxs[1:]): graph = Data(split_data.x[x_start:x_end], split_data.edge_index[:, e_start:e_end], y=split_data.y[x_start:x_end]) graph.num_nodes = int(x_end - x_start) graph.split = split all_true = torch.ones(graph.num_nodes).bool() all_false = torch.zeros(graph.num_nodes).bool() graph.train_mask = all_true if split == 'train' else all_false graph.val_mask = all_true if split == 'val' else all_false graph.test_mask = all_true if split == 'test' else all_false data.graphs.append(graph) if seed != 0: temp_random = random.Random(seed) val_graphs = temp_random.sample(range(len(data.graphs)), 2) test_candidates = [ graph_idx for graph_idx in range(len(data.graphs)) if graph_idx not in val_graphs ] test_graphs = temp_random.sample(test_candidates, 2) for graph_idx, graph in enumerate(data.graphs): all_true = torch.ones(graph.num_nodes).bool() all_false = torch.zeros(graph.num_nodes).bool() graph.split = 'test' if graph_idx in test_graphs else 'val' if graph_idx in val_graphs else 'train' graph.train_mask = all_true if graph.split == 'train' else all_false graph.val_mask = all_true if graph.split == 'val' else all_false graph.test_mask = all_true if graph.split == 'test' else all_false if make_undirected: for graph in data.graphs: graph.edge_index = to_undirected(graph.edge_index, graph.num_nodes) LOG.info(f'Downloaded and transformed {len(data.graphs)} graph(s).') # Populate adjacency lists for efficient k-neighborhood sampling. # Only retain edges coming into a node and reverse the edges for the purpose of adjacency lists. LOG.info('Processing adjacency lists and degree information.') for graph in data.graphs: train_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64) val_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64) test_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64) adjacency_lists = defaultdict(list) not_val_test_mask = (~graph.val_mask & ~graph.test_mask).numpy() val_mask = graph.val_mask.numpy() test_mask = graph.test_mask.numpy() if create_adjacency_lists: num_edges = graph.edge_index[0].shape[0] sources, dests = graph.edge_index[0].numpy( ), graph.edge_index[1].numpy() for source, dest in tqdm(zip(sources, dests), total=num_edges, leave=False): if not_val_test_mask[dest] and not_val_test_mask[source]: train_in_degrees[dest] += 1 val_in_degrees[dest] += 1 elif val_mask[dest] and not test_mask[source]: val_in_degrees[dest] += 1 test_in_degrees[dest] += 1 adjacency_lists[dest].append(source) graph.adjacency_lists = dict(adjacency_lists) graph.train_in_degrees = torch.from_numpy(train_in_degrees).long() graph.val_in_degrees = torch.from_numpy(val_in_degrees).long() graph.test_in_degrees = torch.from_numpy(test_in_degrees).long() if graph_availability == 'transductive': graph.train_in_degrees = data.test_in_degrees graph.val_in_degrees = data.test_in_degrees graph.graph_availability = graph_availability # To accumulate any neighborhood perturbations to the graph. graph.perturbed_neighborhoods = defaultdict(set) graph.added_nodes = defaultdict(set) graph.modified_degrees = {} # For small datasets, cache the neighborhoods for all nodes for at least 3 different radii queries. graph.use_cache = True graph.neighborhood_cache = NeighborhoodCache(graph.num_nodes * 3) graph.train_mask_original = graph.train_mask graph.val_mask_original = graph.val_mask graph.test_mask_original = graph.test_mask graph.train_mask = torch.ones( graph.num_nodes).bool() & ~graph.val_mask & ~graph.test_mask return data
def per_sample_transform(self) -> Callable: return PyGTransformAdapter(NormalizeFeatures())
return 'data.pt' def download(self): for name in self.raw_file_names: download_url('{}/{}'.format(self.url, name), self.raw_dir) def process(self): data = read_planetoid_data(self.raw_dir, 'pubmed') data = data if self.pre_transform is None else self.pre_transform(data) torch.save(self.collate([data]), self.processed_paths[0]) def __repr__(self): return '{}()'.format(self.name) dataset = PlanetoidPubMed(root='data/PlanetoidPubMed/', transform=NormalizeFeatures()) print('dataset.num_features:', dataset.num_features) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') data = dataset[0].to(device) def train(): model.train() optimizer.zero_grad() # Clear gradients. out = model(data.x, data.edge_index) # Perform a single forward pass. # Compute the loss solely based on the training nodes. loss = criterion(out[data.train_mask], data.y[data.train_mask]) loss.backward() # Derive gradients. optimizer.step() # Update parameters based on gradients. return loss def test():
seed = 0 import random random.seed(seed) import numpy as np np.random.seed(seed) import torch torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True from torch_geometric.transforms import NormalizeFeatures from search_space import pruning_search_space_by_eda, precompute_cache from data_prepare import load_data # for name in ['cora', 'usa-airports', 'photo', 'wikics']: for name in ['photo']: data = load_data(name, seed, transform=NormalizeFeatures()) ss, (data_aug, fe, hpo, nas) = pruning_search_space_by_eda(data) precompute_cache(name, data, data_aug, fe)