Beispiel #1
0
def test_normalize_scale():
    assert NormalizeFeatures().__repr__() == 'NormalizeFeatures()'

    x = torch.Tensor([[1, 0, 1], [0, 1, 0], [0, 0, 0]])

    data = Data(x=x)
    data = NormalizeFeatures()(data)
    assert len(data) == 1
    assert data.x.tolist() == [[0.5, 0, 0.5], [0, 1, 0], [0, 0, 0]]
Beispiel #2
0
def get_graph_dataset(name,
                      destination_dir="",
                      min_num_nodes=None,
                      max_num_nodes=None):
    """ Get a dataset from the TUD library (https://chrsmrrs.github.io/datasets/docs/home/) """
    if destination_dir == None:
        destination_dir = ""

    if name == "ENZYMES":
        num_node_features = 18
        train_val_test_ratio = [0.7, 0.1, 0.2]
    elif name == "PROTEINS":
        name = "PROTEINS_full"
        num_node_features = 29
        train_val_test_ratio = [0.7, 0.1, 0.2]
    elif name == "DHFR":
        num_node_features = 3
        train_val_test_ratio = [0.7, 0.1, 0.2]
    elif name == "COX2":
        num_node_features = 3
        train_val_test_ratio = [0.7, 0.2, 0.1]

    dataset = TUDataset(
        root=destination_dir,
        name=name,
        use_node_attr=True,
        pre_filter=DataSizeFilter(min_num_nodes, max_num_nodes),
        pre_transform=SeparateNodeFeaturesAndLabels(num_node_features),
        transform=NormalizeFeatures())
    print_dataset_stats(dataset)
    return dataset, train_val_test_ratio
Beispiel #3
0
def get_node_dataset(name, norm_feat=False, root=None):
    r"""A pre-implemented function to retrieve node datasets from Planetoid.

    Args:
        name (string): The name of the dataset (:obj:`"Cora"`,
            :obj:`"CiteSeer"`, :obj:`"PubMed"`).
        norm_feat (bool, optional): Whether to normalize node features.
        root (string, optional): Root directory where the dataset should be saved.
            (default: :obj:`None`)
        
    :rtype: :class:`torch_geometric.data.Dataset`
    
    Example
    -------
    >>> dataset = get_node_dataset("Cora")
    >>> dataset
    Cora()
    """
    root = "." if root is None else root
    transform = NormalizeFeatures() if norm_feat else None
    full_dataset = Planetoid(root + "/node_dataset/",
                             name,
                             transform=transform)

    return full_dataset
def test_hetero_normalize_scale():
    x = torch.tensor([[1, 0, 1], [0, 1, 0], [0, 0, 0]], dtype=torch.float)

    data = HeteroData()
    data['v'].x = x
    data['w'].x = x
    data = NormalizeFeatures()(data)
    assert data['v'].x.tolist() == [[0.5, 0, 0.5], [0, 1, 0], [0, 0, 0]]
    assert data['w'].x.tolist() == [[0.5, 0, 0.5], [0, 1, 0], [0, 0, 0]]
Beispiel #5
0
def data_prepare(dataset_name='Cora'):
    '''
        Args:
            dataset_name
        Returns:
            an object contains properties and data of the graph
    '''
    if dataset_name in ('Cora', 'CiteSeer', 'PubMed'):
        dataset = Planetoid(root='../../data/',
                            name=dataset_name,
                            transform=NormalizeFeatures())
        data = dataset[0]  # Get the first graph object.
        hr = homo_ratio(data.edge_index.t(), data.y)
        print('h**o ratio:', hr)
    return data, dataset.num_features, dataset.num_classes
Beispiel #6
0
    def __init__(self, args):
        super(GCN, self).__init__()

        num_node_features = args.get("num_node_features")
        num_classes = args.get("num_classes")
        self.dropout = args.get("dropout", 0.0)

        self.norm = NormalizeFeatures()

        self.conv1 = GCNConv(num_node_features, 32)
        self.conv2 = GCNConv(32, 32)

        self.fc1 = Linear(32, 32)
        self.fc2 = Linear(32, num_classes)

        self.graph_embedding_function = args.get("graph_embedding_function",
                                                 None)

        self.reset_parameters()
Beispiel #7
0
def visualize(h, color):
    z = TSNE(n_components=2).fit_transform(out.detach().cpu().numpy())

    plt.figure(figsize=(10, 10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    # plt.show()
    plt.pause(2)
    plt.close()


dataset = Planetoid(root='data/Planetoid',
                    name='Cora',
                    transform=NormalizeFeatures())

print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)

# print('===========================================================================================================')
Beispiel #8
0
        self.lin1.reset_parameters()
        self.lin2.reset_parameters()
        
    def forward(self, x, edge_index, dropout_ratio=0.6):
        x = F.dropout(x, p=dropout_ratio, training=self.training)
        x = F.relu(self.lin1(x))
        x = F.dropout(x, p=dropout_ratio, training=self.training)
        x = self.lin2(x)
        x = self.prop(x, edge_index)
        return F.log_softmax(x, dim=1)

if __name__ == '__main__':
    from torch_geometric.datasets import Planetoid
    from torch_geometric.transforms import NormalizeFeatures

    dataset = Planetoid(root='../../data/', name='Cora', transform=NormalizeFeatures())

    model = APPNP_Net(hidden_channels=16)
    print(model)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    criterion = torch.nn.CrossEntropyLoss()
    data = dataset[0]
    
    def train():
        model.train()
        optimizer.zero_grad()  # Clear gradients.
        out = model(data.x, data.edge_index)  # Perform a single forward pass.
        loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
def test():
    model.eval()
    results = []
    for mask in ["train_mask", "val_mask", "test_mask"]:
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=1)
        num_correct = (pred[data[mask]] == data.y[data[mask]])
        acc = int(num_correct.sum()) / int(data[mask].sum())
        results.append(acc)
    
    return results



dataset = Planetoid(root="data/Planetoid", name='Cora', transform=NormalizeFeatures())

print(f"Datasets: {dataset}")
print("=======================")
print(f"NUmber of graphs: {len(dataset)}")
print(f"NUmber of features: {dataset.num_features}")
print(f"NUmber of calsses: {dataset.num_classes}")

data = dataset[0]

print(data)
print("=====================================================================")

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
Beispiel #10
0
 def default_transforms() -> Optional[Dict[str, Callable]]:
     return {
         "pre_tensor_transform": NormalizeFeatures(),
         "collate": Batch.from_data_list
     }
Beispiel #11
0
def get_small_dataset(dataset_name,
                      normalize_attributes=False,
                      add_self_loops=False,
                      remove_isolated_nodes=False,
                      make_undirected=False,
                      graph_availability=None,
                      seed=0,
                      create_adjacency_lists=True):
    """
    Get the pytorch_geometric.data.Data object associated with the specified dataset name.
    :param dataset_name: str => One of the datasets mentioned below.
    :param normalize_attributes: Whether the attributes for each node should be normalized to sum to 1.
    :param add_self_loops: Add self loops to the input Graph.
    :param remove_isolated_nodes: Remove isolated nodes.
    :param make_undirected: Make the Graph undirected.
    :param graph_availability: Either inductive and transductive. If transductive, all the graph nodes are available
                               during training. Otherwise, only training split nodes are available.
    :param seed: The random seed to use while splitting into train/val/test splits.
    :param create_adjacency_lists: Whether to process and store adjacency lists that can be used for efficient
                                   r-radius neighborhood sampling.
    :return: A pytorch_geometric.data.Data object for that dataset.
    """
    assert dataset_name in {
        'amazon-computers', 'amazon-photo', 'citeseer', 'coauthor-cs',
        'coauthor-physics', 'cora', 'cora-full', 'ppi', 'pubmed', 'reddit'
    }
    assert graph_availability in {'inductive', 'transductive'}

    # Compose transforms that should be applied.
    transforms = []
    if normalize_attributes:
        transforms.append(NormalizeFeatures())
    if remove_isolated_nodes:
        transforms.append(RemoveIsolatedNodes())
    if add_self_loops:
        transforms.append(AddSelfLoops())
    transforms = Compose(transforms) if transforms else None

    # Load the specified dataset and apply transforms.
    root_dir = '/tmp/{dir}'.format(dir=dataset_name)
    processed_dir = os.path.join(root_dir, dataset_name, 'processed')
    # Remove any previously pre-processed data, so pytorch_geometric can pre-process it again.
    if os.path.exists(processed_dir) and os.path.isdir(processed_dir):
        shutil.rmtree(processed_dir)

    data = None

    def split_function(y):
        return _get_train_val_test_masks(y.shape[0], y, 0.2, 0.2, seed)

    if dataset_name in ['citeseer', 'cora', 'pubmed']:
        data = Planetoid(root=root_dir,
                         name=dataset_name,
                         pre_transform=transforms,
                         split='full').data
        if seed != 0:
            data.train_mask, data.val_mask, data.test_mask = split_function(
                data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'cora-full':
        data = CoraFull(root=root_dir, pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'amazon-computers':
        data = Amazon(root=root_dir,
                      name='Computers',
                      pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'amazon-photo':
        data = Amazon(root=root_dir, name='Photo',
                      pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'coauthor-cs':
        data = Coauthor(root=root_dir, name='CS',
                        pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'coauthor-physics':
        data = Coauthor(root=root_dir,
                        name='Physics',
                        pre_transform=transforms).data
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'reddit':
        data = Reddit(root=root_dir, pre_transform=transforms).data
        if seed != 0:
            data.train_mask, data.val_mask, data.test_mask = split_function(
                data.y.numpy())
        data.graphs = [data]
    elif dataset_name == 'ppi':
        data = SimpleNamespace()
        data.graphs = []
        for split in ['train', 'val', 'test']:
            split_data = PPI(root=root_dir,
                             split=split,
                             pre_transform=transforms)
            x_idxs = split_data.slices['x'].numpy()
            edge_idxs = split_data.slices['edge_index'].numpy()
            split_data = split_data.data
            for x_start, x_end, e_start, e_end in zip(x_idxs, x_idxs[1:],
                                                      edge_idxs,
                                                      edge_idxs[1:]):
                graph = Data(split_data.x[x_start:x_end],
                             split_data.edge_index[:, e_start:e_end],
                             y=split_data.y[x_start:x_end])
                graph.num_nodes = int(x_end - x_start)
                graph.split = split
                all_true = torch.ones(graph.num_nodes).bool()
                all_false = torch.zeros(graph.num_nodes).bool()
                graph.train_mask = all_true if split == 'train' else all_false
                graph.val_mask = all_true if split == 'val' else all_false
                graph.test_mask = all_true if split == 'test' else all_false
                data.graphs.append(graph)
        if seed != 0:
            temp_random = random.Random(seed)
            val_graphs = temp_random.sample(range(len(data.graphs)), 2)
            test_candidates = [
                graph_idx for graph_idx in range(len(data.graphs))
                if graph_idx not in val_graphs
            ]
            test_graphs = temp_random.sample(test_candidates, 2)
            for graph_idx, graph in enumerate(data.graphs):
                all_true = torch.ones(graph.num_nodes).bool()
                all_false = torch.zeros(graph.num_nodes).bool()
                graph.split = 'test' if graph_idx in test_graphs else 'val' if graph_idx in val_graphs else 'train'
                graph.train_mask = all_true if graph.split == 'train' else all_false
                graph.val_mask = all_true if graph.split == 'val' else all_false
                graph.test_mask = all_true if graph.split == 'test' else all_false

    if make_undirected:
        for graph in data.graphs:
            graph.edge_index = to_undirected(graph.edge_index, graph.num_nodes)

    LOG.info(f'Downloaded and transformed {len(data.graphs)} graph(s).')

    # Populate adjacency lists for efficient k-neighborhood sampling.
    # Only retain edges coming into a node and reverse the edges for the purpose of adjacency lists.
    LOG.info('Processing adjacency lists and degree information.')

    for graph in data.graphs:
        train_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64)
        val_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64)
        test_in_degrees = np.zeros(graph.num_nodes, dtype=np.int64)
        adjacency_lists = defaultdict(list)
        not_val_test_mask = (~graph.val_mask & ~graph.test_mask).numpy()
        val_mask = graph.val_mask.numpy()
        test_mask = graph.test_mask.numpy()

        if create_adjacency_lists:
            num_edges = graph.edge_index[0].shape[0]
            sources, dests = graph.edge_index[0].numpy(
            ), graph.edge_index[1].numpy()
            for source, dest in tqdm(zip(sources, dests),
                                     total=num_edges,
                                     leave=False):
                if not_val_test_mask[dest] and not_val_test_mask[source]:
                    train_in_degrees[dest] += 1
                    val_in_degrees[dest] += 1
                elif val_mask[dest] and not test_mask[source]:
                    val_in_degrees[dest] += 1
                test_in_degrees[dest] += 1
                adjacency_lists[dest].append(source)

        graph.adjacency_lists = dict(adjacency_lists)
        graph.train_in_degrees = torch.from_numpy(train_in_degrees).long()
        graph.val_in_degrees = torch.from_numpy(val_in_degrees).long()
        graph.test_in_degrees = torch.from_numpy(test_in_degrees).long()
        if graph_availability == 'transductive':
            graph.train_in_degrees = data.test_in_degrees
            graph.val_in_degrees = data.test_in_degrees

        graph.graph_availability = graph_availability

        # To accumulate any neighborhood perturbations to the graph.
        graph.perturbed_neighborhoods = defaultdict(set)
        graph.added_nodes = defaultdict(set)
        graph.modified_degrees = {}

        # For small datasets, cache the neighborhoods for all nodes for at least 3 different radii queries.
        graph.use_cache = True
        graph.neighborhood_cache = NeighborhoodCache(graph.num_nodes * 3)

        graph.train_mask_original = graph.train_mask
        graph.val_mask_original = graph.val_mask
        graph.test_mask_original = graph.test_mask

        graph.train_mask = torch.ones(
            graph.num_nodes).bool() & ~graph.val_mask & ~graph.test_mask

    return data
Beispiel #12
0
 def per_sample_transform(self) -> Callable:
     return PyGTransformAdapter(NormalizeFeatures())
        return 'data.pt'

    def download(self):
        for name in self.raw_file_names:
            download_url('{}/{}'.format(self.url, name), self.raw_dir)

    def process(self):
        data = read_planetoid_data(self.raw_dir, 'pubmed')
        data = data if self.pre_transform is None else self.pre_transform(data)
        torch.save(self.collate([data]), self.processed_paths[0])

    def __repr__(self):
        return '{}()'.format(self.name)


dataset = PlanetoidPubMed(root='data/PlanetoidPubMed/', transform=NormalizeFeatures())
print('dataset.num_features:', dataset.num_features)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = dataset[0].to(device)

def train():
    model.train()
    optimizer.zero_grad()  # Clear gradients.
    out = model(data.x, data.edge_index)  # Perform a single forward pass.
    # Compute the loss solely based on the training nodes.
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    return loss

def test():
Beispiel #14
0
seed = 0

import random
random.seed(seed)
import numpy as np
np.random.seed(seed)
import torch
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

from torch_geometric.transforms import NormalizeFeatures
from search_space import pruning_search_space_by_eda, precompute_cache
from data_prepare import load_data

# for name in ['cora', 'usa-airports', 'photo', 'wikics']:
for name in ['photo']:
    data = load_data(name, seed, transform=NormalizeFeatures())
    ss, (data_aug, fe, hpo, nas) = pruning_search_space_by_eda(data)
    precompute_cache(name, data, data_aug, fe)