Beispiel #1
0
def load_data(data_name='cora',
              normalize_feature=True,
              missing_rate=0,
              cuda=False):
    # can use other dataset, some doesn't have mask
    data = geo_data.Planetoid(os.path.join(DATA_ROOT, data_name),
                              data_name).data
    # original split
    data.train_mask = data.train_mask.type(torch.bool)
    data.val_mask = data.val_mask.type(torch.bool)
    # data.test_mask = data.test_mask.type(torch.bool)
    # expand test_mask to all rest nodes
    data.test_mask = ~(data.train_mask + data.val_mask)
    # get adjacency matrix
    n = len(data.x)
    adj = sp.csr_matrix((np.ones(data.edge_index.shape[1]), data.edge_index),
                        shape=(n, n))
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(
        adj.T > adj) + sp.eye(adj.shape[0])
    adj = normalize_adj_row(
        adj)  # symmetric normalization works bad, but why? Test more.
    #data.adj = to_torch_sparse(adj)
    data.adj = torch.FloatTensor(np.array(adj.todense()))
    # normalize feature
    if normalize_feature:
        data.x = row_l1_normalize(data.x)

    # generate missing feature setting
    indices_dir = os.path.join(DATA_ROOT, data_name, 'indices')
    if not os.path.isdir(indices_dir):
        os.mkdir(indices_dir)
    missing_indices_file = os.path.join(
        indices_dir, "indices_missing_rate={}.npy".format(missing_rate))
    if not os.path.exists(missing_indices_file):
        erasing_pool = torch.arange(n)[
            ~data.train_mask]  # keep training set always full feature
        size = int(len(erasing_pool) * (missing_rate / 100))
        idx_erased = np.random.choice(erasing_pool, size=size, replace=False)
        np.save(missing_indices_file, idx_erased)
    else:
        idx_erased = np.load(missing_indices_file)
    # erasing feature for random missing
    if missing_rate > 0:
        data.x[idx_erased] = 0

    if cuda:
        data.x = data.x.cuda()
        data.y = data.y.cuda()
        data.adj = data.adj.cuda()

    return data
Beispiel #2
0
def load_data(args):
    DATA_ROOT = 'datasets'
    path = osp.join(DATA_ROOT, args.data)
    data = geo_data.Planetoid(path, args.data)[0]

    # data.train_mask = data.train_mask.type(torch.bool)
    # data.val_mask = data.val_mask.type(torch.bool)
    # data.test_mask = data.test_mask.type(torch.bool)
    # expand test_mask to all rest nodes
    # data.test_mask = ~(data.train_mask + data.val_mask)
    # get adjacency matrix
    n = len(data.x)
    adj = sp.csr_matrix((np.ones(data.edge_index.shape[1]), data.edge_index),
                        shape=(n, n))
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(
        adj.T > adj) + sp.eye(adj.shape[0])
    adj, degree = normalize_adj_row(
        adj)  # symmetric normalization works bad, but why? Test more.
    data.adj = to_torch_sparse(adj)

    return data, degree
Beispiel #3
0
import torch

import torch_geometric
import torch_geometric.data as gdata
import torch_geometric.datasets as gdatasets
import torch_geometric.transforms as gtransforms

from config import batch_size

transform = gtransforms.AddSelfLoops()

# test if transform works
# cora = gdatasets.KarateClub(transform=transform)
# cora_loader = gdata.DataLoader(cora, batch_size=1, shuffle=True)

cora = gdatasets.Planetoid(root='./Planetoid/Cora', name='Cora', transform=transform)
cora_data = cora[0]

cora_data.train_mask = torch.zeros(cora_data.num_nodes, dtype=torch.uint8)
cora_data.train_mask[:cora_data.num_nodes-1000] = 1
cora_data.val_mask = None
cora_data.test_mask = torch.zeros(cora_data.num_nodes, dtype=torch.uint8)
cora_data.test_mask[cora_data.num_nodes-500:] = 1

# We only need the train part of the graph to train.

num_features = cora.num_features
num_classes = cora.num_classes

# information about the given dataset/batch
# if __name__ == '__main__':
Beispiel #4
0
def load_data(dataset, trn_ratio, verbose=False, seed=0):
    """
    Read a dataset based on its name.
    """
    root = 'data'
    root_cached = os.path.join(root, 'cached', dataset)
    if not os.path.exists(root_cached):
        if dataset == 'cora':
            data = datasets.Planetoid(root, 'Cora')
        elif dataset == 'citeseer':
            data = datasets.Planetoid(root, 'CiteSeer')
        elif dataset == 'pubmed':
            data = datasets.Planetoid(root, 'PubMed')
        elif dataset == 'cora-ml':
            data = datasets.CitationFull(root, 'Cora_ML')
        elif dataset == 'dblp':
            data = datasets.CitationFull(root, 'DBLP')
        elif dataset == 'amazon':
            data = datasets.Amazon(os.path.join(root, 'Amazon'), 'Photo')
        else:
            raise ValueError(dataset)

        node_x = data.data.x
        node_x[node_x.sum(dim=1) == 0] = 1
        node_x = node_x / node_x.sum(dim=1, keepdim=True)
        node_y = data.data.y
        edges = preprocess_edges(data.data.edge_index)

        os.makedirs(root_cached, exist_ok=True)
        np.save(os.path.join(root_cached, 'x'), node_x)
        np.save(os.path.join(root_cached, 'y'), node_y)
        np.save(os.path.join(root_cached, 'edges'), edges)

    edges = np.load(os.path.join(root_cached, 'edges.npy'))
    node_x = np.load(os.path.join(root_cached, 'x.npy'))
    node_y = np.load(os.path.join(root_cached, 'y.npy'))

    indices = np.arange(node_x.shape[0])
    trn_nodes, test_nodes = train_test_split(indices,
                                             test_size=0.1000,
                                             random_state=seed,
                                             stratify=node_y)
    trn_nodes, val_nodes = train_test_split(trn_nodes,
                                            test_size=0.1111,
                                            random_state=seed,
                                            stratify=node_y[trn_nodes])
    trn_nodes, _ = train_test_split(trn_nodes,
                                    train_size=trn_ratio / 0.8,
                                    random_state=seed,
                                    stratify=node_y[trn_nodes])

    edges = torch.from_numpy(edges)
    node_x = torch.from_numpy(node_x)
    node_y = torch.from_numpy(node_y)
    trn_nodes = torch.from_numpy(trn_nodes)
    val_nodes = torch.from_numpy(val_nodes)
    test_nodes = torch.from_numpy(test_nodes)

    if verbose:
        print('Number of nodes:', node_x.size(0))
        print('Number of features:', node_x.size(1))
        print('Number of edges:', edges.size(1) // 2)
        print('Number of classes:', node_y.max().item() + 1)
    return edges, node_x, node_y, trn_nodes, val_nodes, test_nodes
Beispiel #5
0
'''Makes some PyTorch-Geometric datasets available in LynxKite.'''
import torch_geometric.datasets as ds
from . import util

op = util.Op()
name = op.params["name"]
print('loading dataset', name)
if name == 'Karate Club':
    data = ds.KarateClub().data
else:
    data = ds.Planetoid('/tmp/' + name, name).data

op.output_vs('vs', len(data.x))
op.output_es('es', data.edge_index)
op.output('x', data.x, type=util.DoubleVectorAttribute)
op.output('y', data.y, type=util.DoubleAttribute)
Beispiel #6
0
def load_pubmed():
    ple = datasets.Planetoid(root="./datasets/", name="PubMed")
    data = ple[0]
    return data.x, data.y, data.train_mask, data.val_mask, data.test_mask
Beispiel #7
0
def get_cora():
    dataset = datasets.Planetoid(root='./dataset/Cora', name='Cora')
    return dataset.data
Beispiel #8
0
def get_citeseer():
    dataset = datasets.Planetoid(root='./dataset/Citeseer', name='CiteSeer')
    return dataset.data