Exemple #1
0
def get_dataset(path, name):
    assert name in ['Cora', 'CiteSeer', 'PubMed', 'DBLP', 'Karate', 'WikiCS', 'Coauthor-CS', 'Coauthor-Phy',
                    'Amazon-Computers', 'Amazon-Photo', 'ogbn-arxiv', 'ogbg-code']
    name = 'dblp' if name == 'DBLP' else name
    root_path = osp.expanduser('~/datasets')

    if name == 'Coauthor-CS':
        return Coauthor(root=path, name='cs', transform=T.NormalizeFeatures())

    if name == 'Coauthor-Phy':
        return Coauthor(root=path, name='physics', transform=T.NormalizeFeatures())

    if name == 'WikiCS':
        return WikiCS(root=path, transform=T.NormalizeFeatures())

    if name == 'Amazon-Computers':
        return Amazon(root=path, name='computers', transform=T.NormalizeFeatures())

    if name == 'Amazon-Photo':
        return Amazon(root=path, name='photo', transform=T.NormalizeFeatures())

    if name.startswith('ogbn'):
        return PygNodePropPredDataset(root=osp.join(root_path, 'OGB'), name=name, transform=T.NormalizeFeatures())

    return (CitationFull if name == 'dblp' else Planetoid)(osp.join(root_path, 'Citation'), name, transform=T.NormalizeFeatures())
Exemple #2
0
def get_amazon_dataset(name):
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'datasets',
                    'node_datasets', name)
    dataset = Amazon(path, name, transform=T.NormalizeFeatures())

    num_per_class = 20
    train_index = []
    val_index = []
    test_index = []
    for i in range(dataset.num_classes):
        index = (dataset[0].y.long() == i).nonzero().view(-1)
        index = index[torch.randperm(index.size(0))]
        if len(index) > num_per_class + 30:
            train_index.append(index[:num_per_class])
            val_index.append(index[num_per_class:num_per_class + 30])
            test_index.append(index[num_per_class + 30:])
        else:
            continue
    train_index = torch.cat(train_index)
    val_index = torch.cat(val_index)
    test_index = torch.cat(test_index)

    train_mask = index_to_mask(train_index, size=dataset[0].num_nodes)
    val_mask = index_to_mask(val_index, size=dataset[0].num_nodes)
    test_mask = index_to_mask(test_index, size=dataset[0].num_nodes)

    dataset.train_mask = train_mask
    dataset.val_mask = val_mask
    dataset.test_mask = test_mask

    return dataset
Exemple #3
0
def import_dataset(name='CORA'):
    root = f'BENCHMARK/{name.upper()}/'
    if name.upper() == 'CORA':
        dataset = Planetoid(root=root, name='CORA')
    elif name.upper() == 'CORA-F':
        dataset = CitationFull(root=root, name='cora')
    elif name.upper() == 'CITESEER':
        dataset = Planetoid(root=root, name='citeseer')
    elif name.upper() == 'PUBMED':
        dataset = Planetoid(root=root, name='PubMed')
    elif name.upper() == 'COAUTHOR-P':
        dataset = Coauthor(root=root, name='Physics')
    elif name.upper() == 'COAUTHOR-C':
        dataset = Coauthor(root=root, name='CS')
    elif name.upper() == 'AMAZON-C':
        dataset = Amazon(root=root, name='Computers')
    elif name.upper() == 'AMAZON-P':
        dataset = Amazon(root=root, name='Photo')

    elif name.lower() == 'all':
        Planetoid(root=root, name='CORA')
        Planetoid(root=root, name='citeseer')
        CitationFull(root=root, name='cora')
        Planetoid(root=root, name='PubMed')
        Coauthor(root=root, name='Physics')
        Coauthor(root=root, name='CS')
        Amazon(root=root, name='Computers')
        Amazon(root=root, name='Photo')
        exit()
    return dataset
Exemple #4
0
def get_amazon_dataset(name, normalize_features=False, transform=None):
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', name)
    dataset = Amazon(path, name)

    if transform is not None and normalize_features:
        dataset.transform = T.Compose([T.NormalizeFeatures(), transform])
    elif normalize_features:
        dataset.transform = T.NormalizeFeatures()
    elif transform is not None:
        dataset.transform = transform

    return dataset
Exemple #5
0
def get_data():
    dataset = args.name
    path = '../data/geometric/Amazon-Computers'
    trainset = Amazon(path, "Computers")
    testset = Amazon(path, "Computers")
    lenTrain = len(trainset)
    lenTest = len(testset)

    print("Len Dataset:", lenTrain)
    trainLoader = DataLoader(trainset[:lenTrain], batch_size=1, shuffle=False)
    testloader = DataLoader(trainset[:lenTest], batch_size=1, shuffle=False)
    print("Len TrainLoader:", len(trainLoader))

    return trainLoader, testloader
Exemple #6
0
def get_dataset(dataset_name):
    """
    Retrieves the dataset corresponding to the given name.
    """
    print("Getting dataset...")
    path = join('dataset', dataset_name)
    if dataset_name == 'reddit':
        dataset = Reddit(path)
    elif dataset_name == 'ppi':
        dataset = PPI(path)
    elif dataset_name == 'github':
        dataset = GitHub(path)
        data = dataset.data
        idx_train, idx_test = train_test_split(list(range(data.x.shape[0])),
                                               test_size=0.4,
                                               random_state=42)
        idx_val, idx_test = train_test_split(idx_test,
                                             test_size=0.5,
                                             random_state=42)
        data.train_mask = torch.tensor(idx_train)
        data.val_mask = torch.tensor(idx_val)
        data.test_mask = torch.tensor(idx_test)
        dataset.data = data
    elif dataset_name in ['amazon_comp', 'amazon_photo']:
        dataset = Amazon(path, "Computers", T.NormalizeFeatures()
                         ) if dataset_name == 'amazon_comp' else Amazon(
                             path, "Photo", T.NormalizeFeatures())
        data = dataset.data
        idx_train, idx_test = train_test_split(list(range(data.x.shape[0])),
                                               test_size=0.4,
                                               random_state=42)
        idx_val, idx_test = train_test_split(idx_test,
                                             test_size=0.5,
                                             random_state=42)
        data.train_mask = torch.tensor(idx_train)
        data.val_mask = torch.tensor(idx_val)
        data.test_mask = torch.tensor(idx_test)
        dataset.data = data
    elif dataset_name in ["Cora", "CiteSeer", "PubMed"]:
        dataset = Planetoid(path,
                            name=dataset_name,
                            split="full",
                            transform=T.NormalizeFeatures())
    else:
        raise NotImplementedError

    print("Dataset ready!")
    return dataset
Exemple #7
0
def load_data(dataset="Cora", supervised=False, full_data=True):
    '''
    support semi-supervised and supervised
    :param dataset:
    :param supervised:
    :return:
    '''
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset)
    if dataset in ["CS", "Physics"]:
        dataset = Coauthor(path, dataset, T.NormalizeFeatures())
    elif dataset in ["Computers", "Photo"]:
        dataset = Amazon(path, dataset, T.NormalizeFeatures())
    elif dataset in ["Cora", "Citeseer", "Pubmed"]:
        dataset = Planetoid(path, dataset, T.NormalizeFeatures())
    data = dataset[0]
    if supervised:
        if full_data:
            data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
            data.train_mask[:-1000] = 1
            data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
            data.val_mask[data.num_nodes - 1000:data.num_nodes - 500] = 1
            data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
            data.test_mask[data.num_nodes - 500:] = 1
        else:
            data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
            data.train_mask[:1000] = 1
            data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
            data.val_mask[data.num_nodes - 1000:data.num_nodes - 500] = 1
            data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
            data.test_mask[data.num_nodes - 500:] = 1
    print('loaded data: ', '\n', data)
    return data
Exemple #8
0
def load_data(
    dataset="Cora",
    supervised=True,
):
    '''
    support semi-supervised and supervised
    :param dataset:
    :param supervised:
    :return:
    '''
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset)
    if dataset in ["CS", "Physics"]:
        dataset = Coauthor(path, dataset, T.NormalizeFeatures())
    elif dataset in ["Computers", "Photo"]:
        dataset = Amazon(path, dataset, T.NormalizeFeatures())
    elif dataset in ["Cora", "Citeseer", "Pubmed"]:
        dataset = Planetoid(path, dataset, T.NormalizeFeatures())
    data = dataset[0]
    if supervised:

        data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        data.train_mask[:-1000] = 1
        data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        data.val_mask[-1000:-500] = 1
        data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        data.test_mask[-500:] = 1
    data.num_classes = data.y.max().item() + 1
    return dataset
Exemple #9
0
def load_data(args):
    dataset = args.input
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset)

    if dataset in ['cora', 'citeseer', 'pubmed']:
        dataset = Planetoid(path, dataset, transform=T.NormalizeFeatures())
        num_features = dataset.num_features
        num_classes = dataset.num_classes
        data = dataset[0]
        return data, num_features, num_classes
    elif dataset == 'corafull':
        dataset = CoraFull(path)
    elif dataset in ['cs', 'physics']:
        dataset = Coauthor(path, name=dataset)
    elif dataset in ['computers', 'photo']:
        dataset = Amazon(path, name=dataset)
    elif dataset == 'reddit':
        dataset = Reddit(path)
        num_features = dataset.num_features
        num_classes = dataset.num_classes
        data = dataset[0]
        return data, num_features, num_classes
    num_features = dataset.num_features
    num_classes = dataset.num_classes
    data = dataset[0]

    data.train_mask, data.val_mask, data.test_mask = generate_split(
        data, num_classes)

    return data, num_features, num_classes
Exemple #10
0
def load_data(dataset_name="Cora", seed=10, n_splits=5):
    # Path in which the data will be stored
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data',
                    dataset_name)
    if dataset_name in ["CS", "Physics"]:
        dataset = Coauthor(path, dataset_name, T.NormalizeFeatures())
    elif dataset_name in ["Computers", "Photo"]:
        dataset = Amazon(path, dataset_name, T.NormalizeFeatures())
    elif dataset_name in ["Cora", "Citeseer", "Pubmed"]:
        dataset = Planetoid(path,
                            dataset_name,
                            split='public',
                            transform=T.NormalizeFeatures())
    elif dataset_name in ["Arxiv", "Papers", "Products"]:
        dataset = PygNodePropPredDataset(name=ogb_data_name_conv[dataset_name],
                                         root=path,
                                         transform=T.NormalizeFeatures())
    elif dataset_name == "MAG":
        dataset = PygNodePropPredDataset(name=ogb_data_name_conv[dataset_name],
                                         root=path)
    else:
        raise Exception("[!] Dataset not found: ", str(dataset_name))
    if dataset_name in obg_datasets:
        data = split_ogb_data(dataset, dataset_name)
    else:
        data = dataset[0]  # pyg graph object
        data = split_data(data, seed, n_splits)
        data.num_classes = dataset.num_classes
    return data
Exemple #11
0
def get_dataset(name: str, use_lcc: bool = True) -> InMemoryDataset:
    path = os.path.join(DATA_PATH, name)
    if name in ['Cora', 'Citeseer', 'Pubmed']:
        dataset = Planetoid(path, name)
    elif name in ['Computers', 'Photo']:
        dataset = Amazon(path, name)
    elif name == 'CoauthorCS':
        dataset = Coauthor(path, 'CS')
    else:
        raise Exception('Unknown dataset.')

    if use_lcc:
        lcc = get_largest_connected_component(dataset)

        x_new = dataset.data.x[lcc]
        y_new = dataset.data.y[lcc]

        row, col = dataset.data.edge_index.numpy()
        edges = [[i, j] for i, j in zip(row, col) if i in lcc and j in lcc]
        edges = remap_edges(edges, get_node_mapper(lcc))

        data = Data(x=x_new,
                    edge_index=torch.LongTensor(edges),
                    y=y_new,
                    train_mask=torch.zeros(y_new.size()[0], dtype=torch.bool),
                    test_mask=torch.zeros(y_new.size()[0], dtype=torch.bool),
                    val_mask=torch.zeros(y_new.size()[0], dtype=torch.bool))
        dataset.data = data

    return dataset
Exemple #12
0
def load_amazon(dataset):
    data_name = ['Computers', 'Photo']
    assert dataset in data_name
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'Datasets',
                    'NodeData')
    # transforms = T.Compose([T.NormalizeFeatures()])
    dataset = Amazon(path, dataset)

    num_per_class = 20
    train_index = []
    test_index = []
    for i in range(dataset.num_classes):
        index = (dataset[0].y.long() == i).nonzero().view(-1)
        index = index[torch.randperm(index.size(0))]
        if len(index) > num_per_class + 30:
            train_index.append(index[:num_per_class])
            test_index.append(index[num_per_class:])
        else:
            continue
    train_index = torch.cat(train_index)
    test_index = torch.cat(test_index)

    train_mask = index_to_mask(train_index, size=dataset[0].num_nodes)
    test_mask = index_to_mask(test_index, size=dataset[0].num_nodes)

    data = Data(x=dataset[0].x,
                edge_index=dataset[0].edge_index,
                train_mask=train_mask,
                test_mask=test_mask,
                y=dataset[0].y)
    return dataset, data
Exemple #13
0
def DataLoader(name):
    # assert name in ['cSBM_data_Aug_19_2020-13:06',
    #                 'cSBM_data_Aug_18_2020-18:50',
    #                 'cSBM_data_Aug_21_2020-10:06',
    #                 'cSBM_data_Aug_19_2020-20:41',
    #                 'cSBM_data_Aug_21_2020-11:04',
    #                 'cSBM_data_Aug_21_2020-11:21',
    #                 'cSBM_data_Sep_01_2020-14:15',
    #                 'cSBM_data_Sep_01_2020-14:18',
    #                 'cSBM_data_Sep_01_2020-14:19',
    #                 'cSBM_data_Sep_01_2020-14:32',
    #                 'cSBM_data_Sep_01_2020-14:22',
    #                 'cSBM_data_Sep_01_2020-14:23',
    #                 'cSBM_data_Sep_01_2020-14:27',
    #                 'cSBM_data_Sep_01_2020-14:29',
    #                 'Cora', 'Citeseer', 'PubMed',
    #                 'Computers', 'Photo',
    #                 'chameleon', 'film', 'squirrel',
    #                 'Texas', 'Cornell']

    # if name in ['cSBM_data_Aug_19_2020-13:06',
    #             'cSBM_data_Aug_18_2020-18:50',
    #             'cSBM_data_Aug_21_2020-10:06',
    #             'cSBM_data_Aug_19_2020-20:41',
    #             'cSBM_data_Aug_21_2020-11:04',
    #             'cSBM_data_Aug_21_2020-11:21',
    #             'cSBM_data_Sep_01_2020-14:15',
    #             'cSBM_data_Sep_01_2020-14:18',
    #             'cSBM_data_Sep_01_2020-14:19',
    #             'cSBM_data_Sep_01_2020-14:32',
    #             'cSBM_data_Sep_01_2020-14:22',
    #             'cSBM_data_Sep_01_2020-14:23',
    #             'cSBM_data_Sep_01_2020-14:27',
    #             'cSBM_data_Sep_01_2020-14:29']:
    if 'cSBM_data' in name:
        path = '../data/'
        dataset = dataset_ContextualSBM(path, name=name)
    else:
        name = name.lower()

    if name in ['cora', 'citeseer', 'pubmed']:
        root_path = '../'
        path = osp.join(root_path, 'data', name)
        dataset = Planetoid(path, name, transform=T.NormalizeFeatures())
    elif name in ['computers', 'photo']:
        root_path = '../'
        path = osp.join(root_path, 'data', name)
        dataset = Amazon(path, name, T.NormalizeFeatures())
    elif name in ['chameleon', 'film', 'squirrel']:
        dataset = dataset_heterophily(root='../data/',
                                      name=name,
                                      transform=T.NormalizeFeatures())
    elif name in ['texas', 'cornell']:
        dataset = WebKB(root='../data/',
                        name=name,
                        transform=T.NormalizeFeatures())
    else:
        raise ValueError(f'dataset {name} not supported in dataloader')

    return dataset
Exemple #14
0
def get_dataset(name):
    if name in ['Cora', 'Citeseer', 'Pubmed']:
        dataset = Planetoid(path + name, name)
    elif name in ['Computers', 'Photo']:
        dataset = Amazon(path + name, name)
    else:
        raise Exception('Unknown dataset.')
    return dataset
Exemple #15
0
def load_pyg(name, dataset_dir):
    """
    Load PyG dataset objects. (More PyG datasets will be supported)

    Args:
        name (string): dataset name
        dataset_dir (string): data directory

    Returns: PyG dataset object

    """
    dataset_dir = '{}/{}'.format(dataset_dir, name)
    if name in ['Cora', 'CiteSeer', 'PubMed']:
        dataset = Planetoid(dataset_dir, name)
    elif name[:3] == 'TU_':
        # TU_IMDB doesn't have node features
        if name[3:] == 'IMDB':
            name = 'IMDB-MULTI'
            dataset = TUDataset(dataset_dir, name, transform=T.Constant())
        else:
            dataset = TUDataset(dataset_dir, name[3:])
    elif name == 'Karate':
        dataset = KarateClub()
    elif 'Coauthor' in name:
        if 'CS' in name:
            dataset = Coauthor(dataset_dir, name='CS')
        else:
            dataset = Coauthor(dataset_dir, name='Physics')
    elif 'Amazon' in name:
        if 'Computers' in name:
            dataset = Amazon(dataset_dir, name='Computers')
        else:
            dataset = Amazon(dataset_dir, name='Photo')
    elif name == 'MNIST':
        dataset = MNISTSuperpixels(dataset_dir)
    elif name == 'PPI':
        dataset = PPI(dataset_dir)
    elif name == 'QM7b':
        dataset = QM7b(dataset_dir)
    else:
        raise ValueError('{} not support'.format(name))

    return dataset
Exemple #16
0
def load_dataset(name):
    if name in ["Cora", "CiteSeer", "PubMed"]:
        dataset = Planetoid(root='./data/'+name, name=name)
    elif name == "CoraFull":
        dataset = CoraFull(root='./data/'+name)
    elif name in ["Computers", "Photo"]:
        dataset = Amazon(root='./data/'+name, name=name)
    elif name in ["CS", "Physics"]:
        dataset = Coauthor(root='./data/'+name, name=name)
    else:
        exit("wrong dataset")
    return dataset
Exemple #17
0
 def __init__(self, path: str):
     pyg_dataset = Amazon(os.path.join(path, '_pyg'), "Photo")
     if hasattr(pyg_dataset, "__data_list__"):
         delattr(pyg_dataset, "__data_list__")
     if hasattr(pyg_dataset, "_data_list"):
         delattr(pyg_dataset, "_data_list")
     pyg_data = pyg_dataset[0]
     static_graph = GeneralStaticGraphGenerator.create_homogeneous_static_graph(
         {
             'x': pyg_data.x,
             'y': pyg_data.y
         }, pyg_data.edge_index)
     super(AmazonPhotoDataset, self).__init__([static_graph])
Exemple #18
0
def load_dataset(dataset, transform=None):
    if dataset.lower() in ["cora", "citeseer", "pubmed"]:
        path = os.path.join(".datasets", "Plantoid")
        dataset = Planetoid(path, dataset.lower(), transform=transform)
    elif dataset.lower() in ["cs", "physics"]:
        path = os.path.join(".datasets", "Coauthor", dataset.lower())
        dataset = Coauthor(path, dataset.lower(), transform=transform)
    elif dataset.lower() in ["computers", "photo"]:
        path = os.path.join(".datasets", "Amazon", dataset.lower())
        dataset = Amazon(path, dataset.lower(), transform=transform)
    else:
        print("Dataset not supported!")
        assert False
    return dataset
Exemple #19
0
def prepare_data(dataset, seed):
    """
	:param dataset: name of the dataset used
	:return: data, in the correct format
	"""
    # Retrieve main path of project
    dirname = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))

    # Download and store dataset at chosen location
    if dataset == 'Cora' or dataset == 'PubMed' or dataset == 'Citeseer':
        path = os.path.join(dirname, 'data')
        data = Planetoid(path, name=dataset, split='full')[0]
        # data.train_mask, data.val_mask, data.test_mask = split_function(data.y.numpy())
        data.num_classes = (max(data.y) + 1).item()
        # dataset = Planetoid(path, name=dataset, split='public', transform=T.NormalizeFeatures(), num_train_per_class=20, num_val=500, num_test=1000)
        # data = modify_train_mask(data)

    elif dataset == 'Amazon':
        path = os.path.join(dirname, 'data', 'Amazon')
        data = Amazon(path, 'photo')[0]
        data.num_classes = (max(data.y) + 1).item()
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())
        # Amazon: 4896 train, 1224 val, 1530 test

    elif dataset == 'Reddit':
        path = os.path.join(dirname, 'data', 'Reedit')
        data = Reddit(path)[0]
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy())

    elif dataset == 'PPI':
        path = os.path.join(dirname, 'data', 'PPI')
        data = ppi_prepoc(path, seed)
        data.x = data.graphs[0].x
        data.num_classes = data.graphs[0].y.size(1)
        for df in data.graphs:
            df.num_classes = data.num_classes

    #elif dataset = 'MUTAG'

    # Get it in right format
    if dataset != 'PPI':
        print('Train mask is of size: ',
              data.train_mask[data.train_mask == True].shape)

# data = add_noise_features(data, args.num_noise)

    return data
Exemple #20
0
def prepare_data(dataset, train_ratio=0.8, input_dim=None, seed=10):
    """Import, save and process dataset

    Args:
            dataset (str): name of the dataset used
            seed (int): seed number

    Returns:
            [torch_geometric.Data]: dataset in the correct format 
            with required attributes and train/test/val split
    """
    # Retrieve main path of project
    dirname = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))

    # Download and store dataset at chosen location
    if dataset == 'Cora' or dataset == 'PubMed' or dataset == 'Citeseer':
        path = os.path.join(dirname, 'data')
        data = Planetoid(path, name=dataset, split='full')[0]
        data.name = dataset
        data.num_classes = (max(data.y) + 1).item()
        # data.train_mask, data.val_mask, data.test_mask = split_function(data.y.numpy())
        # data = Planetoid(path, name=dataset, split='public', transform=T.NormalizeFeatures(), num_train_per_class=20, num_val=500, num_test=1000)

    elif dataset == 'Amazon':
        path = os.path.join(dirname, 'data', 'Amazon')
        data = Amazon(path, 'photo')[0]
        data.name = dataset
        data.num_classes = (max(data.y) + 1).item()
        data.train_mask, data.val_mask, data.test_mask = split_function(
            data.y.numpy(), seed=seed)
        # Amazon: 4896 train, 1224 val, 1530 test

    elif dataset in ['syn1', 'syn2', 'syn4', 'syn5']:
        data = synthetic_data(dataset, dirname, train_ratio, input_dim)

    elif dataset == 'syn6':
        data = gc_data(dataset, dirname, train_ratio)

    elif dataset == 'Mutagenicity':
        data = gc_data(dataset, dirname, train_ratio)

    return data
Exemple #21
0
def fetch_dataset(root, name):
    """
    Fetchs datasets from the PyTorch Geometric library
    
    :param root: A path to the root directory a dataset will be placed
    :param name: Name of the dataset. Currently, the following names are supported
                'cora', 'citeseer', "pubmed", 'Computers', "Photo", 'CS',  'Physics'
    :return: A PyTorch Geometric dataset
    """
    print(name.lower())
    if name.lower() in {'cora', 'citeseer', "pubmed"}:
        return Planetoid(root=root, name=name)
    elif name.lower() in {'computers', "photo"}:
        return Amazon(root=root, name=name)
    elif name.lower() in {'cs',  'physics'}:
        return Coauthor(root=root, name=name)
    elif name.lower() == "wiki":
        return WikiCS(osp.join(root, "WikiCS"))
    elif name.lower() == "actor":
        return Actor(osp.join(root, name))
def get_dataset(dataset_name):
    """
    Retrieves the dataset corresponding to the given name.
    """
    path = 'dataset'
    if dataset_name == 'reddit':
        dataset = Reddit(path)
    elif dataset_name == 'amazon_comp':
        dataset = Amazon(path, name="Computers")
        data = dataset.data
        idx_train, idx_test = train_test_split(list(range(data.x.shape[0])),
                                               test_size=0.4,
                                               random_state=42)
        idx_val, idx_test = train_test_split(idx_test,
                                             test_size=0.5,
                                             random_state=42)

        train_mask = torch.tensor([False] * data.x.shape[0])
        val_mask = torch.tensor([False] * data.x.shape[0])
        test_mask = torch.tensor([False] * data.x.shape[0])

        train_mask[idx_train] = True
        val_mask[idx_val] = True
        test_mask[idx_test] = True

        data.train_mask = train_mask
        data.val_mask = val_mask
        data.test_mask = test_mask
        dataset.data = data
    elif dataset_name in ["Cora", "CiteSeer", "PubMed"]:
        dataset = Planetoid(
            path,
            name=dataset_name,
            split="full",
        )
    else:
        raise NotImplementedError

    return dataset
Exemple #23
0
def load_data(dataset="Cora", supervised=False, full_data=True, args=None):
    '''
    support semi-supervised and supervised
    :param dataset:
    :param supervised:
    :return:
    '''
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset)
    dataset_name = dataset
    if dataset in ["CS", "Physics"]:
        dataset = Coauthor(path, dataset, T.NormalizeFeatures())
    elif dataset in ["Computers", "Photo"]:
        dataset = Amazon(path, dataset, T.NormalizeFeatures())
    elif dataset in ["Cora", "Citeseer", "Pubmed"]:
        dataset = Planetoid(path, dataset, T.NormalizeFeatures())

        # path = path + '/processed/data.pt'
        # dataset = torch.load(path)
    data = dataset[0]
    data['adj'] = load_citation(dataset_name, args.normalization)
    if supervised:
        if full_data:
            data.train_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
            data.train_mask[:-1000] = 1
            data.val_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
            data.val_mask[-1000:-500] = 1
            data.test_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
            data.test_mask[-500:] = 1
        else:
            data.train_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
            data.train_mask[:1000] = 1
            data.val_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
            data.val_mask[1000:1500] = 1
            data.test_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
            data.test_mask[1500:2000] = 1
    return data
Exemple #24
0
def load_data_lp(dataset, use_feats, data_path):
    if dataset in ['cora', 'pubmed', 'citeseer']:

        # adj, features, labels = load_citation_data(dataset, use_feats, data_path)[:3]
        adj, features, labels, idx_train, idx_val, idx_test = load_citation_data(
            dataset, use_feats, data_path, split_seed=None)
    elif dataset in ['cora_planetoid', 'pubmed_planetoid']:
        from torch_geometric.datasets import Planetoid
        import torch_geometric as tg
        import scipy.sparse as sp
        if dataset == 'cora':
            name = 'Cora'
        elif dataset == 'pubmed':
            name = 'Pubmed'
        else:
            raise FileNotFoundError(
                'Dataset {} is not supported.'.format(dataset))
        loaded_dataset = Planetoid(root='/root/tmp/' + name, name='Cora')
        adj = tg.utils.to_scipy_sparse_matrix(loaded_dataset.data.edge_index)
        adj = sp.coo_matrix.asformat(adj, format='csr')
        features = sp.lil_matrix(loaded_dataset.data.x.numpy())
        labels = loaded_dataset.data.y.numpy()
    elif 'amazon' in dataset:
        from torch_geometric.datasets import Amazon
        import torch_geometric as tg
        import scipy.sparse as sp
        if dataset == 'amazon-photo':
            name = 'Photo'
        elif dataset == 'amazon-computers':
            name = 'Computers'
        else:
            raise FileNotFoundError(
                'Dataset {} is not supported.'.format(dataset))
        loaded_dataset = Amazon(root='/root/tmp/' + name, name=name)
        adj = tg.utils.to_scipy_sparse_matrix(loaded_dataset.data.edge_index)
        adj = sp.coo_matrix.asformat(adj, format='csr')
        features = sp.lil_matrix(loaded_dataset.data.x.numpy())
        labels = loaded_dataset.data.y.numpy()
    elif dataset == 'BlogCatalog':
        import scipy.io as sio
        import scipy.sparse as sp
        data = sio.loadmat('./data/BlogCatalog/BlogCatalog.mat')
        features = sp.lil_matrix(data['Attributes'])
        labels = np.squeeze(data['Label'])
        adj = sp.csr_matrix(data['Network'])
    elif dataset == 'wiki':
        import scipy.sparse as sp
        features = np.loadtxt('./data/wiki/wiki_feature.txt')
        features = sp.coo_matrix(
            (features[:,
                      2], (features[:,
                                    0].astype(int), features[:,
                                                             1].astype(int))))
        features = sp.lil_matrix(features)
        adj = np.loadtxt('./data/wiki/wiki_graph.txt')
        adj = np.ndarray.tolist(adj)
        adj = nx.from_edgelist(adj)
        adj = nx.adjacency_matrix(adj)
        labels = np.loadtxt('./data/wiki/wiki_group.txt')
        labels = labels[:, 1]
        labels = labels.astype(np.int64)
        labels = np.squeeze(np.reshape(labels, (2405, 1)) - 1)
    elif 'PICA' in dataset:

        if 'ImageNet10' in dataset:
            dataset_lower = 'imagenet10'
            dataset_name = 'PICA-ImageNet10'
        elif 'ImageNetDog' in dataset:
            dataset_lower = 'imagenetdog'
            dataset_name = 'PICA-ImageNetDog'

        if 'feat10' in dataset:
            name = 'picafeat10_{}'.format(dataset_lower)
        elif 'feat70' in dataset:
            name = 'picafeat70_{}'.format(dataset_lower)
        elif 'feat512' in dataset:
            name = 'picafeat512_{}'.format(dataset_lower)

        orig_dataset = dataset
        suffix = dataset.split(dataset_name)[-1]
        dataset = dataset_name

        print('name : {},  suffix : {}'.format(name, suffix))

        y_true = np.load('./data/{}/label.npy'.format(dataset))
        y_true = y_true.astype('int64')
        labels = y_true

        features = np.load('./data/{}/{}.npy'.format(dataset, name))
        import scipy.sparse as sp
        features = sp.lil_matrix(features)

        A = sp.load_npz('./data/{}/A{}.npz'.format(dataset, suffix))

        adj = A.astype('float64')

        labels = torch.LongTensor(labels)
        data = {'adj_train': adj, 'features': features, 'labels': labels}
        return data
    else:
        raise FileNotFoundError('Dataset {} is not supported.'.format(dataset))
    labels = torch.LongTensor(labels)
    data = {'adj_train': adj, 'features': features, 'labels': labels}
    return data
Exemple #25
0
    _data = Planetoid(root="./pciteseer", name="Citeseer")
elif pr.net == 3:
    print("Data Pubmed")
    _data = Planetoid(root="./ppubmed", name="Pubmed")
elif pr.net == 4:
    print("Data CoraFull")
    _data = CoraFull("./Corafull")
elif pr.net == 5:
    print("Data Coauthor CS")
    _data = Coauthor("./CS", "CS")
elif pr.net == 6:
    print("Data Coauthor Physics")
    _data = Coauthor("./Physics", "Physics")
elif pr.net == 7:
    print("Data Amazon Computer")
    _data = Amazon("./Computer", "Computers")
elif pr.net == 8:
    print("Data Amazon Photos")
    _data = Amazon("./Photo", "Photo")

#_data = Coauthor("./Physics","Physics")
#_data = Coauthor("./CS","CS")

#_data = CoraFull("./Corafull")

#_data = Planetoid(root="./pcora",name="Cora")
#_data = Planetoid(root="./pciteseer",name="Citeseer")
#_data = Planetoid(root="./ppubmed",name="Pubmed")

#_data = Amazon("./Computer","Computers")
#_data = Amazon("./Photo","Photo")
Exemple #26
0
def load_data(name, seed, transform=None):
    '''
    Load data from files and return a pytorch geometric `Data` object
    '''
    random.seed(seed) # make sure that the split of data is the same
    ROOT = osp.dirname(osp.abspath(__file__)) + '/..'

    if name in ['cora', 'citeseer', 'pubmed']:   # datasets for transductive node classifiction
        data = Planetoid(osp.join(ROOT, 'data'), name, transform=transform)[0]
        data.task = 'semi' # semi-supervised
        data.setting = 'transductive' # transductive
        return data
    
    elif name in ['wikics']:
        dataset = WikiCS(osp.join(ROOT, 'data', 'wikics'), transform=transform)
        data = dataset[0]
        data.task = 'semi'
        data.setting = 'transductive'
        data.train_mask = data.train_mask[:,0]
        data.val_mask = data.val_mask[:, 0]
        data.stopping_mask = data.stopping_mask[:, 0]
        return data

    elif name in ['ppi']: # datasets for inductive node classification
        train_dataset = PPI(osp.join(ROOT, 'data', 'ppi'), split='train', transform=transform)
        val_dataset = PPI(osp.join(ROOT, 'data', 'ppi'), split='val', transform=transform)
        test_dataset = PPI(osp.join(ROOT, 'data', 'ppi'), split='test', transform=transform)
        return (train_dataset, val_dataset, test_dataset)
    elif name in ['usa-airports']:
        try:
            data = pickle.load(open(osp.join(ROOT, 'data', name, 'data.pkl'), 'rb'))
            return data
        except FileNotFoundError:
            print('Data not found. Re-generating...')
        nx_graph = nx.read_edgelist(osp.join(ROOT, 'data', name, 'edges.txt'))
        nx_graph = nx.convert_node_labels_to_integers(nx_graph, label_attribute='id2oid') # oid for original id
        oid2id = {int(v):k for k,v in nx.get_node_attributes(nx_graph, 'id2oid').items()}
        id2label = {}
        for line in open(osp.join(ROOT, 'data', name, 'labels.txt')):
            linesplit = line.strip().split()
            oid = int(linesplit[0])
            label = int(linesplit[1])
            id2label[oid2id[oid]] = {'y': label} # here we assume that the label id start from 0 and the labeling is consistant.
        nx.set_node_attributes(nx_graph, id2label)

        data = from_networkx(nx_graph)
        num_nodes = len(nx_graph.nodes)
        node_idxs = list(range(num_nodes))
        random.shuffle(node_idxs)
        # split data, train:val:test = 80%:10%:10%
        train_idxs = node_idxs[:int(0.8 * num_nodes)]
        val_idxs = node_idxs[int(0.8 * num_nodes):int(0.9 * num_nodes)]
        test_idxs = node_idxs[int(0.9 * num_nodes):]

        data.train_mask = torch.zeros(num_nodes, dtype=torch.bool)
        data.val_mask = torch.zeros(num_nodes, dtype=torch.bool)
        data.test_mask = torch.zeros(num_nodes, dtype=torch.bool)
        data.train_mask[train_idxs] = True
        data.val_mask[val_idxs] = True
        data.test_mask[test_idxs] = True
        if data.x and transform:
            data.x = transform(data.x)
        data.num_nodes = num_nodes
        data.task = 'sup' # simi-supervised
        data.setting = 'transductive' # transductive
        pickle.dump(data, open(osp.join(ROOT, 'data', name, 'data.pkl'), 'wb'))
        return data

    elif name in ['ogbn-arxiv']:
        dataset = PygNodePropPredDataset(name, root=osp.join(ROOT, 'data'), transform=transform)
        split_idx = dataset.get_idx_split()
        data = dataset[0]
        split_idx['val'] = split_idx.pop('valid')
        for key, idx in split_idx.items():
            mask = torch.zeros(data.num_nodes, dtype=torch.bool)
            mask[idx] = True
            data[f'{key}_mask'] = mask
        data.task = 'sup' # simi-supervised
        data.setting = 'transductive' # transductive
        return data

    elif name in ['photo']:
        dataset = Amazon('data/photo', 'photo', transform=transform)
        data = dataset[0]
        data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        data.train_mask[:-1000] = True
        data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        data.val_mask[-1000: -500] = True
        data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        data.test_mask[-500:] = True

        data.train_edge_index, _ = subgraph(data.train_mask, data.edge_index, relabel_nodes=True)
        data.val_edge_index, _ = subgraph(data.val_mask, data.edge_index, relabel_nodes=True)
        data.test_edge_index, _ = subgraph(data.test_mask, data.edge_index, relabel_nodes=True)
        data.train_x = data.x[data.train_mask]
        data.train_y = data.y[data.train_mask]
        data.val_x = data.x[data.val_mask]
        data.val_y = data.y[data.val_mask]
        data.test_x = data.x[data.test_mask]
        data.test_y = data.y[data.test_mask]

        data.num_train_nodes = data.train_x.shape[0]
        data.task = 'sup' # simi-supervised
        data.setting = 'inductive' # transductive
        return data

    else:
        raise NotImplementedError('Not supported dataset.')
import os.path as osp
import torch
from tqdm import tqdm
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch_geometric.datasets import Amazon
import torch_geometric.transforms as T
from torch_geometric.nn import GNNExplainer, ARMAConv
from torch_geometric.utils import k_hop_subgraph
from math import floor
from scipy import stats

dataset = 'Computers'
path = osp.join('.', 'data', 'Amazon')
dataset = Amazon(path, dataset, transform=T.NormalizeFeatures())
data = dataset[0]


# Define the model
class Net(torch.nn.Module):
    def __init__(self, k=1, x=16):
        super(Net, self).__init__()
        self.conv1 = ARMAConv(dataset.num_features, x)
        self.conv2 = ARMAConv(x, x)
        self.conv3 = ARMAConv(x, dataset.num_classes)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = F.dropout(x, training=self.training)
Exemple #28
0
 def __init__(self, path):
     dataset = "Photo"
     # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
     Amazon(path, dataset)
     super(AmazonPhotoDataset, self).__init__(path, dataset)
Exemple #29
0
def TrainingNet(dataset, modelName, params, num_pre_epochs, num_epochs,
                NumCutoff, optimizerName, MonteSize, savepath):
    Batch_size = int(params[0])
    root = '/git/data/GraphData/' + dataset
    TestAccs = []
    for Monte_iter in range(MonteSize):
        # Data
        start_epoch = 0  # start from epoch 0 or last checkpoint epoch
        NewNetworkSizeAdjust = []
        WeightsDynamicsEvolution = []
        # model
        if dataset == 'Cora' or dataset == 'Citeseer' or dataset == 'Pubmed':
            datasetroot = Planetoid(root=root,
                                    name=dataset,
                                    transform=T.NormalizeFeatures()).shuffle()
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)
            criterion = nn.CrossEntropyLoss()

        elif dataset == "CoraFull":
            datasetroot = CoraFull(root=root,
                                   transform=T.NormalizeFeatures()).shuffle()
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)

        elif dataset == "Amazon":
            datasetroot = Amazon(root,
                                 "Photo",
                                 transform=None,
                                 pre_transform=None)
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            testloader = DataListLoader(datasetroot,
                                        batch_size=100,
                                        shuffle=False)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)

        elif dataset == 'ENZYMES' or dataset == 'MUTAG':
            datasetroot = TUDataset(root, name=dataset, use_node_attr=True)
            Num = len(datasetroot) // 10
            global train_dataset, test_dataset
            train_dataset = datasetroot[:Num]
            test_dataset = datasetroot[Num:]
            trainloader = DataLoader(train_dataset, batch_size=Batch_size)
            testloader = DataLoader(test_dataset, batch_size=60)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)

        elif dataset == "PPI":
            train_dataset = PPI(root, split='train')
            test_dataset = PPI(root, split='test')
            trainloader = DataLoader(train_dataset,
                                     batch_size=Batch_size,
                                     shuffle=True)
            testloader = DataLoader(test_dataset, batch_size=1, shuffle=False)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, train_dataset,
                                           params, num_epochs)
            criterion = torch.nn.BCEWithLogitsLoss()

        elif dataset == "Reddit":
            datasetroot = Reddit(root)
            trainloader = DataListLoader(datasetroot,
                                         batch_size=1,
                                         shuffle=True)
            testloader = DataListLoader(datasetroot,
                                        batch_size=2,
                                        shuffle=False)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)
            criterion = torch.nn.BCEWithLogitsLoss()

        elif dataset == 'MNIST':
            datasetroot = MNISTSuperpixels(root=root, transform=T.Cartesian())
            trainloader = DataListLoader(datasetroot,
                                         batch_size=Batch_size,
                                         shuffle=True)
            testloader = DataListLoader(datasetroot,
                                        batch_size=100,
                                        shuffle=False)
            [net,
             model_to_save] = ModelAndSave(dataset, modelName, datasetroot,
                                           params, num_epochs)

        elif dataset == 'CIFAR10':
            pass
        else:
            raise Exception("Input wrong datatset!!")

        FileName = "{}-{}-param_{}_{}_{}_{}-monte_{}".format(
            dataset, modelName, params[0], params[1], params[2], params[3],
            Monte_iter)

        print('Let\'s use', torch.cuda.device_count(), 'GPUs!')
        global device
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        optimizer = optim.Adam(net.parameters(),
                               lr=params[3],
                               betas=(0.9, 0.999),
                               eps=1e-08,
                               weight_decay=0,
                               amsgrad=False)
        criterion = nn.CrossEntropyLoss()
        net = net.to(device)

        #cudnn.benchmark = True
        logging(
            'dataset:{}, Batch size: {}, Number of layers:{} ConCoeff: {}, LR:{}, MonteSize:{}'
            .format(dataset, params[0], params[1], params[2], params[3],
                    Monte_iter))
        mark = "{}/{}Convergence/DiagElement-{}".format(
            savepath, dataset, FileName)

        PreTrainConvergence, PreTestConvergence, PreTestAcc = TrainPart(
            modelName, datasetroot, start_epoch, num_pre_epochs, trainloader,
            testloader, net, optimizer, criterion, NumCutoff, mark, False,
            model_to_save)
        print(
            'dataset: {}, model name: {}, Number epoches: {},  Pre-train error is: {}, Pre-test error is: {}, test acc is {}'
            .format(dataset, modelName, num_pre_epochs,
                    PreTrainConvergence[-1], PreTestConvergence[-1],
                    PreTestAcc[-1]))

        NewNetworksize, NewNetworkWeight = RetainNetworkSize(net,
                                                             params[2])[0:2]
        NetworkInfo = [NewNetworksize[0:-1], NewNetworkWeight]
        OptimizedNet = ChooseModel(modelName, datasetroot, NetworkInfo)
        NewNetworksize.insert(0, datasetroot.num_features)
        NewNetworkSizeAdjust.append(NewNetworksize[0:-1])
        print(NewNetworkSizeAdjust)

        #OptimizedNet.apply(init_weights)
        #OptimizedNet = DataParallel(OptimizedNet)
        OptimizedNet = OptimizedNet.to(device)
        cudnn.benchmark = True
        criterionNew = nn.CrossEntropyLoss()
        if optimizerName == "SGD":
            optimizerNew = getattr(optim,
                                   optimizerName)(OptimizedNet.parameters(),
                                                  lr=params[3],
                                                  momentum=0.9,
                                                  weight_decay=5e-4)
        elif optimizerName == "Adam":
            optimizerNew = getattr(optim,
                                   optimizerName)(OptimizedNet.parameters(),
                                                  lr=params[3],
                                                  betas=(0.9, 0.999),
                                                  eps=1e-08,
                                                  weight_decay=5e-4,
                                                  amsgrad=False)

        TrainConvergence, TestConvergence, TestAcc = TrainPart(
            modelName, datasetroot, start_epoch, num_epochs, trainloader,
            testloader, OptimizedNet, optimizerNew, criterionNew, NumCutoff,
            mark, True, model_to_save)
        np.save(
            "{}/{}Convergence/TrainConvergence-{}".format(
                savepath, dataset, FileName), TrainConvergence)
        np.save(
            "{}/{}Convergence/TestConvergence-{}".format(
                savepath, dataset, FileName), TestConvergence)

        np.save(
            "{}/{}Convergence/NewNetworkSizeAdjust-{}".format(
                savepath, dataset, FileName), NewNetworkSizeAdjust)

        #np.save(savepath+'TestConvergence-'+FileName,TestConvergence)
        #torch.cuda.empty_cache()

        print(
            'dataset: {}, model name:{}, resized network size is {},  Number epoches:{},  Train error is: {}, Test error is: {}, test acc is {}\n'
            .format(dataset, modelName, NewNetworksize[0:-1], num_epochs,
                    TrainConvergence[-1], TestConvergence[-1], TestAcc[-1]))
        TestAccs.append(TestAcc)
        np.save(
            "{}/{}Convergence/MeanTestAccs-{}".format(savepath, dataset,
                                                      FileName), TestAccs)
    print("The change of test error is:{}".format(TestAccs))
    print_nvidia_useage()
Exemple #30
0
           notes=full_description)

# ---------------------------------------------------------------
print("Done 1")
wandb.log({'action': 'Done 1'})

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
real_data = None
if DATASET == "Karate Club":
    real_data = KarateClub()
elif DATASET == "Cora" or DATASET == "Citeseer" or DATASET == "Pubmed":
    real_data = Planetoid(root=input_path, name=DATASET, split="public")
elif DATASET == "Reddit":
    real_data = Reddit(root=input_path)
elif DATASET == "Amazon Computers":
    real_data = Amazon(root=input_path, name="Computers")
elif DATASET == "Amazon Photos":
    real_data = Amazon(root=input_path, name="Photo")
elif DATASET == "CLUSTER":
    real_data = GNNBenchmarkDataset(root=input_path,
                                    name="CLUSTER",
                                    split="test")
elif DATASET == "PATTERN":
    real_data = GNNBenchmarkDataset(root=input_path,
                                    name="PATTERN",
                                    split="test")
elif DATASET == "Flickr":
    real_data = Flickr(root=input_path)
elif DATASET == "OGB Products":
    real_data = PygNodePropPredDataset(name='ogbn-products')
    split_idx = real_data.get_idx_split()