def get_dataset(path, name): assert name in ['Cora', 'CiteSeer', 'PubMed', 'DBLP', 'Karate', 'WikiCS', 'Coauthor-CS', 'Coauthor-Phy', 'Amazon-Computers', 'Amazon-Photo', 'ogbn-arxiv', 'ogbg-code'] name = 'dblp' if name == 'DBLP' else name root_path = osp.expanduser('~/datasets') if name == 'Coauthor-CS': return Coauthor(root=path, name='cs', transform=T.NormalizeFeatures()) if name == 'Coauthor-Phy': return Coauthor(root=path, name='physics', transform=T.NormalizeFeatures()) if name == 'WikiCS': return WikiCS(root=path, transform=T.NormalizeFeatures()) if name == 'Amazon-Computers': return Amazon(root=path, name='computers', transform=T.NormalizeFeatures()) if name == 'Amazon-Photo': return Amazon(root=path, name='photo', transform=T.NormalizeFeatures()) if name.startswith('ogbn'): return PygNodePropPredDataset(root=osp.join(root_path, 'OGB'), name=name, transform=T.NormalizeFeatures()) return (CitationFull if name == 'dblp' else Planetoid)(osp.join(root_path, 'Citation'), name, transform=T.NormalizeFeatures())
def get_amazon_dataset(name): path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'datasets', 'node_datasets', name) dataset = Amazon(path, name, transform=T.NormalizeFeatures()) num_per_class = 20 train_index = [] val_index = [] test_index = [] for i in range(dataset.num_classes): index = (dataset[0].y.long() == i).nonzero().view(-1) index = index[torch.randperm(index.size(0))] if len(index) > num_per_class + 30: train_index.append(index[:num_per_class]) val_index.append(index[num_per_class:num_per_class + 30]) test_index.append(index[num_per_class + 30:]) else: continue train_index = torch.cat(train_index) val_index = torch.cat(val_index) test_index = torch.cat(test_index) train_mask = index_to_mask(train_index, size=dataset[0].num_nodes) val_mask = index_to_mask(val_index, size=dataset[0].num_nodes) test_mask = index_to_mask(test_index, size=dataset[0].num_nodes) dataset.train_mask = train_mask dataset.val_mask = val_mask dataset.test_mask = test_mask return dataset
def import_dataset(name='CORA'): root = f'BENCHMARK/{name.upper()}/' if name.upper() == 'CORA': dataset = Planetoid(root=root, name='CORA') elif name.upper() == 'CORA-F': dataset = CitationFull(root=root, name='cora') elif name.upper() == 'CITESEER': dataset = Planetoid(root=root, name='citeseer') elif name.upper() == 'PUBMED': dataset = Planetoid(root=root, name='PubMed') elif name.upper() == 'COAUTHOR-P': dataset = Coauthor(root=root, name='Physics') elif name.upper() == 'COAUTHOR-C': dataset = Coauthor(root=root, name='CS') elif name.upper() == 'AMAZON-C': dataset = Amazon(root=root, name='Computers') elif name.upper() == 'AMAZON-P': dataset = Amazon(root=root, name='Photo') elif name.lower() == 'all': Planetoid(root=root, name='CORA') Planetoid(root=root, name='citeseer') CitationFull(root=root, name='cora') Planetoid(root=root, name='PubMed') Coauthor(root=root, name='Physics') Coauthor(root=root, name='CS') Amazon(root=root, name='Computers') Amazon(root=root, name='Photo') exit() return dataset
def get_amazon_dataset(name, normalize_features=False, transform=None): path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', name) dataset = Amazon(path, name) if transform is not None and normalize_features: dataset.transform = T.Compose([T.NormalizeFeatures(), transform]) elif normalize_features: dataset.transform = T.NormalizeFeatures() elif transform is not None: dataset.transform = transform return dataset
def get_data(): dataset = args.name path = '../data/geometric/Amazon-Computers' trainset = Amazon(path, "Computers") testset = Amazon(path, "Computers") lenTrain = len(trainset) lenTest = len(testset) print("Len Dataset:", lenTrain) trainLoader = DataLoader(trainset[:lenTrain], batch_size=1, shuffle=False) testloader = DataLoader(trainset[:lenTest], batch_size=1, shuffle=False) print("Len TrainLoader:", len(trainLoader)) return trainLoader, testloader
def get_dataset(dataset_name): """ Retrieves the dataset corresponding to the given name. """ print("Getting dataset...") path = join('dataset', dataset_name) if dataset_name == 'reddit': dataset = Reddit(path) elif dataset_name == 'ppi': dataset = PPI(path) elif dataset_name == 'github': dataset = GitHub(path) data = dataset.data idx_train, idx_test = train_test_split(list(range(data.x.shape[0])), test_size=0.4, random_state=42) idx_val, idx_test = train_test_split(idx_test, test_size=0.5, random_state=42) data.train_mask = torch.tensor(idx_train) data.val_mask = torch.tensor(idx_val) data.test_mask = torch.tensor(idx_test) dataset.data = data elif dataset_name in ['amazon_comp', 'amazon_photo']: dataset = Amazon(path, "Computers", T.NormalizeFeatures() ) if dataset_name == 'amazon_comp' else Amazon( path, "Photo", T.NormalizeFeatures()) data = dataset.data idx_train, idx_test = train_test_split(list(range(data.x.shape[0])), test_size=0.4, random_state=42) idx_val, idx_test = train_test_split(idx_test, test_size=0.5, random_state=42) data.train_mask = torch.tensor(idx_train) data.val_mask = torch.tensor(idx_val) data.test_mask = torch.tensor(idx_test) dataset.data = data elif dataset_name in ["Cora", "CiteSeer", "PubMed"]: dataset = Planetoid(path, name=dataset_name, split="full", transform=T.NormalizeFeatures()) else: raise NotImplementedError print("Dataset ready!") return dataset
def load_data(dataset="Cora", supervised=False, full_data=True): ''' support semi-supervised and supervised :param dataset: :param supervised: :return: ''' path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset) if dataset in ["CS", "Physics"]: dataset = Coauthor(path, dataset, T.NormalizeFeatures()) elif dataset in ["Computers", "Photo"]: dataset = Amazon(path, dataset, T.NormalizeFeatures()) elif dataset in ["Cora", "Citeseer", "Pubmed"]: dataset = Planetoid(path, dataset, T.NormalizeFeatures()) data = dataset[0] if supervised: if full_data: data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.train_mask[:-1000] = 1 data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.val_mask[data.num_nodes - 1000:data.num_nodes - 500] = 1 data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.test_mask[data.num_nodes - 500:] = 1 else: data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.train_mask[:1000] = 1 data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.val_mask[data.num_nodes - 1000:data.num_nodes - 500] = 1 data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.test_mask[data.num_nodes - 500:] = 1 print('loaded data: ', '\n', data) return data
def load_data( dataset="Cora", supervised=True, ): ''' support semi-supervised and supervised :param dataset: :param supervised: :return: ''' path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset) if dataset in ["CS", "Physics"]: dataset = Coauthor(path, dataset, T.NormalizeFeatures()) elif dataset in ["Computers", "Photo"]: dataset = Amazon(path, dataset, T.NormalizeFeatures()) elif dataset in ["Cora", "Citeseer", "Pubmed"]: dataset = Planetoid(path, dataset, T.NormalizeFeatures()) data = dataset[0] if supervised: data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.train_mask[:-1000] = 1 data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.val_mask[-1000:-500] = 1 data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.test_mask[-500:] = 1 data.num_classes = data.y.max().item() + 1 return dataset
def load_data(args): dataset = args.input path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset) if dataset in ['cora', 'citeseer', 'pubmed']: dataset = Planetoid(path, dataset, transform=T.NormalizeFeatures()) num_features = dataset.num_features num_classes = dataset.num_classes data = dataset[0] return data, num_features, num_classes elif dataset == 'corafull': dataset = CoraFull(path) elif dataset in ['cs', 'physics']: dataset = Coauthor(path, name=dataset) elif dataset in ['computers', 'photo']: dataset = Amazon(path, name=dataset) elif dataset == 'reddit': dataset = Reddit(path) num_features = dataset.num_features num_classes = dataset.num_classes data = dataset[0] return data, num_features, num_classes num_features = dataset.num_features num_classes = dataset.num_classes data = dataset[0] data.train_mask, data.val_mask, data.test_mask = generate_split( data, num_classes) return data, num_features, num_classes
def load_data(dataset_name="Cora", seed=10, n_splits=5): # Path in which the data will be stored path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset_name) if dataset_name in ["CS", "Physics"]: dataset = Coauthor(path, dataset_name, T.NormalizeFeatures()) elif dataset_name in ["Computers", "Photo"]: dataset = Amazon(path, dataset_name, T.NormalizeFeatures()) elif dataset_name in ["Cora", "Citeseer", "Pubmed"]: dataset = Planetoid(path, dataset_name, split='public', transform=T.NormalizeFeatures()) elif dataset_name in ["Arxiv", "Papers", "Products"]: dataset = PygNodePropPredDataset(name=ogb_data_name_conv[dataset_name], root=path, transform=T.NormalizeFeatures()) elif dataset_name == "MAG": dataset = PygNodePropPredDataset(name=ogb_data_name_conv[dataset_name], root=path) else: raise Exception("[!] Dataset not found: ", str(dataset_name)) if dataset_name in obg_datasets: data = split_ogb_data(dataset, dataset_name) else: data = dataset[0] # pyg graph object data = split_data(data, seed, n_splits) data.num_classes = dataset.num_classes return data
def get_dataset(name: str, use_lcc: bool = True) -> InMemoryDataset: path = os.path.join(DATA_PATH, name) if name in ['Cora', 'Citeseer', 'Pubmed']: dataset = Planetoid(path, name) elif name in ['Computers', 'Photo']: dataset = Amazon(path, name) elif name == 'CoauthorCS': dataset = Coauthor(path, 'CS') else: raise Exception('Unknown dataset.') if use_lcc: lcc = get_largest_connected_component(dataset) x_new = dataset.data.x[lcc] y_new = dataset.data.y[lcc] row, col = dataset.data.edge_index.numpy() edges = [[i, j] for i, j in zip(row, col) if i in lcc and j in lcc] edges = remap_edges(edges, get_node_mapper(lcc)) data = Data(x=x_new, edge_index=torch.LongTensor(edges), y=y_new, train_mask=torch.zeros(y_new.size()[0], dtype=torch.bool), test_mask=torch.zeros(y_new.size()[0], dtype=torch.bool), val_mask=torch.zeros(y_new.size()[0], dtype=torch.bool)) dataset.data = data return dataset
def load_amazon(dataset): data_name = ['Computers', 'Photo'] assert dataset in data_name path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'Datasets', 'NodeData') # transforms = T.Compose([T.NormalizeFeatures()]) dataset = Amazon(path, dataset) num_per_class = 20 train_index = [] test_index = [] for i in range(dataset.num_classes): index = (dataset[0].y.long() == i).nonzero().view(-1) index = index[torch.randperm(index.size(0))] if len(index) > num_per_class + 30: train_index.append(index[:num_per_class]) test_index.append(index[num_per_class:]) else: continue train_index = torch.cat(train_index) test_index = torch.cat(test_index) train_mask = index_to_mask(train_index, size=dataset[0].num_nodes) test_mask = index_to_mask(test_index, size=dataset[0].num_nodes) data = Data(x=dataset[0].x, edge_index=dataset[0].edge_index, train_mask=train_mask, test_mask=test_mask, y=dataset[0].y) return dataset, data
def DataLoader(name): # assert name in ['cSBM_data_Aug_19_2020-13:06', # 'cSBM_data_Aug_18_2020-18:50', # 'cSBM_data_Aug_21_2020-10:06', # 'cSBM_data_Aug_19_2020-20:41', # 'cSBM_data_Aug_21_2020-11:04', # 'cSBM_data_Aug_21_2020-11:21', # 'cSBM_data_Sep_01_2020-14:15', # 'cSBM_data_Sep_01_2020-14:18', # 'cSBM_data_Sep_01_2020-14:19', # 'cSBM_data_Sep_01_2020-14:32', # 'cSBM_data_Sep_01_2020-14:22', # 'cSBM_data_Sep_01_2020-14:23', # 'cSBM_data_Sep_01_2020-14:27', # 'cSBM_data_Sep_01_2020-14:29', # 'Cora', 'Citeseer', 'PubMed', # 'Computers', 'Photo', # 'chameleon', 'film', 'squirrel', # 'Texas', 'Cornell'] # if name in ['cSBM_data_Aug_19_2020-13:06', # 'cSBM_data_Aug_18_2020-18:50', # 'cSBM_data_Aug_21_2020-10:06', # 'cSBM_data_Aug_19_2020-20:41', # 'cSBM_data_Aug_21_2020-11:04', # 'cSBM_data_Aug_21_2020-11:21', # 'cSBM_data_Sep_01_2020-14:15', # 'cSBM_data_Sep_01_2020-14:18', # 'cSBM_data_Sep_01_2020-14:19', # 'cSBM_data_Sep_01_2020-14:32', # 'cSBM_data_Sep_01_2020-14:22', # 'cSBM_data_Sep_01_2020-14:23', # 'cSBM_data_Sep_01_2020-14:27', # 'cSBM_data_Sep_01_2020-14:29']: if 'cSBM_data' in name: path = '../data/' dataset = dataset_ContextualSBM(path, name=name) else: name = name.lower() if name in ['cora', 'citeseer', 'pubmed']: root_path = '../' path = osp.join(root_path, 'data', name) dataset = Planetoid(path, name, transform=T.NormalizeFeatures()) elif name in ['computers', 'photo']: root_path = '../' path = osp.join(root_path, 'data', name) dataset = Amazon(path, name, T.NormalizeFeatures()) elif name in ['chameleon', 'film', 'squirrel']: dataset = dataset_heterophily(root='../data/', name=name, transform=T.NormalizeFeatures()) elif name in ['texas', 'cornell']: dataset = WebKB(root='../data/', name=name, transform=T.NormalizeFeatures()) else: raise ValueError(f'dataset {name} not supported in dataloader') return dataset
def get_dataset(name): if name in ['Cora', 'Citeseer', 'Pubmed']: dataset = Planetoid(path + name, name) elif name in ['Computers', 'Photo']: dataset = Amazon(path + name, name) else: raise Exception('Unknown dataset.') return dataset
def load_pyg(name, dataset_dir): """ Load PyG dataset objects. (More PyG datasets will be supported) Args: name (string): dataset name dataset_dir (string): data directory Returns: PyG dataset object """ dataset_dir = '{}/{}'.format(dataset_dir, name) if name in ['Cora', 'CiteSeer', 'PubMed']: dataset = Planetoid(dataset_dir, name) elif name[:3] == 'TU_': # TU_IMDB doesn't have node features if name[3:] == 'IMDB': name = 'IMDB-MULTI' dataset = TUDataset(dataset_dir, name, transform=T.Constant()) else: dataset = TUDataset(dataset_dir, name[3:]) elif name == 'Karate': dataset = KarateClub() elif 'Coauthor' in name: if 'CS' in name: dataset = Coauthor(dataset_dir, name='CS') else: dataset = Coauthor(dataset_dir, name='Physics') elif 'Amazon' in name: if 'Computers' in name: dataset = Amazon(dataset_dir, name='Computers') else: dataset = Amazon(dataset_dir, name='Photo') elif name == 'MNIST': dataset = MNISTSuperpixels(dataset_dir) elif name == 'PPI': dataset = PPI(dataset_dir) elif name == 'QM7b': dataset = QM7b(dataset_dir) else: raise ValueError('{} not support'.format(name)) return dataset
def load_dataset(name): if name in ["Cora", "CiteSeer", "PubMed"]: dataset = Planetoid(root='./data/'+name, name=name) elif name == "CoraFull": dataset = CoraFull(root='./data/'+name) elif name in ["Computers", "Photo"]: dataset = Amazon(root='./data/'+name, name=name) elif name in ["CS", "Physics"]: dataset = Coauthor(root='./data/'+name, name=name) else: exit("wrong dataset") return dataset
def __init__(self, path: str): pyg_dataset = Amazon(os.path.join(path, '_pyg'), "Photo") if hasattr(pyg_dataset, "__data_list__"): delattr(pyg_dataset, "__data_list__") if hasattr(pyg_dataset, "_data_list"): delattr(pyg_dataset, "_data_list") pyg_data = pyg_dataset[0] static_graph = GeneralStaticGraphGenerator.create_homogeneous_static_graph( { 'x': pyg_data.x, 'y': pyg_data.y }, pyg_data.edge_index) super(AmazonPhotoDataset, self).__init__([static_graph])
def load_dataset(dataset, transform=None): if dataset.lower() in ["cora", "citeseer", "pubmed"]: path = os.path.join(".datasets", "Plantoid") dataset = Planetoid(path, dataset.lower(), transform=transform) elif dataset.lower() in ["cs", "physics"]: path = os.path.join(".datasets", "Coauthor", dataset.lower()) dataset = Coauthor(path, dataset.lower(), transform=transform) elif dataset.lower() in ["computers", "photo"]: path = os.path.join(".datasets", "Amazon", dataset.lower()) dataset = Amazon(path, dataset.lower(), transform=transform) else: print("Dataset not supported!") assert False return dataset
def prepare_data(dataset, seed): """ :param dataset: name of the dataset used :return: data, in the correct format """ # Retrieve main path of project dirname = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) # Download and store dataset at chosen location if dataset == 'Cora' or dataset == 'PubMed' or dataset == 'Citeseer': path = os.path.join(dirname, 'data') data = Planetoid(path, name=dataset, split='full')[0] # data.train_mask, data.val_mask, data.test_mask = split_function(data.y.numpy()) data.num_classes = (max(data.y) + 1).item() # dataset = Planetoid(path, name=dataset, split='public', transform=T.NormalizeFeatures(), num_train_per_class=20, num_val=500, num_test=1000) # data = modify_train_mask(data) elif dataset == 'Amazon': path = os.path.join(dirname, 'data', 'Amazon') data = Amazon(path, 'photo')[0] data.num_classes = (max(data.y) + 1).item() data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) # Amazon: 4896 train, 1224 val, 1530 test elif dataset == 'Reddit': path = os.path.join(dirname, 'data', 'Reedit') data = Reddit(path)[0] data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy()) elif dataset == 'PPI': path = os.path.join(dirname, 'data', 'PPI') data = ppi_prepoc(path, seed) data.x = data.graphs[0].x data.num_classes = data.graphs[0].y.size(1) for df in data.graphs: df.num_classes = data.num_classes #elif dataset = 'MUTAG' # Get it in right format if dataset != 'PPI': print('Train mask is of size: ', data.train_mask[data.train_mask == True].shape) # data = add_noise_features(data, args.num_noise) return data
def prepare_data(dataset, train_ratio=0.8, input_dim=None, seed=10): """Import, save and process dataset Args: dataset (str): name of the dataset used seed (int): seed number Returns: [torch_geometric.Data]: dataset in the correct format with required attributes and train/test/val split """ # Retrieve main path of project dirname = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) # Download and store dataset at chosen location if dataset == 'Cora' or dataset == 'PubMed' or dataset == 'Citeseer': path = os.path.join(dirname, 'data') data = Planetoid(path, name=dataset, split='full')[0] data.name = dataset data.num_classes = (max(data.y) + 1).item() # data.train_mask, data.val_mask, data.test_mask = split_function(data.y.numpy()) # data = Planetoid(path, name=dataset, split='public', transform=T.NormalizeFeatures(), num_train_per_class=20, num_val=500, num_test=1000) elif dataset == 'Amazon': path = os.path.join(dirname, 'data', 'Amazon') data = Amazon(path, 'photo')[0] data.name = dataset data.num_classes = (max(data.y) + 1).item() data.train_mask, data.val_mask, data.test_mask = split_function( data.y.numpy(), seed=seed) # Amazon: 4896 train, 1224 val, 1530 test elif dataset in ['syn1', 'syn2', 'syn4', 'syn5']: data = synthetic_data(dataset, dirname, train_ratio, input_dim) elif dataset == 'syn6': data = gc_data(dataset, dirname, train_ratio) elif dataset == 'Mutagenicity': data = gc_data(dataset, dirname, train_ratio) return data
def fetch_dataset(root, name): """ Fetchs datasets from the PyTorch Geometric library :param root: A path to the root directory a dataset will be placed :param name: Name of the dataset. Currently, the following names are supported 'cora', 'citeseer', "pubmed", 'Computers', "Photo", 'CS', 'Physics' :return: A PyTorch Geometric dataset """ print(name.lower()) if name.lower() in {'cora', 'citeseer', "pubmed"}: return Planetoid(root=root, name=name) elif name.lower() in {'computers', "photo"}: return Amazon(root=root, name=name) elif name.lower() in {'cs', 'physics'}: return Coauthor(root=root, name=name) elif name.lower() == "wiki": return WikiCS(osp.join(root, "WikiCS")) elif name.lower() == "actor": return Actor(osp.join(root, name))
def get_dataset(dataset_name): """ Retrieves the dataset corresponding to the given name. """ path = 'dataset' if dataset_name == 'reddit': dataset = Reddit(path) elif dataset_name == 'amazon_comp': dataset = Amazon(path, name="Computers") data = dataset.data idx_train, idx_test = train_test_split(list(range(data.x.shape[0])), test_size=0.4, random_state=42) idx_val, idx_test = train_test_split(idx_test, test_size=0.5, random_state=42) train_mask = torch.tensor([False] * data.x.shape[0]) val_mask = torch.tensor([False] * data.x.shape[0]) test_mask = torch.tensor([False] * data.x.shape[0]) train_mask[idx_train] = True val_mask[idx_val] = True test_mask[idx_test] = True data.train_mask = train_mask data.val_mask = val_mask data.test_mask = test_mask dataset.data = data elif dataset_name in ["Cora", "CiteSeer", "PubMed"]: dataset = Planetoid( path, name=dataset_name, split="full", ) else: raise NotImplementedError return dataset
def load_data(dataset="Cora", supervised=False, full_data=True, args=None): ''' support semi-supervised and supervised :param dataset: :param supervised: :return: ''' path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset) dataset_name = dataset if dataset in ["CS", "Physics"]: dataset = Coauthor(path, dataset, T.NormalizeFeatures()) elif dataset in ["Computers", "Photo"]: dataset = Amazon(path, dataset, T.NormalizeFeatures()) elif dataset in ["Cora", "Citeseer", "Pubmed"]: dataset = Planetoid(path, dataset, T.NormalizeFeatures()) # path = path + '/processed/data.pt' # dataset = torch.load(path) data = dataset[0] data['adj'] = load_citation(dataset_name, args.normalization) if supervised: if full_data: data.train_mask = torch.zeros(data.num_nodes, dtype=torch.uint8) data.train_mask[:-1000] = 1 data.val_mask = torch.zeros(data.num_nodes, dtype=torch.uint8) data.val_mask[-1000:-500] = 1 data.test_mask = torch.zeros(data.num_nodes, dtype=torch.uint8) data.test_mask[-500:] = 1 else: data.train_mask = torch.zeros(data.num_nodes, dtype=torch.uint8) data.train_mask[:1000] = 1 data.val_mask = torch.zeros(data.num_nodes, dtype=torch.uint8) data.val_mask[1000:1500] = 1 data.test_mask = torch.zeros(data.num_nodes, dtype=torch.uint8) data.test_mask[1500:2000] = 1 return data
def load_data_lp(dataset, use_feats, data_path): if dataset in ['cora', 'pubmed', 'citeseer']: # adj, features, labels = load_citation_data(dataset, use_feats, data_path)[:3] adj, features, labels, idx_train, idx_val, idx_test = load_citation_data( dataset, use_feats, data_path, split_seed=None) elif dataset in ['cora_planetoid', 'pubmed_planetoid']: from torch_geometric.datasets import Planetoid import torch_geometric as tg import scipy.sparse as sp if dataset == 'cora': name = 'Cora' elif dataset == 'pubmed': name = 'Pubmed' else: raise FileNotFoundError( 'Dataset {} is not supported.'.format(dataset)) loaded_dataset = Planetoid(root='/root/tmp/' + name, name='Cora') adj = tg.utils.to_scipy_sparse_matrix(loaded_dataset.data.edge_index) adj = sp.coo_matrix.asformat(adj, format='csr') features = sp.lil_matrix(loaded_dataset.data.x.numpy()) labels = loaded_dataset.data.y.numpy() elif 'amazon' in dataset: from torch_geometric.datasets import Amazon import torch_geometric as tg import scipy.sparse as sp if dataset == 'amazon-photo': name = 'Photo' elif dataset == 'amazon-computers': name = 'Computers' else: raise FileNotFoundError( 'Dataset {} is not supported.'.format(dataset)) loaded_dataset = Amazon(root='/root/tmp/' + name, name=name) adj = tg.utils.to_scipy_sparse_matrix(loaded_dataset.data.edge_index) adj = sp.coo_matrix.asformat(adj, format='csr') features = sp.lil_matrix(loaded_dataset.data.x.numpy()) labels = loaded_dataset.data.y.numpy() elif dataset == 'BlogCatalog': import scipy.io as sio import scipy.sparse as sp data = sio.loadmat('./data/BlogCatalog/BlogCatalog.mat') features = sp.lil_matrix(data['Attributes']) labels = np.squeeze(data['Label']) adj = sp.csr_matrix(data['Network']) elif dataset == 'wiki': import scipy.sparse as sp features = np.loadtxt('./data/wiki/wiki_feature.txt') features = sp.coo_matrix( (features[:, 2], (features[:, 0].astype(int), features[:, 1].astype(int)))) features = sp.lil_matrix(features) adj = np.loadtxt('./data/wiki/wiki_graph.txt') adj = np.ndarray.tolist(adj) adj = nx.from_edgelist(adj) adj = nx.adjacency_matrix(adj) labels = np.loadtxt('./data/wiki/wiki_group.txt') labels = labels[:, 1] labels = labels.astype(np.int64) labels = np.squeeze(np.reshape(labels, (2405, 1)) - 1) elif 'PICA' in dataset: if 'ImageNet10' in dataset: dataset_lower = 'imagenet10' dataset_name = 'PICA-ImageNet10' elif 'ImageNetDog' in dataset: dataset_lower = 'imagenetdog' dataset_name = 'PICA-ImageNetDog' if 'feat10' in dataset: name = 'picafeat10_{}'.format(dataset_lower) elif 'feat70' in dataset: name = 'picafeat70_{}'.format(dataset_lower) elif 'feat512' in dataset: name = 'picafeat512_{}'.format(dataset_lower) orig_dataset = dataset suffix = dataset.split(dataset_name)[-1] dataset = dataset_name print('name : {}, suffix : {}'.format(name, suffix)) y_true = np.load('./data/{}/label.npy'.format(dataset)) y_true = y_true.astype('int64') labels = y_true features = np.load('./data/{}/{}.npy'.format(dataset, name)) import scipy.sparse as sp features = sp.lil_matrix(features) A = sp.load_npz('./data/{}/A{}.npz'.format(dataset, suffix)) adj = A.astype('float64') labels = torch.LongTensor(labels) data = {'adj_train': adj, 'features': features, 'labels': labels} return data else: raise FileNotFoundError('Dataset {} is not supported.'.format(dataset)) labels = torch.LongTensor(labels) data = {'adj_train': adj, 'features': features, 'labels': labels} return data
_data = Planetoid(root="./pciteseer", name="Citeseer") elif pr.net == 3: print("Data Pubmed") _data = Planetoid(root="./ppubmed", name="Pubmed") elif pr.net == 4: print("Data CoraFull") _data = CoraFull("./Corafull") elif pr.net == 5: print("Data Coauthor CS") _data = Coauthor("./CS", "CS") elif pr.net == 6: print("Data Coauthor Physics") _data = Coauthor("./Physics", "Physics") elif pr.net == 7: print("Data Amazon Computer") _data = Amazon("./Computer", "Computers") elif pr.net == 8: print("Data Amazon Photos") _data = Amazon("./Photo", "Photo") #_data = Coauthor("./Physics","Physics") #_data = Coauthor("./CS","CS") #_data = CoraFull("./Corafull") #_data = Planetoid(root="./pcora",name="Cora") #_data = Planetoid(root="./pciteseer",name="Citeseer") #_data = Planetoid(root="./ppubmed",name="Pubmed") #_data = Amazon("./Computer","Computers") #_data = Amazon("./Photo","Photo")
def load_data(name, seed, transform=None): ''' Load data from files and return a pytorch geometric `Data` object ''' random.seed(seed) # make sure that the split of data is the same ROOT = osp.dirname(osp.abspath(__file__)) + '/..' if name in ['cora', 'citeseer', 'pubmed']: # datasets for transductive node classifiction data = Planetoid(osp.join(ROOT, 'data'), name, transform=transform)[0] data.task = 'semi' # semi-supervised data.setting = 'transductive' # transductive return data elif name in ['wikics']: dataset = WikiCS(osp.join(ROOT, 'data', 'wikics'), transform=transform) data = dataset[0] data.task = 'semi' data.setting = 'transductive' data.train_mask = data.train_mask[:,0] data.val_mask = data.val_mask[:, 0] data.stopping_mask = data.stopping_mask[:, 0] return data elif name in ['ppi']: # datasets for inductive node classification train_dataset = PPI(osp.join(ROOT, 'data', 'ppi'), split='train', transform=transform) val_dataset = PPI(osp.join(ROOT, 'data', 'ppi'), split='val', transform=transform) test_dataset = PPI(osp.join(ROOT, 'data', 'ppi'), split='test', transform=transform) return (train_dataset, val_dataset, test_dataset) elif name in ['usa-airports']: try: data = pickle.load(open(osp.join(ROOT, 'data', name, 'data.pkl'), 'rb')) return data except FileNotFoundError: print('Data not found. Re-generating...') nx_graph = nx.read_edgelist(osp.join(ROOT, 'data', name, 'edges.txt')) nx_graph = nx.convert_node_labels_to_integers(nx_graph, label_attribute='id2oid') # oid for original id oid2id = {int(v):k for k,v in nx.get_node_attributes(nx_graph, 'id2oid').items()} id2label = {} for line in open(osp.join(ROOT, 'data', name, 'labels.txt')): linesplit = line.strip().split() oid = int(linesplit[0]) label = int(linesplit[1]) id2label[oid2id[oid]] = {'y': label} # here we assume that the label id start from 0 and the labeling is consistant. nx.set_node_attributes(nx_graph, id2label) data = from_networkx(nx_graph) num_nodes = len(nx_graph.nodes) node_idxs = list(range(num_nodes)) random.shuffle(node_idxs) # split data, train:val:test = 80%:10%:10% train_idxs = node_idxs[:int(0.8 * num_nodes)] val_idxs = node_idxs[int(0.8 * num_nodes):int(0.9 * num_nodes)] test_idxs = node_idxs[int(0.9 * num_nodes):] data.train_mask = torch.zeros(num_nodes, dtype=torch.bool) data.val_mask = torch.zeros(num_nodes, dtype=torch.bool) data.test_mask = torch.zeros(num_nodes, dtype=torch.bool) data.train_mask[train_idxs] = True data.val_mask[val_idxs] = True data.test_mask[test_idxs] = True if data.x and transform: data.x = transform(data.x) data.num_nodes = num_nodes data.task = 'sup' # simi-supervised data.setting = 'transductive' # transductive pickle.dump(data, open(osp.join(ROOT, 'data', name, 'data.pkl'), 'wb')) return data elif name in ['ogbn-arxiv']: dataset = PygNodePropPredDataset(name, root=osp.join(ROOT, 'data'), transform=transform) split_idx = dataset.get_idx_split() data = dataset[0] split_idx['val'] = split_idx.pop('valid') for key, idx in split_idx.items(): mask = torch.zeros(data.num_nodes, dtype=torch.bool) mask[idx] = True data[f'{key}_mask'] = mask data.task = 'sup' # simi-supervised data.setting = 'transductive' # transductive return data elif name in ['photo']: dataset = Amazon('data/photo', 'photo', transform=transform) data = dataset[0] data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.train_mask[:-1000] = True data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.val_mask[-1000: -500] = True data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.test_mask[-500:] = True data.train_edge_index, _ = subgraph(data.train_mask, data.edge_index, relabel_nodes=True) data.val_edge_index, _ = subgraph(data.val_mask, data.edge_index, relabel_nodes=True) data.test_edge_index, _ = subgraph(data.test_mask, data.edge_index, relabel_nodes=True) data.train_x = data.x[data.train_mask] data.train_y = data.y[data.train_mask] data.val_x = data.x[data.val_mask] data.val_y = data.y[data.val_mask] data.test_x = data.x[data.test_mask] data.test_y = data.y[data.test_mask] data.num_train_nodes = data.train_x.shape[0] data.task = 'sup' # simi-supervised data.setting = 'inductive' # transductive return data else: raise NotImplementedError('Not supported dataset.')
import os.path as osp import torch from tqdm import tqdm import torch.nn.functional as F import matplotlib.pyplot as plt from torch_geometric.datasets import Amazon import torch_geometric.transforms as T from torch_geometric.nn import GNNExplainer, ARMAConv from torch_geometric.utils import k_hop_subgraph from math import floor from scipy import stats dataset = 'Computers' path = osp.join('.', 'data', 'Amazon') dataset = Amazon(path, dataset, transform=T.NormalizeFeatures()) data = dataset[0] # Define the model class Net(torch.nn.Module): def __init__(self, k=1, x=16): super(Net, self).__init__() self.conv1 = ARMAConv(dataset.num_features, x) self.conv2 = ARMAConv(x, x) self.conv3 = ARMAConv(x, dataset.num_classes) def forward(self, x, edge_index): x = F.relu(self.conv1(x, edge_index)) x = F.dropout(x, training=self.training) x = F.relu(self.conv2(x, edge_index)) x = F.dropout(x, training=self.training)
def __init__(self, path): dataset = "Photo" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) Amazon(path, dataset) super(AmazonPhotoDataset, self).__init__(path, dataset)
def TrainingNet(dataset, modelName, params, num_pre_epochs, num_epochs, NumCutoff, optimizerName, MonteSize, savepath): Batch_size = int(params[0]) root = '/git/data/GraphData/' + dataset TestAccs = [] for Monte_iter in range(MonteSize): # Data start_epoch = 0 # start from epoch 0 or last checkpoint epoch NewNetworkSizeAdjust = [] WeightsDynamicsEvolution = [] # model if dataset == 'Cora' or dataset == 'Citeseer' or dataset == 'Pubmed': datasetroot = Planetoid(root=root, name=dataset, transform=T.NormalizeFeatures()).shuffle() trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) [net, model_to_save] = ModelAndSave(dataset, modelName, datasetroot, params, num_epochs) criterion = nn.CrossEntropyLoss() elif dataset == "CoraFull": datasetroot = CoraFull(root=root, transform=T.NormalizeFeatures()).shuffle() trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) [net, model_to_save] = ModelAndSave(dataset, modelName, datasetroot, params, num_epochs) elif dataset == "Amazon": datasetroot = Amazon(root, "Photo", transform=None, pre_transform=None) trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) testloader = DataListLoader(datasetroot, batch_size=100, shuffle=False) [net, model_to_save] = ModelAndSave(dataset, modelName, datasetroot, params, num_epochs) elif dataset == 'ENZYMES' or dataset == 'MUTAG': datasetroot = TUDataset(root, name=dataset, use_node_attr=True) Num = len(datasetroot) // 10 global train_dataset, test_dataset train_dataset = datasetroot[:Num] test_dataset = datasetroot[Num:] trainloader = DataLoader(train_dataset, batch_size=Batch_size) testloader = DataLoader(test_dataset, batch_size=60) [net, model_to_save] = ModelAndSave(dataset, modelName, datasetroot, params, num_epochs) elif dataset == "PPI": train_dataset = PPI(root, split='train') test_dataset = PPI(root, split='test') trainloader = DataLoader(train_dataset, batch_size=Batch_size, shuffle=True) testloader = DataLoader(test_dataset, batch_size=1, shuffle=False) [net, model_to_save] = ModelAndSave(dataset, modelName, train_dataset, params, num_epochs) criterion = torch.nn.BCEWithLogitsLoss() elif dataset == "Reddit": datasetroot = Reddit(root) trainloader = DataListLoader(datasetroot, batch_size=1, shuffle=True) testloader = DataListLoader(datasetroot, batch_size=2, shuffle=False) [net, model_to_save] = ModelAndSave(dataset, modelName, datasetroot, params, num_epochs) criterion = torch.nn.BCEWithLogitsLoss() elif dataset == 'MNIST': datasetroot = MNISTSuperpixels(root=root, transform=T.Cartesian()) trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) testloader = DataListLoader(datasetroot, batch_size=100, shuffle=False) [net, model_to_save] = ModelAndSave(dataset, modelName, datasetroot, params, num_epochs) elif dataset == 'CIFAR10': pass else: raise Exception("Input wrong datatset!!") FileName = "{}-{}-param_{}_{}_{}_{}-monte_{}".format( dataset, modelName, params[0], params[1], params[2], params[3], Monte_iter) print('Let\'s use', torch.cuda.device_count(), 'GPUs!') global device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') optimizer = optim.Adam(net.parameters(), lr=params[3], betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False) criterion = nn.CrossEntropyLoss() net = net.to(device) #cudnn.benchmark = True logging( 'dataset:{}, Batch size: {}, Number of layers:{} ConCoeff: {}, LR:{}, MonteSize:{}' .format(dataset, params[0], params[1], params[2], params[3], Monte_iter)) mark = "{}/{}Convergence/DiagElement-{}".format( savepath, dataset, FileName) PreTrainConvergence, PreTestConvergence, PreTestAcc = TrainPart( modelName, datasetroot, start_epoch, num_pre_epochs, trainloader, testloader, net, optimizer, criterion, NumCutoff, mark, False, model_to_save) print( 'dataset: {}, model name: {}, Number epoches: {}, Pre-train error is: {}, Pre-test error is: {}, test acc is {}' .format(dataset, modelName, num_pre_epochs, PreTrainConvergence[-1], PreTestConvergence[-1], PreTestAcc[-1])) NewNetworksize, NewNetworkWeight = RetainNetworkSize(net, params[2])[0:2] NetworkInfo = [NewNetworksize[0:-1], NewNetworkWeight] OptimizedNet = ChooseModel(modelName, datasetroot, NetworkInfo) NewNetworksize.insert(0, datasetroot.num_features) NewNetworkSizeAdjust.append(NewNetworksize[0:-1]) print(NewNetworkSizeAdjust) #OptimizedNet.apply(init_weights) #OptimizedNet = DataParallel(OptimizedNet) OptimizedNet = OptimizedNet.to(device) cudnn.benchmark = True criterionNew = nn.CrossEntropyLoss() if optimizerName == "SGD": optimizerNew = getattr(optim, optimizerName)(OptimizedNet.parameters(), lr=params[3], momentum=0.9, weight_decay=5e-4) elif optimizerName == "Adam": optimizerNew = getattr(optim, optimizerName)(OptimizedNet.parameters(), lr=params[3], betas=(0.9, 0.999), eps=1e-08, weight_decay=5e-4, amsgrad=False) TrainConvergence, TestConvergence, TestAcc = TrainPart( modelName, datasetroot, start_epoch, num_epochs, trainloader, testloader, OptimizedNet, optimizerNew, criterionNew, NumCutoff, mark, True, model_to_save) np.save( "{}/{}Convergence/TrainConvergence-{}".format( savepath, dataset, FileName), TrainConvergence) np.save( "{}/{}Convergence/TestConvergence-{}".format( savepath, dataset, FileName), TestConvergence) np.save( "{}/{}Convergence/NewNetworkSizeAdjust-{}".format( savepath, dataset, FileName), NewNetworkSizeAdjust) #np.save(savepath+'TestConvergence-'+FileName,TestConvergence) #torch.cuda.empty_cache() print( 'dataset: {}, model name:{}, resized network size is {}, Number epoches:{}, Train error is: {}, Test error is: {}, test acc is {}\n' .format(dataset, modelName, NewNetworksize[0:-1], num_epochs, TrainConvergence[-1], TestConvergence[-1], TestAcc[-1])) TestAccs.append(TestAcc) np.save( "{}/{}Convergence/MeanTestAccs-{}".format(savepath, dataset, FileName), TestAccs) print("The change of test error is:{}".format(TestAccs)) print_nvidia_useage()
notes=full_description) # --------------------------------------------------------------- print("Done 1") wandb.log({'action': 'Done 1'}) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') real_data = None if DATASET == "Karate Club": real_data = KarateClub() elif DATASET == "Cora" or DATASET == "Citeseer" or DATASET == "Pubmed": real_data = Planetoid(root=input_path, name=DATASET, split="public") elif DATASET == "Reddit": real_data = Reddit(root=input_path) elif DATASET == "Amazon Computers": real_data = Amazon(root=input_path, name="Computers") elif DATASET == "Amazon Photos": real_data = Amazon(root=input_path, name="Photo") elif DATASET == "CLUSTER": real_data = GNNBenchmarkDataset(root=input_path, name="CLUSTER", split="test") elif DATASET == "PATTERN": real_data = GNNBenchmarkDataset(root=input_path, name="PATTERN", split="test") elif DATASET == "Flickr": real_data = Flickr(root=input_path) elif DATASET == "OGB Products": real_data = PygNodePropPredDataset(name='ogbn-products') split_idx = real_data.get_idx_split()