def _process(self): # Reload train, val and test .pt files self.train = PPI(root=self.raw_dir, split='train') self.validation = PPI(root=self.raw_dir, split='val') self.test = PPI(root=self.raw_dir, split='test') # dynamically set maximum num nodes (useful if using dense batching, e.g. diffpool) max_num_nodes = max([g.x.shape[0] for data in [self.train, self.validation, self.test] for g in data]) setattr(self, 'max_num_nodes', max_num_nodes) # 11-th feature is a constant, let's remove it idx_to_remove = 10 mask = torch.LongTensor(list(range(idx_to_remove)) + list(range(idx_to_remove+1, 50)) ) # Convert PyG Data object into our augmented Data object dataset = [Data(x=g.x.index_select(1, mask), y=g.y, edge_index=g.edge_index) for data in [self.train, self.validation, self.test] for g in data] ''' Used to debug feature filtering for g in dataset: print(g.x.shape) print(g.x.index_select(1, mask).shape) ''' torch.save(dataset, self.processed_dir / f"{self.name}.pt")
def main(args): print('-----------dense gsdnef ppi alpha %s-----------' % (args.alpha)) ## loading data dataset = args.input path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'PPI') train_dataset = PPI(path, split='train') val_dataset = PPI(path, split='val') test_dataset = PPI(path, split='test') train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = Net(in_channels=train_dataset.num_features, out_channels=train_dataset.num_classes, hidden_num=args.hidden_num, alpha=args.alpha, K=args.K).to(device) loss_op = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) t1 = time.time() for epoch in range(1, args.epochs + 1): loss = train(model, optimizer, train_loader, loss_op, device) val_f1 = test(model, val_loader, device) test_f1 = test(model, test_loader, device) print('Epoch: {:02d}, Loss: {:.4f}, Val: {:.4f}, Test: {:.4f}'.format( epoch, loss, val_f1, test_f1)) print('{:.4f}'.format(test_f1)) t2 = time.time()
def load_data(): path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'PPI') train_dataset = PPI(path, split='train') val_dataset = PPI(path, split='val') test_dataset = PPI(path, split='test') train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False) return train_loader, val_loader, test_loader
def load_ppi_data(): path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "data", "PPI") train_dataset = PPI(path, split="train", transform=T.NormalizeFeatures()) val_dataset = PPI(path, split="val", transform=T.NormalizeFeatures()) test_dataset = PPI(path, split="test", transform=T.NormalizeFeatures()) train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False) val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False) return [train_loader, val_loader, test_loader]
def __init__(self, name): path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', name) self.path = path self.train_dataset = PPI(self.path, split='train', transform=T.NormalizeFeatures()) self.test_dataset = PPI(self.path, split='test', transform=T.NormalizeFeatures()) self.num_features = self.train_dataset.num_features self.reconstruction_loss = None
def get_ppi_dataset(name): path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'datasets', 'node_datasets', name) train_dataset = PPI(path, split='train') val_dataset = PPI(path, split='val') test_dataset = PPI(path, split='test') dataset = { 'train': train_dataset, 'val': val_dataset, 'test': test_dataset } return dataset
def loadPPI(): from torch_geometric.datasets import PPI ppi = PPI(root="data/ppi/") ppitrain = ppi ppi += PPI(root="data/ppi/", split="val") ppi += PPI(root="data/ppi/", split="test") Y = [] for ppii in ppi: Y.append(ppii.y) Y = torch.cat(Y, dim=0) labels = Y goodclass = selectclass(labels) # print(labels.size()) labels = labels[:, goodclass] print(labels[:10, :]) print("=" * 20) selectclass(labels) ## check # print(ppi.data.x[:10,:])[] # for i in range() for i in range(len(ppi)): print("ppi{}".format(i)) ppii = ppi[i] N = ppii.x.size(0) edgelist = ppii.edge_index.transpose(0, 1).numpy().tolist() with open("data/ppi/ppi" + str(i) + ".edgelist", 'w') as file: file.write(str(N) + "\n") for edge in edgelist: file.write(str(edge[0]) + " " + str(edge[1]) + "\n") labeli = ppii.y[:, goodclass].numpy() with open("data/ppi/ppi" + str(i) + ".y.pkl", 'wb') as file: pkl.dump(labeli, file) with open("data/ppi/ppi" + str(i) + ".x.pkl", 'wb') as file: xi = ppii.x.numpy() pkl.dump(xi, file) i = 24 print("ppitrain {}".format(i)) ppii = ppitrain.data N = ppii.x.size(0) edgelist = ppii.edge_index.transpose(0, 1).numpy().tolist() with open("data/ppi/ppi" + str(i) + ".edgelist", 'w') as file: file.write(str(N) + "\n") for edge in edgelist: file.write(str(edge[0]) + " " + str(edge[1]) + "\n") labeli = ppii.y[:, goodclass].numpy() with open("data/ppi/ppi" + str(i) + ".y.pkl", 'wb') as file: pkl.dump(labeli, file) with open("data/ppi/ppi" + str(i) + ".x.pkl", 'wb') as file: xi = ppii.x.numpy() pkl.dump(xi, file)
def load_PPI(dataset_folder): """ 导入PPI数据集,处理为Data格式并划分训练集,验证集,测试集 :param dataset_folder: 数据集存储路径 :return: 训练集,验证集,测试集 """ path = os.path.join(os.path.dirname(dataset_folder), 'PPI') train_dataset = PPI(path, split='train') val_dataset = PPI(path, split='val') test_dataset = PPI(path, split='test') train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False) return train_dataset, val_dataset, test_dataset, train_loader, val_loader, test_loader
def load_ppi(dataset): data_name = ['PPI'] assert dataset in data_name path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'Datasets', 'NodeData', 'PPI') dataset = PPI(path) return dataset
def main(): if not os.path.exists("plots/cluster"): os.makedirs("plots/cluster") parser = argparse.ArgumentParser(description='Decoder arguments') parse_encoder(parser) parse_decoder(parser) args = parser.parse_args() args.dataset = "enzymes" print("Using dataset {}".format(args.dataset)) if args.dataset == 'enzymes': dataset = TUDataset(root='/tmp/ENZYMES', name='ENZYMES') task = 'graph' elif args.dataset == 'cox2': dataset = TUDataset(root='/tmp/cox2', name='COX2') task = 'graph' elif args.dataset == 'reddit-binary': dataset = TUDataset(root='/tmp/REDDIT-BINARY', name='REDDIT-BINARY') task = 'graph' elif args.dataset == 'dblp': dataset = TUDataset(root='/tmp/dblp', name='DBLP_v1') task = 'graph-truncate' elif args.dataset == 'coil': dataset = TUDataset(root='/tmp/coil', name='COIL-DEL') task = 'graph' elif args.dataset.startswith('roadnet-'): graph = nx.Graph() with open("data/{}.txt".format(args.dataset), "r") as f: for row in f: if not row.startswith("#"): a, b = row.split("\t") graph.add_edge(int(a), int(b)) dataset = [graph] task = 'graph' elif args.dataset == "ppi": dataset = PPI(root="/tmp/PPI") task = 'graph' elif args.dataset in ['diseasome', 'usroads', 'mn-roads', 'infect']: fn = { "diseasome": "bio-diseasome.mtx", "usroads": "road-usroads.mtx", "mn-roads": "mn-roads.mtx", "infect": "infect-dublin.edges" } graph = nx.Graph() with open("data/{}".format(fn[args.dataset]), "r") as f: for line in f: if not line.strip(): continue a, b = line.strip().split(" ") graph.add_edge(int(a), int(b)) dataset = [graph] task = 'graph' elif args.dataset.startswith('plant-'): size = int(args.dataset.split("-")[-1]) dataset = make_plant_dataset(size) task = 'graph' pattern_growth(dataset, task, args)
def load_data(path): train_dataset = PPI(path, split='train') val_dataset = PPI(path, split='val') test_dataset = PPI(path, split='test') train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=0) val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, num_workers=0) test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False, num_workers=0) return train_dataset, val_dataset, test_dataset, train_loader, val_loader, test_loader
def test_PPI(dataset_folder): """ 导入PPI数据集,处理为Data格式 :param dataset_folder: 数据集存储路径 :return: 测试集 """ path = os.path.join(os.path.dirname(dataset_folder), 'PPI') test_dataset = PPI(path, split='test') test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False) return test_dataset, test_loader
def _process(self): # Reload train, val and test .pt files self.train = PPI(root=self.raw_dir, split='train') self.validation = PPI(root=self.raw_dir, split='val') self.test = PPI(root=self.raw_dir, split='test') # dynamically set maximum num nodes (useful if using dense batching, e.g. diffpool) max_num_nodes = max([ g.x.shape[0] for data in [self.train, self.validation, self.test] for g in data ]) setattr(self, 'max_num_nodes', max_num_nodes) # 11-th feature is a constant, let's remove it idx_to_remove = 10 mask = torch.LongTensor( list(range(idx_to_remove)) + list(range(idx_to_remove + 1, 50))) # Convert PyG Data object into our augmented Data object dataset = [ Data(x=g.x.index_select(1, mask), y=g.y, edge_index=g.edge_index) for data in [self.train, self.validation, self.test] for g in data ] ''' Used to debug feature filtering for g in dataset: print(g.x.shape) print(g.x.index_select(1, mask).shape) ''' # TODO COME FARLO SU PPI SE GIA' HO I DATA? (MINOR) # if self.precompute_kron_indices: # laplacians, v_plus_list = self._precompute_kron_indices(G) # G.laplacians = laplacians # G.v_plus = v_plus_list # # if G.number_of_nodes() > 1 and G.number_of_edges() > 0: # data = self._to_data(G) # TODO questo c'e' già!! # dataset.append(data) torch.save(dataset, self.processed_dir / f"{self.name}.pt")
def __init__(self, path: str): train_dataset = PPI(os.path.join(path, '_pyg'), 'train') if hasattr(train_dataset, "__data_list__"): delattr(train_dataset, "__data_list__") if hasattr(train_dataset, "_data_list"): delattr(train_dataset, "_data_list") val_dataset = PPI(os.path.join(path, '_pyg'), 'val') if hasattr(val_dataset, "__data_list__"): delattr(val_dataset, "__data_list__") if hasattr(val_dataset, "_data_list"): delattr(val_dataset, "_data_list") test_dataset = PPI(os.path.join(path, '_pyg'), 'test') if hasattr(test_dataset, "__data_list__"): delattr(test_dataset, "__data_list__") if hasattr(test_dataset, "_data_list"): delattr(test_dataset, "_data_list") train_index = range(len(train_dataset)) val_index = range(len(train_dataset), len(train_dataset) + len(val_dataset)) test_index = range( len(train_dataset) + len(val_dataset), len(train_dataset) + len(val_dataset) + len(test_dataset)) super(PPIDataset, self).__init__([ GeneralStaticGraphGenerator.create_homogeneous_static_graph( { 'x': data.x, 'y': data.y }, data.edge_index) for data in train_dataset ] + [ GeneralStaticGraphGenerator.create_homogeneous_static_graph( { 'x': data.x, 'y': data.y }, data.edge_index) for data in val_dataset ] + [ GeneralStaticGraphGenerator.create_homogeneous_static_graph( { 'x': data.x, 'y': data.y }, data.edge_index) for data in test_dataset ], train_index, val_index, test_index)
def load_dataset(name): name = name.lower() if name in ['cora', 'citeseer', 'pubmed']: return Planetoid(root=name, name=name, pre_transform=normalize_features) elif name == 'ppi': datasets = [] for split in ['train', 'val', 'test']: dataset = PPI(root='PPI', split=split, pre_transform=normalize_features) datasets.append(dataset) return datasets
def get_dataset(dataset_name): """ Retrieves the dataset corresponding to the given name. """ print("Getting dataset...") path = join('dataset', dataset_name) if dataset_name == 'reddit': dataset = Reddit(path) elif dataset_name == 'ppi': dataset = PPI(path) elif dataset_name == 'github': dataset = GitHub(path) data = dataset.data idx_train, idx_test = train_test_split(list(range(data.x.shape[0])), test_size=0.4, random_state=42) idx_val, idx_test = train_test_split(idx_test, test_size=0.5, random_state=42) data.train_mask = torch.tensor(idx_train) data.val_mask = torch.tensor(idx_val) data.test_mask = torch.tensor(idx_test) dataset.data = data elif dataset_name in ['amazon_comp', 'amazon_photo']: dataset = Amazon(path, "Computers", T.NormalizeFeatures() ) if dataset_name == 'amazon_comp' else Amazon( path, "Photo", T.NormalizeFeatures()) data = dataset.data idx_train, idx_test = train_test_split(list(range(data.x.shape[0])), test_size=0.4, random_state=42) idx_val, idx_test = train_test_split(idx_test, test_size=0.5, random_state=42) data.train_mask = torch.tensor(idx_train) data.val_mask = torch.tensor(idx_val) data.test_mask = torch.tensor(idx_test) dataset.data = data elif dataset_name in ["Cora", "CiteSeer", "PubMed"]: dataset = Planetoid(path, name=dataset_name, split="full", transform=T.NormalizeFeatures()) else: raise NotImplementedError print("Dataset ready!") return dataset
def load_pyg(name, dataset_dir): """ Load PyG dataset objects. (More PyG datasets will be supported) Args: name (string): dataset name dataset_dir (string): data directory Returns: PyG dataset object """ dataset_dir = '{}/{}'.format(dataset_dir, name) if name in ['Cora', 'CiteSeer', 'PubMed']: dataset = Planetoid(dataset_dir, name) elif name[:3] == 'TU_': # TU_IMDB doesn't have node features if name[3:] == 'IMDB': name = 'IMDB-MULTI' dataset = TUDataset(dataset_dir, name, transform=T.Constant()) else: dataset = TUDataset(dataset_dir, name[3:]) elif name == 'Karate': dataset = KarateClub() elif 'Coauthor' in name: if 'CS' in name: dataset = Coauthor(dataset_dir, name='CS') else: dataset = Coauthor(dataset_dir, name='Physics') elif 'Amazon' in name: if 'Computers' in name: dataset = Amazon(dataset_dir, name='Computers') else: dataset = Amazon(dataset_dir, name='Photo') elif name == 'MNIST': dataset = MNISTSuperpixels(dataset_dir) elif name == 'PPI': dataset = PPI(dataset_dir) elif name == 'QM7b': dataset = QM7b(dataset_dir) else: raise ValueError('{} not support'.format(name)) return dataset
def load_dataset(name): """ Load real-world datasets, available in PyTorch Geometric. Used as a helper for DiskDataSource. """ task = "graph" if name == "enzymes": dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES") elif name == "proteins": dataset = TUDataset(root="/tmp/PROTEINS", name="PROTEINS") elif name == "cox2": dataset = TUDataset(root="/tmp/cox2", name="COX2") elif name == "aids": dataset = TUDataset(root="/tmp/AIDS", name="AIDS") elif name == "reddit-binary": dataset = TUDataset(root="/tmp/REDDIT-BINARY", name="REDDIT-BINARY") elif name == "imdb-binary": dataset = TUDataset(root="/tmp/IMDB-BINARY", name="IMDB-BINARY") elif name == "firstmm_db": dataset = TUDataset(root="/tmp/FIRSTMM_DB", name="FIRSTMM_DB") elif name == "dblp": dataset = TUDataset(root="/tmp/DBLP_v1", name="DBLP_v1") elif name == "ppi": dataset = PPI(root="/tmp/PPI") elif name == "qm9": dataset = QM9(root="/tmp/QM9") elif name == "atlas": dataset = [g for g in nx.graph_atlas_g()[1:] if nx.is_connected(g)] if task == "graph": train_len = int(0.8 * len(dataset)) train, test = [], [] dataset = list(dataset) random.shuffle(dataset) has_name = hasattr(dataset[0], "name") for i, graph in tqdm(enumerate(dataset)): if not type(graph) == nx.Graph: if has_name: del graph.name graph = pyg_utils.to_networkx(graph).to_undirected() if i < train_len: train.append(graph) else: test.append(graph) return train, test, task
def ppi_prepoc(dirname, seed): # 20 protein graphs - some set as validation, some as train, some as test. # Need to create the relevant masks for each graph data = SimpleNamespace() data.graphs = [] for split in ['train', 'val', 'test']: split_data = PPI(root=dirname, split=split, pre_transform=T.NormalizeFeatures()) x_idxs = split_data.slices['x'].numpy() edge_idxs = split_data.slices['edge_index'].numpy() split_data = split_data.data for x_start, x_end, e_start, e_end in zip(x_idxs, x_idxs[1:], edge_idxs, edge_idxs[1:]): graph = Data(split_data.x[x_start:x_end], split_data.edge_index[:, e_start:e_end], y=split_data.y[x_start:x_end]) graph.num_nodes = int(x_end - x_start) graph.split = split all_true = torch.ones(graph.num_nodes).bool() all_false = torch.zeros(graph.num_nodes).bool() graph.train_mask = all_true if split == 'train' else all_false graph.val_mask = all_true if split == 'val' else all_false graph.test_mask = all_true if split == 'test' else all_false data.graphs.append(graph) if seed != 0: temp_random = random.Random(seed) val_graphs = temp_random.sample(range(len(data.graphs)), 2) test_candidates = [ graph_idx for graph_idx in range(len(data.graphs)) if graph_idx not in val_graphs ] test_graphs = temp_random.sample(test_candidates, 2) for graph_idx, graph in enumerate(data.graphs): all_true = torch.ones(graph.num_nodes).bool() all_false = torch.zeros(graph.num_nodes).bool() graph.split = 'test' if graph_idx in test_graphs else 'val' if graph_idx in val_graphs else 'train' graph.train_mask = all_true if graph.split == 'train' else all_false graph.val_mask = all_true if graph.split == 'val' else all_false graph.test_mask = all_true if graph.split == 'test' else all_false return data
import os.path as osp import time import torch import torch.nn.functional as F from torch_geometric.datasets import PPI from torch_geometric.data import DataLoader from torch_geometric.nn import GATConv from sklearn.metrics import f1_score import pandas as pd path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'PPI') train_dataset = PPI(path, split='train') val_dataset = PPI(path, split='test') test_dataset = PPI(path, split='test') train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False) class Net(torch.nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = GATConv(train_dataset.num_features, 256, heads=4) self.lin1 = torch.nn.Linear(train_dataset.num_features, 4 * 256) self.conv2 = GATConv(4 * 256, 256, heads=4) self.lin2 = torch.nn.Linear(4 * 256, 4 * 256) self.conv3 = GATConv( 4 * 256, train_dataset.num_classes, heads=6, concat=False) self.lin3 = torch.nn.Linear(4 * 256, train_dataset.num_classes) #self.lin3 = torch.nn.Linear(4 * 256, 200)
import numpy as np import torch.nn as nn import os.path as osp import torch.nn.functional as F from sklearn.metrics import f1_score import torch_geometric.transforms as T from torch_geometric.data import Batch from torch_geometric.nn import ChebConv from torch_geometric.datasets import PPI from torch_geometric.datasets import Planetoid from torch_geometric.data import ClusterData, ClusterLoader from torch_geometric.data import DataLoader, Data #Data path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'PPI') train_dataset = PPI(path, split='train') #20graphs val_dataset = PPI(path, split='val') #2graphs test_dataset = PPI(path, split='test') #2graphs dataset = PPI(path) #Data to Loader train_data_list = [data for data in train_dataset] for data in train_data_list: data.train_mask = torch.ones(data.num_nodes, dtype=torch.bool) data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool) data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool) val_data_list = [data for data in val_dataset] for data in val_data_list: data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
def main_ppi(type): """ :arg type: 'GCN' or 'GAT' """ # Import PPI dataset path = osp.join( os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'data', 'PPI') train_dataset = PPI(path, split='train') val_dataset = PPI(path, split='val') test_dataset = PPI(path, split='test') train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False) # Define training function def train(): model.train() total_loss = 0 for data in train_loader: num_graphs = data.num_graphs data.batch = None data = data.to(device) optimizer.zero_grad() loss = loss_op(model(data.x, data.edge_index), data.y) total_loss += loss.item() * num_graphs loss.backward() optimizer.step() return total_loss / len(train_loader.dataset) # Define testing pipeline def test(loader): model.eval() ys, preds = [], [] for data in loader: ys.append(data.y) with torch.no_grad(): out = model(data.x.to(device), data.edge_index.to(device)) preds.append((out > 0).float().cpu()) y, pred = torch.cat(ys, dim=0).numpy(), torch.cat(preds, dim=0).numpy() return f1_score(y, pred, average='micro') if pred.sum() > 0 else 0 # Train GAT or GCN model on PPI dataset device = 'cpu' #torch.device('cuda' if torch.cuda.is_available() else if type == 'GAT': model = Net(train_dataset).to(device) epochs = 20 else: model = Net_GCN(train_dataset).to(device) epochs = 100 loss_op = torch.nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=0.005) # Use above functions to train and test the model over n epochs for epoch in range(1, epochs): loss = train() val_f1 = test(val_loader) test_f1 = test(test_loader) print('Epoch: {:02d}, Loss: {:.4f}, Val: {:.4f}, Test: {:.4f}'.format( epoch, loss, val_f1, test_f1)) #torch.save(model.state_dict(), model_path) #model = Net() #model.load_state_dict(torch.load(model_path)) return model
def _download(self): # Downloads and stores in raw_dir PPI(root=self.raw_dir, split='train') PPI(root=self.raw_dir, split='val') PPI(root=self.raw_dir, split='test')
def TrainingNet(dataset, modelName, params, num_pre_epochs, num_epochs, NumCutoff, optimizerName, MonteSize, savepath): Batch_size = int(params[0]) root = '/git/data/GraphData/' + dataset TestAccs = [] for Monte_iter in range(MonteSize): # Data start_epoch = 0 # start from epoch 0 or last checkpoint epoch NewNetworkSizeAdjust = [] WeightsDynamicsEvolution = [] # model if dataset == 'Cora' or dataset == 'Citeseer' or dataset == 'Pubmed': datasetroot = Planetoid(root=root, name=dataset, transform=T.NormalizeFeatures()).shuffle() trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) [net, model_to_save] = ModelAndSave(dataset, modelName, datasetroot, params, num_epochs) criterion = nn.CrossEntropyLoss() elif dataset == "CoraFull": datasetroot = CoraFull(root=root, transform=T.NormalizeFeatures()).shuffle() trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) [net, model_to_save] = ModelAndSave(dataset, modelName, datasetroot, params, num_epochs) elif dataset == "Amazon": datasetroot = Amazon(root, "Photo", transform=None, pre_transform=None) trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) testloader = DataListLoader(datasetroot, batch_size=100, shuffle=False) [net, model_to_save] = ModelAndSave(dataset, modelName, datasetroot, params, num_epochs) elif dataset == 'ENZYMES' or dataset == 'MUTAG': datasetroot = TUDataset(root, name=dataset, use_node_attr=True) Num = len(datasetroot) // 10 global train_dataset, test_dataset train_dataset = datasetroot[:Num] test_dataset = datasetroot[Num:] trainloader = DataLoader(train_dataset, batch_size=Batch_size) testloader = DataLoader(test_dataset, batch_size=60) [net, model_to_save] = ModelAndSave(dataset, modelName, datasetroot, params, num_epochs) elif dataset == "PPI": train_dataset = PPI(root, split='train') test_dataset = PPI(root, split='test') trainloader = DataLoader(train_dataset, batch_size=Batch_size, shuffle=True) testloader = DataLoader(test_dataset, batch_size=1, shuffle=False) [net, model_to_save] = ModelAndSave(dataset, modelName, train_dataset, params, num_epochs) criterion = torch.nn.BCEWithLogitsLoss() elif dataset == "Reddit": datasetroot = Reddit(root) trainloader = DataListLoader(datasetroot, batch_size=1, shuffle=True) testloader = DataListLoader(datasetroot, batch_size=2, shuffle=False) [net, model_to_save] = ModelAndSave(dataset, modelName, datasetroot, params, num_epochs) criterion = torch.nn.BCEWithLogitsLoss() elif dataset == 'MNIST': datasetroot = MNISTSuperpixels(root=root, transform=T.Cartesian()) trainloader = DataListLoader(datasetroot, batch_size=Batch_size, shuffle=True) testloader = DataListLoader(datasetroot, batch_size=100, shuffle=False) [net, model_to_save] = ModelAndSave(dataset, modelName, datasetroot, params, num_epochs) elif dataset == 'CIFAR10': pass else: raise Exception("Input wrong datatset!!") FileName = "{}-{}-param_{}_{}_{}_{}-monte_{}".format( dataset, modelName, params[0], params[1], params[2], params[3], Monte_iter) print('Let\'s use', torch.cuda.device_count(), 'GPUs!') global device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') optimizer = optim.Adam(net.parameters(), lr=params[3], betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False) criterion = nn.CrossEntropyLoss() net = net.to(device) #cudnn.benchmark = True logging( 'dataset:{}, Batch size: {}, Number of layers:{} ConCoeff: {}, LR:{}, MonteSize:{}' .format(dataset, params[0], params[1], params[2], params[3], Monte_iter)) mark = "{}/{}Convergence/DiagElement-{}".format( savepath, dataset, FileName) PreTrainConvergence, PreTestConvergence, PreTestAcc = TrainPart( modelName, datasetroot, start_epoch, num_pre_epochs, trainloader, testloader, net, optimizer, criterion, NumCutoff, mark, False, model_to_save) print( 'dataset: {}, model name: {}, Number epoches: {}, Pre-train error is: {}, Pre-test error is: {}, test acc is {}' .format(dataset, modelName, num_pre_epochs, PreTrainConvergence[-1], PreTestConvergence[-1], PreTestAcc[-1])) NewNetworksize, NewNetworkWeight = RetainNetworkSize(net, params[2])[0:2] NetworkInfo = [NewNetworksize[0:-1], NewNetworkWeight] OptimizedNet = ChooseModel(modelName, datasetroot, NetworkInfo) NewNetworksize.insert(0, datasetroot.num_features) NewNetworkSizeAdjust.append(NewNetworksize[0:-1]) print(NewNetworkSizeAdjust) #OptimizedNet.apply(init_weights) #OptimizedNet = DataParallel(OptimizedNet) OptimizedNet = OptimizedNet.to(device) cudnn.benchmark = True criterionNew = nn.CrossEntropyLoss() if optimizerName == "SGD": optimizerNew = getattr(optim, optimizerName)(OptimizedNet.parameters(), lr=params[3], momentum=0.9, weight_decay=5e-4) elif optimizerName == "Adam": optimizerNew = getattr(optim, optimizerName)(OptimizedNet.parameters(), lr=params[3], betas=(0.9, 0.999), eps=1e-08, weight_decay=5e-4, amsgrad=False) TrainConvergence, TestConvergence, TestAcc = TrainPart( modelName, datasetroot, start_epoch, num_epochs, trainloader, testloader, OptimizedNet, optimizerNew, criterionNew, NumCutoff, mark, True, model_to_save) np.save( "{}/{}Convergence/TrainConvergence-{}".format( savepath, dataset, FileName), TrainConvergence) np.save( "{}/{}Convergence/TestConvergence-{}".format( savepath, dataset, FileName), TestConvergence) np.save( "{}/{}Convergence/NewNetworkSizeAdjust-{}".format( savepath, dataset, FileName), NewNetworkSizeAdjust) #np.save(savepath+'TestConvergence-'+FileName,TestConvergence) #torch.cuda.empty_cache() print( 'dataset: {}, model name:{}, resized network size is {}, Number epoches:{}, Train error is: {}, Test error is: {}, test acc is {}\n' .format(dataset, modelName, NewNetworksize[0:-1], num_epochs, TrainConvergence[-1], TestConvergence[-1], TestAcc[-1])) TestAccs.append(TestAcc) np.save( "{}/{}Convergence/MeanTestAccs-{}".format(savepath, dataset, FileName), TestAccs) print("The change of test error is:{}".format(TestAccs)) print_nvidia_useage()
import os.path as osp import torch from torch.nn import Linear import torch.nn.functional as F from sklearn.metrics import f1_score from torch_geometric.datasets import PPI import torch_geometric.transforms as T from torch_geometric.nn import GCN2Conv from torch_geometric.loader import DataLoader path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'GCN2_PPI') pre_transform = T.Compose([T.GCNNorm(), T.ToSparseTensor()]) train_dataset = PPI(path, split='train', pre_transform=pre_transform) val_dataset = PPI(path, split='val', pre_transform=pre_transform) test_dataset = PPI(path, split='test', pre_transform=pre_transform) train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False) class Net(torch.nn.Module): def __init__(self, hidden_channels, num_layers, alpha, theta, shared_weights=True, dropout=0.0): super(Net, self).__init__() self.lins = torch.nn.ModuleList() self.lins.append(Linear(train_dataset.num_features, hidden_channels)) self.lins.append(Linear(hidden_channels, train_dataset.num_classes)) self.convs = torch.nn.ModuleList()
def load_dataset(name): """ Load real-world datasets, available in PyTorch Geometric. Used as a helper for DiskDataSource. """ task = "graph" if name == "enzymes": dataset = TUDataset(root="/tmp/ENZYMES", name="ENZYMES") elif name == "proteins": dataset = TUDataset(root="/tmp/PROTEINS", name="PROTEINS") elif name == "cox2": dataset = TUDataset(root="/tmp/cox2", name="COX2") elif name == "aids": dataset = TUDataset(root="/tmp/AIDS", name="AIDS") elif name == "reddit-binary": dataset = TUDataset(root="/tmp/REDDIT-BINARY", name="REDDIT-BINARY") elif name == "imdb-binary": dataset = TUDataset(root="/tmp/IMDB-BINARY", name="IMDB-BINARY") elif name == "firstmm_db": dataset = TUDataset(root="/tmp/FIRSTMM_DB", name="FIRSTMM_DB") elif name == "dblp": dataset = TUDataset(root="/tmp/DBLP_v1", name="DBLP_v1") elif name == "ppi": dataset = PPI(root="/tmp/PPI") elif name == "qm9": dataset = QM9(root="/tmp/QM9") elif name == "atlas": dataset = [g for g in nx.graph_atlas_g()[1:] if nx.is_connected(g)] elif name == 'aifb': dataset = Entities(root="/tmp/aifb", name='AIFB') # 90 edge types elif name == 'wn18': dataset = WordNet18(root="/tmp/wn18") elif name == 'fb15k237': dataset = [None] if task == "graph": train_len = int(0.8 * len(dataset)) train, test = [], [] if name not in ['aifb', 'wn18', 'fb15k237']: dataset = list(dataset) random.shuffle(dataset) has_name = hasattr(dataset[0], "name") else: has_name = True for i, graph in tqdm(enumerate(dataset)): if not type(graph) == nx.Graph: try: if has_name: del graph.name except: pass if name == 'aifb': graph = pyg_utils.to_networkx(graph, edge_attrs=['edge_type']) elif name == 'wn18': graph = pyg_utils.to_networkx(graph, edge_attrs=['edge_type']) elif name == 'fb15k237': data = FB15k_237() (graph, _, _, _) = data.load() graph = graph.to_networkx() edge_type_dict = [] for j in graph.edges: edge_type_dict.append(graph.edges[j]['label']) edge_type_dict = { i: ind for ind, i in enumerate(sorted(set(edge_type_dict))) } for j in graph.edges: graph.edges[j]['edge_type'] = edge_type_dict[ graph.edges[j]['label']] del graph.edges[j]['label'] del graph.edges[j]['weight'] else: graph = pyg_utils.to_networkx(graph).to_undirected() if name == 'aifb': train.append(graph) test.append(deepcopy(graph)) elif name == 'wn18': train.append(graph) test.append(deepcopy(graph)) elif name == 'fb15k237': train.append(graph) test.append(deepcopy(graph)) else: if i < train_len: train.append(graph) else: test.append(graph) return train, test, task
def process_inductive(dataset, gnn_type="GCNConv", K=None, random_init=False, runs=10): hyperparameters = get_hyperparameters() nb_epochs = hyperparameters["nb_epochs"] patience = hyperparameters["patience"] lr = hyperparameters["lr"] l2_coef = hyperparameters["l2_coef"] drop_prob = hyperparameters["drop_prob"] hid_units = hyperparameters["hid_units"] nonlinearity = hyperparameters["nonlinearity"] batch_size = hyperparameters["batch_size"] norm_features = torch_geometric.transforms.NormalizeFeatures() dataset_train = PPI( "./geometric_datasets/"+dataset, split="train", transform=norm_features, ) print(dataset_train) dataset_val = PPI( "./geometric_datasets/"+dataset, split="val", transform=norm_features, ) print(dataset_val) dataset_test = PPI( "./geometric_datasets/"+dataset, split="test", transform=norm_features, ) data = [] for d in dataset_train: data.append(d) for d in dataset_val: data.append(d) ft_size = dataset_train[0].x.shape[1] nb_classes = dataset_train[0].y.shape[1] # multilabel b_xent = nn.BCEWithLogitsLoss() loader_train = DataLoader( data, batch_size=hyperparameters["batch_size"], shuffle=True, ) loader_test = DataLoader( dataset_test, batch_size=hyperparameters["batch_size"], shuffle=False ) all_accs = [] for _ in range(runs): model = DGI(ft_size, hid_units, nonlinearity, update_rule=gnn_type, batch_size=1, K=K) model_name = get_model_name(dataset, gnn_type, K, random_init=random_init) print(model) optimiser = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2_coef) if torch.cuda.is_available(): print('Using CUDA') model = model.cuda() model.train() torch.cuda.empty_cache() for epoch in range(20): if random_init: break total_loss = 0 batch_id = 0 model.train() loaded = list(loader_train) for batch in loaded: optimiser.zero_grad() if torch.cuda.is_available: batch = batch.to('cuda') nb_nodes = batch.x.shape[0] features = batch.x labels = batch.y edge_index = batch.edge_index idx = np.random.randint(0, len(data)) while idx == batch_id: idx = np.random.randint(0, len(data)) shuf_fts = torch.nn.functional.dropout(loaded[idx].x, drop_prob) edge_index2 = loaded[idx].edge_index lbl_1 = torch.ones(nb_nodes) lbl_2 = torch.zeros(shuf_fts.shape[0]) lbl = torch.cat((lbl_1, lbl_2), 0) if torch.cuda.is_available(): shuf_fts = shuf_fts.cuda() if edge_index2 is not None: edge_index2 = edge_index2.cuda() lbl = lbl.cuda() logits = model(features, shuf_fts, edge_index, batch=batch.batch, edge_index_alt=edge_index2) loss = b_xent(logits, lbl) loss.backward() optimiser.step() batch_id += 1 total_loss += loss.item() print(epoch, 'Train Loss:', total_loss/(len(dataset_train))) torch.save(model.state_dict(), './trained_models/'+model_name) torch.cuda.empty_cache() print('Loading last epoch') if not random_init: model.load_state_dict(torch.load('./trained_models/'+model_name)) model.eval() b_xent_reg = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(2.25)) train_embs, whole_train_data = preprocess_embeddings(model, dataset_train) val_embs, whole_val_data = preprocess_embeddings(model, dataset_val) test_embs, whole_test_data = preprocess_embeddings(model, dataset_test) for _ in range(50): log = LogReg(hid_units, nb_classes) opt = torch.optim.Adam(log.parameters(), lr=0.01, weight_decay=0.0) log.cuda() pat_steps = 0 best = 1e9 log.train() for _ in range(250): opt.zero_grad() logits = log(train_embs) loss = b_xent_reg(logits, whole_train_data.y) loss.backward() opt.step() log.eval() val_logits = log(val_embs) loss = b_xent_reg(val_logits, whole_val_data.y) if loss.item() < best: best = loss.item() pat_steps = 0 if pat_steps >= 5: break pat_steps += 1 log.eval() logits = log(test_embs) preds = torch.sigmoid(logits) > 0.5 f1 = sklearn.metrics.f1_score(whole_test_data.y.cpu(), preds.long().cpu(), average='micro') all_accs.append(float(f1)) print() print('Micro-averaged f1:', f1) all_accs = torch.tensor(all_accs) with open("./results/"+model_name[:-4]+"_results.txt", "w") as f: f.writelines([str(all_accs.mean().item())+'\n', str(all_accs.std().item())]) print(all_accs.mean()) print(all_accs.std())
def prepare_data(self): path = osp.join(osp.dirname(osp.realpath(__file__)), "..", "..", "data", self.NAME) self.train_dataset = PPI(path, split="train") self.val_dataset = PPI(path, split="val") self.test_dataset = PPI(path, split="test")
def forward(self, x, edge_index): # Apply Dropout to the input features as in the paper x = F.dropout(x, p=0., training=self.training) H1skip = self.Wskip(x) h_1_1 = self.prelu(self.gcn1(x, edge_index)) h_1_2 = self.prelu(self.gcn2(h_1_1 + H1skip, edge_index)) h_1_3 = self.prelu(self.gcn3(h_1_2 + H1skip, edge_index)) x = self.prelu(h_1_3) return x dataset = PPI('train') test_set = PPI('test') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # todo training on the first graph only! #data = dataset[0].to(device) # Used for other datasets #def corruption(x, edge_index): # return x[torch.randperm(x.size(0))], edge_index # Used for PPI def corruption(x, edge_index):
def Load_Dataset(Name): path = osp.join(osp.realpath(__file__), '..', 'data', 'PPIdataset') if Name == "PPI": Train_Dataset = PPI(path, split="train") Valid_Dataset = PPI(path, split="val") return Train_Dataset, Valid_Dataset