def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') dataset = Planetoid(root='data', name='Cora', transform=T.NormalizeFeatures()) data = dataset[0] ground_truth_edge_index = data.edge_index.to(device) data.train_mask = data.val_mask = data.test_mask = data.y = None data = train_test_split_edges(data) data = data.to(device) model = Net(dataset.num_features, 64).to(device) optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01) best_val_auc = test_auc = 0 for epoch in range(1, 101): loss = train(data, model, optimizer) val_auc, tmp_test_auc = test(data, model) if val_auc > best_val_auc: best_val_auc = val_auc test_auc = tmp_test_auc print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val: {val_auc:.4f}, ' f'Test: {test_auc:.4f}') z = model.encode(data.x, data.train_pos_edge_index) final_edge_index = model.decode_all(z)
def create_train_test_split_edges(self, stage=None): def get_link_labels(pos_edge_index, neg_edge_index): link_labels = torch.zeros( pos_edge_index.size(1) + neg_edge_index.size(1)).float() link_labels[:pos_edge_index.size(1)] = 1. return link_labels def prepare_data(data, stage): x, pos_edge_index = data.x, getattr(data, f"{stage}_pos_edge_index") _edge_index, _ = remove_self_loops(pos_edge_index) pos_edge_index_with_self_loops, _ = add_self_loops( _edge_index, num_nodes=x.size(0)) neg_edge_index = negative_sampling( edge_index=pos_edge_index_with_self_loops, num_nodes=x.size(0), num_neg_samples=pos_edge_index.size(1)) link_labels = get_link_labels(pos_edge_index, neg_edge_index) Data = namedtuple( "Data", ["x", "pos_edge_index", "neg_edge_index", "link_labels"]) return [Data(*[x, pos_edge_index, neg_edge_index, link_labels])] if not hasattr(self, "_loaded_dataset"): self.data.train_mask = self.data.val_mask = self.data.test_mask = self.data.y = None self._loaded_dataset = train_test_split_edges(self.data) data = prepare_data(self._loaded_dataset, stage) return THDataloader(data)
def test_gae(): model = GAE(encoder=lambda x: x) model.reset_parameters() x = torch.Tensor([[1, -1], [1, 2], [2, 1]]) z = model.encode(x) assert z.tolist() == x.tolist() adj = model.decoder.forward_all(z) assert adj.tolist() == torch.sigmoid( torch.Tensor([[+2, -1, +1], [-1, +5, +4], [+1, +4, +5]])).tolist() edge_index = torch.tensor([[0, 1], [1, 2]]) value = model.decode(z, edge_index) assert value.tolist() == torch.sigmoid(torch.Tensor([-1, 4])).tolist() edge_index = torch.tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]) data = Data(edge_index=edge_index) data.num_nodes = edge_index.max().item() + 1 data = train_test_split_edges(data, val_ratio=0.2, test_ratio=0.3) z = torch.randn(11, 16) loss = model.recon_loss(z, data.train_pos_edge_index) assert loss.item() > 0 auc, ap = model.test(z, data.val_pos_edge_index, data.val_neg_edge_index) assert auc >= 0 and auc <= 1 and ap >= 0 and ap <= 1
def train_val_test_split(self): x, edge_index = self.data.x, self.data.edge_index edge_index, _ = remove_self_loops(edge_index) edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0)) data = deepcopy(self.data) data.edge_index = edge_index # train_pos_edge_index=[2, E * 0.85] (undirected) # val_neg/pos_edge_index=[2, E/2 * 0.05] (not undirected) # test_neg/pos_edge_index: [2, E/2 * 0.1] (not undirected) data = train_test_split_edges(data, *self.train_val_test_ratio[1:]) data.__delattr__("train_neg_adj_mask") test_edge_index = torch.cat([ to_undirected(data.test_pos_edge_index), to_undirected(data.test_neg_edge_index) ], dim=1) if data.val_pos_edge_index.size(1) > 0: val_edge_index = torch.cat([ to_undirected(data.val_pos_edge_index), to_undirected(data.val_neg_edge_index) ], dim=1) else: val_edge_index = test_edge_index return data.train_pos_edge_index, val_edge_index, test_edge_index
def __init__(self, feature): root = raw_path = 'data/{}'.format(feature) dataset = ScisciDataset(root=root, raw_path=raw_path, transform=T.NormalizeFeatures()) data = train_test_split_edges(dataset[0]) self.feature = feature self.root = root self.dataset = dataset self.data = data self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.patience = 100 self.num_epochs = 400
def test_train_test_split_edges(): edge_index = torch.tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]) data = Data(edge_index=edge_index) data.num_nodes = edge_index.max().item() + 1 data = train_test_split_edges(data, val_ratio=0.2, test_ratio=0.3) assert data.val_pos_edge_index.size() == (2, 2) assert data.val_neg_edge_index.size() == (2, 2) assert data.test_pos_edge_index.size() == (2, 3) assert data.test_neg_edge_index.size() == (2, 3) assert data.train_pos_edge_index.size() == (2, 10) assert data.train_neg_adj_mask.size() == (11, 11) assert data.train_neg_adj_mask.sum().item() == (11**2 - 11) / 2 - 4 - 6 - 5
def construct_sparse(loader): """Construct sparse matrices in the graphs from the loader""" graphs = [] for graph in loader: graph.edge_id = dict( list( zip( (graph.num_nodes * graph.edge_index[0] + graph.edge_index[1]).numpy().squeeze(), graph.edge_attr.squeeze().numpy(), ))) graphs.append( data.make_sparse( train_test_split_edges(graph, val_ratio=0, test_ratio=0))) return graphs
def get_biograkn_data(edge_csv_file_path, nodes_labs_file_path, task='node', features="ones", val_ratio=0.3, test_ratio=0.3): ''' Returns pytorch 'Data' object from provided filepaths. Args: edge_csv_file_path: filepath to a csv file where nodes and hyperrelations are columns and traversed graph paths are rows nodes_labs_file_path: filepath to a two columns csv files with nodes ids and their labels val_ratio: percentage ratio for validation data mask test_ratio: percentage ratio for test data mask ''' edge_index_df = pd.read_csv(edge_csv_file_path) nodes_labels = pd.read_csv(nodes_labs_file_path) edge_index = torch.tensor( [edge_index_df["source"], edge_index_df["target"]], dtype=torch.long) y = torch.tensor(nodes_labels["label"], dtype=torch.long) num_nodes = len(y) if features == "ones": x = torch.ones(num_nodes, 10) elif features == 'labels': encoder = OneHotEncoder(categories='auto') features = nodes_labels["label"].values.reshape(-1, 1) features = encoder.fit_transform(features) x = torch.tensor(features.toarray(), dtype=torch.float32) if task == 'node': dataset_masks = create_masks(nodes_labels["node"], val_ratio, test_ratio) num_classes = torch.unique(y).size(0) data = Data(y=y, x=x, edge_index=edge_index, num_nodes=num_nodes, test_mask=dataset_masks["test"], train_mask=dataset_masks["train"], val_mask=dataset_masks["validation"], num_classes=num_classes) elif task == 'link': data = Data(y=y, x=x, edge_index=edge_index, num_nodes=num_nodes) data = train_test_split_edges(data) else: raise RuntimeError('Unknown task.') return data
def do_edge_split(dataset): data = dataset[0] data = train_test_split_edges(data) edge_index, _ = add_self_loops(data.train_pos_edge_index) data.train_neg_edge_index = negative_sampling( edge_index, num_nodes=data.num_nodes, num_neg_samples=data.train_pos_edge_index.size(1)) split_edge = {'train': {}, 'valid': {}, 'test': {}} split_edge['train']['edge'] = data.train_pos_edge_index.t() split_edge['train']['edge_neg'] = data.train_neg_edge_index.t() split_edge['valid']['edge'] = data.val_pos_edge_index.t() split_edge['valid']['edge_neg'] = data.val_neg_edge_index.t() split_edge['test']['edge'] = data.test_pos_edge_index.t() split_edge['test']['edge_neg'] = data.test_neg_edge_index.t() return split_edge
def do_edge_split(dataset, fast_split=False, val_ratio=0.05, test_ratio=0.1): data = dataset[0] random.seed(234) torch.manual_seed(234) if not fast_split: data = train_test_split_edges(data, val_ratio, test_ratio) edge_index, _ = add_self_loops(data.train_pos_edge_index) data.train_neg_edge_index = negative_sampling( edge_index, num_nodes=data.num_nodes, num_neg_samples=data.train_pos_edge_index.size(1)) else: num_nodes = data.num_nodes row, col = data.edge_index # Return upper triangular portion. mask = row < col row, col = row[mask], col[mask] n_v = int(math.floor(val_ratio * row.size(0))) n_t = int(math.floor(test_ratio * row.size(0))) # Positive edges. perm = torch.randperm(row.size(0)) row, col = row[perm], col[perm] r, c = row[:n_v], col[:n_v] data.val_pos_edge_index = torch.stack([r, c], dim=0) r, c = row[n_v:n_v + n_t], col[n_v:n_v + n_t] data.test_pos_edge_index = torch.stack([r, c], dim=0) r, c = row[n_v + n_t:], col[n_v + n_t:] data.train_pos_edge_index = torch.stack([r, c], dim=0) # Negative edges (cannot guarantee (i,j) and (j,i) won't both appear) neg_edge_index = negative_sampling(data.edge_index, num_nodes=num_nodes, num_neg_samples=row.size(0)) data.val_neg_edge_index = neg_edge_index[:, :n_v] data.test_neg_edge_index = neg_edge_index[:, n_v:n_v + n_t] data.train_neg_edge_index = neg_edge_index[:, n_v + n_t:] split_edge = {'train': {}, 'valid': {}, 'test': {}} split_edge['train']['edge'] = data.train_pos_edge_index.t() split_edge['train']['edge_neg'] = data.train_neg_edge_index.t() split_edge['valid']['edge'] = data.val_pos_edge_index.t() split_edge['valid']['edge_neg'] = data.val_neg_edge_index.t() split_edge['test']['edge'] = data.test_pos_edge_index.t() split_edge['test']['edge_neg'] = data.test_neg_edge_index.t() return split_edge
def train_test_split(self, x, edge_index, ratio): edge_index, _ = remove_self_loops(edge_index) edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0)) data = deepcopy(self.data) data.edge_index = edge_index data.x = x data = train_test_split_edges(data, 0, ratio) data.__delattr__("train_neg_adj_mask") test_edge_index = torch.cat([ to_undirected(data.test_pos_edge_index), to_undirected(data.test_neg_edge_index) ], dim=1) return data.train_pos_edge_index, test_edge_index
def test_lp_trainer(): dataset = build_dataset_from_name("cora") dataset = to_pyg_dataset(dataset) data = dataset[0] data = train_test_split_edges(data, 0.1, 0.1) dataset = [data] lp_trainer = LinkPredictionTrainer(model='gcn', init=False) lp_trainer.num_features = data.x.size(1) lp_trainer.initialize() print(lp_trainer.encoder.encoder) print(lp_trainer.decoder.decoder) lp_trainer.train(dataset, True) result = lp_trainer.evaluate(dataset, "test", "auc") print(result)
def main(training_method, dataset_name, dataset_dir, gpu_id, seed): torch.manual_seed(seed) device = get_device(gpu_id) dataset = get_data(dataset_name, dataset_dir) features_dimension = dataset.num_features data = dataset[0].to(device) data.train_mask = data.val_mask = data.test_mask = data.y = None data = train_test_split_edges(data) x, train_pos_edge_index = data.x, data.train_pos_edge_index model, optimizer = get_model_and_optimizer(training_method, dataset_name, features_dimension, device) max_epoch = 201 if dataset_name == 'citeseer' else 401 for epoch in range(1, max_epoch): train(model, optimizer, x, train_pos_edge_index) auc, ap = test(model, x, train_pos_edge_index, data.test_pos_edge_index, data.test_neg_edge_index) print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))
def _train_test_split_edges(data, random_seed=12345): import pytorch_lightning as pl from torch_geometric.utils import train_test_split_edges import random import numpy as np import torch rand_state = random.getstate() np_state = np.random.get_state() torch_state = torch.random.get_rng_state() pl.seed_everything(random_seed) data = train_test_split_edges(data) random.setstate(rand_state) np.random.set_state(np_state) torch.random.set_rng_state(torch_state) return data
def process(self): random.seed(12345) torch.manual_seed(12345) data = train_test_split_edges(self.data) edge_index, _ = add_self_loops(data.train_pos_edge_index) data.train_neg_edge_index = negative_sampling( edge_index, num_nodes=data.num_nodes, num_neg_samples=data.train_pos_edge_index.size(1)) self.__max_z__ = 0 # Collect a list of subgraphs for training, validation and test. train_pos_list = self.extract_enclosing_subgraphs( data.train_pos_edge_index, data.train_pos_edge_index, 1) train_neg_list = self.extract_enclosing_subgraphs( data.train_neg_edge_index, data.train_pos_edge_index, 0) val_pos_list = self.extract_enclosing_subgraphs( data.val_pos_edge_index, data.train_pos_edge_index, 1) val_neg_list = self.extract_enclosing_subgraphs( data.val_neg_edge_index, data.train_pos_edge_index, 0) test_pos_list = self.extract_enclosing_subgraphs( data.test_pos_edge_index, data.train_pos_edge_index, 1) test_neg_list = self.extract_enclosing_subgraphs( data.test_neg_edge_index, data.train_pos_edge_index, 0) # Convert labels to one-hot features. for data in chain(train_pos_list, train_neg_list, val_pos_list, val_neg_list, test_pos_list, test_neg_list): data.x = F.one_hot(data.z, self.__max_z__ + 1).to(torch.float) torch.save(self.collate(train_pos_list + train_neg_list), self.processed_paths[0]) torch.save(self.collate(val_pos_list + val_neg_list), self.processed_paths[1]) torch.save(self.collate(test_pos_list + test_neg_list), self.processed_paths[2])
def test_train_test_split_edges(): edge_index = torch.tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]) edge_attr = torch.arange(edge_index.size(1)) data = Data(edge_index=edge_index, edge_attr=edge_attr) data.num_nodes = edge_index.max().item() + 1 with pytest.warns(UserWarning, match='deprecated'): data = train_test_split_edges(data, val_ratio=0.2, test_ratio=0.3) assert len(data) == 10 assert data.val_pos_edge_index.size() == (2, 2) assert data.val_neg_edge_index.size() == (2, 2) assert data.test_pos_edge_index.size() == (2, 3) assert data.test_neg_edge_index.size() == (2, 3) assert data.train_pos_edge_index.size() == (2, 10) assert data.train_neg_adj_mask.size() == (11, 11) assert data.train_neg_adj_mask.sum().item() == (11**2 - 11) / 2 - 4 - 6 - 5 assert data.train_pos_edge_attr.size() == (10, ) assert data.val_pos_edge_attr.size() == (2, ) assert data.test_pos_edge_attr.size() == (3, )
def __init__(self, model_type='GCN', text_encoding='bert'): """ Class for training N times and computing the results model_type: string. Model definition. Options {"GCN","SAGE", "GIN", "GAT", "AGNN","GraphUNet"} default GCN text_encoding: text representation. Options {"bert","tfidf","d2v"} default BERT """ root = 'data/{}'.format(text_encoding) raw_path = '../../data/torch/{}/'.format(text_encoding) dataset = ScisciDataset(root=root, raw_path=raw_path, transform=T.NormalizeFeatures()) data = train_test_split_edges(dataset[0]) self.model_type = model_type self.text_encoding = text_encoding self.root = root self.dataset = dataset self.data = data self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.patience = 100 self.num_epochs = 450
def run(file, data_name, model_name,lr): parser = argparse.ArgumentParser(description='OGBL-DDI (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_sage', action='store_true') parser.add_argument('--num_layers', type=int, default=2) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--batch_size', type=int, default=64*1024) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--eval_steps', type=int, default=5) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--use_nd', action='store_true') parser.add_argument('--use_lgae', action='store_true') parser.add_argument('--use_vgae', action='store_true') parser.add_argument('--model', type=str, default='') parser.add_argument('--dataset', type=str, default='Citeseer') args = parser.parse_args() if data_name != None and model_name != None and lr != None: args.dataset = data_name args.model = model_name args.lr = lr print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' # device = 'cpu' device = torch.device(device) dataset = CitationFull(os.path.join('citation_data',args.dataset),name=args.dataset,transform=T.ToSparseTensor()) num_training = int(dataset.__len__()*0.8) num_val = int(dataset.__len__()*0.1) num_test = dataset.__len__() - (num_training+num_val) data = dataset[0] print('data:',vars(data)) adj_t = data.adj_t.to(device) edge_index, edge_type = utils.dense_to_sparse(adj_t.to_dense()) data.edge_index = edge_index data.x = data.x.to(device) split_edge = utils.train_test_split_edges(data) split_edge.edge_index = edge_index print(data) print(edge_index.shape) decoder_enable = args.model[-3:] if args.model[-3:] == '-nd': model_name = args.model[:-3] if model_name == 'lgae': model = LGAE(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'vgae': model = DeepVGAE(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'gae': model = GraphAutoEncoder(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'arga': model = AdversarialGAE(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'arvga': model = AdversarialVGAE(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'lrga': model = LRGA(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'sage': model = SAGEAutoEncoder(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) if decoder_enable == '-nd': model.decoder = NeuralDecoder( args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout) evaluator = Evaluator(name='ogbl-ddi') model = model.to(device) loggers = { 'metrics': Logger(args.runs, args) } for run in range(args.runs): torch.manual_seed(run) model.reset_parameters() if args.model in ['arga','arga-nd','arvga','arvga-nd']: args.lr=0.005 optimizer = torch.optim.Adam( list(model.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, data.x, adj_t, split_edge, optimizer, args.batch_size) result = test(model, data.x, data, split_edge, evaluator, args.batch_size) loggers['metrics'].add_result(run, result) for key in loggers.keys(): print(key) toWrite = loggers[key].print_statistics() file.write(args.model+'\t'+'\t'.join(toWrite)+'\n') file.flush() os.fsync(file)
def data_processor(data, undirected=True, val_ratio=0.05, test_ratio=0.1): ''' グラフデータをPytorch Geometric用に処理する. Parameters: data (torch_geometric.data.Data): グラフデータ. Returens: all_pos_edge_index (torch.Tensor[2, num_pos_edges]): train_test_split前の全リンク. train_pos_edge_adj_t (torch.SparseTensor[2, num_pos_edges]): trainデータのリンク. y_true (numpy.ndarray[num_nodes, num_nodes].flatten()): 全リンクの隣接行列をflattenしたもの. y_train (numpy.ndarray[num_nodes, num_nodes].flatten()): trainデータのリンクの隣接行列をflattenしたもの. mask (numpy.ndarray[num_nodes, num_nodes].flatten()): validation, testのpos_edge, neg_edgeとしてサンプリングしたリンクをFalse、それ以外をTrueとした隣接行列をflattenしたもの. ''' # train_test_splitをする前に、エッジのTensorをコピーしておく all_pos_edge_index = data.edge_index data.train_mask = data.val_mask = data.test_mask = data.y = None data = train_test_split_edges(data, val_ratio=val_ratio, test_ratio=test_ratio) if (val_ratio + test_ratio == 1) and (data.train_pos_edge_index.size(1) > 0): data.test_pos_edge_index = torch.cat( [data.test_pos_edge_index, data.train_pos_edge_index], dim=-1) data.train_pos_edge_index = torch.LongTensor([[], []]) print('train test split has been done.') print(data) print('') # GCN2Convに渡すエッジのSparseTensorを作成する # 参考: https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/transforms/to_sparse_tensor.html#ToSparseTensor # edge_index全てではなく、train_pos_edge_indexに抽出されたエッジのみを変換する点に注意 ( row, col ), N, E = data.train_pos_edge_index, data.num_nodes, data.train_pos_edge_index.size( 1) perm = (col * N + row).argsort() row, col = row[perm], col[perm] value = None for key in ['edge_weight', 'edge_attr', 'edge_type']: if data[key] is not None: value = data[key][perm] break for key, item in data: if item.size(0) == E: data[key] = item[perm] train_pos_edge_adj_t = SparseTensor(row=col, col=row, value=value, sparse_sizes=(N, N), is_sorted=True) print('train_pos_edge_adj_t is completed.\n') # 1. 全エッジ edge = pd.DataFrame(all_pos_edge_index.cpu().numpy().T, columns=['source', 'target']) G = nx.from_pandas_edgelist(edge, create_using=nx.Graph()) #隣接行列を作成 df_adj = pd.DataFrame( np.zeros([len(G.nodes()), len(G.nodes())]), index=G.nodes(), columns=G.nodes()).sort_index(axis=0).sort_index(axis=1) for i, j in G.edges(): df_adj.loc[i, j] = 1 y_true = torch.tensor(df_adj.to_numpy().flatten(), dtype=torch.float) print('y_true is completed.\n') # 2. trainに用いるエッジ edge = pd.DataFrame(data.train_pos_edge_index.cpu().numpy().T, columns=['source', 'target']) G_train = nx.from_pandas_edgelist(edge, create_using=nx.Graph()) #隣接行列を作成 df_adj_train = pd.DataFrame( np.zeros([len(G.nodes()), len(G.nodes())]), index=G.nodes(), columns=G.nodes()).sort_index(axis=0).sort_index(axis=1) for i, j in G_train.edges(): df_adj_train.loc[i, j] = 1 y_train = torch.tensor(df_adj_train.to_numpy().flatten(), dtype=torch.float) print('y_train is completed.\n') # 隣接行列が0の部分には、validation、testで用いるpositive、negativeのエッジが含まれる。これらのエッジをlossの計算から除くためのmaskを作成 val_test_edge = torch.cat([ data.test_neg_edge_index, data.test_pos_edge_index, data.val_neg_edge_index, data.val_pos_edge_index ], dim=-1) mask = torch.ones([data.x.size(0), data.x.size(0)], dtype=torch.float) for i in range(val_test_edge.size(1)): mask[val_test_edge[0, i], val_test_edge[1, i]] = 0 mask[val_test_edge[1, i], val_test_edge[0, i]] = 0 mask = mask.flatten() return all_pos_edge_index, train_pos_edge_adj_t, y_true, y_train, mask
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') dataset = None if args.dataset.lower() == 'Cora'.lower(): dataset = Planetoid(root='tmp', name='Cora') print("use dataset: Cora") elif args.dataset.lower() == 'CiteSeer'.lower(): dataset = Planetoid(root='tmp', name='CiteSeer') print("use dataset: CiteSeer") elif args.dataset.lower() == 'PubMed'.lower(): dataset = Planetoid(root='tmp', name='PubMed') print("use dataset: PubMed") data = dataset[0] enhanced_data = train_test_split_edges(data.clone(), val_ratio=0.1, test_ratio=0.2) train_data = Data(x=enhanced_data.x, edge_index=enhanced_data['train_pos_edge_index']).to(DEVICE) target_data = data.to(DEVICE) if args.model is 'VGAE': model = VGAE(encoder=VEncoder(data['x'].shape[1])).to(DEVICE) else: model = GAE(encoder=Encoder(data['x'].shape[1])).to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=5e-4)
device = args.device = "cpu" else: device = args.device = f"cuda:{args.device}" dataset = Planetoid(osp.expanduser('~/.cache-autogl'), args.dataset, transform=T.NormalizeFeatures()) res = [] begin_time = time.time() for seed in tqdm(range(1234, 1234 + args.repeat)): setup_seed(seed) data = dataset[0].to(device) # use train_test_split_edges to create neg and positive edges data.train_mask = data.val_mask = data.test_mask = data.y = None data = train_test_split_edges(data).to(device) model_hp, decoder_hp = get_encoder_decoder_hp(args.model) trainer = LinkPredictionTrainer( model=args.model, num_features=data.x.size(1), lr=1e-2, max_epoch=100, early_stopping_round=101, weight_decay=0.0, device=args.device, feval=[Auc], loss="binary_cross_entropy_with_logits", init=False).duplicate_from_hyper_parameter( {
def galTrainer(model, data: torch_geometric.data.Data): """ trains the model according to the required epochs/patience Parameters ---------- model: Model data: torch_geometric.data.Data Returns ------- model: Model model_log: str test_accuracy: torch.Tensor """ # according to best results reported in GAL paper if model.dataset_name == "CITESEER": lambda_param = 0.75 use_ws_loss = False elif model.dataset_name == "CORA": # default param - nor reported in the paper lambda_param = 0.05 use_ws_loss = True elif model.dataset_name == "PUBMED": lambda_param = 0.5 use_ws_loss = False else: lambda_param = 0.05 use_ws_loss = True # Train/validation/test data = train_test_split_edges(data) optimizer, optimizer_attack, optimizer_fine_tune = create_gal_optimizer(model=model, lambda_reg=lambda_param) train_epochs = 250 fine_tune_epochs = 800 switch = True for epoch in range(1, train_epochs + 1): train_acc = train(model=model, optimizer=optimizer, optimizer_attack=optimizer_attack, data=data, switch=switch, use_ws_loss=use_ws_loss) switch = not switch # start of changes XXXXX log_template = 'Regular Epoch: {:03d}, Train: {:.4f}, Val: {:.4f}, Test: {:.4f}' val_acc, test_acc = test(model, data) print(log_template.format(epoch, train_acc, val_acc, test_acc), flush=True) print(flush=True) # end of changes XXXXX best_val_acc = test_acc = 0 for epoch in range(1, fine_tune_epochs + 1): train_attr(model=model, optimizer_attr=optimizer_fine_tune, data=data) train_acc, val_acc, tmp_test_acc = test_attr(model=model, data=data) if val_acc > best_val_acc: best_val_acc = val_acc test_acc = tmp_test_acc log = 'Finetune Epoch: {:03d}, Train: {:.4f}, Val: {:.4f}, Test: {:.4f}' print(log.format(epoch, train_acc, val_acc, tmp_test_acc)) print(flush=True) model_log = 'Basic Model - Train: {:.4f}, Val: {:.4f}, Test: {:.4f}' \ .format(train_acc, best_val_acc, test_acc) return model, model_log, test_accuracy
def main(config_path): # settings conf = load_config(config_path) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') base_dir = conf['base_dir'] strengths_path = base_dir + conf['strengths_path'] # load data df = pd.read_csv(strengths_path) num_dict = make_dict(df) fm = make_feature_matrix(df, num_dict, **conf.feature_matrix) G = make_graph(df, num_dict, **conf.graph) preprocess_G = G.copy() for (u, v, d) in G.edges(data=True): if d["weight"] <= 6: # 重みが6以下のエッジは削除する preprocess_G.remove_edge(u, v) G = preprocess_G # 下記の対応表がないとtorch geometric <-> 社員名の変換ができない mapping = {num: num_dict['syain']['num_str'][i] for num, i in enumerate(G.nodes)} # 一旦ここで画像が表示されるため止まる plot_graph(G, mapping, conf.save_pos) mapping_swap = {v: k for k, v in mapping.items()} data = from_networkx(G) # 特徴量行列 data.x = fm # 標準化 transform = T.NormalizeFeatures() data = transform(data) # GAE model = kwargs[conf.gae.model]( Encoder(data.x.shape[1], conf.gae.dim, model=conf.gae.model)).to(device) data.train_mask = data.val_mask = data.test_mask = data.y = None # 今回全データでGAEを実行してしまう data = train_test_split_edges(data) x, train_pos_edge_index = data.x.to( device), data.train_pos_edge_index.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) def train(): """train""" model.train() optimizer.zero_grad() z = model.encode(x, train_pos_edge_index) loss = model.recon_loss(z, train_pos_edge_index) if conf.gae.model in ['VGAE']: loss = loss + (1 / data.num_nodes) * model.kl_loss() loss.backward() optimizer.step() return loss.item() def test(pos_edge_index, neg_edge_index): """test""" model.eval() with torch.no_grad(): z = model.encode(x, train_pos_edge_index) return model.test(z, pos_edge_index, neg_edge_index) # 学習実行 print("start training") for epoch in range(1, conf.num_iter + 1): loss = train() auc, ap = test(data.test_pos_edge_index, data.test_neg_edge_index) if epoch % (conf.num_iter//10) == 0: print( f'Epoch: {epoch:02d}, Loss: {loss:.4f}, AUC: {auc:.4f}, AP: {ap:.4f}') @torch.no_grad() def plot_points(): """学習済みモデルで学習データの分散表現をTSNEで2次元圧縮し、可視化する""" model.eval() res = model.encode(x, train_pos_edge_index) z = TSNE(n_components=2).fit_transform(res.cpu().numpy()) plt.figure(figsize=(8, 8)) plt.scatter(z[:, 0], z[:, 1], s=20) for num, [x_pos, y_pos] in enumerate(z): label = mapping[num] plt.annotate( label, (x_pos, y_pos), size=10, ha='center', fontproperties=font_prop ) plt.axis('off') plt.show() return res.cpu().numpy() # 二次元圧縮と可視化 res = plot_points() if conf.save_res: res_vec_save_path = base_dir + conf['res_vec_path'] df_res = pd.DataFrame(res) df_res.to_csv(res_vec_save_path) if conf.save_res: res_cos_save_path = base_dir + conf['res_cos_path'] df_res = pd.DataFrame(cos_sim_matrix(res)) df_res.to_csv(res_cos_save_path)
def perturb_edges(data, name, remove_pct, add_pct, hidden_channels=16, epochs=400): if remove_pct == 0 and add_pct == 0: return try: cached = pickle.load( open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'rb')) print(f'Use cached edge augmentation for dataset {name}') if data.setting == 'inductive': data.train_edge_index = cached else: data.edge_index = cached return except FileNotFoundError: try: A_pred, adj_orig = pickle.load( open(f'{ROOT}/cache/edge/{name}.pt', 'rb')) A = sample_graph_det(adj_orig, A_pred, remove_pct, add_pct) data.edge_index, _ = from_scipy_sparse_matrix(A) pickle.dump( data.edge_index, open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb')) return except FileNotFoundError: print( f'cache/edge/{name}_{remove_pct}_{add_pct}.pt not found! Regenerating it now' ) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if data.setting == 'inductive': train_data = Data(x=data.train_x, ori_x=data.ori_x, edge_index=data.train_edge_index, y=data.train_y) else: train_data = deepcopy(data) edge_index = deepcopy(train_data.edge_index) train_data = train_test_split_edges(train_data, val_ratio=0.1, test_ratio=0) num_features = train_data.ori_x.shape[1] model = GAE(GCNEncoder(num_features, hidden_channels)) model = model.to(device) x = train_data.ori_x.to(device) train_pos_edge_index = train_data.train_pos_edge_index.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) best_val_auc = 0 best_z = None for epoch in range(1, epochs + 1): model.train() optimizer.zero_grad() z = model.encode(x, train_pos_edge_index) loss = model.recon_loss(z, train_pos_edge_index) loss.backward() optimizer.step() model.eval() with torch.no_grad(): z = model.encode(x, train_pos_edge_index) auc, ap = model.test(z, train_data.val_pos_edge_index, train_data.val_neg_edge_index) print('Val | Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format( epoch, auc, ap)) if auc > best_val_auc: best_val_auc = auc best_z = deepcopy(z) A_pred = torch.sigmoid(torch.mm(z, z.T)).cpu().numpy() adj_orig = to_scipy_sparse_matrix(edge_index).asformat('csr') adj_pred = sample_graph_det(adj_orig, A_pred, remove_pct, add_pct) if data.setting == 'inductive': data.train_edge_index, _ = from_scipy_sparse_matrix(adj_pred) else: data.edge_index, _ = from_scipy_sparse_matrix(adj_pred) pickle.dump((A_pred, adj_orig), open(f'{ROOT}/cache/edge/{name}.pt', 'wb')) if data.setting == 'inductive': pickle.dump( data.train_edge_index, open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb')) else: pickle.dump( data.edge_index, open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb'))
from sklearn.metrics.cluster import (v_measure_score, homogeneity_score, completeness_score) from sklearn.manifold import TSNE import matplotlib.pyplot as plt import torch_geometric.transforms as T from torch_geometric.datasets import Planetoid from torch_geometric.nn import GCNConv, ARGVA from torch_geometric.utils import train_test_split_edges dataset = 'Cora' path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset) dataset = Planetoid(path, dataset, T.NormalizeFeatures()) data = dataset.get(0) data.train_mask = data.val_mask = data.test_mask = None data = train_test_split_edges(data) class Encoder(torch.nn.Module): def __init__(self, in_channels, hidden_channels, out_channels): super(Encoder, self).__init__() self.conv1 = GCNConv(in_channels, hidden_channels, cached=True) self.conv_mu = GCNConv(hidden_channels, out_channels, cached=True) self.conv_logvar = GCNConv(hidden_channels, out_channels, cached=True) def forward(self, x, edge_index): x = F.relu(self.conv1(x, edge_index)) return self.conv_mu(x, edge_index), self.conv_logvar(x, edge_index) class Discriminator(torch.nn.Module):
from model import DeepVGAE from config.config import parse_args torch.manual_seed(12345) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') args = parse_args() model = DeepVGAE(args).to(device) optimizer = Adam(model.parameters(), lr=args.lr) os.makedirs("datasets", exist_ok=True) dataset = Planetoid("datasets", args.dataset, transform=T.NormalizeFeatures()) data = dataset[0].to(device) all_edge_index = data.edge_index data = train_test_split_edges(data, 0.05, 0.1) for epoch in range(args.epoch): model.train() optimizer.zero_grad() loss = model.loss(data.x, data.train_pos_edge_index, all_edge_index) loss.backward() optimizer.step() if epoch % 2 == 0: model.eval() roc_auc, ap = model.single_test(data.x, data.train_pos_edge_index, data.test_pos_edge_index, data.test_neg_edge_index) print("Epoch {} - Loss: {} ROC_AUC: {} Precision: {}".format( epoch, loss.cpu().item(), roc_auc, ap))
def split_edges(dataset, train_ratio, val_ratio): datas = [data for data in dataset] for i in range(len(datas)): datas[i] = train_test_split_edges(datas[i], val_ratio, 1 - train_ratio - val_ratio) dataset.data, dataset.slices = dataset.collate(datas)
def get_data_model(custom_dataset, task, random_seed, sparse_matrix, feature_matrix, initialize_spectral, encoder_base, n_hidden, n_layers, discriminator_layers, ae_type, bias, attention_heads, decoder_type, use_mincut, K, Niter, val_ratio, test_ratio, interpret=False, prediction_column=-1 ): assert custom_dataset in ['lawyer', 'physician', 'none'] assert task in ['link_prediction', 'generation', 'clustering', 'embedding', 'classification', 'regression'] print("Random Seed:",random_seed) torch.manual_seed(random_seed) np.random.seed(random_seed) random.seed(random_seed) if custom_dataset != 'none': sparse_matrix=DATA[custom_dataset]['A'] feature_matrix=DATA[custom_dataset]['X'] if isinstance(sparse_matrix,str) and os.path.exists(sparse_matrix) and sparse_matrix.split('.')[-1] in ['npz','csv']: if sparse_matrix.endswith('.csv'): sparse_matrix=sps.csr_matrix(pd.read_csv(sparse_matrix).values) else: sparse_matrix=sps.load_npz(sparse_matrix) elif not sps.issparse(sparse_matrix): sparse_matrix=sps.csr_matrix(sparse_matrix) # print(sparse_matrix.shape) if isinstance(feature_matrix,str) and os.path.exists(feature_matrix) and feature_matrix.split('.')[-1] in ['npy','csv']: if feature_matrix.endswith('.csv'): X=pd.read_csv(feature_matrix).values.astype(float) else: X=np.load(feature_matrix,allow_pickle=True).astype(float) elif isinstance(feature_matrix,type(None)): if initialize_spectral: from sklearn.manifold import SpectralEmbedding X=SpectralEmbedding(n_components=3,affinity="precomputed",random_state=42).fit_transform(sparse_matrix) else: X=np.ones(sparse_matrix.shape[0],dtype=float)[:,np.newaxis]#np.eye(sparse_matrix.shape[0])*sparse_matrix.sum(axis=1)#modify#np.ones(sparse_matrix.shape[0],dtype=float)[:,np.newaxis] else: X=feature_matrix y=None idx_df=None label_encoder=None n_classes=-1 if task in ['classification','regression']: X=pd.DataFrame(X) # print(X) assert prediction_column>=0 #in X.columns prediction_column=X.columns.values[prediction_column] y=X.pop(prediction_column).values.flatten() X=X.values # print(X,y) idx_df=pd.DataFrame(dict(idx=np.arange(len(y)),y=y)) idx_df_train,idx_df_test=train_test_split(idx_df,test_size=test_ratio,stratify=idx_df['y'] if task=='classification' else None, random_state=random_seed) idx_df_train,idx_df_val=train_test_split(idx_df_train,test_size=val_ratio,stratify=idx_df_train['y'] if task=='classification' else None, random_state=random_seed) idx_df_train['set']='train' idx_df_val['set']='val' idx_df_test['set']='test' idx_df=pd.concat([idx_df_train,idx_df_val,idx_df_test]) if task=='classification': n_classes=idx_df['y'].nunique() label_encoder=LabelEncoder() y=torch.tensor(label_encoder.fit_transform(y)).long() else: n_classes=1 y=torch.FloatTensor(y) X=torch.FloatTensor(X) n_input = X.shape[1] edge_index,edge_attr=from_scipy_sparse_matrix(sparse_matrix) G=Data(X,edge_index,edge_attr,y=y,idx_df=idx_df) G.num_nodes = X.shape[0] model=get_model(encoder_base, n_input, n_hidden, n_layers, discriminator_layers, ae_type, bias, attention_heads, decoder_type, use_mincut, K, Niter, interpret, n_classes) if task == 'link_prediction': G=train_test_split_edges(G, val_ratio=val_ratio, test_ratio=test_ratio) if torch.cuda.is_available(): model=model.cuda() return G,model,X,edge_index,edge_attr
def run(file, data_name, model_name,lr): parser = argparse.ArgumentParser(description='OGBL-DDI (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_sage', action='store_true') parser.add_argument('--num_layers', type=int, default=2) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--batch_size', type=int, default=64*1024) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--eval_steps', type=int, default=5) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--use_nd', action='store_true') parser.add_argument('--use_lgae', action='store_true') parser.add_argument('--use_vgae', action='store_true') parser.add_argument('--model', type=str, default='') parser.add_argument('--dataset', type=str, default='Citeseer') args = parser.parse_args() if data_name != None and model_name != None and lr != None: args.dataset = data_name args.model = model_name args.lr = lr print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' # device = 'cpu' device = torch.device(device) dataset = CitationFull(os.path.join('citation_data',args.dataset),name=args.dataset,transform=T.ToSparseTensor()) num_training = int(dataset.__len__()*0.8) num_val = int(dataset.__len__()*0.1) num_test = dataset.__len__() - (num_training+num_val) data = dataset[0] adj_t = data.adj_t.to(device) edge_index, edge_type = utils.dense_to_sparse(adj_t.to_dense()) data.edge_index = edge_index data.x = data.x.to(device) num_nodes = data.x.shape[0] num_edges = data.edge_index.shape[1] print(data) # nx_data = to_networkx(data, to_undirected=True) # print('graph density='+str(2*num_edges/(num_nodes*(num_nodes-1)))) # print('clustering coefficient='+str(nx.average_clustering(nx_data))) decoder_enable = args.model[-3:] if args.model[-3:] == '-nd': model_name = args.model[:-3] if model_name == 'lgae': model = LGAE(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'vgae': model = DeepVGAE(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'gae': model = GraphAutoEncoder(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'arga': model = AdversarialGAE(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'arvga': model = AdversarialVGAE(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'lrga': model = LRGA(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'sage': model = SAGEAutoEncoder(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) if decoder_enable == '-nd': model.decoder = NeuralDecoder( args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout) evaluator = Evaluator(name='ogbl-ddi') model = model.to(device) loggers = {} K_list = ['20','50','100'] for k in K_list: loggers['Hits@'+k] = Logger(args.runs, args) for run in range(args.runs): torch.manual_seed(run) split_edge = utils.train_test_split_edges(data) # print(split_edge.train_pos_edge_index.shape) # print(split_edge.val_pos_edge_index.shape) # exit() split_edge.edge_index = edge_index # emb.weight.data = features model.reset_parameters() if args.model in ['arga','arga-nd','arvga','arvga-nd']: args.lr=0.005 optimizer = torch.optim.Adam( list(model.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, data.x, adj_t, split_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test(model, data.x, adj_t, split_edge, evaluator, args.batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits, test_auc, test_ap, val_auc, val_ap = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'auc: {100 * test_auc:.2f}%, ' f'ap: {100 * test_ap:.2f}%, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%', ) print('---') for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) toWrite = loggers[key].print_statistics() file.write(str(args.lr)+' ' +key + ' ' +args.model+"'"+str(toWrite)+'\n') file.flush()
def train(args): np.random.seed(args.seed) torch.manual_seed(args.seed) if int(args.double_precision): torch.set_default_dtype(torch.float64) if int(args.cuda) >= 0: torch.cuda.manual_seed(args.seed) args.device = 'cuda:' + str(args.cuda) if int(args.cuda) >= 0 else 'cpu' args.patience = args.epochs if not args.patience else int(args.patience) logging.getLogger().setLevel(logging.INFO) if args.save: if not args.save_dir: dt = datetime.datetime.now() date = f"{dt.year}_{dt.month}_{dt.day}" models_dir = os.path.join(os.environ['LOG_DIR'], args.task, date) save_dir = get_dir_name(models_dir) else: save_dir = args.save_dir logging.basicConfig(level=logging.INFO, handlers=[ logging.FileHandler( os.path.join(save_dir, 'log.txt')), logging.StreamHandler() ]) logging.info(f'Using: {args.device}') logging.info("Using seed {}.".format(args.seed)) from torch_geometric.datasets import Planetoid import torch_geometric.transforms as T from torch_geometric.utils import train_test_split_edges dataset = Planetoid("data/", args.dataset, transform=T.NormalizeFeatures()) data_pyg = dataset[0] all_edge_index = data_pyg.edge_index data_pyg = train_test_split_edges(data_pyg, 0.05, 0.1) reserve_mark = 0 if args.task == 'nc': reserve_mark = 0 else: args.task = 'nc' reserve_mark = 1 # Load data data = load_data(args, os.path.join('data/', args.dataset)) args.n_nodes, args.feat_dim = data['features'].shape if args.task == 'nc': # Model = ADVNCModel args.n_classes = int(data['labels'].max() + 1) logging.info(f'Num classes: {args.n_classes}') else: args.nb_false_edges = len(data['train_edges_false']) args.nb_edges = len(data['train_edges']) if args.task == 'lp': print(' ') # Model = ADVLPModel else: Model = RECModel # No validation for reconstruction task args.eval_freq = args.epochs + 1 #transfer loading if reserve_mark == 1: args.task = 'lp' # reset reserve mark reserve_mark = 0 if args.task == 'lp': reserve_mark = 0 else: args.task = 'lp' reserve_mark = 1 data1 = load_data(args, os.path.join('data/', args.dataset)) args.n_nodes, args.feat_dim = data1['features'].shape if args.task == 'nc': # Model = ADVNCModel args.n_classes = int(data1['labels'].max() + 1) logging.info(f'Num classes: {args.n_classes}') else: print('*****') args.nb_false_edges = len(data1['train_edges_false']) args.nb_edges = len(data1['train_edges']) if args.task == 'lp': print(' ') # Model = ADVLPModel else: Model = RECModel # No validation for reconstruction task args.eval_freq = args.epochs + 1 if reserve_mark == 1: args.task = 'nc' # if args.task == 'nc': # Model = ADVNCModel # else: # Model = ADVLPModel print(data_pyg.x) print(data['features']) print((data_pyg.x == data['features']).all())