def loss(self, data: Graph, split="train"): if split == "train": mask = data.train_mask elif split == "val": mask = data.val_mask else: mask = data.test_mask edge_index, edge_types = data.edge_index[:, mask], data.edge_attr[mask] self.get_edge_set(edge_index, edge_types) batch_edges, batch_attr, samples, rels, labels = sampling_edge_uniform( edge_index, edge_types, self.edge_set, self.sampling_rate, self.num_rels, label_smoothing=self.lbl_smooth, num_entities=self.num_entities, ) with data.local_graph(): data.edge_index = batch_edges data.edge_attr = batch_attr node_embed, rel_embed = self.forward(data) sampled_nodes, reindexed_edges = torch.unique(samples, sorted=True, return_inverse=True) assert (self.cache_index == sampled_nodes).any() loss_n = self._loss(node_embed[reindexed_edges[0]], node_embed[reindexed_edges[1]], rel_embed[rels], labels) loss_r = self.penalty * self._regularization([self.emb(sampled_nodes), rel_embed]) return loss_n + loss_r
def read_gatne_data(folder): train_data = {} with open(osp.join(folder, "{}".format("train.txt")), "r") as f: for line in f: items = line.strip().split() if items[0] not in train_data: train_data[items[0]] = [] train_data[items[0]].append([int(items[1]), int(items[2])]) valid_data = {} with open(osp.join(folder, "{}".format("valid.txt")), "r") as f: for line in f: items = line.strip().split() if items[0] not in valid_data: valid_data[items[0]] = [[], []] valid_data[items[0]][1 - int(items[3])].append( [int(items[1]), int(items[2])]) test_data = {} with open(osp.join(folder, "{}".format("test.txt")), "r") as f: for line in f: items = line.strip().split() if items[0] not in test_data: test_data[items[0]] = [[], []] test_data[items[0]][1 - int(items[3])].append( [int(items[1]), int(items[2])]) data = Graph() data.train_data = train_data data.valid_data = valid_data data.test_data = test_data return data
def loss(self, data: Graph, scoring): row, col = data.edge_index edge_types = data.edge_attr edge_index = torch.stack([row, col]) self.get_edge_set(edge_index, edge_types) batch_edges, batch_attr, samples, rels, labels = sampling_edge_uniform( (row, col), edge_types, self.edge_set, self.sampling_rate, self.num_rels, label_smoothing=self.lbl_smooth, num_entities=self.num_entities, ) with data.local_graph(): data.edge_index = batch_edges data.edge_attr = batch_attr node_embed, rel_embed = self.forward(data) sampled_nodes, reindexed_edges = torch.unique(samples, sorted=True, return_inverse=True) assert (self.cache_index == sampled_nodes).any() loss_n = self._loss(node_embed[reindexed_edges[0]], node_embed[reindexed_edges[1]], rel_embed[rels], labels, scoring) loss_r = self.penalty * self._regularization( [self.emb(sampled_nodes), rel_embed]) return loss_n + loss_r
def __call__(self, edge_index, x=None, edge_weight=None): if self.method_type == "emb": if isinstance(edge_index, np.ndarray): edge_index = torch.from_numpy(edge_index) edge_index = (edge_index[:, 0], edge_index[:, 1]) data = Graph(edge_index=edge_index, edge_weight=edge_weight) self.model = build_model(self.args) embeddings = self.model(data) elif self.method_type == "gnn": num_nodes = edge_index.max().item() + 1 if x is None: print("No input node features, use random features instead.") x = np.random.randn(num_nodes, self.num_features) if isinstance(x, np.ndarray): x = torch.from_numpy(x).float() if isinstance(edge_index, np.ndarray): edge_index = torch.from_numpy(edge_index) edge_index = (edge_index[:, 0], edge_index[:, 1]) data = Graph(x=x, edge_index=edge_index, edge_weight=edge_weight) torch.save(data, self.data_path) dataset = NodeDataset(path=self.data_path, scale_feat=False, metric="accuracy") self.args.dataset = dataset model = train(self.args) embeddings = model.embed(data.to(model.device)) embeddings = embeddings.detach().cpu().numpy() return embeddings
def forward( self, graph: Graph, x: torch.Tensor, ): graph.sym_norm() return self.encoder(graph, x)
def forward(self, graph: Graph) -> torch.Tensor: x = graph.x if self.improved and not hasattr(graph, "unet_improved"): row, col = graph.edge_index row = torch.cat( [row, torch.arange(0, x.shape[0], device=x.device)], dim=0) col = torch.cat( [col, torch.arange(0, x.shape[0], device=x.device)], dim=0) graph.edge_index = (row, col) graph["unet_improved"] = True graph.row_norm() with graph.local_graph(): if self.training and self.adj_dropout > 0: graph.edge_index, graph.edge_weight = dropout_adj( graph.edge_index, graph.edge_weight, self.adj_dropout) x = F.dropout(x, p=self.n_dropout, training=self.training) h = self.in_gcn(graph, x) h = self.act(h) h_list = self.unet(graph, h) h = h_list[-1] h = F.dropout(h, p=self.n_dropout, training=self.training) return self.out_gcn(graph, h)
def build_toy_data(): x = torch.randn(100, 10) edge_index = torch.randint(0, 100, (2, 200)) g = Graph(x=x, edge_index=edge_index) nedge = g.num_edges edge_attr = torch.randn(nedge, 10) g.edge_attr = edge_attr return g
def forward( self, graph: Graph, x: torch.Tensor = None, ): if x is None: x = graph.x graph.sym_norm() return self.encoder(graph, x)
def read_planetoid_data(folder, prefix): prefix = prefix.lower() names = ["x", "tx", "allx", "y", "ty", "ally", "graph", "test.index"] objects = [] for item in names[:-1]: with open(f"{folder}/ind.{prefix}.{item}", "rb") as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding="latin1")) else: objects.append(pkl.load(f)) test_index = parse_index_file(f"{folder}/ind.{prefix}.{names[-1]}") test_index = torch.Tensor(test_index).long() test_index_reorder = test_index.sort()[0] x, tx, allx, y, ty, ally, graph = tuple(objects) x, tx, allx = tuple( [torch.from_numpy(item.todense()).float() for item in [x, tx, allx]]) y, ty, ally = tuple( [torch.from_numpy(item).float() for item in [y, ty, ally]]) train_index = torch.arange(y.size(0), dtype=torch.long) val_index = torch.arange(y.size(0), y.size(0) + 500, dtype=torch.long) if prefix.lower() == "citeseer": # There are some isolated nodes in the Citeseer graph, resulting in # none consecutive test indices. We need to identify them and add them # as zero vectors to `tx` and `ty`. len_test_indices = (test_index.max() - test_index.min()).item() + 1 tx_ext = torch.zeros(len_test_indices, tx.size(1)) tx_ext[test_index_reorder - test_index.min(), :] = tx ty_ext = torch.zeros(len_test_indices, ty.size(1)) ty_ext[test_index_reorder - test_index.min(), :] = ty tx, ty = tx_ext, ty_ext x = torch.cat([allx, tx], dim=0).float() y = torch.cat([ally, ty], dim=0).max(dim=1)[1].long() x[test_index] = x[test_index_reorder] y[test_index] = y[test_index_reorder] train_mask = index_to_mask(train_index, size=y.size(0)) val_mask = index_to_mask(val_index, size=y.size(0)) test_mask = index_to_mask(test_index, size=y.size(0)) edge_index = edge_index_from_dict(graph, num_nodes=y.size(0)) data = Graph(x=x, edge_index=edge_index, y=y) data.train_mask = train_mask data.val_mask = val_mask data.test_mask = test_mask return data
def prop( self, graph: Graph, x: torch.Tensor, drop_feature_rate: float = 0.0, drop_edge_rate: float = 0.0, ): x = dropout_features(x, drop_feature_rate) with graph.local_graph(): graph.edge_index, graph.edge_weight = dropout_adj( graph.edge_index, graph.edge_weight, drop_edge_rate) return self.model.forward(graph, x)
def __init__(self, root, name): self.name = name super(GCCDataset, self).__init__(root) name1 = name.split("_")[0] name2 = name.split("_")[1] edge_index_1, dict_1, self.node2id_1 = self.preprocess(root, name1) edge_index_2, dict_2, self.node2id_2 = self.preprocess(root, name2) self.data = [ Graph(x=None, edge_index=edge_index_1, name_dict=dict_1), Graph(x=None, edge_index=edge_index_2, name_dict=dict_2), ] self.transform = None
def drop_adj(self, graph: Graph, drop_rate: float = 0.5): if drop_rate < 0.0 or drop_rate > 1.0: raise ValueError("Dropout probability has to be between 0 and 1, " "but got {}".format(drop_rate)) if not self.training: return graph num_edges = graph.num_edges mask = torch.full((num_edges,), 1 - drop_rate, dtype=torch.float) mask = torch.bernoulli(mask).to(torch.bool) edge_index = graph.edge_index edge_weight = graph.edge_weight graph.edge_index = edge_index[:, mask] graph.edge_weight = edge_weight[mask] return graph
def _construct_propagation_matrix(self, sample_adj, sample_id, num_neighbors): row, col = sample_adj.edge_index value = sample_adj.edge_weight """add self connection""" num_row = row.max() + 1 row = torch.cat([torch.arange(0, num_row).long(), row], dim=0) col = torch.cat([torch.arange(0, num_row).long(), col], dim=0) value = torch.cat([self.diag[sample_id[:num_row]], value], dim=0) value = value * self.degree[sample_id[row]] / num_neighbors new_graph = Graph() new_graph.edge_index = torch.stack([row, col]) new_graph.edge_weight = value return new_graph
def _rwr_trace_to_cogdl_graph(g: Graph, seed: int, trace: torch.Tensor, positional_embedding_size: int, entire_graph: bool = False): subv = torch.unique(trace).tolist() try: subv.remove(seed) except ValueError: pass subv = [seed] + subv if entire_graph: subg = copy.deepcopy(g) else: subg = g.subgraph(subv) subg = _add_undirected_graph_positional_embedding( subg, positional_embedding_size) subg.seed = torch.zeros(subg.num_nodes, dtype=torch.long) if entire_graph: subg.seed[seed] = 1 else: subg.seed[0] = 1 return subg
def __init__(self, root, name): super(OGBNDataset, self).__init__(root) dataset = NodePropPredDataset(name, root) graph, y = dataset[0] x = torch.tensor(graph["node_feat"]) y = torch.tensor(y.squeeze()) row, col, edge_attr = coalesce(graph["edge_index"][0], graph["edge_index"][1], graph["edge_feat"]) edge_index = torch.stack([row, col], dim=0) edge_index, edge_attr = remove_self_loops(edge_index, edge_attr) row = torch.cat([edge_index[0], edge_index[1]]) col = torch.cat([edge_index[1], edge_index[0]]) edge_index = torch.stack([row, col], dim=0) if edge_attr is not None: edge_attr = torch.cat([edge_attr, edge_attr], dim=0) self.data = Graph(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y) self.data.num_nodes = graph["num_nodes"] assert self.data.num_nodes == self.data.x.shape[0] # split split_index = dataset.get_idx_split() self.data.train_mask = torch.zeros(self.data.num_nodes, dtype=torch.bool) self.data.test_mask = torch.zeros(self.data.num_nodes, dtype=torch.bool) self.data.val_mask = torch.zeros(self.data.num_nodes, dtype=torch.bool) self.data.train_mask[split_index["train"]] = True self.data.test_mask[split_index["test"]] = True self.data.val_mask[split_index["valid"]] = True self.transform = None
def process(self): filenames = self.raw_paths with open(f"{filenames[0]}", "r") as f: edge_index = f.read().strip().split("\n") edge_index = [[int(i) for i in x.split("\t")] for x in edge_index] edge_index = np.array(edge_index, dtype=np.int64).transpose() edge_index = torch.from_numpy(edge_index) rev_edge_index = torch.stack([edge_index[1], edge_index[0]]) edge_index = torch.cat((edge_index, rev_edge_index), dim=1) self_loop_mask = edge_index[0] != edge_index[1] edge_index = edge_index[:, self_loop_mask] with open(f"{filenames[1]}", "r") as f: cmty = f.read().strip().split("\n") cmty = [[int(i) for i in x.split("\t")] for x in cmty] num_classes = len(cmty) num_nodes = torch.max(edge_index).item() + 1 labels = np.zeros((num_nodes, num_classes), dtype=np.float) for i, cls in enumerate(cmty): labels[cls, i] = 1.0 labels = torch.from_numpy(labels) data = Graph(x=None, y=labels, edge_index=edge_index) torch.save(data, self.processed_paths[0])
def process(self): num_nodes = 100 num_edges = 300 feat_dim = 30 # load or generate your dataset edge_index = torch.randint(0, num_nodes, (2, num_edges)) x = torch.randn(num_nodes, feat_dim) y = torch.randint(0, 2, (num_nodes, )) # set train/val/test mask in node_classification task train_mask = torch.zeros(num_nodes).bool() train_mask[0:int(0.3 * num_nodes)] = True val_mask = torch.zeros(num_nodes).bool() val_mask[int(0.3 * num_nodes):int(0.7 * num_nodes)] = True test_mask = torch.zeros(num_nodes).bool() test_mask[int(0.7 * num_nodes):] = True data = Graph(x=x, edge_index=edge_index, y=y, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask) torch.save(data, "mydata.pt") return data
def __init__(self, data_type="unsupervised", root="testchem", transform=None, pre_transform=None, pre_filter=None): super(TestChemDataset, self).__init__(root, transform, pre_transform, pre_filter) num_nodes = 10 num_edges = 10 num_graphs = 100 def cycle_index(num, shift): arr = torch.arange(num) + shift arr[-shift:] = torch.arange(shift) return arr upp = torch.cat([torch.arange(0, num_nodes)] * num_graphs) dwn = torch.cat([cycle_index(num_nodes, 1)] * num_graphs) edge_index = torch.stack([upp, dwn]) edge_attr = torch.zeros(num_edges * num_graphs, 2) x = torch.zeros(num_graphs * num_nodes, 2) for idx, val in enumerate( torch.randint(0, 6, size=(num_edges * num_graphs, ))): edge_attr[idx][0] = val for idx, val in enumerate( torch.randint(0, 3, size=(num_edges * num_graphs, ))): edge_attr[idx][1] = val for idx, val in enumerate( torch.randint(0, 120, size=(num_edges * num_graphs, ))): x[idx][0] = val for idx, val in enumerate( torch.randint(0, 3, size=(num_edges * num_graphs, ))): x[idx][1] = val self.data = Graph( x=x.to(torch.long), edge_index=edge_index.to(torch.long), edge_attr=edge_attr.to(torch.long), ) self.slices = { "x": torch.arange(0, (num_graphs + 1) * num_nodes, num_nodes), "edge_index": torch.arange(0, (num_graphs + 1) * num_edges, num_edges), "edge_attr": torch.arange(0, (num_graphs + 1) * num_edges, num_edges), } if data_type == "supervised": pretrain_tasks = 10 go_target_pretrain = torch.zeros(pretrain_tasks * num_graphs) - 1 for i in range(num_graphs): val = np.random.randint(0, pretrain_tasks) go_target_pretrain[i * pretrain_tasks + val] = 1 self.data.y = go_target_pretrain self.slices["y"] = torch.arange(0, (num_graphs + 1) * pretrain_tasks, pretrain_tasks)
def preprocess(self, graph): num_nodes = graph.num_nodes graph.add_remaining_self_loops() graph.sym_norm() adj = sp.coo_matrix( (graph.edge_weight.cpu().numpy(), (graph.edge_index[0].cpu().numpy(), graph.edge_index[1].cpu().numpy())), shape=(graph.num_nodes, graph.num_nodes), ) diff = compute_ppr(adj.tocsr(), np.arange(num_nodes), self.alpha).tocoo() if self.cache is None: self.cache = dict() graphs = [] for g in [adj, diff]: row = torch.from_numpy(g.row).long() col = torch.from_numpy(g.col).long() val = torch.from_numpy(g.data).float() edge_index = torch.stack([row, col]) graphs.append(Graph(edge_index=edge_index, edge_weight=val)) self.cache["diff"] = graphs[1] self.cache["adj"] = graphs[0] self.device = next(self.gcn1.parameters()).device
def test_will_return_computed_embeddings_for_simple_fully_connected_graph(): args = get_args() model: DeepWalk = DeepWalk.build_model_from_args(args) graph = Graph(edge_index=(torch.LongTensor([0]), torch.LongTensor([1]))) trained = model(graph, creator) assert len(trained) == 2 np.testing.assert_array_equal(trained[0], embed_1) np.testing.assert_array_equal(trained[1], embed_2)
def top_k(self, graph, x: torch.Tensor, scores: torch.Tensor) -> Tuple[Graph, torch.Tensor]: org_n_nodes = x.shape[0] num = int(self.pooling_rate * x.shape[0]) values, indices = torch.topk(scores, max(2, num)) if self.aug_adj: edge_index = graph.edge_index.cpu() edge_attr = torch.ones(edge_index.shape[1]) edge_index, _ = spspmm(edge_index, edge_attr, edge_index, edge_attr, org_n_nodes, org_n_nodes, org_n_nodes) edge_index = edge_index.to(x.device) batch = Graph(x=x, edge_index=edge_index) else: batch = graph new_batch = batch.subgraph(indices) new_batch.row_norm() return new_batch, indices
def test_base_layer(): layer = BaseLayer() x = torch.eye(4) edge_index = (torch.tensor([0, 0, 0, 1, 1, 2]), torch.tensor([1, 2, 3, 2, 3, 3])) graph = Graph(x=x, edge_index=edge_index) x = layer(graph, x) assert tuple(x.shape) == (4, 4)
def test_gine_layer(): layer = GINELayer() x = torch.eye(4) edge_index = (torch.tensor([0, 0, 0, 1, 1, 2]), torch.tensor([1, 2, 3, 2, 3, 3])) graph = Graph(x=x, edge_index=edge_index, edge_attr=torch.randn(6, 4)) x = layer(graph, x) assert tuple(x.shape) == (4, 4)
def process(self): graphs = [] for i in range(200): edges = torch.randint(0, 1000, (2, 30)) label = torch.randint(0, 7, (1, )) graphs.append(Graph(edge_index=edges, y=label)) torch.save(graphs, self.path) return graphs
def process(self): # Load and preprocess data # Here we randomly generate several graphs for simplicity as an example graphs = [] for i in range(10): edges = torch.randint(0, 20, (2, 30)) label = torch.randint(0, 7, (1, )) graphs.append(Graph(edge_index=edges, y=label)) return graphs
def _add_undirected_graph_positional_embedding(g: Graph, hidden_size, retry=10): # We use eigenvectors of normalized graph laplacian as vertex features. # It could be viewed as a generalization of positional embedding in the # attention is all you need paper. # Recall that the eignvectors of normalized laplacian of a line graph are cos/sin functions. # See section 2.4 of http://www.cs.yale.edu/homes/spielman/561/2009/lect02-09.pdf n = g.num_nodes with g.local_graph(): g.sym_norm() adj = g.to_scipy_csr() laplacian = adj k = min(n - 2, hidden_size) x = eigen_decomposision(n, k, laplacian, hidden_size, retry) g.pos_undirected = x.float() return g
def __call__(self, graph, x, train_only=True): g1 = graph g2 = Graph(edge_index=g1.edge_index) g1.normalize(self.op_dict["correct_g"]) g2.normalize(self.op_dict["smooth_g"]) train_nid, valid_nid, _ = g1.train_nid, g1.val_nid, g1.test_nid y = g1.y if train_only: label_nid = train_nid residual_nid = train_nid else: label_nid = torch.cat((train_nid, valid_nid)) residual_nid = train_nid # Correct y = pre_residual_correlation(x, y, residual_nid) if self.op_dict["autoscale"]: post_func = partial(autoscale_post, lower=-1.0, upper=1.0) scale_func = correlation_autoscale else: post_func = partial(fixed_post, y=y, nid=residual_nid) scale_func = correlation_fixed resid = outcome_correlation( g1, y, self.op_dict["correct_alpha"], nprop=self.op_dict["num_correct_prop"], post_step=post_func ) res_result = scale_func(x, y, resid, residual_nid, self.op_dict["scale"]) # Smooth y = pre_outcome_correlation(res_result, g1.y, label_nid) result = outcome_correlation( g2, y, self.op_dict["smooth_alpha"], nprop=self.op_dict["num_smooth_prop"], post_step=partial(autoscale_post, lower=0, upper=1), ) return result
def convert(data): if not hasattr(data, "_adj"): g = Graph() for key in data.keys: if "adj" in key: g["_" + key] = data[key] else: g[key] = data[key] return g else: return data
def prop( self, graph: Graph, x: torch.Tensor, drop_feature_rate: float = 0.0, drop_edge_rate: float = 0.0, ): x = self.drop_feature(x, drop_feature_rate) with graph.local_graph(): graph = self.drop_adj(graph, drop_edge_rate) return self.forward(graph, x)
def forward(self, graph: Graph) -> torch.Tensor: x = graph.x if self.improved and not hasattr(graph, "unet_improved"): self_loop = torch.stack([torch.arange(0, x.shape[0])] * 2, dim=0).to(x.device) graph.edge_index = torch.cat([graph.edge_index, self_loop], dim=1) graph["unet_improved"] = True graph.row_norm() with graph.local_graph(): if self.training and self.adj_dropout > 0: graph.edge_index, graph.edge_weight = dropout_adj(graph.edge_index, graph.edge_weight, self.adj_dropout) x = F.dropout(x, p=self.n_dropout, training=self.training) h = self.in_gcn(graph, x) h = self.act(h) h_list = self.unet(graph, h) h = h_list[-1] h = F.dropout(h, p=self.n_dropout, training=self.training) return self.out_gcn(graph, h)