def test_edge_index_to_vector_and_vice_versa(): # Create a fully-connected graph: N = 10 row = torch.arange(N).view(-1, 1).repeat(1, N).view(-1) col = torch.arange(N).view(1, -1).repeat(N, 1).view(-1) edge_index = torch.stack([row, col], dim=0) idx, population = edge_index_to_vector(edge_index, (N, N), bipartite=True) assert population == N * N assert idx.tolist() == list(range(population)) edge_index2 = vector_to_edge_index(idx, (N, N), bipartite=True) assert is_undirected(edge_index2) assert edge_index.tolist() == edge_index2.tolist() idx, population = edge_index_to_vector(edge_index, (N, N), bipartite=False) assert population == N * N - N assert idx.tolist() == list(range(population)) mask = edge_index[0] != edge_index[1] # Remove self-loops. edge_index2 = vector_to_edge_index(idx, (N, N), bipartite=False) assert is_undirected(edge_index2) assert edge_index[:, mask].tolist() == edge_index2.tolist() idx, population = edge_index_to_vector(edge_index, (N, N), bipartite=False, force_undirected=True) assert population == (N * (N + 1)) / 2 - N assert idx.tolist() == list(range(population)) mask = edge_index[0] != edge_index[1] # Remove self-loops. edge_index2 = vector_to_edge_index(idx, (N, N), bipartite=False, force_undirected=True) assert is_undirected(edge_index2) assert edge_index[:, mask].tolist() == to_undirected(edge_index2).tolist()
def test_is_undirected(): row = torch.tensor([0, 1, 0]) col = torch.tensor([1, 0, 0]) assert is_undirected(torch.stack([row, col], dim=0)) row = torch.tensor([0, 1, 1]) col = torch.tensor([1, 0, 2]) assert not is_undirected(torch.stack([row, col], dim=0))
def diffusion_matrix_exact(self, edge_index, edge_weight, num_nodes, method, **kwargs): r"""Calculate the (dense) diffusion on a given sparse graph. Note that these exact variants are not scalable. They densify the adjacency matrix and calculate either its inverse or its matrix exponential. Args: edge_index (LongTensor): The edge indices. edge_weight (Tensor): One-dimensional edge weights. num_nodes (int): Number of nodes. method (str): Diffusion method: 1. :obj:`"ppr"`: Use personalized PageRank as diffusion. Additionally expects the parameter: - **alpha** (*float*) - Return probability in PPR. Commonly lies in :obj:`[0.05, 0.2]`. 2. :obj:`"heat"`: Use heat kernel diffusion. Additionally expects the parameter: - **t** (*float*) - Time of diffusion. Commonly lies in :obj:`[2, 10]`. 3. :obj:`"coeff"`: Freely choose diffusion coefficients. Additionally expects the parameter: - **coeffs** (*List[float]*) - List of coefficients :obj:`theta_k` for each power of the transition matrix (starting at :obj:`0`). :rtype: (:class:`Tensor`) """ if method == 'ppr': # α (I_n + (α - 1) A)^-1 edge_weight = (kwargs['alpha'] - 1) * edge_weight edge_index, edge_weight = add_self_loops(edge_index, edge_weight, fill_value=1, num_nodes=num_nodes) mat = to_dense_adj(edge_index, edge_attr=edge_weight).squeeze() diff_matrix = kwargs['alpha'] * torch.inverse(mat) elif method == 'heat': # exp(t (A - I_n)) edge_index, edge_weight = add_self_loops(edge_index, edge_weight, fill_value=-1, num_nodes=num_nodes) edge_weight = kwargs['t'] * edge_weight mat = to_dense_adj(edge_index, edge_attr=edge_weight).squeeze() undirected = is_undirected(edge_index, edge_weight, num_nodes) diff_matrix = self.__expm__(mat, undirected) elif method == 'coeff': adj_matrix = to_dense_adj(edge_index, edge_attr=edge_weight).squeeze() mat = torch.eye(num_nodes, device=edge_index.device) diff_matrix = kwargs['coeffs'][0] * mat for coeff in kwargs['coeffs'][1:]: mat = mat @ adj_matrix diff_matrix += coeff * mat else: raise ValueError('Exact GDC diffusion {} unknown.'.format(method)) return diff_matrix
def __getitem__(self, item) -> torch.Tensor: """ :param item: :return: Draw negative samples, and return [2, E * 0.8 * 2] tensor """ datum = super().__getitem__(item) # Sample negative training samples from the negative sample pool train_neg_edge_index = self._sample_train_neg_edge_index( is_undirected(self.test_edge_index)) train_edge_index = torch.cat( [self.train_pos_edge_index, train_neg_edge_index], dim=1) train_edge_y = self.get_edge_y( train_edge_index.size(1), pos_num_or_ratio=self.train_pos_edge_index.size(1), device=train_edge_index.device) # Add attributes for edge prediction datum.__setitem__("train_edge_index", train_edge_index) datum.__setitem__("train_edge_y", train_edge_y) datum.__setitem__("val_edge_index", self.val_edge_index) datum.__setitem__("val_edge_y", self.val_edge_y) datum.__setitem__("test_edge_index", self.test_edge_index) datum.__setitem__("test_edge_y", self.test_edge_y) return datum
def norm(edge_index, num_nodes, edge_weight=None, improved=False, dtype=None): if edge_weight is None: edge_weight = torch.ones((edge_index.size(1), ), dtype=dtype, device=edge_index.device) fill_value = 1.0 if not improved else 2.0 edge_index, edge_weight = add_remaining_self_loops( edge_index, edge_weight, fill_value, num_nodes) row, col = edge_index if is_undirected(edge_index, num_nodes=num_nodes): deg = scatter_add(edge_weight, row, dim=0, dim_size=num_nodes) deg_inv_sqrt = deg.pow(-0.5) deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0 norm = deg_inv_sqrt[row] * edge_weight * deg_inv_sqrt[col] else: if cfg.gnn.flow == 'source_to_target': deg = scatter_add(edge_weight, row, dim=0, dim_size=num_nodes) else: deg = scatter_add(edge_weight, col, dim=0, dim_size=num_nodes) deg_inv_sqrt = deg.pow(-1.0) deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0 norm = (deg_inv_sqrt[row] if cfg.gnn.flow == 'source_to_target' else deg_inv_sqrt[col]) * edge_weight return edge_index, norm
def test_negative_sampling(): edge_index = torch.as_tensor([[0, 0, 1, 2], [0, 1, 2, 3]]) neg_edge_index = negative_sampling(edge_index) assert neg_edge_index.size(1) == edge_index.size(1) adj = torch.zeros(4, 4, dtype=torch.bool) adj[edge_index[0], edge_index[1]] = 1 neg_adj = torch.zeros(4, 4, dtype=torch.bool) neg_adj[neg_edge_index[0], neg_edge_index[1]] = 1 assert (adj & neg_adj).sum() == 0 neg_edge_index = negative_sampling(edge_index, num_neg_samples=2) assert neg_edge_index.size(1) == 2 undirected_edge_index = to_undirected(edge_index) undirected_neg_edge_index = negative_sampling(undirected_edge_index, force_undirected=True) assert is_undirected(undirected_neg_edge_index) assert undirected_neg_edge_index.size(1) <= undirected_edge_index.size(1) undirected_adj = torch.zeros(4, 4, dtype=torch.bool) undirected_adj[undirected_edge_index[0], undirected_edge_index[1]] = 1 undirected_neg_adj = torch.zeros(4, 4, dtype=torch.bool) undirected_neg_adj[undirected_neg_edge_index[0], undirected_neg_edge_index[1]] = 1 assert (undirected_adj & undirected_neg_adj).sum() == 0
def is_undirected(self) -> bool: if self.is_bipartite(): # TODO check for inverse storage. return False for value in self.values('adj', 'adj_t'): return value.is_symmetric() edge_index = self.edge_index edge_attr = self.edge_attr if 'edge_attr' in self else None return is_undirected(edge_index, edge_attr, num_nodes=self.size(0))
def forward(self, data, return_hidden_feature=False): #import pdb #pdb.set_trace() if torch.cuda.is_available(): data.x = data.x.cuda() data.edge_attr = data.edge_attr.cuda() data.edge_index = data.edge_index.cuda() data.batch = data.batch.cuda() # make sure that we have undirected graph if not is_undirected(data.edge_index): data.edge_index = to_undirected(data.edge_index) # make sure that nodes can propagate messages to themselves if not contains_self_loops(data.edge_index): data.edge_index, data.edge_attr = add_self_loops( data.edge_index, data.edge_attr.view(-1)) # covalent_propagation # add self loops to enable self propagation covalent_edge_index, covalent_edge_attr = self.covalent_neighbor_threshold( data.edge_index, data.edge_attr) ( non_covalent_edge_index, non_covalent_edge_attr, ) = self.non_covalent_neighbor_threshold(data.edge_index, data.edge_attr) # covalent_propagation and non_covalent_propagation covalent_x = self.covalent_propagation(data.x, covalent_edge_index, covalent_edge_attr) non_covalent_x = self.non_covalent_propagation( covalent_x, non_covalent_edge_index, non_covalent_edge_attr) # zero out the protein features then do ligand only gather...hacky sure but it gets the job done non_covalent_ligand_only_x = non_covalent_x non_covalent_ligand_only_x[data.x[:, 14] == -1] = 0 pool_x = self.global_add_pool(non_covalent_ligand_only_x, data.batch) # fully connected and output layers if return_hidden_feature or self.always_return_hidden_feature: # return prediction and atomistic features (covalent result, non-covalent result, pool result) avg_covalent_x, _ = avg_pool_x(data.batch, covalent_x, data.batch) avg_non_covalent_x, _ = avg_pool_x(data.batch, non_covalent_x, data.batch) fc0_x, fc1_x, output_x = self.output(pool_x, return_hidden_feature=True) return avg_covalent_x, avg_non_covalent_x, pool_x, fc0_x, fc1_x, output_x else: return self.output(pool_x)
def _create_data(self): # Include rows that are explicitly provided df = pd.read_json(self.file_in, lines=True) counter = 0 for node_props_x, node_props_y, e_index_1, e_index_2, edge_props, n_scored, sn_val, sn_filter in graph_props_x_( df, self.consider_loop_type): edge_index = torch.tensor([e_index_1, e_index_2], dtype=torch.long) if not is_undirected(edge_index): raise RuntimeError( 'Directed graph created, should be impossible') # Ignore all data that does not meet signal-noise threshold if self.filter_noise: if sn_filter != 1: continue # Add feature of incoming edges to node as a node feature if self.nonbond_nodefeature: node_nbond_feature = create_node_nbond(node_props_x, e_index_1, e_index_2, edge_props) x = torch.tensor(node_nbond_feature, dtype=torch.float) else: x = torch.tensor(node_props_x, dtype=torch.float) edge_x = torch.tensor(edge_props, dtype=torch.float) # Add ground-truth node features if not node_props_y is None: y = torch.tensor(node_props_y, dtype=torch.float) else: y = None # Create mask for which nodes are to be predicted train_mask = torch.zeros(x.shape[0], dtype=torch.bool) train_mask[:n_scored] = True data = Data(x=x, edge_index=edge_index, y=y, edge_attr=edge_x, train_mask=train_mask, sn_val=sn_val) torch.save( data, '{}/{}_{}.pt'.format(self.save_dir, ALL_DATA_NAME, counter)) counter += 1 self.total_processed = counter
def test_negative_sampling(): edge_index = torch.as_tensor([[0, 0, 1, 2], [0, 1, 2, 3]]) neg_edge_index = negative_sampling(edge_index) assert neg_edge_index.size(1) == edge_index.size(1) assert is_negative(edge_index, neg_edge_index, (4, 4), bipartite=False) neg_edge_index = negative_sampling(edge_index, num_neg_samples=2) assert neg_edge_index.size(1) == 2 assert is_negative(edge_index, neg_edge_index, (4, 4), bipartite=False) edge_index = to_undirected(edge_index) neg_edge_index = negative_sampling(edge_index, force_undirected=True) assert neg_edge_index.size(1) == edge_index.size(1) - 1 assert is_undirected(neg_edge_index) assert is_negative(edge_index, neg_edge_index, (4, 4), bipartite=False)
def negative_sampling(self, edge_index: Tensor, num_nodes: int, batch: OptTensor = None) -> Tensor: num_neg_samples = int(self.neg_sample_ratio * self.edge_sample_ratio * edge_index.size(1)) if not self.is_undirected and not is_undirected( edge_index, num_nodes=num_nodes): edge_index = to_undirected(edge_index, num_nodes=num_nodes) if batch is None: neg_edge_index = negative_sampling(edge_index, num_nodes, num_neg_samples=num_neg_samples) else: neg_edge_index = batched_negative_sampling( edge_index, batch, num_neg_samples=num_neg_samples) return neg_edge_index
def print_statistics(data): print('Original feature size:', data.x.shape[1]) remove_unique_feature(data) print('Current feature size:', data.x.shape[1]) edge_direction = 'Undirected' if is_undirected( data.edge_index) else 'Directed' print('{} graph'.format(edge_direction)) print('{} graph'.format('Weighted' if is_weighted(data) else 'Unweighted')) print('Number of nodes: ', data.x.shape[0]) num_edges = data.edge_index.shape[1] if edge_direction == 'Undirected': num_edges = num_edges // 2 print('Number of edges: ', num_edges) print('Number of classes: {}'.format(int(max(data.y)) + 1)) if hasattr(data, 'train_mask'): print('Number of training nodes:', data.train_mask.sum().item()) if hasattr(data, 'val_mask'): print('Number of validation nodes:', data.val_mask.sum().item()) if hasattr(data, 'test_mask'): print('Number of test nodes:', data.test_mask.sum().item())
def get_edge_info(self, df): """ Get information of the edges: number of edges, if weighted, if directed, Max / Min weight, etc. """ self.info['num_edges'] = df.shape[0] min_weight, max_weight = df['edge_weight'].min(), df['edge_weight'].max() if min_weight != max_weight: self.info['weighted'] = True else: self.info['weighted'] = False edge_index = df[['src_idx', 'dst_idx']].to_numpy() edge_index = sorted(edge_index, key=lambda d: d[0]) edge_index = torch.tensor(edge_index, dtype=torch.long).transpose(0, 1) self.info['directed'] = not is_undirected(edge_index, num_nodes=self.info['num_nodes']) print('Number of Edges:', self.info['num_edges']) print('Is Directed Graph:', self.info['directed']) print('Is Weighted Graph:',self.info['weighted']) print('Max Weight:', max_weight, 'Min Weight:', min_weight)
def print_stats(): for data in DATASETS: out = load_data(data) num_graphs = len(out) avg_nodes = out.data.x.size(0) / num_graphs avg_edges = out.data.edge_index.size(1) / num_graphs num_features = out.num_features num_classes = out.num_classes print( f'{data}\t{num_graphs}\t{avg_nodes}\t{avg_edges}\t{num_features}\t{num_classes}', end='\t') undirected, self_loops, isolated_nodes, onehot = True, False, False, True for graph in out: if not is_undirected(graph.edge_index, num_nodes=graph.num_nodes): undirected = False if contains_self_loops(graph.edge_index): self_loops = True if contains_isolated_nodes(graph.edge_index, num_nodes=graph.num_nodes): isolated_nodes = True if ((graph.x > 0).sum(dim=1) != 1).sum() > 0: onehot = False print(f'{undirected}\t{self_loops}\t{isolated_nodes}\t{onehot}')
def generate_pyg_data(self, data): # get x feature table x = data['fea_table'].copy() df = data['edge_file'] edges = df[['src_idx', 'dst_idx', 'edge_weight']] # get indices first train_indices = data['train_indices'] if self.config.use_valid: train_indices, valid_indices = train_test_split(train_indices, test_size=0.2, shuffle=False) try: if x.shape[1] == 1: # 0-dimensional feature x = x.set_index(keys="node_index") x = feat_engineering( x, edges=edges, num_nodes=self.metadata["n_node"].iloc[0] ) else: x_feat = x.drop('node_index', axis=1).to_numpy() conf_name = self.config.filename.split("/")[-1].split(".")[0] is_only_one_zero = not ((x_feat != 0) & (x_feat != 1)).any() logger.info("use {} config".format(conf_name)) logger.info( "feature only contains zero: {}, only one and zero: {}".format((x_feat == 0).all(), is_only_one_zero)) if conf_name in self.citation_configs: # Judge whether it is a citation graph # if True: if is_only_one_zero: logger.info("Normalize features") normal_feat = feat_row_sum_inv_normalize(x_feat) normal_df = pd.DataFrame(data=normal_feat) normal_df["node_index"] = x["node_index"] x = normal_df pre_feat = prepredict(data, train_indices=train_indices, use_valid=self.config.use_valid, use_ohe=False) x = x.set_index(keys="node_index") x_index = x.index.tolist() lpa_preds, lpa_train_acc = lpa_predict(data, n_class=self._n_class, train_indices=train_indices, use_valid=self.config.use_valid) if not np.isnan(lpa_train_acc) and lpa_train_acc > 0.8: logger.info("Use LPA predicts") x = pd.concat([x, pre_feat, lpa_preds], axis=1).values[x_index] else: x = pd.concat([x, pre_feat], axis=1).values[x_index] else: x = x.set_index(keys="node_index") x = feat_engineering( x, edges=edges, num_nodes=self.metadata["n_node"].iloc[0] ) except Exception as e: logger.error(e) if x.shape[1] == 0: x = np.zeros((x.shape[0], 64), dtype=np.float) else: x = x.to_numpy() logger.info("x shape: {}".format(x.shape)) node_index = torch.tensor(data['fea_table']['node_index'].to_numpy(), dtype=torch.long) x = torch.tensor(x, dtype=torch.float) # get edge_index, edge_weight edges = edges.to_numpy() edge_index = edges[:, :2].astype(np.int) # transpose from [edge_num, 2] to [2, edge_num] which is required by PyG edge_index = torch.tensor(edge_index, dtype=torch.long).transpose(0, 1) edge_weight = edges[:, 2] edge_weight = torch.tensor(edge_weight, dtype=torch.float32) undirected = gtils.is_undirected(edge_index) edge_index, edge_weight = gtils.sort_edge_index(edge_index, edge_weight) logger.info(f"is undirected ? {undirected}") logger.info(f"edge index {edge_index.shape}, edge weight {edge_weight.shape}") # get train/test mask num_nodes = x.size(0) self._num_nodes = num_nodes y = torch.zeros(num_nodes, dtype=torch.long) inds = data['train_label'][['node_index']].to_numpy() train_y = data['train_label'][['label']].to_numpy() self.y_train = train_y y[inds] = torch.tensor(train_y, dtype=torch.long) # train_indices = data['train_indices'] self._origin_graph_data_indices = copy.deepcopy(data['train_indices']) if self.config.use_valid: # train_indices, valid_indices = train_test_split(train_indices, test_size=0.2) # train_indices, valid_indices = train_test_split(train_indices, test_size=0.2, shuffle=False) self.y_train = data['train_label'].set_index('node_index').loc[train_indices][['label']].to_numpy() test_indices = data['test_indices'] data = Data(x=x, node_index=node_index, edge_index=edge_index, y=y, edge_weight=edge_weight) data.num_nodes = num_nodes train_mask = torch.zeros(num_nodes, dtype=torch.bool) train_mask[train_indices] = 1 data.train_indices = np.asarray(train_indices) data.train_mask = train_mask self._train_indices = np.asarray(train_indices) self._train_mask = train_mask if self.config.use_valid: valid_mask = torch.zeros(num_nodes, dtype=torch.bool) valid_mask[valid_indices] = 1 data.valid_indices = valid_indices data.valid_mask = valid_mask self._valid_indices = valid_indices self._valid_mask = valid_mask self._test_mask = np.zeros(num_nodes, dtype=np.bool) self._test_mask[test_indices] = True test_mask = torch.zeros(num_nodes, dtype=torch.bool) test_mask[test_indices] = 1 data.test_mask = test_mask data.test_indices = np.asarray(test_indices) self._sampler = Sampler(data, self.metadata["n_edge"].iloc[0], self.device) return data
ac_label = np.array(hkl.load(ac_label_fp)) # binary feature for each gene ac_gene_list = [] for i in range(len(ac_gene_fp)): ac_gene_list.append(hkl.load(ac_gene_fp[i])) #Loads PPI ppi_fp = './dataset/onecc_noiso_string_matrix_interactome_data_filtered.hkl' ppi = hkl.load(ppi_fp) # generate graph topology and edge_attr edge_index = torch.from_numpy(ppi[:, 0:2]).transpose(dim0=0, dim1=1) assert (is_undirected(edge_index) is True), "ppi graph should be undirected graph" edge_attr = torch.from_numpy(ppi[:, 2]).to(torch.float32) / 1000.0 # create data with single feature ac_data_list = list([] for i in range(ac_gene_list[0].shape[0])) for i in range(len(ac_gene_list)): for idx, vector in enumerate(ac_gene_list[i]): ac_data_list[idx].append(torch.from_numpy(vector)) new_ac_data_list = [] for data in ac_data_list: new_ac_data_list.append(torch.stack(data, dim=0).t())
def forward(self, x, edge_index, size=None, batch=None, neg_edge_index=None, attention_edge_index=None): """ :param x: [N, F] :param edge_index: [2, E] :param size: :param batch: None or [B] :param neg_edge_index: When using explicitly given negative edges. :param attention_edge_index: [2, E'], Use for link prediction :return: """ if self.pretraining and self.pretraining_noise_ratio > 0.0: edge_index, _ = dropout_adj( edge_index, p=self.pretraining_noise_ratio, force_undirected=is_undirected(edge_index), num_nodes=x.size(0), training=self.training) if size is None and torch.is_tensor(x): edge_index, _ = remove_self_loops(edge_index) edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0)) # [N, F0] * [F0, heads * F] = [N, heads * F] if torch.is_tensor(x): x = torch.matmul(x, self.weight) else: x = (None if x[0] is None else torch.matmul(x[0], self.weight), None if x[1] is None else torch.matmul(x[1], self.weight)) propagated = self.propagate(edge_index, size=size, x=x) if (self.is_super_gat and self.training) or (attention_edge_index is not None) or (neg_edge_index is not None): device = next(self.parameters()).device num_pos_samples = int(self.edge_sample_ratio * edge_index.size(1)) num_neg_samples = int(self.neg_sample_ratio * self.edge_sample_ratio * edge_index.size(1)) if attention_edge_index is not None: neg_edge_index = None elif neg_edge_index is not None: pass elif batch is None: if self.to_undirected_at_neg: edge_index_for_ns = to_undirected(edge_index, num_nodes=x.size(0)) else: edge_index_for_ns = edge_index neg_edge_index = negative_sampling( edge_index=edge_index_for_ns, num_nodes=x.size(0), num_neg_samples=num_neg_samples, ) else: neg_edge_index = batched_negative_sampling( edge_index=edge_index, batch=batch, num_neg_samples=num_neg_samples, ) if self.edge_sample_ratio < 1.0: pos_indices = random.sample(range(edge_index.size(1)), num_pos_samples) pos_indices = torch.tensor(pos_indices).long().to(device) pos_edge_index = edge_index[:, pos_indices] else: pos_edge_index = edge_index att_with_negatives = self._get_attention_with_negatives( x=x, edge_index=pos_edge_index, neg_edge_index=neg_edge_index, total_edge_index=attention_edge_index, ) # [E + neg_E, heads] # Labels if self.training and (self.cache["att_label"] is None or not self.cache_label): att_label = torch.zeros( att_with_negatives.size(0)).float().to(device) att_label[:pos_edge_index.size(1)] = 1. elif self.training and self.cache["att_label"] is not None: att_label = self.cache["att_label"] else: att_label = None self._update_cache("att_label", att_label) self._update_cache("att_with_negatives", att_with_negatives) return propagated
def main(): parser = argparse.ArgumentParser(description='Prepare data for Giant-XRT') parser.add_argument( '--raw-text-path', type=str, required=True, help="Path of raw text (.txt file, each raw correspond to a node)") parser.add_argument( '--vectorizer-config-path', type=str, required=True, help="a path to a json file that specify the tfidf hyper-paramters") parser.add_argument('--data-root-dir', type=str, default="./dataset") parser.add_argument('--xrt-data-dir', type=str, default="./proc_data_xrt") parser.add_argument('--dataset', type=str, default="ogbn-arxiv") parser.add_argument('--max-deg', type=int, default=1000) args = parser.parse_args() print(args) # Change args.save_data_dir to args.save_data_dir/args.dataset save_data_dir = os.path.join(args.xrt_data_dir, args.dataset) dataset = PygNodePropPredDataset(name=args.dataset, root=args.data_root_dir) data = dataset[0] edge_index = data.edge_index # Make sure edge_index is undirected!!! if not is_undirected(edge_index): edge_index = to_undirected(edge_index) # Filtering nodes whose number of edges >= max_degree Degree = degree(edge_index[0]) Filtered_idx = torch.where(Degree < args.max_deg)[0] print('Number of original nodes:{}'.format(data.x.shape[0])) print('Number of filtered nodes:{}'.format(len(Filtered_idx))) # # Construct and save label matrix (adjacencey matrix) Y. Y_csr_all = smat.csr_matrix(to_scipy_sparse_matrix(edge_index)) Y_csr_trn = Y_csr_all[Filtered_idx] smat_util.save_matrix(f"{save_data_dir}/Y.trn.npz", Y_csr_trn) smat_util.save_matrix(f"{save_data_dir}/Y.all.npz", Y_csr_all) print("Saved Y.trn.npz and Y.all.npz") # Apply the same filtering for raw text with open(args.raw_text_path, "r") as fin: node_text_list = fin.readlines() print("|node_text_list={}".format(len(node_text_list))) count = 0 with open(f"{save_data_dir}/X.trn.txt", "w") as fout: for cur_idx, line in enumerate(node_text_list): if Filtered_idx[count].item() == cur_idx: fout.writelines(line) count += 1 assert count == len(Filtered_idx), "count={}, len(Filtered_idx)={}".format( count, len(Filtered_idx)) print("Saved X.trn.txt") # Apply the same filtering for tfidf features vectorizer_config = Vectorizer.load_config_from_args( args) # using args.vectorizer_config_path preprocessor = Preprocessor.train(node_text_list, vectorizer_config, dtype=np.float32) preprocessor.save(f"{save_data_dir}/tfidf-model") X_tfidf_all = preprocessor.predict(node_text_list) X_tfidf_trn = X_tfidf_all[Filtered_idx] smat_util.save_matrix(f"{save_data_dir}/X.all.tfidf.npz", X_tfidf_all) smat_util.save_matrix(f"{save_data_dir}/X.trn.tfidf.npz", X_tfidf_trn) print("Saved X.trn.npz and X.all.npz")
def is_undirected(self): return is_undirected(self.edge_index, self.num_nodes)
def graph_data( edge_list_path, node_features_path, protein_ids_path, protein_id_col_node="Gene", protein_id_col_prot="ensembl.gene", sparse_tensor=True, cut=0, ): """Creates a data object from the given tsv files. Parameters ---------- edge_list_path : str Path to edge list file. The first two columns -- edges, the rest are edge attributes. node_features_path : str Path to a file with node features. protein_ids_path : str Protein ids filepath. File should contain protein_id_col_node : str, optional Column with ids in the node features file, by default "Gene" protein_id_col_prot : str, optional Column with ids in the protein_ids_file, by default "ensembl.gene" sparse_tensor : bool, optional If true, a sparse tensor will be constructed instead of edge index and edge_weight, by default True cut : int, optional Edges with values below the cut will be dropped, by default 0 Returns ------- torch_geometric.data.Data Data file with a graph """ a = pd.read_csv(edge_list_path).values edge_attr = a[:, 2:] / 1000.0 # cut the edges cut_mask = edge_attr[:, -1] > cut edge_ind = torch.tensor(a[:, :2][cut_mask], dtype=torch.long) edge_attr = torch.tensor(edge_attr[cut_mask], dtype=torch.float32) # force undirected if not is_undirected(edge_ind): edge_ind = torch.cat([edge_ind, edge_ind[:, [1, 0]]], 0) edge_attr = torch.cat([edge_attr, edge_attr], 0) # features protein_ids = pd.read_csv(protein_ids_path, sep="\t")[["id", protein_id_col_prot]] x = pd.read_csv(node_features_path, sep="\t") feature_columns = x.drop(protein_id_col_node, 1).columns x = pd.merge( protein_ids, x, how="left", left_on=protein_id_col_prot, right_on=protein_id_col_node, ).sort_values("id")[feature_columns] x.fillna(x.mean(), inplace=True) x = torch.tensor(((x - x.mean()) / x.std()).values, dtype=torch.float32) data = Data(x, edge_ind.T, edge_attr, id=torch.arange(x.shape[0])) if sparse_tensor: tsp = ToSparseTensor(False) data = tsp(data) return data
def get_graph_property(graph_property_list, dataset_class, dataset_name, data_root, verbose=True, **kwargs): _data_attr = get_dataset_or_loader(dataset_class, dataset_name, data_root, seed=42, **kwargs) train_d, val_d, test_d = _data_attr if dataset_name in ["PPI", "WebKB4Univ", "CLUSTER"]: cum_sum = 0 y_list, edge_index_list = [], [] for _data in chain(train_d, val_d, test_d): y_list.append(_data.y) edge_index_list.append(_data.edge_index + cum_sum) cum_sum += _data.y.size(0) y = torch.cat(y_list, dim=0) edge_index = torch.cat(edge_index_list, dim=1) else: data = train_d[0] y, edge_index = data.y, data.edge_index y_list, edge_index_list = [y], [edge_index] # to_undirected one_nxg = to_networkx(Data(edge_index=edge_index), to_undirected=is_undirected(edge_index)) nxg_list = [ to_networkx(Data(edge_index=ei), to_undirected=is_undirected(edge_index)) for ei in edge_index_list ] ni_nxg_list = [deepcopy(nxg) for nxg in nxg_list] for ni_nxg in ni_nxg_list: ni_nxg.remove_nodes_from(list(nx.isolates(ni_nxg))) gp_dict = {} if graph_property_list is None or "diameter" in graph_property_list: diameter_list = [] for ni_nxg in ni_nxg_list: ni_nxg = ni_nxg.to_undirected() # important for computing cc. for cc in nx.connected_components(ni_nxg): ni_nxg_cc = ni_nxg.subgraph(cc).copy() diameter_list.append( nx.algorithms.distance_measures.diameter(ni_nxg_cc)) gp_dict["diameter_mean"] = float(np.mean(diameter_list)) gp_dict["diameter_std"] = float(np.std(diameter_list)) gp_dict["diameter_max"] = float(np.max(diameter_list)) gp_dict["diameter_min"] = float(np.min(diameter_list)) gp_dict["diameter_n"] = len(diameter_list) if graph_property_list is None or "average_clustering_coefficient" in graph_property_list: gp_dict["average_clustering_coefficient"] = nx.average_clustering( one_nxg) if verbose: print(f"{dataset_class} / {dataset_name} / {data_root}") pprint(gp_dict) if graph_property_list is None or "centrality" in graph_property_list: dc = nx.degree_centrality(one_nxg) gp_dict["degree_centrality_mean"] = float(np.mean(list(dc.values()))) gp_dict["degree_centrality_std"] = float(np.std(list(dc.values()))) cc = nx.closeness_centrality(one_nxg) gp_dict["closeness_centrality_mean"] = float(np.mean(list( cc.values()))) gp_dict["closeness_centrality_std"] = float(np.std(list(cc.values()))) if graph_property_list is None or "assortativity" in graph_property_list: gp_dict[ "degree_assortativity_coefficient"] = nx.degree_assortativity_coefficient( one_nxg) if verbose: print(f"{dataset_class} / {dataset_name} / {data_root}") pprint(gp_dict) return gp_dict
def generate_pyg_data(data, n_class, time_budget, use_dim_reduction=True, use_feature_generation=True, use_label_distribution=False, use_node_degree=False, use_node_degree_binary=False, use_node_embed=True, use_one_hot_label=False): other_needed = dict() x = data['fea_table'] df = data['edge_file'] edge_index = df[['src_idx', 'dst_idx']].to_numpy() edge_index = sorted(edge_index, key=lambda d: d[0]) edge_index = torch.tensor(edge_index, dtype=torch.long).transpose(0, 1) edge_weight = df['edge_weight'].to_numpy() edge_weight = torch.tensor(edge_weight, dtype=torch.float32) num_nodes = x.shape[0] y = n_class * torch.ones(num_nodes, dtype=torch.long) inds = data['train_label'][['node_index']].to_numpy() train_y = data['train_label'][['label']].to_numpy() y[inds] = torch.tensor(train_y, dtype=torch.long) train_indices = data['train_indices'] test_indices = data['test_indices'] flag_directed_graph = not is_undirected(edge_index) ### feature engineering ### flag_none_feature = False if use_dim_reduction: x, flag_none_feature = dim_reduction(x) else: x = x.to_numpy() flag_none_feature = (x.shape[1] == 1) if use_feature_generation: other_needed["x"] = x other_needed["y"] = y other_needed["n_class"] = n_class other_needed["edge_index"] = edge_index other_needed["edge_weight"] = edge_weight other_needed["time_budget"] = time_budget other_needed["flag_none_feature"] = flag_none_feature other_needed["flag_directed_graph"] = flag_directed_graph other_needed["use_label_distribution"] = use_label_distribution other_needed["use_node_degree"] = use_node_degree other_needed["use_node_degree_binary"] = use_node_degree_binary other_needed["use_node_embed"] = use_node_embed other_needed["use_one_hot_label"] = use_one_hot_label added_features = feature_generation(x, y, n_class, edge_index, edge_weight, flag_none_feature, flag_directed_graph, time_budget, use_label_distribution=use_label_distribution, use_node_degree=use_node_degree, use_node_degree_binary=use_node_degree_binary, use_node_embed=use_node_embed, use_one_hot_label=use_one_hot_label) if added_features: x = np.concatenate([x]+added_features, axis=1) only_one_hot_id = False if x.shape[1] != 1: #remove raw node_index x = x[:,1:] else: #one hot encoder of node_index (backup plan) x = np.eye(num_nodes) only_one_hot_id = True logger.info('x.shape after feature engineering: {}'.format(x.shape)) x = torch.tensor(x, dtype=torch.float) non_zero_index = torch.nonzero(edge_weight).reshape(-1) edge_weight = edge_weight[non_zero_index] edge_index = edge_index[:,non_zero_index] data = Data(x=x, edge_index=edge_index, y=y, edge_weight=edge_weight) data.num_nodes = num_nodes data.train_indices = train_indices data.test_indices = test_indices train_mask = torch.zeros(num_nodes, dtype=torch.bool) train_mask[train_indices] = 1 data.train_mask = train_mask test_mask = torch.zeros(num_nodes, dtype=torch.bool) test_mask[test_indices] = 1 data.test_mask = test_mask data["directed"] = flag_directed_graph # used for directed DGL-GCN return data, other_needed, only_one_hot_id
def test_random_link_split_on_hetero_data(): data = HeteroData() data['p'].x = torch.arange(100) data['a'].x = torch.arange(100, 300) data['p', 'p'].edge_index = get_edge_index(100, 100, 500) data['p', 'p'].edge_index = to_undirected(data['p', 'p'].edge_index) data['p', 'p'].edge_attr = torch.arange(data['p', 'p'].num_edges) data['p', 'a'].edge_index = get_edge_index(100, 200, 1000) data['p', 'a'].edge_attr = torch.arange(500, 1500) data['a', 'p'].edge_index = data['p', 'a'].edge_index.flip([0]) data['a', 'p'].edge_attr = torch.arange(1500, 2500) transform = RandomLinkSplit(num_val=0.2, num_test=0.2, is_undirected=True, edge_types=('p', 'p')) train_data, val_data, test_data = transform(data) assert len(train_data['p']) == 1 assert len(train_data['a']) == 1 assert len(train_data['p', 'p']) == 4 assert len(train_data['p', 'a']) == 2 assert len(train_data['a', 'p']) == 2 assert is_undirected(train_data['p', 'p'].edge_index, train_data['p', 'p'].edge_attr) assert is_undirected(val_data['p', 'p'].edge_index, val_data['p', 'p'].edge_attr) assert is_undirected(test_data['p', 'p'].edge_index, test_data['p', 'p'].edge_attr) transform = RandomLinkSplit(num_val=0.2, num_test=0.2, edge_types=('p', 'a'), rev_edge_types=('a', 'p')) train_data, val_data, test_data = transform(data) assert len(train_data['p']) == 1 assert len(train_data['a']) == 1 assert len(train_data['p', 'p']) == 2 assert len(train_data['p', 'a']) == 4 assert len(train_data['a', 'p']) == 2 assert train_data['p', 'a'].edge_index.size() == (2, 600) assert train_data['p', 'a'].edge_attr.size() == (600, ) assert train_data['p', 'a'].edge_attr.min() >= 500 assert train_data['p', 'a'].edge_attr.max() <= 1500 assert train_data['a', 'p'].edge_index.size() == (2, 600) assert train_data['a', 'p'].edge_attr.size() == (600, ) assert train_data['a', 'p'].edge_attr.min() >= 500 assert train_data['a', 'p'].edge_attr.max() <= 1500 assert train_data['p', 'a'].edge_label_index.size() == (2, 1200) assert train_data['p', 'a'].edge_label.size() == (1200, ) assert val_data['p', 'a'].edge_index.size() == (2, 600) assert val_data['p', 'a'].edge_attr.size() == (600, ) assert val_data['p', 'a'].edge_attr.min() >= 500 assert val_data['p', 'a'].edge_attr.max() <= 1500 assert val_data['a', 'p'].edge_index.size() == (2, 600) assert val_data['a', 'p'].edge_attr.size() == (600, ) assert val_data['a', 'p'].edge_attr.min() >= 500 assert val_data['a', 'p'].edge_attr.max() <= 1500 assert val_data['p', 'a'].edge_label_index.size() == (2, 400) assert val_data['p', 'a'].edge_label.size() == (400, ) assert test_data['p', 'a'].edge_index.size() == (2, 800) assert test_data['p', 'a'].edge_attr.size() == (800, ) assert test_data['p', 'a'].edge_attr.min() >= 500 assert test_data['p', 'a'].edge_attr.max() <= 1500 assert test_data['a', 'p'].edge_index.size() == (2, 800) assert test_data['a', 'p'].edge_attr.size() == (800, ) assert test_data['a', 'p'].edge_attr.min() >= 500 assert test_data['a', 'p'].edge_attr.max() <= 1500 assert test_data['p', 'a'].edge_label_index.size() == (2, 400) assert test_data['p', 'a'].edge_label.size() == (400, ) transform = RandomLinkSplit(num_val=0.2, num_test=0.2, is_undirected=True, edge_types=[('p', 'p'), ('p', 'a')], rev_edge_types=[None, ('a', 'p')]) train_data, val_data, test_data = transform(data) assert len(train_data['p']) == 1 assert len(train_data['a']) == 1 assert len(train_data['p', 'p']) == 4 assert len(train_data['p', 'a']) == 4 assert len(train_data['a', 'p']) == 2 assert is_undirected(train_data['p', 'p'].edge_index, train_data['p', 'p'].edge_attr) assert train_data['p', 'a'].edge_index.size() == (2, 600) assert train_data['a', 'p'].edge_index.size() == (2, 600)
def is_undirected(self) -> bool: r"""Returns :obj:`True` if graph edges are undirected.""" edge_index, _, _ = to_homogeneous_edge_index(self) return is_undirected(edge_index, num_nodes=self.num_nodes)
def is_undirected(self): r"""Returns :obj:`True`, if graph edges are undirected.""" return is_undirected(self.edge_index, self.num_nodes)
def forward(self, data, return_hidden_feature=False): data.x = data.x.cuda() data.edge_attr = data.edge_attr.cuda() data.edge_index = data.edge_index.cuda() data.batch = data.batch.cuda() # make sure that we have undirected graph if not is_undirected(data.edge_index): data.edge_index = to_undirected(data.edge_index) # make sure that nodes can propagate messages to themselves if not contains_self_loops(data.edge_index): data.edge_index, data.edge_attr = add_self_loops( data.edge_index, data.edge_attr.view(-1)) """ # now select the top 5 closest neighbors to each node dense_adj = sparse_to_dense(edge_index=data.edge_index, edge_attr=data.edge_attr) #top_k_vals, top_k_idxs = torch.topk(dense_adj, dim=0, k=5, largest=False) #dense_adj = torch.zeros_like(dense_adj).scatter(1, top_k_idxs, top_k_vals) dense_adj[dense_adj == 0] = 10000 # insert artificially large values for 0 valued entries that will throw off NN calculation top_k_vals, top_k_idxs = torch.topk(dense_adj, dim=1, k=15, largest=False) dense_adj = torch.zeros_like(dense_adj).scatter(1, top_k_idxs, top_k_vals) data.edge_index, data.edge_attr = dense_to_sparse(dense_adj) """ # covalent_propagation # add self loops to enable self propagation covalent_edge_index, covalent_edge_attr = self.covalent_neighbor_threshold( data.edge_index, data.edge_attr) ( non_covalent_edge_index, non_covalent_edge_attr, ) = self.non_covalent_neighbor_threshold(data.edge_index, data.edge_attr) # covalent_propagation and non_covalent_propagation covalent_x = self.covalent_propagation(data.x, covalent_edge_index, covalent_edge_attr) non_covalent_x = self.non_covalent_propagation( covalent_x, non_covalent_edge_index, non_covalent_edge_attr) # zero out the protein features then do ligand only gather...hacky sure but it gets the job done non_covalent_ligand_only_x = non_covalent_x non_covalent_ligand_only_x[data.x[:, 14] == -1] = 0 pool_x = self.global_add_pool(non_covalent_ligand_only_x, data.batch) # fully connected and output layers if return_hidden_feature: # return prediction and atomistic features (covalent result, non-covalent result, pool result) avg_covalent_x, _ = avg_pool_x(data.batch, covalent_x, data.batch) avg_non_covalent_x, _ = avg_pool_x(data.batch, non_covalent_x, data.batch) fc0_x, fc1_x, output_x = self.output(pool_x, return_hidden_feature=True) return avg_covalent_x, avg_non_covalent_x, pool_x, fc0_x, fc1_x, output_x else: return self.output(pool_x)