def build_pc_diag(A: spmatrix) -> spmatrix: """Diagonal preconditioner.""" return sp.spdiags(1.0 / A.diagonal(), 0, A.shape[0], A.shape[0])
def split_graph_edges(sparse_matrix: sp.spmatrix, val_ratio=0.02, test_ratio=0.02, seed: int = None): """ Function for building train/validation/test split in graph. Randomly splits edges into three groups, train, val, and test. The ratios are set according to input ration values. The seed for the split can be specified manually for reproducibility. Also removes diagonal elements from the sparse matrix. The input is assumed to be symmetric. This symmetry is maintained in the splitting process as well. Args: sparse_matrix: Input sparse matrix to be split. val_ratio: Validation split ratio. test_ratio: Test set split ratio. seed: Seed for random splitting. Returns: The adjacency matrix for the training edges and the training edges. The validation and test sets only return the edges but also include fake edges for verification by the model. """ np.random.seed(seed) # For reproducibility of data split, etc. assert (0 <= val_ratio < 1) and (0 <= test_ratio < 1) and (0 <= val_ratio + test_ratio < 1) # Removing diagonal elements. sparse_matrix -= sp.diags(sparse_matrix.diagonal(), shape=sparse_matrix.shape) sparse_matrix.eliminate_zeros() upper_triangular = sp.triu(sparse_matrix) upper_triangular_info = sparse_to_info(upper_triangular) edge_coords = upper_triangular_info[0] # edge_coords_all = sparse_to_info(sparse_matrix)[0] num_edges = edge_coords.shape[0] num_val_edges = int(num_edges * val_ratio) num_test_edges = int(num_edges * test_ratio) num_train_edges = num_edges - num_val_edges - num_test_edges edge_idx = np.random.permutation(num_edges) train_edges = edge_coords[edge_idx[:num_train_edges]] val_edges = edge_coords[edge_idx[num_train_edges:-num_test_edges]] test_edges = edge_coords[edge_idx[-num_test_edges:]] num_nodes = upper_triangular.shape[0] edge_coordinates = set(tuple(coord) for coord in edge_coords) # Generate fake edges for testing later. fake_test_edges = edge_coordinates.copy( ) # Use sets for fast removal of duplicates. while len(fake_test_edges) < (num_edges + num_test_edges): # Indices are always upper triangular when sorted. Not allowing replacement removes diagonal elements. indices = tuple( np.sort(np.random.choice(num_nodes, size=2, replace=False), axis=-1)) fake_test_edges.add( indices ) # Numpy arrays cannot be hashed, hence the need to convert to tuples. # Generate fake edges for validation for later. fake_val_edges = fake_test_edges.copy( ) # Deep copy necessary to prevent adding to original set. while len(fake_val_edges) < (num_edges + num_test_edges + num_val_edges): indices = tuple( np.sort(np.random.choice(num_nodes, size=2, replace=False), axis=-1)) fake_val_edges.add(indices) # Turn the set into an array of shape (n,2), removing the unnecessary elements from each set. fake_val_edges = np.array(list(fake_val_edges - fake_test_edges)) fake_test_edges = np.array(list(fake_test_edges - edge_coordinates)) # Rebuild the adjacency matrix. data = np.ones(shape=num_train_edges) # Connections are marked by 1. # Build a sparse matrix where the locations given by the indices in train_edges # are given values of 1 to mark connections. # Coordinates in train_edges must be given as separate vectors, hence the horizontal split. train_matrix = sp.csr_matrix( (data, (train_edges[:, 0], train_edges[:, 1])), shape=sparse_matrix.shape) # Symmetric matrix should be the same on the transpose. train_matrix += train_matrix.transpose() # Returning the edge coordinates to symmetric form. train_edges = np.concatenate([train_edges, np.fliplr(train_edges)], axis=0) val_edges = np.concatenate([val_edges, np.fliplr(val_edges)], axis=0) test_edges = np.concatenate([test_edges, np.fliplr(test_edges)], axis=0) return train_matrix, train_edges, val_edges, fake_val_edges, test_edges, fake_test_edges