def grid_graph(m, corners=False): z = graph.grid(m) # compute pairwise distance dist, idx = graph.distance_sklearn_metrics(z, k=number_edges, metric=metric) A = graph.adjacency(dist, idx) # build adjacent matrix # Connections are only vertical or horizontal on the grid. # Corner vertices are connected to 2 neightbors only. if corners: A = A.toarray() A[A < A.max()/1.5] = 0 A = scipy.sparse.csr_matrix(A) print('{} edges'.format(A.nnz)) print("{} > {} edges".format(A.nnz//2, number_edges*m**2//2)) return A
def grid_graph(m, corners=False): k=8 z = graph.grid(m) dist, idx = graph.distance_sklearn_metrics(z, k=8, metric='euclidean') A = graph.adjacency(dist, idx) # Connections are only vertical or horizontal on the grid. # Corner vertices are connected to 2 neightbors only. if corners: import scipy.sparse A = A.toarray() A[A < A.max()/1.5] = 0 A = scipy.sparse.csr_matrix(A) print('{} edges'.format(A.nnz)) print("{} > {} edges".format(A.nnz//2, k*m**2//2)) return A
def grid_graph(m, corners=False): """ 返回 graph 的连接矩阵(symmetric, nearest k neighbors) """ num_edges = 8 z = graph.grid(m) dist, idx = graph.distance_sklearn_metrics(z, k=8, metric='euclidean') A = graph.adjacency(dist, idx) print('A.nnz: {}'.format(A.nnz)) if corners: import scipy.sparse A = A.toarray() A[A < A.max()/1.5] = 0 A = scipy.sparse.csr_matrix(A) print('{} edges'.format(A.nnz)) print("{} > {} edges".format(A.nnz // 2, num_edges * (m**2) // 2)) return A
def grid_graph(m, r, corners=False): z, z_theta = graph.grid_sphere( m, r) #z is rectangular coordinate system while z_theta is polar print('z shape is' + str(np.array(z).shape) + 'z_theta shape is' + str(np.array(z_theta).shape)) dist, idx = graph.distance_sklearn_metrics(np.array(z), k=FLAGS.number_edges, metric=FLAGS.metric) A = graph.adjacency(dist, idx) # Connections are only vertical or horizontal on the grid. # Corner vertices are connected to 2 neightbors only. if corners: import scipy.sparse A = A.toarray() A[A < A.max() / 1.5] = 0 A = scipy.sparse.csr_matrix(A) print('{} edges'.format(A.nnz)) print("{} > {} edges".format(A.nnz // 2, FLAGS.number_edges * m**2 // 2)) return z, z_theta, A
def build_graph(cls, args): number_edges = args.number_edges metric = args.metric normalized_laplacian = args.normalized_laplacian coarsening_levels = args.coarsening_levels data_dir = 'data/20news' embed_path = os.path.join(data_dir, 'embeddings.npy') graph_data = np.load(embed_path).astype(np.float32) dist, idx = graph.distance_sklearn_metrics(graph_data, k=number_edges, metric=metric) adj_matrix = graph.adjacency(dist, idx) print("{} > {} edges".format(adj_matrix.nnz // 2, number_edges * graph_data.shape[0] // 2)) adj_matrix = graph.replace_random_edges(adj_matrix, 0) graphs, perm = coarsening.coarsen(adj_matrix, levels=coarsening_levels, self_connections=False) laplacians = [ graph.laplacian(g, normalized=normalized_laplacian) for g in graphs ] cls.perm = perm cls.graphs = graphs cls.laplacians = laplacians
# The second thing we need is a **graph between features**, i.e. an adjacency matrix $A \in \mathbb{R}^{d_x \times d_x}$. # Structuring data with graphs is very flexible: it can accomodate both structured and unstructured data. # 1. **Structured data**. # 1. The data is structured by an Euclidean domain, e.g. $x_i$ represents an image, a sound or a video. We can use a classical ConvNet with 1D, 2D or 3D convolutions or a graph ConvNet with a line or grid graph (however losing the orientation). # 2. The data is structured by a graph, e.g. the data lies on a transportation, energy, brain or social network. # 2. **Unstructured data**. We could use a fully connected network, but the learning and computational complexities are gonna be large. An alternative is to construct a sparse similarity graph between features (or between samples) and use a graph ConvNet, effectively structuring the data and drastically reducing the number of parameters through weight sharing. As for classical ConvNets, the number of parameters are independent of the input size. # # There are many ways, supervised or unsupervised, to construct a graph given some data. And better the graph, better the performance ! For this example we'll define the adjacency matrix as a simple similarity measure between features. Below are the choices one has to make when constructing such a graph. # 1. The distance function. We'll use the Euclidean distance $d_{ij} = \|x_i - x_j\|_2$. # 2. The kernel. We'll use the Gaussian kernel $a_{ij} = \exp(d_{ij}^2 / \sigma^2)$. # 3. The type of graph. We'll use a $k$ nearest neigbors (kNN) graph. # In[ ]: dist, idx = graph.distance_scipy_spatial(X_train.T, k=10, metric='euclidean') A = graph.adjacency(dist, idx).astype(np.float32) assert A.shape == (d, d) print('d = |V| = {}, k|V| < |E| = {}'.format(d, A.nnz)) plt.spy(A, markersize=2, color='black') # To be able to pool graph signals, we need first to coarsen the graph, i.e. to find which vertices to group together. At the end we'll have multiple graphs, like a pyramid, each at one level of resolution. The finest graph is where the input data lies, the coarsest graph is where the data at the output of the graph convolutional layers lie. That data, of reduced spatial dimensionality, can then be fed to a fully connected layer. # # The parameter here is the number of times to coarsen the graph. Each coarsening approximately reduces the size of the graph by a factor two. Thus if you want a pooling of size 4 in the first layer followed by a pooling of size 2 in the second, you'll need to coarsen $\log_2(4+2) = 3$ times. # # After coarsening we rearrange the vertices (and add fake vertices) such that pooling a graph signal is analog to pooling a 1D signal. See the [paper] for details. # # [paper]: https://arxiv.org/abs/1606.09375 # In[ ]:
for i in range(all_combs.shape[0]): X_train[i, :, :, 0] = X[all_combs[i, 0], :, :] X_train[i, :, :, 1] = X[all_combs[i, 1], :, :] if y[all_combs[i, 0]] != y[all_combs[i, 1]]: y_train[i] = 0 # -1 if site[all_combs[i, 0]] != site[all_combs[i, 1]]: site_train[i] = 0 print("Training samples shape") print(X_train.shape) # Get the graph structure dist, idx = graph.distance_scipy_spatial(coords, k=10, metric='euclidean') A = graph.adjacency(dist, idx).astype(np.float32) graphs = [] for i in range(3): graphs.append(A) # Calculate Laplacians L = [graph.laplacian(A, normalized=True) for A in graphs] # Number of nodes in graph and features print("Number of controls in the dataset: ") print(y.sum()) # Prepare training testing and validation sets X_test, y_test, site_test = prepare_pairs(X, y, site, test_idx)
def gcn_run(fname, train_num, batch_size): rs = 222 print("Random state is %d" % rs) prng = np.random.RandomState(rs) np.random.seed(seed=222) data = np.load(fname + '.npz') X_train = np.array(data['name1'], dtype=np.float32) y_train = np.array(data['name2'], dtype=np.float32) X_test = np.array(data['name3'], dtype=np.float32) y_test = np.array(data['name4'], dtype=np.float32) all_combs = np.array(data['name5'], dtype=np.float32) site_train = np.array(data['name6'], dtype=np.float32) site_test = np.array(data['name7'], dtype=np.float32) tr_idx = np.array(data['name8'], dtype=np.float32) dist, idx = graph.distance_scipy_spatial(coords, k=10, metric='euclidean') A = graph.adjacency(dist, idx).astype(np.float32) graphs = [] for i in range(3): graphs.append(A) # Calculate Laplacians L = [graph.laplacian(A, normalized=True) for A in graphs] n, m, f, _ = X_train.shape # Graph Conv-net features = 64 K = 3 params = dict() params['num_epochs'] = train_num params['batch_size'] = batch_size # params['eval_frequency'] = X_train.shape[0] / (params['batch_size'] * 2) params['eval_frequency'] = 1 # Building blocks. params['filter'] = 'chebyshev5' params['brelu'] = 'b2relu' params['pool'] = 'apool1' # Architecture. params['F'] = [features, features] # Number of graph convolutional filters. params['K'] = [K, K] # Polynomial orders. params['p'] = [1, 1] # Pooling sizes. params['M'] = [1] # Output dimensionality of fully cofeannected layers. params['input_features'] = f params['lamda'] = 0.35 params['mu'] = 0.6 # Optimization. params['regularization'] = 5e-3 params['dropout'] = 0.8 params['learning_rate'] = 1e-2 params['decay_rate'] = 0.95 params['momentum'] = 0 params['decay_steps'] = X_train.shape[0] / params['batch_size'] params['dir_name'] = 'siamese_' + time.strftime("%Y_%m_%d_%H_%M") + '_feat' + str(params['F'][0]) + '_' + \ str(params['F'][1]) + '_K' + str(K) + '_state' print(params) # Run model model = models_siamese.siamese_cgcnn_cor(L, **params) print("Constructor finished") accuracy, loss, scores_summary, tr_error, test_error = model.fit( X_train, y_train, site_train, X_test, y_test, site_test) #print('Time per step: {:.2f} ms'.format(t_step*1000)) # Save training tr_res = model.evaluate(X_train, y_train, site_train) # Evaluate test data print("Test accuracy is:") res = model.evaluate(X_test, y_test, site_test) print(res[0]) return tr_error, test_error
def constructingGraph(inFIDDic, data_type, mean_geometry=0, std_geometry=1, is_distance=True): #inFiDDic {key,[label,pointlist]} if len(inFIDDic) < 1: return None vertices_Geometry, adjacencies, labels, inFIDs, process_count = [], [], [], [], 0 for k in inFIDDic: [label, Node_coords, Node_features] = inFIDDic[k] assert len(Node_coords) == len(Node_features) subObject_size = len(Node_coords) # # 1 get the label of this sample. label = 1 if label == 3 else 0 # # 3 get the adjacency graph of the building group (one sample). # # MST, Delaunay, K-NN points = np.array(Node_coords) adjacency = np.zeros((subObject_size, subObject_size)) tri = Delaunay(points[:, 0:2]) for i in range(0, tri.nsimplex): if i > tri.neighbors[i, 2]: adjacency[tri.vertices[i, 0], tri.vertices[i, 1]] = 1 adjacency[tri.vertices[i, 1], tri.vertices[i, 0]] = 1 if i > tri.neighbors[i, 0]: adjacency[tri.vertices[i, 1], tri.vertices[i, 2]] = 1 adjacency[tri.vertices[i, 2], tri.vertices[i, 1]] = 1 if i > tri.neighbors[i, 1]: adjacency[tri.vertices[i, 2], tri.vertices[i, 0]] = 1 adjacency[tri.vertices[i, 0], tri.vertices[i, 2]] = 1 adjacency = scipy.sparse.coo_matrix(adjacency, shape=(subObject_size, subObject_size)) # In order to make the calculation simpler, only the distance between the center points of the buildings is provided here. # According to the author's experience, the closest distance of two building outlines coule be a better opition for this task. distances = sklearn.metrics.pairwise.pairwise_distances( points[:, 0:2], metric="euclidean", n_jobs=1) if False: # K-nearest neighbor graph. # Distance matrix. is it necessary to be normalized? idx = np.argsort(distances)[:, 1:1 + 1] distances.sort() distances = graph.adjacency(distances[:, 1:1 + 1], idx) adjacency = scipy.sparse.coo_matrix( np.ones((subObject_size, subObject_size)), shape=(subObject_size, subObject_size)).multiply(distances) # print(distances.toarray())# adjacency = adjacency.multiply(distances) else: adjacency = adjacency.multiply(distances) if False: # MST graph. adjacency = scipy.sparse.csgraph.minimum_spanning_tree( adjacency) adjacency = scipy.sparse.csr_matrix(adjacency).toarray() adjacency += adjacency.T - np.diag(adjacency.diagonal()) else: # Delaunay graph. adjacency = scipy.sparse.csr_matrix(adjacency).toarray() #if is_distance: # # Distance matrix. is it necessary to be normalized? # distances = sklearn.metrics.pairwise.pairwise_distances(points[:,0:2], metric="euclidean", n_jobs=1) # adjacency = adjacency.multiply(distances) adjacency = scipy.sparse.csr_matrix(adjacency) assert subObject_size == points.shape[0] assert type(adjacency) is scipy.sparse.csr.csr_matrix # # 4 collecting the sample: vertice_Geometry,vertice_Fourier,adjacency,label. vertices_Geometry.append(Node_features) adjacencies.append(adjacency) labels.append(label) inFIDs.append(k) # preprocessing inputs. pro_method = True # to control the m if pro_method: # standardizing if data_type == 1: # Calculate the mean and std of train dataset, they also will be used to validation and test dataset. concatenate_Geometry = np.concatenate(vertices_Geometry, axis=0) mean_geometry = concatenate_Geometry.mean(axis=0) std_geometry = concatenate_Geometry.std(axis=0) if data_type == 1: file = r'C:\Users\wh\Desktop\gcnn_classification\lib\data\_used_new22.txt' file = "./lib/data/_config_22.txt" conc = np.vstack((mean_geometry, std_geometry)) np.savetxt(file, conc, fmt='%.18f') if data_type == -1: # for the extra experiment. # Import the mean and std of train dataset. file = r'C:\Users\wh\Desktop\gcnn_classification\lib\data\_used_new22.txt' file = "./lib/data/_config_22.txt" conc = np.loadtxt(file) mean_geometry, std_geometry = conc[0, :], conc[1, :] mean_fourier, std_fourier = conc[0, :], conc[ 1, :] # This two parameters are just for fun, do not matter. print( "\n========import the mean and std of train dataset from text file========\n" ) #print(mean_geometry) #print(std_geometry) if True: # # The efficiency can be improved by means of vectorization.s for i in range(0, len(vertices_Geometry)): vertices_shape = np.array((vertices_Geometry[i])).shape vertices_Geometry[i] -= np.tile( mean_geometry, vertices_shape[0]).reshape(vertices_shape) vertices_Geometry[i] /= np.tile( std_geometry, vertices_shape[0]).reshape(vertices_shape) # vertices_shape=np.array((vertices_Fourier[i])).shape # vertices_Fourier[i] -= np.tile(mean_fourier,vertices_shape[0]).reshape(vertices_shape) # vertices_Fourier[i] /= np.tile(std_fourier,vertices_shape[0]).reshape(vertices_shape) else: # normalizing, it is not working very well. if data_type == 1: # Calculate the mean and std of train dataset, they also will be used to validation and test dataset. concatenate_Geometry = np.concatenate(vertices_Geometry, axis=0) mean_geometry = concatenate_Geometry.min(axis=0) std_geometry = concatenate_Geometry.max(axis=0) file = r'C:\Users\wh\Desktop\gcnn_classification\lib\data\_used2.txt' file = "./data/_config_ex.txt" conc = np.vstack((mean_geometry, std_geometry)) np.savetxt(file, conc, fmt='%.18f') if not pro_method: # # The efficiency can be improved by means of vectorization.s for i in range(0, len(vertices_Geometry)): vertices_shape = np.array((vertices_Geometry[i])).shape vertices_Geometry[i] = (vertices_Geometry[i] - np.tile( mean_geometry, vertices_shape[0]).reshape(vertices_shape) ) / (std_geometry - mean_geometry) # padding. # the max number of vertices in a group (sample). maxnum_vertices = 128 #max([len(vertices_Geometry[i]) for i in range(0,len(vertices_Geometry))]) graph_vertices_geo, graph_adjacencies = [], [] assert len(vertices_Geometry) == len(adjacencies) == len(labels) #print(len(vertices_Geometry)) #print(len(vertices_Geometry[i])) #print(np.pad(vertices_Geometry[i], ((0, maxnum_vertices-len(vertices_Geometry[i])),(0,0)), 'constant', constant_values=(0)).shape) #exit() for i in range(0, len(vertices_Geometry)): graph_vertices_geo.append( np.pad(vertices_Geometry[i], ((0, maxnum_vertices - len(vertices_Geometry[i])), (0, 0)), 'constant', constant_values=(0))) graph_adjacencies.append( np.pad(adjacencies[i].toarray(), ((0, maxnum_vertices - adjacencies[i].shape[0]), (0, maxnum_vertices - adjacencies[i].shape[0])), 'constant', constant_values=(0))) # collecting. graph_vertices_geo = np.stack(graph_vertices_geo, axis=0).astype( np.float32) #NSample x NVertices x NFeature graph_adjacencies = np.stack(graph_adjacencies, axis=0).astype( np.float32) #NSample x NVertices x NVertices graph_labels = np.array(labels).astype(np.int64) #NSample x 1 graph_inFIDs = np.array(inFIDs).astype(np.int64) #NSample x 1 graph_size = graph_labels.shape[0] #NSample graph_Laplacian = np.stack([ graph.laplacian( scipy.sparse.csr_matrix(A), normalized=True, rescaled=True) for A in graph_adjacencies ], axis=0) return [ graph_vertices_geo, graph_Laplacian, graph_labels, graph_inFIDs, graph_size, mean_geometry, std_geometry ] # load_data('gz1124i.json', [0.6, 0.2, 0.2])
def ABIDE_save(num_subjects, filename): rs = 33 print("Random state is %d" % rs) prng = np.random.RandomState(rs) # Split into training, validation and testing sets training_num = num_subjects lines = int(1.2 * num_subjects) # I am guessing this is to create a validation set within training data # Used in the next moving step # Get subject features atlas = 'ho' kind = 'correlation' subject_IDs = abide_utils.get_ids(lines) # Get all subject networks networks = abide_utils.load_all_networks(subject_IDs, kind, atlas_name=atlas) X = np.array(networks) # with open('GCN_train.pkl', 'wb') as f: # Python 3: open(..., 'wb') # pickle.dump(X, f, 2) # f.close() # Number of nodes nodes = X.shape[1] # Get ROI coordinates coords = abide_utils.get_atlas_coords(atlas_name=atlas) # Get subject labels label_dict = abide_utils.get_subject_label(subject_IDs, label_name='DX_GROUP') y = np.array([int(label_dict[x]) - 1 for x in sorted(label_dict)]) # Get site ID site = abide_utils.get_subject_label(subject_IDs, label_name='SITE_ID') unq = np.unique(list(site.values())).tolist() site = np.array([unq.index(site[x]) for x in sorted(site)]) # Choose site IDs to include in the analysis site_mask = range(20) X = X[np.in1d(site, site_mask)] y = y[np.in1d(site, site_mask)] site = site[np.in1d(site, site_mask)] tr_idx, test_idx = split_data(site, 0.6) # training_num = int(0.6 * X.shape[0]) prng.shuffle(test_idx) subs_to_add = training_num - len( tr_idx) # subjects that need to be moved from testing to training set tr_idx.extend(test_idx[:subs_to_add]) test_idx = test_idx[subs_to_add:] print("The test indices are the following: ") print(test_idx) all_combs = [] tr_mat = np.array(tr_idx).reshape([int(len(tr_idx) / 6), 6]) for i in range(3): x1 = tr_mat[:, i * 2].flatten() x2 = tr_mat[:, i * 2 + 1].flatten() combs = np.transpose([np.tile(x1, len(x2)), np.repeat(x2, len(x1))]) all_combs.append(combs) all_combs = np.vstack(all_combs) # print(all_combs.shape) n, m, f = X.shape X_train = np.ones((all_combs.shape[0], m, f, 2), dtype=np.float32) y_train = np.ones(all_combs.shape[0], dtype=np.int32) site_train = np.ones(all_combs.shape[0], dtype=np.int32) for i in range(all_combs.shape[0]): X_train[i, :, :, 0] = X[all_combs[i, 0], :, :] X_train[i, :, :, 1] = X[all_combs[i, 1], :, :] if y[all_combs[i, 0]] != y[all_combs[i, 1]]: y_train[i] = 0 # -1 if site[all_combs[i, 0]] != site[all_combs[i, 1]]: site_train[i] = 0 print("Training samples shape") print(X_train.shape) # Get the graph structure dist, idx = graph.distance_scipy_spatial(coords, k=10, metric='euclidean') A = graph.adjacency(dist, idx).astype(np.float32) graphs = [] for i in range(3): graphs.append(A) # Calculate Laplacians L = [graph.laplacian(A, normalized=True) for A in graphs] # Number of nodes in graph and features print("Number of controls in the dataset: ") print(y.sum()) # Prepare training testing and validation sets X_test, y_test, site_test = prepare_pairs(X, y, site, test_idx) # Saving training data for comparison with ann siamese np.savez(filename, '.npz', name1=X_train, name2=y_train, name3=X_test, name4=y_test, name5=all_combs, name6=site_train, name7=site_test, name8=tr_idx) return None
if True: graph_data = train.embeddings.astype(np.float32) else: graph_data = train.data.T.astype(np.float32).toarray() #del train, test #%% [markdown] # # Feature graph #%% t_start = time.process_time() dist, idx = graph.distance_sklearn_metrics(graph_data, k=FLAGS.number_edges, metric=FLAGS.metric) A = graph.adjacency(dist, idx) print("{} > {} edges".format(A.nnz // 2, FLAGS.number_edges * graph_data.shape[0] // 2)) A = graph.replace_random_edges(A, 0) graphs, perm = coarsening.coarsen(A, levels=FLAGS.coarsening_levels, self_connections=False) L = [graph.laplacian(A, normalized=True) for A in graphs] print('Execution time: {:.2f}s'.format(time.process_time() - t_start)) #graph.plot_spectrum(L) #del graph_data, A, dist, idx #%% t_start = time.process_time() train_data = scipy.sparse.csr_matrix( coarsening.perm_data(train_data.toarray(), perm))