Exemple #1
0
        def grid_graph(m, corners=False):
            z = graph.grid(m)
            # compute pairwise distance
            dist, idx = graph.distance_sklearn_metrics(z, k=number_edges, metric=metric)
            A = graph.adjacency(dist, idx) # build adjacent matrix
            # Connections are only vertical or horizontal on the grid.
            # Corner vertices are connected to 2 neightbors only.
            if corners:
                A = A.toarray()
                A[A < A.max()/1.5] = 0
                A = scipy.sparse.csr_matrix(A)
                print('{} edges'.format(A.nnz))

            print("{} > {} edges".format(A.nnz//2, number_edges*m**2//2))
            return A
Exemple #2
0
def grid_graph(m, corners=False):
    k=8
    z = graph.grid(m)
    dist, idx = graph.distance_sklearn_metrics(z, k=8, metric='euclidean')
    A = graph.adjacency(dist, idx)

    # Connections are only vertical or horizontal on the grid.
    # Corner vertices are connected to 2 neightbors only.
    if corners:
        import scipy.sparse
        A = A.toarray()
        A[A < A.max()/1.5] = 0
        A = scipy.sparse.csr_matrix(A)
        print('{} edges'.format(A.nnz))

    print("{} > {} edges".format(A.nnz//2, k*m**2//2))
    return A
Exemple #3
0
def grid_graph(m, corners=False):
    """
    返回 graph 的连接矩阵(symmetric, nearest k neighbors)
    """
    num_edges = 8
    z = graph.grid(m)
    dist, idx = graph.distance_sklearn_metrics(z, k=8, metric='euclidean')
    A = graph.adjacency(dist, idx)
    print('A.nnz: {}'.format(A.nnz))
    if corners:
        import scipy.sparse
        A = A.toarray()
        A[A < A.max()/1.5] = 0
        A = scipy.sparse.csr_matrix(A)
        print('{} edges'.format(A.nnz))

    print("{} > {} edges".format(A.nnz // 2, num_edges * (m**2) // 2))
    return A
Exemple #4
0
def grid_graph(m, r, corners=False):
    z, z_theta = graph.grid_sphere(
        m, r)  #z is rectangular coordinate system while z_theta is polar
    print('z shape is' + str(np.array(z).shape) + 'z_theta shape is' +
          str(np.array(z_theta).shape))
    dist, idx = graph.distance_sklearn_metrics(np.array(z),
                                               k=FLAGS.number_edges,
                                               metric=FLAGS.metric)
    A = graph.adjacency(dist, idx)

    # Connections are only vertical or horizontal on the grid.
    # Corner vertices are connected to 2 neightbors only.
    if corners:
        import scipy.sparse
        A = A.toarray()
        A[A < A.max() / 1.5] = 0
        A = scipy.sparse.csr_matrix(A)
        print('{} edges'.format(A.nnz))

    print("{} > {} edges".format(A.nnz // 2, FLAGS.number_edges * m**2 // 2))
    return z, z_theta, A
Exemple #5
0
 def build_graph(cls, args):
     number_edges = args.number_edges
     metric = args.metric
     normalized_laplacian = args.normalized_laplacian
     coarsening_levels = args.coarsening_levels
     data_dir = 'data/20news'
     embed_path = os.path.join(data_dir, 'embeddings.npy')
     graph_data = np.load(embed_path).astype(np.float32)
     dist, idx = graph.distance_sklearn_metrics(graph_data,
                                                k=number_edges,
                                                metric=metric)
     adj_matrix = graph.adjacency(dist, idx)
     print("{} > {} edges".format(adj_matrix.nnz // 2,
                                  number_edges * graph_data.shape[0] // 2))
     adj_matrix = graph.replace_random_edges(adj_matrix, 0)
     graphs, perm = coarsening.coarsen(adj_matrix,
                                       levels=coarsening_levels,
                                       self_connections=False)
     laplacians = [
         graph.laplacian(g, normalized=normalized_laplacian) for g in graphs
     ]
     cls.perm = perm
     cls.graphs = graphs
     cls.laplacians = laplacians
Exemple #6
0
# The second thing we need is a **graph between features**, i.e. an adjacency matrix $A \in \mathbb{R}^{d_x \times d_x}$.
# Structuring data with graphs is very flexible: it can accomodate both structured and unstructured data.
# 1. **Structured data**.
#     1. The data is structured by an Euclidean domain, e.g. $x_i$ represents an image, a sound or a video. We can use a classical ConvNet with 1D, 2D or 3D convolutions or a graph ConvNet with a line or grid graph (however losing the orientation).
#     2. The data is structured by a graph, e.g. the data lies on a transportation, energy, brain or social network.
# 2. **Unstructured data**. We could use a fully connected network, but the learning and computational complexities are gonna be large. An alternative is to construct a sparse similarity graph between features (or between samples) and use a graph ConvNet, effectively structuring the data and drastically reducing the number of parameters through weight sharing. As for classical ConvNets, the number of parameters are independent of the input size.
#
# There are many ways, supervised or unsupervised, to construct a graph given some data. And better the graph, better the performance ! For this example we'll define the adjacency matrix as a simple similarity measure between features. Below are the choices one has to make when constructing such a graph.
# 1. The distance function. We'll use the Euclidean distance $d_{ij} = \|x_i - x_j\|_2$.
# 2. The kernel. We'll use the Gaussian kernel $a_{ij} = \exp(d_{ij}^2 / \sigma^2)$.
# 3. The type of graph. We'll use a $k$ nearest neigbors (kNN) graph.

# In[ ]:

dist, idx = graph.distance_scipy_spatial(X_train.T, k=10, metric='euclidean')
A = graph.adjacency(dist, idx).astype(np.float32)

assert A.shape == (d, d)
print('d = |V| = {}, k|V| < |E| = {}'.format(d, A.nnz))
plt.spy(A, markersize=2, color='black')

# To be able to pool graph signals, we need first to coarsen the graph, i.e. to find which vertices to group together. At the end we'll have multiple graphs, like a pyramid, each at one level of resolution. The finest graph is where the input data lies, the coarsest graph is where the data at the output of the graph convolutional layers lie. That data, of reduced spatial dimensionality, can then be fed to a fully connected layer.
#
# The parameter here is the number of times to coarsen the graph. Each coarsening approximately reduces the size of the graph by a factor two. Thus if you want a pooling of size 4 in the first layer followed by a pooling of size 2 in the second, you'll need to coarsen $\log_2(4+2) = 3$ times.
#
# After coarsening we rearrange the vertices (and add fake vertices) such that pooling a graph signal is analog to pooling a 1D signal. See the [paper] for details.
#
# [paper]: https://arxiv.org/abs/1606.09375

# In[ ]:
for i in range(all_combs.shape[0]):
    X_train[i, :, :, 0] = X[all_combs[i, 0], :, :]
    X_train[i, :, :, 1] = X[all_combs[i, 1], :, :]
    if y[all_combs[i, 0]] != y[all_combs[i, 1]]:
        y_train[i] = 0  # -1
    if site[all_combs[i, 0]] != site[all_combs[i, 1]]:
        site_train[i] = 0

print("Training samples shape")
print(X_train.shape)


# Get the graph structure
dist, idx = graph.distance_scipy_spatial(coords, k=10, metric='euclidean')
A = graph.adjacency(dist, idx).astype(np.float32)

graphs = []
for i in range(3):
    graphs.append(A)

# Calculate Laplacians
L = [graph.laplacian(A, normalized=True) for A in graphs]

# Number of nodes in graph and features
print("Number of controls in the dataset: ")
print(y.sum())

# Prepare training testing and validation sets
X_test, y_test, site_test = prepare_pairs(X, y, site, test_idx)
def gcn_run(fname, train_num, batch_size):
    rs = 222

    print("Random state is %d" % rs)
    prng = np.random.RandomState(rs)

    np.random.seed(seed=222)
    data = np.load(fname + '.npz')

    X_train = np.array(data['name1'], dtype=np.float32)
    y_train = np.array(data['name2'], dtype=np.float32)

    X_test = np.array(data['name3'], dtype=np.float32)
    y_test = np.array(data['name4'], dtype=np.float32)

    all_combs = np.array(data['name5'], dtype=np.float32)
    site_train = np.array(data['name6'], dtype=np.float32)
    site_test = np.array(data['name7'], dtype=np.float32)
    tr_idx = np.array(data['name8'], dtype=np.float32)

    dist, idx = graph.distance_scipy_spatial(coords, k=10, metric='euclidean')
    A = graph.adjacency(dist, idx).astype(np.float32)

    graphs = []
    for i in range(3):
        graphs.append(A)

    # Calculate Laplacians
    L = [graph.laplacian(A, normalized=True) for A in graphs]

    n, m, f, _ = X_train.shape

    # Graph Conv-net
    features = 64
    K = 3
    params = dict()
    params['num_epochs'] = train_num
    params['batch_size'] = batch_size
    # params['eval_frequency'] = X_train.shape[0] / (params['batch_size'] * 2)
    params['eval_frequency'] = 1
    # Building blocks.
    params['filter'] = 'chebyshev5'
    params['brelu'] = 'b2relu'
    params['pool'] = 'apool1'

    # Architecture.
    params['F'] = [features,
                   features]  # Number of graph convolutional filters.
    params['K'] = [K, K]  # Polynomial orders.
    params['p'] = [1, 1]  # Pooling sizes.
    params['M'] = [1]  # Output dimensionality of fully cofeannected layers.
    params['input_features'] = f
    params['lamda'] = 0.35
    params['mu'] = 0.6

    # Optimization.
    params['regularization'] = 5e-3
    params['dropout'] = 0.8
    params['learning_rate'] = 1e-2
    params['decay_rate'] = 0.95
    params['momentum'] = 0
    params['decay_steps'] = X_train.shape[0] / params['batch_size']

    params['dir_name'] = 'siamese_' + time.strftime("%Y_%m_%d_%H_%M") + '_feat' + str(params['F'][0]) + '_' + \
                         str(params['F'][1]) + '_K' + str(K) + '_state'

    print(params)

    # Run model
    model = models_siamese.siamese_cgcnn_cor(L, **params)

    print("Constructor finished")
    accuracy, loss, scores_summary, tr_error, test_error = model.fit(
        X_train, y_train, site_train, X_test, y_test, site_test)
    #print('Time per step: {:.2f} ms'.format(t_step*1000))

    # Save training
    tr_res = model.evaluate(X_train, y_train, site_train)

    # Evaluate test data
    print("Test accuracy is:")
    res = model.evaluate(X_test, y_test, site_test)
    print(res[0])

    return tr_error, test_error
Exemple #9
0
def constructingGraph(inFIDDic,
                      data_type,
                      mean_geometry=0,
                      std_geometry=1,
                      is_distance=True):
    #inFiDDic {key,[label,pointlist]}
    if len(inFIDDic) < 1: return None

    vertices_Geometry, adjacencies, labels, inFIDs, process_count = [], [], [], [], 0

    for k in inFIDDic:
        [label, Node_coords, Node_features] = inFIDDic[k]
        assert len(Node_coords) == len(Node_features)
        subObject_size = len(Node_coords)

        # # 1 get the label of this sample.
        label = 1 if label == 3 else 0

        # # 3 get the adjacency graph of the building group (one sample).
        # #   MST, Delaunay, K-NN
        points = np.array(Node_coords)
        adjacency = np.zeros((subObject_size, subObject_size))

        tri = Delaunay(points[:, 0:2])

        for i in range(0, tri.nsimplex):
            if i > tri.neighbors[i, 2]:
                adjacency[tri.vertices[i, 0], tri.vertices[i, 1]] = 1
                adjacency[tri.vertices[i, 1], tri.vertices[i, 0]] = 1
            if i > tri.neighbors[i, 0]:
                adjacency[tri.vertices[i, 1], tri.vertices[i, 2]] = 1
                adjacency[tri.vertices[i, 2], tri.vertices[i, 1]] = 1
            if i > tri.neighbors[i, 1]:
                adjacency[tri.vertices[i, 2], tri.vertices[i, 0]] = 1
                adjacency[tri.vertices[i, 0], tri.vertices[i, 2]] = 1

        adjacency = scipy.sparse.coo_matrix(adjacency,
                                            shape=(subObject_size,
                                                   subObject_size))
        # In order to make the calculation simpler, only the distance between the center points of the buildings is provided here.
        # According to the author's experience, the closest distance of two building outlines coule be a better opition for this task.
        distances = sklearn.metrics.pairwise.pairwise_distances(
            points[:, 0:2], metric="euclidean", n_jobs=1)

        if False:
            # K-nearest neighbor graph.
            # Distance matrix. is it necessary to be normalized?
            idx = np.argsort(distances)[:, 1:1 + 1]
            distances.sort()
            distances = graph.adjacency(distances[:, 1:1 + 1], idx)
            adjacency = scipy.sparse.coo_matrix(
                np.ones((subObject_size, subObject_size)),
                shape=(subObject_size, subObject_size)).multiply(distances)
            # print(distances.toarray())# adjacency = adjacency.multiply(distances)
        else:
            adjacency = adjacency.multiply(distances)
            if False:
                # MST graph.
                adjacency = scipy.sparse.csgraph.minimum_spanning_tree(
                    adjacency)
                adjacency = scipy.sparse.csr_matrix(adjacency).toarray()
                adjacency += adjacency.T - np.diag(adjacency.diagonal())
            else:
                # Delaunay graph.
                adjacency = scipy.sparse.csr_matrix(adjacency).toarray()

        #if is_distance:
        #    # Distance matrix. is it necessary to be normalized?
        #    distances = sklearn.metrics.pairwise.pairwise_distances(points[:,0:2], metric="euclidean", n_jobs=1)
        #    adjacency = adjacency.multiply(distances)

        adjacency = scipy.sparse.csr_matrix(adjacency)
        assert subObject_size == points.shape[0]
        assert type(adjacency) is scipy.sparse.csr.csr_matrix

        # # 4 collecting the sample: vertice_Geometry,vertice_Fourier,adjacency,label.
        vertices_Geometry.append(Node_features)
        adjacencies.append(adjacency)
        labels.append(label)
        inFIDs.append(k)

    # preprocessing inputs.
    pro_method = True  # to control the m
    if pro_method:
        # standardizing
        if data_type == 1:
            # Calculate the mean and std of train dataset, they also will be used to validation and test dataset.
            concatenate_Geometry = np.concatenate(vertices_Geometry, axis=0)
            mean_geometry = concatenate_Geometry.mean(axis=0)
            std_geometry = concatenate_Geometry.std(axis=0)

            if data_type == 1:
                file = r'C:\Users\wh\Desktop\gcnn_classification\lib\data\_used_new22.txt'
                file = "./lib/data/_config_22.txt"
                conc = np.vstack((mean_geometry, std_geometry))
                np.savetxt(file, conc, fmt='%.18f')
        if data_type == -1:  # for the extra experiment.
            # Import the mean and std of train dataset.
            file = r'C:\Users\wh\Desktop\gcnn_classification\lib\data\_used_new22.txt'
            file = "./lib/data/_config_22.txt"
            conc = np.loadtxt(file)
            mean_geometry, std_geometry = conc[0, :], conc[1, :]
            mean_fourier, std_fourier = conc[0, :], conc[
                1, :]  # This two parameters are just for fun, do not matter.
            print(
                "\n========import the mean and std of train dataset from text file========\n"
            )
            #print(mean_geometry)
            #print(std_geometry)

        if True:
            # # The efficiency can be improved by means of vectorization.s
            for i in range(0, len(vertices_Geometry)):
                vertices_shape = np.array((vertices_Geometry[i])).shape
                vertices_Geometry[i] -= np.tile(
                    mean_geometry, vertices_shape[0]).reshape(vertices_shape)
                vertices_Geometry[i] /= np.tile(
                    std_geometry, vertices_shape[0]).reshape(vertices_shape)

                # vertices_shape=np.array((vertices_Fourier[i])).shape
                # vertices_Fourier[i] -= np.tile(mean_fourier,vertices_shape[0]).reshape(vertices_shape)
                # vertices_Fourier[i] /= np.tile(std_fourier,vertices_shape[0]).reshape(vertices_shape)
    else:
        # normalizing, it is not working very well.
        if data_type == 1:
            # Calculate the mean and std of train dataset, they also will be used to validation and test dataset.
            concatenate_Geometry = np.concatenate(vertices_Geometry, axis=0)
            mean_geometry = concatenate_Geometry.min(axis=0)
            std_geometry = concatenate_Geometry.max(axis=0)

            file = r'C:\Users\wh\Desktop\gcnn_classification\lib\data\_used2.txt'
            file = "./data/_config_ex.txt"
            conc = np.vstack((mean_geometry, std_geometry))
            np.savetxt(file, conc, fmt='%.18f')

        if not pro_method:
            # # The efficiency can be improved by means of vectorization.s
            for i in range(0, len(vertices_Geometry)):
                vertices_shape = np.array((vertices_Geometry[i])).shape
                vertices_Geometry[i] = (vertices_Geometry[i] - np.tile(
                    mean_geometry, vertices_shape[0]).reshape(vertices_shape)
                                        ) / (std_geometry - mean_geometry)

    # padding.
    # the max number of vertices in a group (sample).
    maxnum_vertices = 128  #max([len(vertices_Geometry[i]) for i in range(0,len(vertices_Geometry))])
    graph_vertices_geo, graph_adjacencies = [], []

    assert len(vertices_Geometry) == len(adjacencies) == len(labels)

    #print(len(vertices_Geometry))
    #print(len(vertices_Geometry[i]))
    #print(np.pad(vertices_Geometry[i], ((0, maxnum_vertices-len(vertices_Geometry[i])),(0,0)), 'constant', constant_values=(0)).shape)
    #exit()

    for i in range(0, len(vertices_Geometry)):
        graph_vertices_geo.append(
            np.pad(vertices_Geometry[i],
                   ((0, maxnum_vertices - len(vertices_Geometry[i])), (0, 0)),
                   'constant',
                   constant_values=(0)))
        graph_adjacencies.append(
            np.pad(adjacencies[i].toarray(),
                   ((0, maxnum_vertices - adjacencies[i].shape[0]),
                    (0, maxnum_vertices - adjacencies[i].shape[0])),
                   'constant',
                   constant_values=(0)))
    # collecting.
    graph_vertices_geo = np.stack(graph_vertices_geo, axis=0).astype(
        np.float32)  #NSample x NVertices x NFeature
    graph_adjacencies = np.stack(graph_adjacencies, axis=0).astype(
        np.float32)  #NSample x NVertices x NVertices
    graph_labels = np.array(labels).astype(np.int64)  #NSample x 1
    graph_inFIDs = np.array(inFIDs).astype(np.int64)  #NSample x 1
    graph_size = graph_labels.shape[0]  #NSample
    graph_Laplacian = np.stack([
        graph.laplacian(
            scipy.sparse.csr_matrix(A), normalized=True, rescaled=True)
        for A in graph_adjacencies
    ],
                               axis=0)

    return [
        graph_vertices_geo, graph_Laplacian, graph_labels, graph_inFIDs,
        graph_size, mean_geometry, std_geometry
    ]


# load_data('gz1124i.json', [0.6, 0.2, 0.2])
def ABIDE_save(num_subjects, filename):
    rs = 33
    print("Random state is %d" % rs)
    prng = np.random.RandomState(rs)

    # Split into training, validation and testing sets
    training_num = num_subjects
    lines = int(1.2 * num_subjects)
    # I am guessing this is to create a validation set within training data
    # Used in the next moving step

    # Get subject features
    atlas = 'ho'
    kind = 'correlation'

    subject_IDs = abide_utils.get_ids(lines)
    # Get all subject networks
    networks = abide_utils.load_all_networks(subject_IDs,
                                             kind,
                                             atlas_name=atlas)
    X = np.array(networks)

    # with open('GCN_train.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    #     pickle.dump(X, f, 2)
    # f.close()

    # Number of nodes
    nodes = X.shape[1]

    # Get ROI coordinates
    coords = abide_utils.get_atlas_coords(atlas_name=atlas)

    # Get subject labels
    label_dict = abide_utils.get_subject_label(subject_IDs,
                                               label_name='DX_GROUP')
    y = np.array([int(label_dict[x]) - 1 for x in sorted(label_dict)])

    # Get site ID
    site = abide_utils.get_subject_label(subject_IDs, label_name='SITE_ID')
    unq = np.unique(list(site.values())).tolist()
    site = np.array([unq.index(site[x]) for x in sorted(site)])

    # Choose site IDs to include in the analysis
    site_mask = range(20)
    X = X[np.in1d(site, site_mask)]
    y = y[np.in1d(site, site_mask)]
    site = site[np.in1d(site, site_mask)]

    tr_idx, test_idx = split_data(site, 0.6)
    # training_num = int(0.6 * X.shape[0])

    prng.shuffle(test_idx)
    subs_to_add = training_num - len(
        tr_idx)  # subjects that need to be moved from testing to training set
    tr_idx.extend(test_idx[:subs_to_add])
    test_idx = test_idx[subs_to_add:]
    print("The test indices are the following: ")
    print(test_idx)

    all_combs = []
    tr_mat = np.array(tr_idx).reshape([int(len(tr_idx) / 6), 6])
    for i in range(3):
        x1 = tr_mat[:, i * 2].flatten()
        x2 = tr_mat[:, i * 2 + 1].flatten()
        combs = np.transpose([np.tile(x1, len(x2)), np.repeat(x2, len(x1))])
        all_combs.append(combs)

    all_combs = np.vstack(all_combs)

    # print(all_combs.shape)
    n, m, f = X.shape
    X_train = np.ones((all_combs.shape[0], m, f, 2), dtype=np.float32)
    y_train = np.ones(all_combs.shape[0], dtype=np.int32)
    site_train = np.ones(all_combs.shape[0], dtype=np.int32)

    for i in range(all_combs.shape[0]):
        X_train[i, :, :, 0] = X[all_combs[i, 0], :, :]
        X_train[i, :, :, 1] = X[all_combs[i, 1], :, :]
        if y[all_combs[i, 0]] != y[all_combs[i, 1]]:
            y_train[i] = 0  # -1
        if site[all_combs[i, 0]] != site[all_combs[i, 1]]:
            site_train[i] = 0

    print("Training samples shape")
    print(X_train.shape)

    # Get the graph structure
    dist, idx = graph.distance_scipy_spatial(coords, k=10, metric='euclidean')
    A = graph.adjacency(dist, idx).astype(np.float32)

    graphs = []
    for i in range(3):
        graphs.append(A)

    # Calculate Laplacians
    L = [graph.laplacian(A, normalized=True) for A in graphs]

    # Number of nodes in graph and features
    print("Number of controls in the dataset: ")
    print(y.sum())

    # Prepare training testing and validation sets
    X_test, y_test, site_test = prepare_pairs(X, y, site, test_idx)

    # Saving training data for comparison with ann siamese

    np.savez(filename,
             '.npz',
             name1=X_train,
             name2=y_train,
             name3=X_test,
             name4=y_test,
             name5=all_combs,
             name6=site_train,
             name7=site_test,
             name8=tr_idx)

    return None
Exemple #11
0
if True:
    graph_data = train.embeddings.astype(np.float32)
else:
    graph_data = train.data.T.astype(np.float32).toarray()

#del train, test

#%% [markdown]
# # Feature graph

#%%
t_start = time.process_time()
dist, idx = graph.distance_sklearn_metrics(graph_data,
                                           k=FLAGS.number_edges,
                                           metric=FLAGS.metric)
A = graph.adjacency(dist, idx)
print("{} > {} edges".format(A.nnz // 2,
                             FLAGS.number_edges * graph_data.shape[0] // 2))
A = graph.replace_random_edges(A, 0)
graphs, perm = coarsening.coarsen(A,
                                  levels=FLAGS.coarsening_levels,
                                  self_connections=False)
L = [graph.laplacian(A, normalized=True) for A in graphs]
print('Execution time: {:.2f}s'.format(time.process_time() - t_start))
#graph.plot_spectrum(L)
#del graph_data, A, dist, idx

#%%
t_start = time.process_time()
train_data = scipy.sparse.csr_matrix(
    coarsening.perm_data(train_data.toarray(), perm))