def data_compression(fmri_masked, mask_img, mask_np, output_size): """ data : array_like A matrix of shape (`V`, `N`) with `V` voxels `N` timepoints The functional dataset that needs to be reduced mask : a numpy array of the mask output_size : integer The number of elements that the data should be reduced to """ ## Transform nifti files to a data matrix with the NiftiMasker import time from nilearn import input_data datacompressiontime = time.time() nifti_masker = input_data.NiftiMasker(mask_img=mask_img, memory='nilearn_cache', mask_strategy='background', memory_level=1, standardize=False) ward = [] # Perform Ward clustering from sklearn.feature_extraction import image shape = mask_np.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask_np) #import pdb;pdb.set_trace() from sklearn.cluster import FeatureAgglomeration start = time.time() ward = FeatureAgglomeration(n_clusters=output_size, connectivity=connectivity, linkage='ward') ward.fit(fmri_masked) #print("Ward agglomeration compressing voxels into clusters: %.2fs" % (time.time() - start)) labels = ward.labels_ #print ('Extracting reduced Dimension Data') data_reduced = ward.transform(fmri_masked) fmri_masked = [] #print('Data compression took ', (time.time()- datacompressiontime), ' seconds') return {'data': data_reduced, 'labels': labels}
def data_compression(fmri_masked, mask_img, mask_np, compression_dim): # TODO @AKI update doc """ Perform... Parameters ---------- fmri_masked : np.ndarray[ndim=2] A matrix of shape (`V`, `N`) with `V` voxels `N` timepoints The functional dataset that needs to be reduced mask_img : an nibabel img object of the mask mask_np : a numpy array of the mask compression_dim : integer The number of elements that the data should be reduced to Returns ------- A dictionaty ... """ from sklearn.feature_extraction import image from sklearn.cluster import FeatureAgglomeration # Perform Ward clustering shape = mask_np.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask_np) ward = FeatureAgglomeration(n_clusters=compression_dim, connectivity=connectivity, linkage='ward') ward.fit(fmri_masked) labels = ward.labels_ data_reduced = ward.transform(fmri_masked) return { 'compressor': ward, 'compressed': data_reduced, 'labels': labels, }
def cross_cluster_timeseries(data1, data2, roi_mask_nparray, n_clusters, similarity_metric, affinity_threshold, cluster_method = 'ward'): """ Cluster a timeseries dataset based on its relationship to a second timeseries dataset Parameters ---------- data1 : array_like A matrix of shape (`N`, `M`) with `N1` samples and `M1` dimensions. This is the matrix to receive cluster assignment data2 : array_like A matrix of shape (`N`, `M`) with `N2` samples and `M2` dimensions. This is the matrix with which distances will be calculated to assign clusters to data1 n_clusters : integer Number of clusters similarity_metric : {'euclidean', 'correlation', 'minkowski', 'cityblock', 'seuclidean'} Type of similarity measure for distance matrix. The pairwise similarity measure specifies the edges of the similarity graph. 'data' option assumes X as the similarity matrix and hence must be symmetric. Default is kneighbors_graph [1]_ (forced to be symmetric) affinity_threshold : float Threshold of similarity metric when 'correlation' similarity metric is used. Returns ------- y_pred : array_like Predicted cluster labels Examples -------- np.random.seed(30) offset = np.random.randn(30) x1 = np.random.randn(200,30) + 2*offset x2 = np.random.randn(100,30) + 44*np.random.randn(30) x3 = np.random.randn(400,30) sampledata1 = np.vstack((x1,x2,x3)) np.random.seed(99) offset = np.random.randn(30) x1 = np.random.randn(200,30) + 2*offset x2 = np.random.randn(100,30) + 44*np.random.randn(30) x3 = np.random.randn(400,30) sampledata2 = np.vstack((x1,x2,x3)) cross_cluster(sampledata1, sampledata2, 3, 'euclidean') References ---------- https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html#scipy.spatial.distance.cdist http://scikit-learn.org/stable/modules/clustering.html#spectral-clustering """ import scipy as sp import time import sklearn as sk from sklearn import cluster, datasets, preprocessing from sklearn.cluster import FeatureAgglomeration from sklearn.feature_extraction import image print("Calculating Cross-clustering") print("Calculating pairwise distances between areas") dist_btwn_data_1_2 = np.array(sp.spatial.distance.cdist(data1.T, data2.T, metric = similarity_metric)) sim_btwn_data_1_2=1-dist_btwn_data_1_2 sim_btwn_data_1_2[np.isnan(sim_btwn_data_1_2)]=0 sim_btwn_data_1_2[sim_btwn_data_1_2<affinity_threshold]=0 print("Calculating pairwise distances between voxels in ROI 1 ") dist_of_1 = sp.spatial.distance.pdist(sim_btwn_data_1_2, metric = 'euclidean') dist_matrix = sp.spatial.distance.squareform(dist_of_1) sim_matrix=1-sk.preprocessing.normalize(dist_matrix, norm='max') sim_matrix[sim_matrix<affinity_threshold]=0 if cluster_method == 'ward': # ## BEGIN WARD CLUSTERING CODE print("ward") print("ward") print("ward") print("ward") print("ward") print("ward") print("ward") print("ward") if roi_mask_nparray!='empty': #import pdb; pdb.set_trace() shape = roi_mask_nparray.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=roi_mask_nparray) ward = FeatureAgglomeration(n_clusters=n_clusters, connectivity=connectivity, linkage='ward') ward.fit(sim_matrix) y_pred = ward.labels_.astype(np.int) else: print("Calculating Hierarchical Cross-clustering") ward = FeatureAgglomeration(n_clusters=n_clusters, affinity='euclidean', linkage='ward') ward.fit(sim_matrix) y_pred = ward.labels_.astype(np.int) # # END WARD CLUSTERING CODE else: print("spectral") print("spectral") print("spectral") print("spectral") print("spectral") print("spectral") print("spectral") print("spectral") print("spectral") #cluster_method== 'spectral': #Spectral method spectral = cluster.SpectralClustering(n_clusters, eigen_solver='arpack', random_state = 5, affinity="precomputed", assign_labels='discretize') spectral.fit(sim_matrix) y_pred = spectral.labels_.astype(np.int) # # BEGIN SPECTRAL CLUSTERING CODE # END SPECTRAL CLUSTERING CODE # sim_matrix[np.isnan((sim_matrix))]=0 # sim_matrix[sim_matrix<0]=0 # sim_matrix[sim_matrix>1]=1 ## BEGIN WARD CLUSTERING CODE # print("Calculating Hierarchical Cross-clustering") # ward = FeatureAgglomeration(n_clusters=n_clusters, affinity='euclidean', linkage='ward') # ward.fit(sim_matrix) # y_pred = ward.labels_.astype(np.int) # ## END WARD CLUSTERING CODE # # BEGIN SPECTRAL CLUSTERING CODE # spectral = cluster.SpectralClustering(n_clusters, eigen_solver='arpack', random_state = 5, affinity="precomputed", assign_labels='discretize') # spectral.fit(sim_matrix) # y_pred = spectral.labels_.astype(np.int) # # END SPECTRAL CLUSTERING CODE return y_pred
def cluster_timeseries(X, roi_mask_nparray, n_clusters, similarity_metric, affinity_threshold, cluster_method = 'ward'): """ Cluster a given timeseries Parameters ---------- X : array_like A matrix of shape (`N`, `M`) with `N` samples and `M` dimensions n_clusters : integer Number of clusters similarity_metric : {'k_neighbors', 'correlation', 'data'} Type of similarity measure for spectral clustering. The pairwise similarity measure specifies the edges of the similarity graph. 'data' option assumes X as the similarity matrix and hence must be symmetric. Default is kneighbors_graph [1]_ (forced to be symmetric) affinity_threshold : float Threshold of similarity metric when 'correlation' similarity metric is used. Returns ------- y_pred : array_like Predicted cluster labels Examples -------- References ---------- .. [1] http://scikit-learn.org/dev/modules/generated/sklearn.neighbors.kneighbors_graph.html if similarity_metric == 'correlation': # Calculate empirical correlation matrix between samples Xn = X - X.mean(1)[:,np.newaxis] Xn = Xn/np.sqrt( (Xn**2.).sum(1)[:,np.newaxis] ) C_X = np.dot(Xn, Xn.T) C_X[C_X < affinity_threshold] = 0 from scipy.sparse import lil_matrix C_X = lil_matrix(C_X) elif similarity_metric == 'data': C_X = X elif similarity_metric == 'k_neighbors': from sklearn.neighbors import kneighbors_graph C_X = kneighbors_graph(X, n_neighbors=neighbors) C_X = 0.5 * (C_X + C_X.T) else: raise ValueError("Unknown value for similarity_metric: '%s'." % similarity_metric) #sklearn code is not stable for bad clusters which using correlation as a stability metric #tends to give for more info see: #http://scikit-learn.org/dev/modules/clustering.html#spectral-clustering warning #from sklearn import cluster #algorithm = cluster.SpectralClustering(k=n_clusters, mode='arpack') #algorithm.fit(C_X) #y_pred = algorithm.labels_.astype(np.int) from python_ncut_lib import ncut, discretisation eigen_val, eigen_vec = ncut(C_X, n_clusters) eigen_discrete = discretisation(eigen_vec) #np.arange(n_clusters)+1 isn't really necessary since the first cluster can be determined #by the fact that the each cluster is a disjoint set y_pred = np.dot(eigen_discrete.toarray(), np.diag(np.arange(n_clusters))).sum(1) """ import sklearn as sk from sklearn import cluster, datasets, preprocessing import scipy as sp import time from sklearn.cluster import FeatureAgglomeration from sklearn.feature_extraction import image print('Beginning Calculating pairwise distances between voxels') X = np.array(X) X_dist = sp.spatial.distance.pdist(X.T, metric = similarity_metric) temp=X_dist temp[np.isnan(temp)]=0 tempmax=temp.max() X_dist = sp.spatial.distance.squareform(X_dist) X_dist[np.isnan(X_dist)]=tempmax #import pdb;pdb.set_trace() sim_matrix=1-sk.preprocessing.normalize(X_dist, norm='max') sim_matrix[sim_matrix<affinity_threshold]=0 #import pdb;pdb.set_trace() if cluster_method == 'ward': # ## BEGIN WARD CLUSTERING CODE print("ward") print("ward") print("ward") print("ward") print("ward") print("ward") print("ward") print("ward") if roi_mask_nparray!='empty': #import pdb; pdb.set_trace() shape = roi_mask_nparray.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=roi_mask_nparray) ward = FeatureAgglomeration(n_clusters=n_clusters, connectivity=connectivity, linkage='ward') ward.fit(sim_matrix) y_pred = ward.labels_.astype(np.int) else: print("Calculating Hierarchical Clustering") ward = FeatureAgglomeration(n_clusters=n_clusters, affinity='euclidean', linkage='ward') ward.fit(sim_matrix) y_pred = ward.labels_.astype(np.int) # # END WARD CLUSTERING CODE else: print("spectral") print("spectral") print("spectral") print("spectral") print("spectral") print("spectral") print("spectral") print("spectral") print("spectral") #cluster_method== 'spectral': #Spectral method spectral = cluster.SpectralClustering(n_clusters, eigen_solver='arpack', random_state = 5, affinity="precomputed", assign_labels='discretize') spectral.fit(sim_matrix) y_pred = spectral.labels_.astype(np.int) # # BEGIN SPECTRAL CLUSTERING CODE # END SPECTRAL CLUSTERING CODE return y_pred
def cross_cluster_timeseries(data1, data2, roi_mask_data, n_clusters, similarity_metric, affinity_threshold, cluster_method='ward', random_state=None): """ Cluster a timeseries dataset based on its relationship to a second timeseries dataset Parameters ---------- data1 : array_like A matrix of shape (`N`, `M`) with `N1` samples and `M1` dimensions. This is the matrix to receive cluster assignment data2 : array_like A matrix of shape (`N`, `M`) with `N2` samples and `M2` dimensions. This is the matrix with which distances will be calculated to assign clusters to data1 roi_mask_data : array_like An array that contains a binary mask of the region of interest (ROI) being parcellated. n_clusters : integer Number of clusters similarity_metric : {'euclidean', 'correlation', 'minkowski', 'cityblock', 'seuclidean'} Type of similarity measure for distance matrix. The pairwise similarity measure specifies the edges of the similarity graph. 'data' option assumes X as the similarity matrix and hence must be symmetric. Default is kneighbors_graph [1]_ (forced to be symmetric) affinity_threshold : float Threshold of similarity metric when 'correlation' similarity metric is used. cluster_method : {'ward', 'spectral', 'kmeans', 'gaussianmixture'} A string that says which cluster method to use. random_state : integer the random state to seed the bootstrap Returns ------- y_pred : array_like Predicted cluster labels Examples -------- np.random.seed(30) offset = np.random.randn(30) x1 = np.random.randn(200, 30) + 2 * offset x2 = np.random.randn(100, 30) + 44 * np.random.randn(30) x3 = np.random.randn(400, 30) sampledata1 = np.vstack((x1, x2, x3)) np.random.seed(99) offset = np.random.randn(30) x1 = np.random.randn(200, 30) + 2 * offset x2 = np.random.randn(100, 30) + 44 * np.random.randn(30) x3 = np.random.randn(400, 30) sampledata2 = np.vstack((x1, x2, x3)) cross_cluster(sampledata1, sampledata2, 3, 'euclidean') References ---------- https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html#scipy.spatial.distance.cdist http://scikit-learn.org/stable/modules/clustering.html#spectral-clustering """ from scipy.spatial.distance import pdist, cdist, squareform from sklearn.preprocessing import normalize from sklearn.feature_extraction import image from sklearn.cluster import FeatureAgglomeration, KMeans, SpectralClustering from sklearn.mixture import GaussianMixture dist_btwn_data_1_2 = np.array( cdist(data1.T, data2.T, metric=similarity_metric)) max_dist = np.nanmax(dist_btwn_data_1_2) dist_btwn_data_1_2[np.isnan(dist_btwn_data_1_2)] = max_dist dist_of_1 = pdist(dist_btwn_data_1_2, metric='euclidean') dist_matrix = squareform(dist_of_1) sim_matrix = 1 - normalize(dist_matrix, norm='max') sim_matrix[sim_matrix < affinity_threshold] = 0 if cluster_method == 'ward': if roi_mask_data is not None: shape = roi_mask_data.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=roi_mask_data) ward = FeatureAgglomeration(n_clusters=n_clusters, connectivity=connectivity, linkage='ward') ward.fit(sim_matrix) y_pred = ward.labels_.astype(np.int) else: ward = FeatureAgglomeration(n_clusters=n_clusters, affinity='euclidean', linkage='ward') ward.fit(sim_matrix) y_pred = ward.labels_.astype(np.int) elif cluster_method == 'spectral': spectral = SpectralClustering(n_clusters, eigen_solver='arpack', affinity="precomputed", assign_labels='discretize', random_state=random_state) spectral.fit(sim_matrix) y_pred = spectral.labels_.astype(np.int) elif cluster_method == 'kmeans': kmeans = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, random_state=random_state) kmeans.fit(sim_matrix) y_pred = kmeans.labels_.astype(np.int) elif cluster_method == 'gaussianmixture': gaussianmixture = GaussianMixture(n_components=n_clusters, init_params='kmeans', random_state=random_state) y_pred = gaussianmixture.fit_predict(sim_matrix) return y_pred
def cluster_timeseries(X, roi_mask_data, n_clusters, similarity_metric, affinity_threshold, cluster_method='ward', random_state=None): """ Cluster a given timeseries Parameters ---------- X : array_like A matrix of shape (`N`, `M`) with `N` samples and `M` dimensions roi_mask_data : array_like An array that contains a binary mask of the region of interest (ROI) being parcellated. n_clusters : integer Number of clusters similarity_metric : {'k_neighbors', 'correlation', 'data'} Type of similarity measure for spectral clustering. The pairwise similarity measure specifies the edges of the similarity graph. 'data' option assumes X as the similarity matrix and hence must be symmetric. Default is kneighbors_graph [1]_ (forced to be symmetric) affinity_threshold : float Threshold of similarity metric when 'correlation' similarity metric is used. cluster_method : {'ward', 'spectral', 'kmeans', 'gaussianmixture'} A string that says which cluster method to use. random_state : integer the random state to seed the bootstrap Returns ------- y_pred : array_like Predicted cluster labels Examples -------- References ---------- .. [1] http://scikit-learn.org/dev/modules/generated/sklearn.neighbors.kneighbors_graph.html """ import numpy as np import scipy as sp import sklearn as sk from sklearn.feature_extraction import image from sklearn.cluster import FeatureAgglomeration, SpectralClustering, KMeans from sklearn.mixture import GaussianMixture X = np.array(X) X_dist = sp.spatial.distance.pdist(X.T, metric=similarity_metric) max_dist = np.nanmax(X_dist) X_dist = sp.spatial.distance.squareform(X_dist) X_dist[np.isnan(X_dist)] = max_dist sim_matrix = 1 - sk.preprocessing.normalize(X_dist, norm='max') sim_matrix[sim_matrix < affinity_threshold] = 0 print("Calculating Hierarchical Clustering") cluster_method = cluster_method.lower() if cluster_method == 'ward': if roi_mask_data is not None: shape = roi_mask_data.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=roi_mask_data) ward = FeatureAgglomeration(n_clusters=n_clusters, connectivity=connectivity, linkage='ward') ward.fit(sim_matrix) y_pred = ward.labels_.astype(np.int) else: ward = FeatureAgglomeration(n_clusters=n_clusters, affinity='euclidean', linkage='ward') ward.fit(sim_matrix) y_pred = ward.labels_.astype(np.int) elif cluster_method == 'spectral': spectral = SpectralClustering(n_clusters, eigen_solver='arpack', affinity="precomputed", assign_labels='discretize', random_state=random_state) spectral.fit(sim_matrix) y_pred = spectral.labels_.astype(np.int) elif cluster_method == 'kmeans': kmeans = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, random_state=random_state) kmeans.fit(sim_matrix) y_pred = kmeans.labels_.astype(np.int) elif cluster_method == 'gaussianmixture': gaussianmixture = GaussianMixture(n_components=n_clusters, init_params='kmeans', random_state=random_state) y_pred = gaussianmixture.fit_predict(sim_matrix) return y_pred