def test_grid_to_graph(): # Checking that the function works with graphs containing no edges size = 2 roi_size = 1 # Generating two convex parts with one vertex # Thus, edges will be empty in _to_graph mask = np.zeros((size, size), dtype=np.bool) mask[0:roi_size, 0:roi_size] = True mask[-roi_size:, -roi_size:] = True mask = mask.reshape(size ** 2) A = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray) assert_true(connected_components(A)[0] == 2) # Checking that the function works whatever the type of mask is mask = np.ones((size, size), dtype=np.int16) A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask) assert_true(connected_components(A)[0] == 1) # Checking dtype of the graph mask = np.ones((size, size)) A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=np.bool) assert_true(A.dtype == np.bool) A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=np.int) assert_true(A.dtype == np.int) A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=np.float64) assert_true(A.dtype == np.float64)
def test_connect_regions_with_grid(): lena = sp.misc.lena() mask = lena > 50 graph = grid_to_graph(*lena.shape, mask=mask) assert_equal(ndimage.label(mask)[1], connected_components(graph)[0]) mask = lena > 150 graph = grid_to_graph(*lena.shape, mask=mask, dtype=None) assert_equal(ndimage.label(mask)[1], connected_components(graph)[0])
def cluster_spatial_data(X, n_parcels, xyz=None, shape=None, mask=None, method='ward', verbose=False): """Cluster the data using Ward's algorithm Parameters ========== X: array of shape(n_voxels, n_subjects) the functional data, across subjects n_parcels: int, the desired number of parcels xyz: array of shape (n_voxels, 3), optional positions of the voxels in grid coordinates shape: tuple: the domain shape (assuming a grid structure), optional alternative specification of positions mask: arbitrary array of arbitrary dimension,optional alternative specification of positions method: string, one of ['ward', 'spectral', 'kmeans'], optional clustering method Returns ======= label: array of shape(n_voxels): the resulting cluster assignment Note ==== One of xyz, shape or mask needs to be provided """ from sklearn.cluster import spectral_clustering, k_means if mask is not None: connectivity = grid_to_graph(*shape, mask=mask) elif shape is not None: connectivity = grid_to_graph(*shape) elif xyz is not None: from sklearn.neighbors import kneighbors_graph n_neighbors = 2 * xyz.shape[1] connectivity = kneighbors_graph(xyz, n_neighbors=n_neighbors) else: raise ValueError('One of mask, shape or xyz has to be provided') if n_parcels == 1: return np.zeros(X.shape[0]) if method == 'ward': connectivity = connectivity.tocsr() ward = Ward(n_clusters=n_parcels, connectivity=connectivity).fit(X) label = ward.labels_ elif method == 'spectral': i, j = connectivity.nonzero() sigma = np.sum((X[i] - X[j]) ** 2, 1).mean() connectivity.data = np.exp(- np.sum((X[i] - X[j]) ** 2, 1) / (2 * sigma)) label = spectral_clustering(connectivity, n_clusters=n_parcels) elif method == 'kmeans': _, label, _ = k_means(X, n_parcels) else: raise ValueError('Unknown method for parcellation') return label
def test_connect_regions_with_grid(): try: face = sp.face(gray=True) except AttributeError: # Newer versions of scipy have face in misc from scipy import misc face = misc.face(gray=True) mask = face > 50 graph = grid_to_graph(*face.shape, mask=mask) assert_equal(ndimage.label(mask)[1], connected_components(graph)[0]) mask = face > 150 graph = grid_to_graph(*face.shape, mask=mask, dtype=None) assert_equal(ndimage.label(mask)[1], connected_components(graph)[0])
def ward_cluster_land_mask(self,threshold=50): """ Try to seperate land from water using scikits-learn ward clustering. The simple land_to_zeros method above does not distinguish shadow pixels on land from water pixels. The Ward clustering conectivity constraint should take care of that. """ from sklearn.cluster import Ward from sklearn.feature_extraction.image import grid_to_graph import time # Get the last band. I'm assuming the last band will be the longest # wavelength. band = self.band_array[-1] # zero out pixels that are above the threshold band[np.where(band > threshold)] = 0 X = np.reshape(band, (-1,1)) connectivity = grid_to_graph(*band.shape) st = time.time() n_clusters = 2 ward = Ward(n_clusters=n_clusters, connectivity=connectivity).fit(X) label = np.reshape(ward.labels_, band.shape) print "Elaspsed time: ", time.time() - st return label
def test_ward_agglomeration(): """ Check that we obtain the correct solution in a simplistic case """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) assert_warns(DeprecationWarning, WardAgglomeration) with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", DeprecationWarning) if hasattr(np, 'VisibleDeprecationWarning'): # Let's not catch the numpy internal DeprecationWarnings warnings.simplefilter('ignore', np.VisibleDeprecationWarning) ward = WardAgglomeration(n_clusters=5, connectivity=connectivity) ward.fit(X) assert_equal(len(warning_list), 1) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert_array_equal(agglo.labels_, ward.labels_) assert_true(np.size(np.unique(agglo.labels_)) == 5) X_red = agglo.transform(X) assert_true(X_red.shape[1] == 5) X_full = agglo.inverse_transform(X_red) assert_true(np.unique(X_full[0]).size == 5) assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError assert_raises(ValueError, agglo.fit, X[:0])
def test_ward_clustering(): """ Check that we obtain the correct number of clusters with Ward clustering. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(100, 50) connectivity = grid_to_graph(*mask.shape) clustering = Ward(n_clusters=10, connectivity=connectivity) clustering.fit(X) # test caching clustering = Ward(n_clusters=10, connectivity=connectivity, memory=mkdtemp()) clustering.fit(X) labels = clustering.labels_ assert_true(np.size(np.unique(labels)) == 10) # Turn caching off now clustering = Ward(n_clusters=10, connectivity=connectivity) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) np.testing.assert_array_equal(clustering.labels_, labels) clustering.connectivity = None clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10) # Check that we raise a TypeError on dense matrices clustering = Ward(n_clusters=10, connectivity=connectivity.todense()) assert_raises(TypeError, clustering.fit, X) clustering = Ward(n_clusters=10, connectivity=sparse.lil_matrix( connectivity.todense()[:10, :10])) assert_raises(ValueError, clustering.fit, X)
def test_affinity_passed_to_fix_connectivity(): # Test that the affinity parameter is actually passed to the pairwise # function size = 2 rng = np.random.RandomState(0) X = rng.randn(size, size) mask = np.array([True, False, False, True]) connectivity = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray) class FakeAffinity: def __init__(self): self.counter = 0 def increment(self, *args, **kwargs): self.counter += 1 return self.counter fa = FakeAffinity() linkage_tree(X, connectivity=connectivity, affinity=fa.increment) assert fa.counter == 3
def test_agglomerative_clustering_with_distance_threshold(linkage): # Check that we obtain the correct number of clusters with # agglomerative clustering with distance_threshold. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) # test when distance threshold is set to 10 distance_threshold = 10 for conn in [None, connectivity]: clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=distance_threshold, connectivity=conn, linkage=linkage) clustering.fit(X) clusters_produced = clustering.labels_ num_clusters_produced = len(np.unique(clustering.labels_)) # test if the clusters produced match the point in the linkage tree # where the distance exceeds the threshold tree_builder = _TREE_BUILDERS[linkage] children, n_components, n_leaves, parent, distances = \ tree_builder(X, connectivity=conn, n_clusters=None, return_distance=True) num_clusters_at_threshold = np.count_nonzero( distances >= distance_threshold) + 1 # test number of clusters produced assert num_clusters_at_threshold == num_clusters_produced # test clusters produced clusters_at_threshold = _hc_cut(n_clusters=num_clusters_produced, children=children, n_leaves=n_leaves) assert np.array_equiv(clusters_produced, clusters_at_threshold)
def test_ward_agglomeration(): """ Check that we obtain the correct solution in a simplistic case """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) assert_warns(DeprecationWarning, WardAgglomeration) with ignore_warnings(): ward = WardAgglomeration(n_clusters=5, connectivity=connectivity) ward.fit(X) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert_array_equal(agglo.labels_, ward.labels_) assert_true(np.size(np.unique(agglo.labels_)) == 5) X_red = agglo.transform(X) assert_true(X_red.shape[1] == 5) X_full = agglo.inverse_transform(X_red) assert_true(np.unique(X_full[0]).size == 5) assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError assert_raises(ValueError, agglo.fit, X[:0])
def _fit_method(self, data): """Helper function which applies clustering method on the masked data """ mask_img_ = self.masker_.mask_img_ if self.algorithm == 'minibatchkmeans': if self.verbose: print("[MiniBatchKMeans] Learning") labels = self._cache(_minibatch_kmeans_fit_method, func_memory_level=1)( data.T, self.n_parcels, self.init, self.random_state, self.verbose) self.kmeans_labels_ = labels elif self.algorithm == 'featureagglomeration': if self.verbose: print("[Feature Agglomeration] Learning") mask_ = mask_img_.get_data().astype(np.bool) shape = mask_.shape if self.connectivity is None: self.connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask_) labels = self._cache(_feature_agglomeration_fit_method, func_memory_level=1)( data, self.n_parcels, self.connectivity, self.linkage) self.ward_labels_ = labels
def spatio_temporal_cluster_test_connectivity(): """Test cluster level permutations with and without connectivity """ try: try: from sklearn.feature_extraction.image import grid_to_graph except ImportError: from scikits.learn.feature_extraction.image import grid_to_graph except ImportError: return rng = np.random.RandomState(0) noise1_2d = rng.randn(condition1_2d.shape[0], condition1_2d.shape[1], 10) data1_2d = np.transpose(np.dstack((condition1_2d, noise1_2d)), [0, 2, 1]) noise2_d2 = rng.randn(condition2_2d.shape[0], condition2_2d.shape[1], 10) data2_2d = np.transpose(np.dstack((condition2_2d, noise2_d2)), [0, 2, 1]) conn = grid_to_graph(data1_2d.shape[-1], 1) threshold = dict(start=4.0, step=2) T_obs, clusters, p_values_conn, hist = \ spatio_temporal_cluster_test([data1_2d, data2_2d], connectivity=conn, n_permutations=50, tail=1, seed=1, threshold=threshold) T_obs, clusters, p_values_no_conn, hist = \ spatio_temporal_cluster_test([data1_2d, data2_2d], n_permutations=50, tail=1, seed=1, threshold=threshold) assert_equal(np.sum(p_values_conn < 0.05), np.sum(p_values_no_conn < 0.05))
def segment_image(im_file, n_segments=5, alg='ac'): img = imread(im_file) img = img[:,:,0] X = np.reshape(img, (-1, 1)) if alg == 'ac': # Define the structure A of the data. Pixels connected to their neighbors. connectivity = grid_to_graph(*img.shape) # Compute clustering print("Compute structured hierarchical clustering...") st = time.time() n_clusters = n_segments # number of regions ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete', connectivity=connectivity).fit(X) label = np.reshape(ward.labels_, img.shape) elif alg == 'dbscan': print("Compute DBScan clustering...") st = time.time() dbs = DBSCAN(eps=1).fit(X) label = np.reshape(dbs.labels_, img.shape) print("Elapsed time: ", time.time() - st) print("Number of pixels: ", label.size) print("Number of clusters: ", np.unique(label).size) return label
def wardHierarchical(img): connectivity = grid_to_graph(*img.shape) print("Compute structured hierarchical clustering...") st = time.time() n_clusters = 15 # number of regions ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', connectivity=connectivity) face = sp.misc.imresize(img, 0.10) / 255. X = np.reshape(img, (-1, 1)) ward.fit(X) label = np.reshape(ward.labels_, face.shape) print("Elapsed time: ", time.time() - st) print("Number of pixels: ", label.size) print("Number of clusters: ", np.unique(label).size) plt.figure(figsize=(5, 5)) plt.imshow(face, cmap=plt.cm.gray) for l in range(n_clusters): plt.contour(label == l, contours=1, colors=[plt.cm.spectral(l / float(n_clusters)), ]) plt.xticks(()) plt.yticks(()) plt.show()
def unsupervisedLearningTest03(): # Connectivity-constrained clustering import numpy as np import scipy as sp import matplotlib.pyplot as plt import time from sklearn.feature_extraction.image import grid_to_graph from sklearn.cluster import AgglomerativeClustering from sklearn import cluster, datasets lena = sp.misc.lena() #Downsample the image by a factor of 4 lena = lena[::2, ::2] + lena[1::2, ::2] + lena[::2, 1::2] + lena[1::2, 1::2] X = np.reshape(lena, (-1, 1)) # Define the structure A of the data. Pixels connected to their neighbors. # 把图片变成一张图, 讨论其连接性 connectivity = grid_to_graph(*lena.shape) print "Compute structured hierarchical clustering..." st = time.time() n_clusters = 15 # number of regions ward = AgglomerativeClustering(n_clusters = n_clusters, linkage = 'ward', connectivity = connectivity).fit(X) label = np.reshape(ward.labels_, lena.shape) print "Elapsed time: " + str(time.time() - st) print "Number of pixels: " + str(label.size) print "Number of clusters: " + str(np.unique(label).size) #Feature agglomeration digits = datasets.load_digits() images = digits.images X = np.reshape(images, (len(images), -1)) connectivity = grid_to_graph(*images[0].shape) agglo = cluster.FeatureAgglomeration(connectivity = connectivity, n_clusters = 32) agglo.fit(X) X_reduced = agglo.transform(X) X_approx = agglo.inverse_transform(X_reduced) images_approx = np.reshape(X_approx, images.shape)
def agglomerativeClusteringFeatures(image): connectivity = grid_to_graph(*image[:,:,2].shape) X = np.reshape(image[:,:,2], (-1,1)) ward = AgglomerativeClustering(n_clusters=150, linkage = 'ward', connectivity = connectivity).fit(X) labels = np.reshape(ward.labels_, image[:,:,2].shape) averageIntensity = color.label2rgb(labels, image[:,:,2], kind = 'avg') #areas = getAreas(labels) return averageIntensity
def test_connectivity_fixing_non_lil(): # Check non regression of a bug if a non item assignable connectivity is # provided with more than one component. # create dummy data x = np.array([[0, 0], [1, 1]]) # create a mask with several components to force connectivity fixing m = np.array([[True, False], [False, True]]) c = grid_to_graph(n_x=2, n_y=2, mask=m) w = AgglomerativeClustering(connectivity=c, linkage='ward') assert_warns(UserWarning, w.fit, x)
def test_height_linkage_tree(): # Check that the height of the results of linkage tree is sorted. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rng.randn(50, 100) connectivity = grid_to_graph(*mask.shape) for linkage_func in _TREE_BUILDERS.values(): children, n_nodes, n_leaves, parent = linkage_func(X.T, connectivity) n_nodes = 2 * X.shape[1] - 1 assert len(children) + n_leaves == n_nodes
def test_height_ward_tree(): """ Check that the height of ward tree is sorted. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) children, n_nodes, n_leaves, parent = ward_tree(X.T, connectivity) n_nodes = 2 * X.shape[1] - 1 assert_true(len(children) + n_leaves == n_nodes)
def test_ward_clustering(): """ Check that we obtain the correct number of clusters with Ward clustering. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(100, 50) connectivity = grid_to_graph(*mask.shape) clustering = Ward(n_clusters=10, connectivity=connectivity) clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10)
def test_structured_ward_tree(): """ Check that we obtain the correct solution for structured ward tree. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) children, n_components, n_leaves = ward_tree(X.T, connectivity) n_nodes = 2 * X.shape[1] - 1 assert_true(len(children) + n_leaves == n_nodes)
def fit(self, X, y): y = y.ravel() n_samples, n_lags, n_lats, n_lons = X.shape self.scaler.fit(X[:, -1].reshape(n_samples, -1)) X = X.reshape(n_lags * n_samples, -1) connectivity = grid_to_graph(n_lats, n_lons) self.agglo.connectivity = connectivity X = self.scaler.transform(X) X = self.agglo.fit_transform(X) X = X.reshape(n_samples, -1) self.clf.fit(X, y)
def test_connectivity_fixing_non_lil(): """ Check non regression of a bug if a non item assignable connectivity is provided with more than one component. """ # create dummy data x = np.array([[0, 0], [1, 1]]) # create a mask with several components to force connectivity fixing m = np.array([[True, False], [False, True]]) c = grid_to_graph(n_x=2, n_y=2, mask=m) w = Ward(connectivity=c) w.fit(x)
def fit(self, kshape=None): if kshape is not None: connectivity = grid_to_graph(*kshape) self.fit_parameters.update({"connectivity": connectivity}) ward = AgglomerativeClustering(**self.fit_parameters) ward.fit(self.input_data) self.mapper_data = ward.labels_ self.output_data = np.array([]) self.output_space_size = ward.n_clusters self.model_attributes = {"n_clusters": ward.n_clusters, "n_components": ward.n_components} self._log_model_results() return self
def test_spatio_temporal_cluster_connectivity(): """Test spatio-temporal cluster permutations.""" try: try: from sklearn.feature_extraction.image import grid_to_graph except ImportError: from scikits.learn.feature_extraction.image import grid_to_graph except ImportError: return condition1_1d, condition2_1d, condition1_2d, condition2_2d = \ _get_conditions() rng = np.random.RandomState(0) noise1_2d = rng.randn(condition1_2d.shape[0], condition1_2d.shape[1], 10) data1_2d = np.transpose(np.dstack((condition1_2d, noise1_2d)), [0, 2, 1]) noise2_d2 = rng.randn(condition2_2d.shape[0], condition2_2d.shape[1], 10) data2_2d = np.transpose(np.dstack((condition2_2d, noise2_d2)), [0, 2, 1]) conn = grid_to_graph(data1_2d.shape[-1], 1) threshold = dict(start=4.0, step=2) T_obs, clusters, p_values_conn, hist = \ spatio_temporal_cluster_test([data1_2d, data2_2d], connectivity=conn, n_permutations=50, tail=1, seed=1, threshold=threshold, buffer_size=None) buffer_size = data1_2d.size // 10 T_obs, clusters, p_values_no_conn, hist = \ spatio_temporal_cluster_test([data1_2d, data2_2d], n_permutations=50, tail=1, seed=1, threshold=threshold, n_jobs=2, buffer_size=buffer_size) assert_equal(np.sum(p_values_conn < 0.05), np.sum(p_values_no_conn < 0.05)) # make sure results are the same without buffer_size T_obs, clusters, p_values2, hist2 = \ spatio_temporal_cluster_test([data1_2d, data2_2d], n_permutations=50, tail=1, seed=1, threshold=threshold, n_jobs=2, buffer_size=None) assert_array_equal(p_values_no_conn, p_values2) assert_raises(ValueError, spatio_temporal_cluster_test, [data1_2d, data2_2d], tail=1, threshold=-2.) assert_raises(ValueError, spatio_temporal_cluster_test, [data1_2d, data2_2d], tail=-1, threshold=2.) assert_raises(ValueError, spatio_temporal_cluster_test, [data1_2d, data2_2d], tail=0, threshold=-1)
def ward_clustering(config, img_flat): X = np.reshape(img_flat, (-1, 1)) connectivity = grid_to_graph(*img_flat.shape) ward = AgglomerativeClustering( n_clusters=config['ward_clusters'], linkage='ward', compute_full_tree = False, connectivity=connectivity).fit(X) ulab = np.unique(ward.labels_) out = [] for u in ulab: inds = np.where(ward.labels_ == u)[0] hsh = hash(tuple(inds - inds[0])) out.append(hsh) return tuple(out)
def test_structured_ward_tree(): """ Check that we obtain the correct solution for structured ward tree. """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) # Avoiding a mask with only 'True' entries mask[4:7, 4:7] = 0 X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) children, n_components, n_leaves, parent = ward_tree(X.T, connectivity) n_nodes = 2 * X.shape[1] - 1 assert_true(len(children) + n_leaves == n_nodes) # Check that ward_tree raises a ValueError with a connectivity matrix # of the wrong shape assert_raises(ValueError, ward_tree, X.T, np.ones((4, 4)))
def test_ward_agglomeration(): """ Check that we obtain the correct solution in a simplistic case """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) ward = WardAgglomeration(n_clusters=5, connectivity=connectivity) ward.fit(X) assert_true(np.size(np.unique(ward.labels_)) == 5) Xred = ward.transform(X) assert_true(Xred.shape[1] == 5) Xfull = ward.inverse_transform(Xred) assert_true(np.unique(Xfull[0]).size == 5)
def test_ward_fit_transform(): """Test parcellation building and associated signal extraction. """ # Generate toy data # define data structure shape = (5, 5, 5) mask = np.ones(shape, dtype=bool) connectivity = image.grid_to_graph(n_x=5, n_y=5, n_z=5, mask=mask) # data generation data1 = np.ones(shape) data1[1:3, 1:3, 1:3] = 2. data2 = np.ones(shape) data2[3:, 3:, 3:] = 4. data = np.ones((4, np.prod(shape))) # 4 ravelized images data[0] = np.ravel(data1) data[1] = np.ravel(data2) # One image used for train, transform all parcelled_data, labels = _ward_fit_transform(data, [0], connectivity, 2, 0) # check parcelled_data assert_equal(parcelled_data.shape, (4, 2)) assert_array_equal( np.sort(np.unique(parcelled_data[0])), # order is hard to predict [1, 2]) assert_array_equal(parcelled_data[2], [1, 1]) assert_array_equal(parcelled_data[3], [1, 1]) # check labels assert_equal(len(labels.shape), 1) assert_array_equal(np.unique(labels), [0, 1]) # Two images used for train, transform all, add offset to labels parcelled_data, labels = _ward_fit_transform(data, [0, 1], connectivity, 3, 10) # check parcelled_data assert_equal(parcelled_data.shape, (4, 3)) assert_array_equal( np.sort(np.unique(parcelled_data[0])), # order is hard to predict [1, 2]) assert_array_equal( np.sort(np.unique(parcelled_data[1])), # order is hard to predict [1, 4]) assert_array_equal(parcelled_data[2], [1, 1, 1]) assert_array_equal(parcelled_data[3], [1, 1, 1]) # check labels assert_equal(len(labels.shape), 1) assert_array_equal(np.unique(labels), [10, 11, 12])
def test_structured_linkage_tree(): # Check that we obtain the correct solution for structured linkage trees. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) # Avoiding a mask with only 'True' entries mask[4:7, 4:7] = 0 X = rng.randn(50, 100) connectivity = grid_to_graph(*mask.shape) for tree_builder in _TREE_BUILDERS.values(): children, n_components, n_leaves, parent = tree_builder(X.T, connectivity) n_nodes = 2 * X.shape[1] - 1 assert_true(len(children) + n_leaves == n_nodes) # Check that ward_tree raises a ValueError with a connectivity matrix # of the wrong shape assert_raises(ValueError, tree_builder, X.T, np.ones((4, 4))) # Check that fitting with no samples raises an error assert_raises(ValueError, tree_builder, X.T[:0], connectivity)
def clustering(interactive: Interactive, api: API): window = api.application.document_windows[0] target_data_item = window.target_data_item ctx = iface.get_context() ds = iface.dataset_from_data_item(ctx, target_data_item) fy, fx = tuple(ds.shape.sig) y, x = tuple(ds.shape.nav) # roi = np.random.choice([True, False], tuple(ds.shape.nav), p=[0.01, 0.99]) # We only sample 5 % of the frame for the std deviation map # since the UDF still needs optimization std_roi = np.random.choice([True, False], tuple(ds.shape.nav), p=[0.05, 0.95]) roi = np.ones((y, x), dtype=bool) # roi = np.zeros((y, x), dtype=bool) # roi[:, :50] = True stddev_res = run_stddev(ctx=ctx, dataset=ds, roi=std_roi * roi) ref_frame = stddev_res['std'] # sum_res = ctx.run_udf(udf=SumUDF(), dataset=ds) # ref_frame = sum_res['intensity'].data update_data(target_data_item, ref_frame) peaks = peak_local_max(ref_frame, min_distance=3, num_peaks=500) masks = sparse.COO(shape=(len(peaks), fy, fx), coords=(range(len(peaks)), peaks[..., 0], peaks[..., 1]), data=1) feature_udf = ApplyMasksUDF(mask_factories=lambda: masks, mask_dtype=np.uint8, mask_count=len(peaks), use_sparse=True) feature_res = ctx.run_udf(udf=feature_udf, dataset=ds, roi=roi) f = feature_res['intensity'].raw_data.astype(np.float32) f = np.log(f - np.min(f) + 1) feature_vector = f / np.abs(f).mean(axis=0) # too slow # nion_peaks = peaks / tuple(ds.shape.sig) # with api.library.data_ref_for_data_item(target_data_item): # for p in nion_peaks: # target_data_item.add_ellipse_region(*p, 0.01, 0.01) connectivity = scipy.sparse.csc_matrix( grid_to_graph( # Transposed! n_x=y, n_y=x, )) roi_connectivity = connectivity[roi.flatten()][:, roi.flatten()] threshold = interactive.get_float("Cluster distance threshold: ", 10) clusterer = AgglomerativeClustering( affinity='euclidean', distance_threshold=threshold, n_clusters=None, linkage='ward', connectivity=roi_connectivity, ) clusterer.fit(feature_vector) labels = np.zeros((y, x), dtype=np.int32) labels[roi] = clusterer.labels_ + 1 new_data = api.library.create_data_item_from_data(labels) window.display_data_item(new_data)
def _build_parcellations(all_subjects_data, mask, n_parcellations=100, n_parcels=1000, n_bootstrap_samples=None, random_state=None, memory=Memory(cachedir=None), n_jobs=1, verbose=False): """Build the parcellations for the RPBI framework. Parameters ---------- all_subjects_data : array_like, shape=(n_samples, n_voxels) Masked subject images as an array. mask : ndarray of booleans Mask that has been applied on the initial images to obtain `all_subjects_data`. n_parcellations : int, The number of parcellations to be built and used to extract signal averages from the data. n_parcels : int, Number of parcels for the parcellations. n_bootstrap_samples : int, Number of subjects to be used to build the parcellations. The subjects are randomly drawn with replacement. If set to None, n_samples subjects are drawn, which correspond to a bootstrap draw. random_state : int, Random numbers seed for reproducible results. memory : instance of joblib.Memory or string Used to cache the masking process. By default, no caching is done. If a string is given, it is the path to the caching directory. n_jobs : int, Number of parallel workers. If 0 is provided, all CPUs are used. A negative number indicates that all the CPUs except (|n_jobs| - 1) ones will be used. verbose : boolean, Activate verbose mode (default is False). Returns ------- parcelled_data : np.ndarray, shape=(n_parcels_tot, n_subjs) Data for all subjects after mean signal extraction with all the parcellations that have been created. ward_labels : np.ndarray, shape=(n_vox * n_wards, ) Voxel-to-parcel map for all the parcellations. Useful to perform inverse transforms. TODO ---- - Deal with NaNs in the original data (WardAgglomeration cannot fit when NaNs are present in the data). Median imputation? """ # initialize the seed of the random generator rng = check_random_state(random_state) # check n_jobs (number of CPUs) n_jobs = check_n_jobs(n_jobs) n_samples = all_subjects_data.shape[0] if n_bootstrap_samples is None: n_bootstrap_samples = n_samples # Compute connectivity shape = mask.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask) # Build parcellations draw = rng.randint(n_samples, size=n_bootstrap_samples * n_parcellations) draw = draw.reshape((n_parcellations, -1)) ret = joblib.Parallel(n_jobs=n_jobs)( joblib.delayed(cache(_ward_fit_transform, memory, verbose=verbose))( all_subjects_data, draw[i], connectivity, n_parcels, i * n_parcels) for i in range(n_parcellations)) # reduce results parcelled_data_parts, ward_labels = zip(*ret) parcelled_data = np.hstack((parcelled_data_parts)) ward_labels = np.ravel(ward_labels) return parcelled_data, np.ravel(ward_labels)
def regionprops_3D(im): r""" Calculates various metrics for each labeled region in a 3D image. The ``regionsprops`` method in **skimage** is very thorough for 2D images, but is a bit limited when it comes to 3D images, so this function aims to fill this gap. Parameters ---------- im : array_like An imaging containing at least one labeled region. If a boolean image is received than the ``True`` voxels are treated as a single region labeled ``1``. Regions labeled 0 are ignored in all cases. Returns ------- An augmented version of the list returned by skimage's ``regionprops``. Information, such as ``volume``, can be found for region A using the following syntax: ``result[A-1].volume``. Notes ----- This function may seem slow compared to the skimage version, but that is because they defer calculation of certain properties until they are accessed while this one evalulates everything (inlcuding the deferred properties from skimage's ``regionprops``) Regions can be identified using a watershed algorithm, which can be a bit tricky to obtain desired results. *PoreSpy* includes the SNOW algorithm, which may be helpful. """ print('_' * 60) print('Calculating regionprops') results = regionprops(im, coordinates='xy') for i in tqdm(range(len(results))): mask = results[i].image mask_padded = sp.pad(mask, pad_width=1, mode='constant') temp = spim.distance_transform_edt(mask_padded) dt = extract_subsection(temp, shape=mask.shape) # --------------------------------------------------------------------- # Slice indices results[i].slice = results[i]._slice # --------------------------------------------------------------------- # Volume of regions in voxels results[i].volume = results[i].area # --------------------------------------------------------------------- # Volume of bounding box, in voxels results[i].bbox_volume = sp.prod(mask.shape) # --------------------------------------------------------------------- # Create an image of the border results[i].border = dt == 1 # --------------------------------------------------------------------- # Create an image of the maximal inscribed sphere r = dt.max() inv_dt = spim.distance_transform_edt(dt < r) results[i].inscribed_sphere = inv_dt < r # --------------------------------------------------------------------- # Find surface area using marching cubes and analyze the mesh tmp = sp.pad(sp.atleast_3d(mask), pad_width=1, mode='constant') tmp = spim.convolve(tmp, weights=ball(1)) / 5 verts, faces, norms, vals = marching_cubes_lewiner(volume=tmp, level=0) results[i].surface_mesh_vertices = verts results[i].surface_mesh_simplices = faces area = mesh_surface_area(verts, faces) results[i].surface_area = area # --------------------------------------------------------------------- # Find sphericity vol = results[i].volume r = (3 / 4 / sp.pi * vol)**(1 / 3) a_equiv = 4 * sp.pi * (r)**2 a_region = results[i].surface_area results[i].sphericity = a_equiv / a_region # --------------------------------------------------------------------- # Find skeleton of region results[i].skeleton = skeletonize_3d(mask) # --------------------------------------------------------------------- # Volume of convex image, equal to area in 2D, so just translating results[i].convex_volume = results[i].convex_area # --------------------------------------------------------------------- # Convert region grid to a graph am = grid_to_graph(*mask.shape, mask=mask) results[i].graph = am return results
def generate_clustering_per_region(region): """ Generate clustering from a series of 2D slices pertaining to a region (e.g. cervical) :param region: :param levels: list of levels :return: """ use_mask = True # Load data logger.info("Load data...") nii = nib.load(params.file_prefix_all + region + ext) data = nii.get_fdata() # Crop around spinal cord, and only keep half of it. # The way the atlas was built, the right and left sides are perfectly symmetrical (mathematical average). Hence, # we can discard one half, without loosing information. # TODO: parametrize this, and find center automatically # TODO: find cropping values per region if region == 'cervical' or region == 'lumbar': xmin, xmax = (45, 110) ymin, ymax = (75, 114) elif region == 'thoracic': xmin, xmax = (53, 100) ymin, ymax = (75, 105) else: xmin, xmax = (55, 94) ymin, ymax = (75, 95) data_crop = data[xmin:xmax, ymin:ymax, :] del data # If we have a mask of the white matter, we load it and crop it according to the data_crop shape. if use_mask: # Load data nii_mask = nib.load(params.file_mask_prefix + region + ext) mask = nii_mask.get_data() # Crop, binarize mask_crop = mask[xmin:xmax, ymin:ymax, :] > 0.5 else: mask_crop = np.ones(data_crop.shape[0:3]) > 0.5 # Reshape ind_mask = np.where(mask_crop) mask1d = np.squeeze(mask_crop.reshape(-1, 1)) # Standardize data logger.info("Standardize data...") # original_shape = data_crop.shape[0:3] data2d = data_crop.reshape(-1, data_crop.shape[3]) scaler = StandardScaler() data2d_norm = scaler.fit_transform(data2d) del data2d # Build connectivity matrix logger.info("Build connectivity matrix...") connectivity = grid_to_graph(n_x=data_crop.shape[0], n_y=data_crop.shape[1], n_z=data_crop.shape[2], mask=mask_crop) del data_crop # Process Paxinos atlas for display nii_paxinos = nib.load(params.file_paxinos + '_' + region + ext) paxinos3d = np.mean(nii_paxinos.get_data(), axis=2) # Crop data paxinos3d = paxinos3d[xmin:xmax, ymin:ymax, :] # clip between 0 and 1. # note: we don't want to normalize, otherwise the background (which should be 0) will have a non-zero value. paxinos3d = np.clip(paxinos3d, 0, 1) # TODO: crop Paxinos # Perform clustering logger.info("Run clustering...") num_clusters = [8, 10] # [5, 6, 7, 8, 9, 10, 11] for n_cluster in num_clusters: logger.info("Number of clusters: {}".format(n_cluster)) clustering = AgglomerativeClustering(linkage="ward", n_clusters=n_cluster, connectivity=connectivity) clustering.fit(data2d_norm[mask1d, :]) logger.info("Reshape labels...") labels = np.zeros_like(mask_crop, dtype=np.int) labels[ind_mask] = clustering.labels_ + 1 # we add a the +1 because sklearn's first label has value "0", and we are now going to use "0" as the background (i.e. not a label) del clustering # Display clustering results logger.info("Generate figures...") fig = plt.figure(figsize=(20, 20)) fig.subplots_adjust(hspace=0.1, wspace=0.1) for i in range(len(levels)): ax = fig.add_subplot(4, 4, i+1) ax.imshow(labels[:, :, i], cmap='Spectral') plt.title("iz = {}".format(i), pad=18) plt.tight_layout() fig.savefig('clustering_results_ncluster{}_{}.png'.format(n_cluster, region)) fig.clear() # Create 4D array: last dimension corresponds to the cluster number. Cluster value is converted to 1. a = list(labels.shape) a.append(n_cluster) labels4d = np.zeros(a) for i_label in range(n_cluster): ind_label = np.argwhere(labels == i_label + 1) for i in ind_label: labels4d[i[0], i[1], i[2], i_label] = 1 # Average across Z. Each cluster is coded between 0 and 1. labels3d = np.mean(labels4d, axis=2) # Display result of averaging logger.info("Generate figures...") fig = plt.figure(figsize=(7, 7)) fig.suptitle('Averaged clusters (N={}) | Region: {}'.format(n_cluster, region), fontsize=20) # Display Paxinos # TODO: generalize BASE_COLORS for more than 8 labels ax = fig.add_subplot(1, 2, 1) ax.set_facecolor((1, 1, 1)) for i_label in range(paxinos3d.shape[2]): labels_rgb = np.zeros([paxinos3d.shape[0], paxinos3d.shape[1], 4]) for ix in range(paxinos3d.shape[0]): for iy in range(paxinos3d.shape[1]): ind_color = list(params.colors.keys())[i_label] labels_rgb[ix, iy] = colors.to_rgba(params.colors[ind_color], paxinos3d[ix, iy, i_label]) ax.imshow(labels_rgb) plt.axis('off') plt.title("Paxinos atlas", pad=18) plt.tight_layout() # Find label color corresponding best to the Paxinos atlas list_color = get_best_matching_color_with_paxinos(im=labels3d, imref=paxinos3d) # Display clustering ax = fig.add_subplot(1, 2, 2) for i_label in range(n_cluster): labels_rgb = np.zeros([labels3d.shape[0], labels3d.shape[1], 4]) for ix in range(labels3d.shape[0]): for iy in range(labels3d.shape[1]): labels_rgb[ix, iy] = colors.to_rgba(params.colors[list_color[i_label]], labels3d[ix, iy, i_label]) ax.imshow(labels_rgb) plt.axis('off') plt.title("Cluster map", pad=18) plt.tight_layout() fig.subplots_adjust(hspace=0, wspace=0.1) fig.savefig('clustering_results_avgz_{}_ncluster{}.png'.format(region, n_cluster)) del data2d_norm logger.info("Done!")
kshape = (30,20) n_iter = 100 learning_rate = 0.01 n_colors = 100 spcolors = np.random.rand(n_colors,3) mapper = SOMMapper(kshape=kshape, n_iter=n_iter, learning_rate=learning_rate) kohonen = mapper.fit_transform(spcolors) U_Matrix = build_U_matrix(kohonen, kshape, topology="rect") fig = plt.figure() ax1 = fig.add_subplot(211) ax1.imshow(np.split(kohonen, kshape[0], axis=0)) ax1.set_title("Kohonen Map") ## Clustering n_clusters = 5 # number of regions connectivity = grid_to_graph(kshape[0],kshape[1]) ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', connectivity=connectivity).fit(kohonen) label = np.reshape(ward.labels_, kshape) for l in range(n_clusters): ax1.contour(label == l, contours=1, colors=[plt.cm.spectral(l / float(n_clusters)), ]) ax2 = fig.add_subplot(221) ax2.imshow(np.split(U_Matrix, kshape[0], axis=0)) ax2.set_title("U_Matrix") plt.show()
def _cluster(self, linkage='complete', k=6): C = grid_to_graph(self.x, self.y) X = np.array(self.weights).reshape((self.x * self.y, self.weights[0][0].shape[0])) clusterer = AgglomerativeClustering(n_clusters=k, connectivity=C, affinity=self.dtw_fn, linkage=linkage) return clusterer.fit_predict(X)
# ############################################################################# # Generate data orig_coins = coins() # Resize it to 20% of the original size to speed up the processing # Applying a Gaussian filter for smoothing prior to down-scaling # reduces aliasing artifacts. smoothened_coins = gaussian_filter(orig_coins, sigma=2) rescaled_coins = rescale(smoothened_coins, 0.2, mode="reflect", **rescale_params) X = np.reshape(rescaled_coins, (-1, 1)) # ############################################################################# # Define the structure A of the data. Pixels connected to their neighbors. connectivity = grid_to_graph(*rescaled_coins.shape) # ############################################################################# # Compute clustering print("Compute structured hierarchical clustering...") st = time.time() n_clusters = 27 # number of regions ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', connectivity=connectivity) ward.fit(X) label = np.reshape(ward.labels_, rescaled_coins.shape) print("Elapsed time: ", time.time() - st) print("Number of pixels: ", label.size) print("Number of clusters: ", np.unique(label).size) # #############################################################################
# is quite simple, as our data are aligned on a rectangular grid. # # Let's pretend that our data were smaller -- a 3 × 3 grid. Thinking about # each voxel as being connected to the other voxels it touches, we would # need a 9 × 9 adjacency matrix. The first row of this matrix contains the # voxels in the flattened data that the first voxel touches. Since it touches # the second element in the first row and the first element in the second row # (and is also a neighbor to itself), this would be:: # # [1, 1, 0, 1, 0, 0, 0, 0, 0] # # :mod:`sklearn.feature_extraction` provides a convenient function for this: from sklearn.feature_extraction.image import grid_to_graph # noqa: E402 mini_adjacency = grid_to_graph(3, 3).toarray() assert mini_adjacency.shape == (9, 9) print(mini_adjacency[0]) # %% # In general the adjacency between voxels can be more complex, such as # those between sensors in 3D space, or time-varying activation at brain # vertices on a cortical surface. MNE provides several convenience functions # for computing adjacency matrices, for example: # # * :func:`mne.channels.find_ch_adjacency` # * :func:`mne.stats.combine_adjacency` # # See the :ref:`Statistics API <api_reference_statistics>` for a full list. # # MNE also ships with numerous built-in channel adjacency matrices from the
def connectivity(shape): from sklearn.feature_extraction import image connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2]) return connectivity
# which voxels are adjacent to each other. In our case this # is quite simple, as our data are aligned on a rectangular grid. # # Let's pretend that our data were smaller -- a 3x3 grid. Thinking about # each voxel as being connected to the other voxels it touches, we would # need a 9x9 connectivity matrix. The first row should contain the elements # in the ``.ravel()``'ed data that it touches. Since it touches the # second element in the first row and the first element in the second row # (and is also a neighbor to itself), this would be:: # # [1, 1, 0, 1, 0, 0, 0, 0, 0] # # :mod:`sklearn.feature_extraction` provides a convenient function for this: from sklearn.feature_extraction.image import grid_to_graph # noqa: E402 mini_connectivity = grid_to_graph(3, 3).toarray() assert mini_connectivity.shape == (9, 9) print(mini_connectivity[0]) del mini_connectivity ############################################################################### # In general the connectivity between voxels can be more complex, such as # those between sensors in 3D space, or time-varying activation at brain # vertices on a cortical surface. MNE provides several convenience functions # for computing connectivity/neighbor/adjacency matrices, see the # :ref:`Statistics API <api_reference_statistics>`. # # Standard clustering # ~~~~~~~~~~~~~~~~~~~ # Here, since our data are on a grid, we can use ``connectivity=None`` to # trigger optimized grid-based code, and run the clustering algorithm.
def test_agglomerative_clustering(): # Check that we obtain the correct number of clusters with # agglomerative clustering. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) for linkage in ("ward", "complete", "average"): clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) clustering.fit(X) # test caching try: tempdir = mkdtemp() clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, memory=tempdir, linkage=linkage) clustering.fit(X) labels = clustering.labels_ assert_true(np.size(np.unique(labels)) == 10) finally: shutil.rmtree(tempdir) # Turn caching off now clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, linkage=linkage) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1) clustering.connectivity = None clustering.fit(X) assert_true(np.size(np.unique(clustering.labels_)) == 10) # Check that we raise a TypeError on dense matrices clustering = AgglomerativeClustering( n_clusters=10, connectivity=sparse.lil_matrix( connectivity.toarray()[:10, :10]), linkage=linkage) assert_raises(ValueError, clustering.fit, X) # Test that using ward with another metric than euclidean raises an # exception clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity.toarray(), affinity="manhattan", linkage="ward") assert_raises(ValueError, clustering.fit, X) # Test using another metric than euclidean works with linkage complete for affinity in PAIRED_DISTANCES.keys(): # Compare our (structured) implementation to scipy clustering = AgglomerativeClustering( n_clusters=10, connectivity=np.ones((n_samples, n_samples)), affinity=affinity, linkage="complete") clustering.fit(X) clustering2 = AgglomerativeClustering( n_clusters=10, connectivity=None, affinity=affinity, linkage="complete") clustering2.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1) # Test that using a distance matrix (affinity = 'precomputed') has same # results (with connectivity constraints) clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage="complete") clustering.fit(X) X_dist = pairwise_distances(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, affinity='precomputed', linkage="complete") clustering2.fit(X_dist) assert_array_equal(clustering.labels_, clustering2.labels_)
mkdir(write_dir) ############################################################################### # Global parameters n_clusters = 5000 test_set = ['left button press (auditory cue)'] do_soft_threshold = False nifti_masker = NiftiMasker(mask=mask_image, smoothing_fwhm=False, standardize=False, memory='nilearn_cache') shape = mask.shape connectivity = grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask) #cross_validation scheme subject_label = np.repeat(np.arange(n_subjects), len(ref)) cv = ShuffleSplit(n_subjects, n_iter=20, train_size=.9, test_size=.1, random_state=2) def do_parcel_connectivity(mask, n_clusters, ward): # Estimate parcel connectivity import scipy.sparse as sps n_voxels = mask.sum()
from sklearn.cluster import AgglomerativeClustering # generate data lena = sp.misc.lena() # downsample the image by a factor of 4 lena = lena[::2, ::2] + \ lena[1::2, ::2] + \ lena[::2, 1::2] + \ lena[1::2, 1::2] X = np.reshape(lena, (-1, 1)) # define the structure A of the data. # Pixels connected to their neighbors. connectivity = grid_to_graph(*lena.shape) # compute clustering print('Compute structured hierarchical clustering ...') st = time.time() n_clusters = 15 # number of regions ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', connectivity=connectivity).fit(X) label = np.reshape(ward.labels_, lena.shape) print('Elapsed time: ', time.time() - st) print('Number of pixels: ', label.size) print('Number of clusters: ', np.unique(label).size)
def _raw_fit(self, data): """ Fits the parcellation method on this reduced data. Data are coming from a base decomposition estimator which computes the mask and reduces the dimensionality of images using randomized_svd. Parameters ---------- data: ndarray Shape (n_samples, n_features) Returns ------- labels: numpy.ndarray Labels to each cluster in the brain. connectivity: numpy.ndarray voxel-to-voxel connectivity matrix computed from a mask. Note that, this attribute is returned only for selected methods such as 'ward', 'complete', 'average'. """ valid_methods = self.VALID_METHODS if self.method is None: raise ValueError("Parcellation method is specified as None. " "Please select one of the method in " "{0}".format(valid_methods)) if self.method is not None and self.method not in valid_methods: raise ValueError("The method you have selected is not implemented " "'{0}'. Valid methods are in {1}".format( self.method, valid_methods)) # we delay importing Ward or AgglomerativeClustering and same # time import plotting module before that. # Because sklearn.cluster imports scipy hierarchy and hierarchy imports # matplotlib. So, we force import matplotlib first using our # plotting to avoid backend display error with matplotlib # happening in Travis try: from nilearn import plotting except Exception: pass components = MultiPCA._raw_fit(self, data) mask_img_ = self.masker_.mask_img_ if self.verbose: print("[{0}] computing {1}".format(self.__class__.__name__, self.method)) if self.method == 'kmeans': from sklearn.cluster import MiniBatchKMeans kmeans = MiniBatchKMeans(n_clusters=self.n_parcels, init='k-means++', random_state=self.random_state, verbose=max(0, self.verbose - 1)) labels = self._cache(_estimator_fit, func_memory_level=1)(components.T, kmeans) elif self.method == 'rena': rena = ReNA(mask_img_, n_clusters=self.n_parcels, scaling=self.scaling, n_iter=self.n_iter, memory=self.memory, memory_level=self.memory_level, verbose=max(0, self.verbose - 1)) method = 'rena' labels = \ self._cache(_estimator_fit, func_memory_level=1)(components.T, rena, method) else: mask_ = _safe_get_data(mask_img_).astype(np.bool) shape = mask_.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask_) from sklearn.cluster import AgglomerativeClustering agglomerative = AgglomerativeClustering(n_clusters=self.n_parcels, connectivity=connectivity, linkage=self.method, memory=self.memory) labels = self._cache(_estimator_fit, func_memory_level=1)(components.T, agglomerative) self.connectivity_ = connectivity # Avoid 0 label labels = labels + 1 unique_labels = np.unique(labels) # Check that appropriate number of labels were created if len(unique_labels) != self.n_parcels: n_parcels_warning = ('The number of generated labels does not ' 'match the requested number of parcels.') warnings.warn(message=n_parcels_warning, category=UserWarning, stacklevel=3) self.labels_img_ = self.masker_.inverse_transform(labels) return self
brain[labels == i] = 0 # Spatial smoothing to encourage smooth parcels dim = np.shape(brain) tc = tc.reshape((dim[0], dim[1], dim[2], -1)) n_tpts = tc.shape[-1] for t in np.arange(n_tpts): tc[:, :, :, t] = gaussian_filter(tc[:, :, :, t], sigma=1) tc = tc.reshape((-1, n_tpts)) tc = tc[brain.ravel() == 1, :] # Functional parcellation with Ward clustering print("Performing Ward Clustering") mem = Memory(cachedir='.', verbose=1) # Define connectivity based on brain mask A = grid_to_graph(n_x=brain.shape[0], n_y=brain.shape[1], n_z=brain.shape[2], mask=brain) # Create ward object ward = WardAgglomeration(n_clusters=n_parcels, connectivity=A.tolil(), memory=mem) ward.fit(tc.T) template = np.zeros((dim[0], dim[1], dim[2])) template[brain==1] = ward.labels_ + 1 # labels start from 0, which is used for background # Remove single voxels not connected to parcel #for i in np.unique(template)[1:]: # labels, n_labels = label(template == i, structure=np.ones((3,3,3))) # if n_labels > 1: # for j in np.arange(n_labels): # if np.sum(labels == j + 1) < 10: # template[labels == j + 1] = 0 # Saving the template
def test_permutation_connectivity_equiv(): """Test cluster level permutations with and without connectivity.""" try: try: from sklearn.feature_extraction.image import grid_to_graph except ImportError: from scikits.learn.feature_extraction.image import grid_to_graph except ImportError: return rng = np.random.RandomState(0) # subjects, time points, spatial points n_time = 2 n_space = 4 X = rng.randn(6, n_time, n_space) # add some significant points X[:, :, 0:2] += 10 # span two time points and two spatial points X[:, 1, 3] += 20 # span one time point max_steps = [1, 1, 1, 2, 1] # This will run full algorithm in two ways, then the ST-algorithm in 2 ways # All of these should give the same results conns = [None, grid_to_graph(n_time, n_space), grid_to_graph(1, n_space), grid_to_graph(1, n_space), None] stat_map = None thresholds = [2, 2, 2, 2, dict(start=0.01, step=1.0)] sig_counts = [2, 2, 2, 2, 5] stat_fun = partial(ttest_1samp_no_p, sigma=1e-3) cs = None ps = None for thresh, count, max_step, conn in zip(thresholds, sig_counts, max_steps, conns): t, clusters, p, H0 = \ permutation_cluster_1samp_test( X, threshold=thresh, connectivity=conn, n_jobs=2, max_step=max_step, stat_fun=stat_fun) # make sure our output datatype is correct assert_true(isinstance(clusters[0], np.ndarray)) assert_true(clusters[0].dtype == bool) assert_array_equal(clusters[0].shape, X.shape[1:]) # make sure all comparisons were done; for TFCE, no perm # should come up empty inds = np.where(p < 0.05)[0] assert_equal(len(inds), count) if isinstance(thresh, dict): assert_equal(len(clusters), n_time * n_space) assert_true(np.all(H0 != 0)) continue this_cs = [clusters[ii] for ii in inds] this_ps = p[inds] this_stat_map = np.zeros((n_time, n_space), dtype=bool) for ci, c in enumerate(this_cs): if isinstance(c, tuple): this_c = np.zeros((n_time, n_space), bool) for x, y in zip(c[0], c[1]): this_stat_map[x, y] = True this_c[x, y] = True this_cs[ci] = this_c c = this_c this_stat_map[c] = True if cs is None: ps = this_ps cs = this_cs if stat_map is None: stat_map = this_stat_map assert_array_equal(ps, this_ps) assert_true(len(cs) == len(this_cs)) for c1, c2 in zip(cs, this_cs): assert_array_equal(c1, c2) assert_array_equal(stat_map, this_stat_map)
def Clustering(image, amount_noise): # transforming image to appropriate input for cluster flatImg = np.reshape(image, [-1, 3]) # Using Meanshift algorithm # Estimating bandwidth for meanshift algorithm bandwidth = estimate_bandwidth(flatImg, quantile=0.1, n_samples=100) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) print(". Using MeanShift Algorithm with", amount_noise, "noise.") ms.fit(flatImg) labels = ms.labels_ # Finding and displaying the number of clusters labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print(". Number of estimated clusters using MeanShift: %d" % n_clusters_) # Displaying segmented image using MeanShift ms_segmentedImg = np.reshape(labels, original_image_shape[:2]) ms_segmentedImg = label2rgb(ms_segmentedImg) * 255 cv2.imshow("MeanShift segments", ms_segmentedImg.astype(np.uint8)) cv2.waitKey(0) cv2.destroyAllWindows() # cv2.imwrite("MeanShiftSegmentedImage.png", ms_segmentedImg) print(". Done!") print(". Calculating scores") CalculatingMetrics(a_image, ms_segmentedImg) # Agglomerative clustering algorithm x, y, z = original_image.shape connectivity = grid_to_graph(n_x=x, n_y=y) print(". Using Agglomerative Clustering Algorithm with", amount_noise, "noise.") ac = AgglomerativeClustering(n_clusters=n_clusters_, linkage="ward", connectivity=connectivity) ac.fit(flatImg) labels = ac.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print(" Number of estimated clusters using Agglomerative clustering: %d" % n_clusters_) # Displaying segmented image using KMeans ac_segmentedImg = np.reshape(labels, original_image_shape[:2]) ac_segmentedImg = label2rgb(ac_segmentedImg) * 255 cv2.imshow("Agglomerative clustering segmented image", ac_segmentedImg) cv2.waitKey(0) cv2.destroyAllWindows() # cv2.imwrite("AgglomerativeSegmentedImage.png", ac_segmentedImg) print(". Done!") print(". Calculating scores") CalculatingMetrics(a_image, ac_segmentedImg) # KMeans algorithm print(". Using KMeans Clustering Algorithm with", amount_noise, "noise.") km = KMeans() km.fit(flatImg) labels = km.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print(" Number of estimated clusters using KMeans: %d" % n_clusters_) # Displaying segmented image using KMeans km_segmentedImg = np.reshape(labels, original_image_shape[:2]) km_segmentedImg = label2rgb(km_segmentedImg) * 255 cv2.imshow("KMeans segmented image", km_segmentedImg) cv2.waitKey(0) cv2.destroyAllWindows() # cv2.imwrite("KMeansSegmentedImage.png", km_segmentedImg) print(". Done!") print(". Calculating scores") CalculatingMetrics(a_image, km_segmentedImg)
def superpixel_tSVD(signals, Niter=3, nclusters=5, alpha=0.1, grid_shape=None, min_ncomps = 1, max_ncomps = 100, do_cleanup_label_maps=False, return_components=True): approx = [] sh = signals.shape connectivity_ward = None if grid_shape is not None: connectivity_ward = grid_to_graph(*grid_shape) labels = None # just to put this name into outer context comps = {} if connectivity_ward is None: clusterer = clustering_dispatcher_['minibatchkmeans'](nclusters) clusterer.batch_size = min(clusterer.batch_size, len(signals)) if clusterer.init_size is None: clusterer.init_size=3*nclusters clusterer.init_size = max(3 * nclusters, clusterer.init_size) else: clusterer = skclust.AgglomerativeClustering(nclusters,connectivity=connectivity_ward) for k in (range(Niter)): # could also "improve" signals for labeling by smoothing or projection to low-rank spaces if nclusters >1 : label_signals = signals if k == 0 else np.mean(approx,0)#/i labels = clusterer.fit_predict(label_signals) if do_cleanup_label_maps: labels = cleanup_cluster_map(labels.reshape((len(labels),1)), min_neighbors=2, niter=10).ravel() else: labels = np.ones(signals.shape,dtype=np.int) #alpha = k/Niter update_signals = (1-alpha)*signals + alpha*np.mean(approx,0) if k > 0 else signals update = np.zeros_like(update_signals) comps = {} for ll in np.unique(labels): group = labels == ll u,s,vh = simple_tSVD(signals[group]) comps[ll] = (u,s,vh) app = u @ np.diag(s) @ vh update[group] = app approx.append(update) if return_components: Ulist,Slist,Vhlist = [],[],[] for ll in comps: u,s,vh = comps[ll] Slist.append(s) ui = np.zeros((sh[0], len(s))) ui[labels==ll] = u Ulist.append(ui) Vhlist.append(vh) U = np.hstack(Ulist) S = np.concatenate(Slist) Vh = np.vstack(Vhlist) return U,S,Vh else: kstart = 1 if Niter > 1 else 0 approx = np.mean(approx[kstart:],0) return approx
def BMA_consensus_cluster_parallel(cfg, remote_path, remote_BOLD_fn, remote_mask_fn, Y, nifti_masker, \ num_vox, K_clus, K_clusters, \ parc, alpha, prop, nbItRFIR, onsets, durations,\ output_sub_parc, rescale=True, averg_bold=False): ''' Performs all steps for one clustering case (Kclus given, number l of the parcellation given) remote_path: path on the cluster, where results will be stored ''' import os import sys sys.path.append("/home/pc174679/pyhrf/pyhrf-tree_trunk/script/WIP/Scripts_IRMf_BB/Parcellations/") sys.path.append("/home/pc174679/pyhrf/pyhrf-tree_trunk/script/WIP/Scripts_IRMf_Adultes_Solv/") sys.path.append("/home/pc174679/pyhrf/pyhrf-tree_trunk/script/WIP/Scripts_IRMf_Adultes_Solv/Scripts_divers_utiles/Scripts_utiles/") sys.path.append('/home/pc174679/local/installations/consensus-cluster-0.6') from Random_parcellations import random_parcellations, subsample_data_on_time from Divers_parcellations_test import * from RFIR_evaluation_parcellations import JDE_estim, RFIR_estim, clustering_from_RFIR from Random_parcellations import hrf_roi_to_vox from pyhrf.tools._io import remote_copy, remote_mkdir from nisl import io #nifti_masker.mask=remote_mask_fn # Creation of the necessary paths --> do not do here parc_name = 'Subsampled_data_with_' + str(K_clus) + 'clusters' parc_name_clus = parc_name + 'rnd_number_' + str(parc+1) remote_sub = os.sep.join((remote_path, parc_name)) #if not os.path.exists(remote_sub): #os.path.exists(remote_sub) #print 'remote_sub:', remote_sub #os.makedirs(remote_sub) remote_sub_parc = os.sep.join((remote_sub,parc_name_clus)) #if not os.path.exists(remote_sub_parc): #os.makedirs(remote_sub_parc) output_RFIR_parc = os.sep.join((output_sub_parc,'RFIR_estim')) ################################### ## 1st STEP: SUBSAMPLING print '--- Subsample data ---' Ysub = subsample_data_on_time(Y, remote_mask_fn, K_clus, alpha, prop, \ nifti_masker, rescale=rescale) print 'Ysub:', Ysub print 'remote_sub_prc:', remote_sub_parc Ysub_name = 'Y_sub_'+ str(K_clus) + 'clusters_' + 'rnd_number_' + str(parc+1) +'.nii' Ysub_fn = os.sep.join((remote_sub_parc, Ysub_name)) Ysub_masked = nifti_masker.inverse_transform(Ysub).get_data() write_volume(Ysub_masked, Ysub_fn) ################################### ## 2D STEP: RFIR print '--- Performs RFIR estimation ---' remote_RFIR_parc_clus = os.sep.join((remote_sub_parc, 'RFIR_estim')) #if not os.path.exists(remote_RFIR_parc):os.makedirs(remote_RFIR_parc) #remote_RFIR_parc_clus = os.sep.join((remote_RFIR_parc, parc_name_clus)) #if not os.path.exists(remote_RFIR_parc_clus):os.makedirs(remote_RFIR_parc_clus) print ' * output path for RFIR ', remote_RFIR_parc_clus print ' * RFIR for subsampling nb ', str(parc+1), ' with ', K_clus, ' clusters' RFIR_estim(nbItRFIR, onsets, durations, Ysub_fn, remote_mask_fn, \ remote_RFIR_parc, avg_bold=averg_bold) hrf_fn = os.sep.join((remote_RFIR_parc_clus, 'rfir_ehrf.nii')) #remote_copy([hrf_fn], remote_host, #remote_user, remote_path)[0] ################################### ## 3D STEP: CLUSTERING FROM RFIR RESULTS name_hrf = 'rfir_ehrf.nii' from pyhrf.tools._io import write_volume, read_volume from pyhrf.tools._io import read_volume, write_volume import nisl.io as ionisl from sklearn.feature_extraction import image from sklearn.cluster import WardAgglomeration from scipy.spatial.distance import cdist, pdist hrf_fn = os.sep.join((remote_RFIR_parc_clus,name_hrf)) hrf=read_volume(hrf_fn)[0] hrf_t_fn = add_suffix(hrf_fn, 'transpose') #taking only 1st condition to parcellate write_volume(hrf[:,:,:,:,0], hrf_t_fn) nifti_masker = ionisl.NiftiMasker(remote_mask_fn) Nm = nifti_masker.fit(hrf_t_fn) #features: coeff of the HRF HRF = Nm.fit_transform(hrf_t_fn) mask, meta_data = read_volume(remote_mask_fn) shape = mask.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask) #features used for clustering features = HRF.transpose() ward = WardAgglomeration(n_clusters=K_clus, connectivity=connectivity, memory='nisl_cache') ward.fit(HRF) labels_tot = ward.labels_+1 #Kelbow, Perc_WSS, all_parc_from_RFIR_fns, all_parc_RFIR = \ #clustering_from_RFIR(K_clusters, remote_RFIR_parc_clus, remote_mask_fn, name_hrf, plots=False) #labels_tot = all_parc_RFIR[str(Kelbow)] #to retrieve clustering with as many clusters as determined in K_clusters #labels_tot = all_parc_RFIR[str(K_clus)] #Parcellation retrieved: for K=Kelbow #clusters_RFIR_fn = all_parc_from_RFIR[str(Kelbow)] #clustering_rfir_fn = os.path.join(remote_RFIR_parc_clus, 'output_clustering_elbow.nii') #write_volume(read_volume(clusters_RFIR_fn)[0], clustering_rfir_fn, meta_bold) #labels_tot = nifti_masker.fit_transform([clusters_RFIR_fn])[0] #labels_tot = read_volume(clusters_RFIR_fn)[0] #labels_name='labels_' + str(int(K_clus)) + '_' + str(parc+1) + '.pck' #name_f = os.sep.join((remote_sub_parc, labels_name)) #pickle_labels=open(name_f, 'w') #cPickle.dump(labels_tot,f) #pickle_labels.close() #remote_copy(pickle_labels, remote_user, #remote_host, output_sub_parc) ################################# ## Prepare consensus clustering print 'Prepare consensus clustering' clustcount, totalcount = upd_similarity_matrix(labels_tot) print 'results:', clustcount return clustcount.astype(np.bool)
def test_cluster_permutation_with_connectivity(): """Test cluster level permutations with connectivity matrix.""" try: try: from sklearn.feature_extraction.image import grid_to_graph except ImportError: from scikits.learn.feature_extraction.image import grid_to_graph except ImportError: return condition1_1d, condition2_1d, condition1_2d, condition2_2d = \ _get_conditions() n_pts = condition1_1d.shape[1] # we don't care about p-values in any of these, so do fewer permutations args = dict(seed=None, max_step=1, exclude=None, step_down_p=0, t_power=1, threshold=1.67, check_disjoint=False, n_permutations=50) did_warn = False for X1d, X2d, func, spatio_temporal_func in \ [(condition1_1d, condition1_2d, permutation_cluster_1samp_test, spatio_temporal_cluster_1samp_test), ([condition1_1d, condition2_1d], [condition1_2d, condition2_2d], permutation_cluster_test, spatio_temporal_cluster_test)]: out = func(X1d, **args) connectivity = grid_to_graph(1, n_pts) out_connectivity = func(X1d, connectivity=connectivity, **args) assert_array_equal(out[0], out_connectivity[0]) for a, b in zip(out_connectivity[1], out[1]): assert_array_equal(out[0][a], out[0][b]) assert_true(np.all(a[b])) # test spatio-temporal w/o time connectivity (repeat spatial pattern) connectivity_2 = sparse.coo_matrix( linalg.block_diag(connectivity.asfptype().todense(), connectivity.asfptype().todense())) if isinstance(X1d, list): X1d_2 = [np.concatenate((x, x), axis=1) for x in X1d] else: X1d_2 = np.concatenate((X1d, X1d), axis=1) out_connectivity_2 = func(X1d_2, connectivity=connectivity_2, **args) # make sure we were operating on the same values split = len(out[0]) assert_array_equal(out[0], out_connectivity_2[0][:split]) assert_array_equal(out[0], out_connectivity_2[0][split:]) # make sure we really got 2x the number of original clusters n_clust_orig = len(out[1]) assert_true(len(out_connectivity_2[1]) == 2 * n_clust_orig) # Make sure that we got the old ones back data_1 = set([np.sum(out[0][b[:n_pts]]) for b in out[1]]) data_2 = set([np.sum(out_connectivity_2[0][a]) for a in out_connectivity_2[1][:]]) assert_true(len(data_1.intersection(data_2)) == len(data_1)) # now use the other algorithm if isinstance(X1d, list): X1d_3 = [np.reshape(x, (-1, 2, n_space)) for x in X1d_2] else: X1d_3 = np.reshape(X1d_2, (-1, 2, n_space)) out_connectivity_3 = spatio_temporal_func(X1d_3, n_permutations=50, connectivity=connectivity, max_step=0, threshold=1.67, check_disjoint=True) # make sure we were operating on the same values split = len(out[0]) assert_array_equal(out[0], out_connectivity_3[0][0]) assert_array_equal(out[0], out_connectivity_3[0][1]) # make sure we really got 2x the number of original clusters assert_true(len(out_connectivity_3[1]) == 2 * n_clust_orig) # Make sure that we got the old ones back data_1 = set([np.sum(out[0][b[:n_pts]]) for b in out[1]]) data_2 = set([np.sum(out_connectivity_3[0][a[0], a[1]]) for a in out_connectivity_3[1]]) assert_true(len(data_1.intersection(data_2)) == len(data_1)) # test new versus old method out_connectivity_4 = spatio_temporal_func(X1d_3, n_permutations=50, connectivity=connectivity, max_step=2, threshold=1.67) out_connectivity_5 = spatio_temporal_func(X1d_3, n_permutations=50, connectivity=connectivity, max_step=1, threshold=1.67) # clusters could be in a different order sums_4 = [np.sum(out_connectivity_4[0][a]) for a in out_connectivity_4[1]] sums_5 = [np.sum(out_connectivity_4[0][a]) for a in out_connectivity_5[1]] sums_4 = np.sort(sums_4) sums_5 = np.sort(sums_5) assert_array_almost_equal(sums_4, sums_5) if not _force_serial: assert_raises(ValueError, spatio_temporal_func, X1d_3, n_permutations=1, connectivity=connectivity, max_step=1, threshold=1.67, n_jobs=-1000) # not enough TFCE params assert_raises(KeyError, spatio_temporal_func, X1d_3, connectivity=connectivity, threshold=dict(me='hello')) # too extreme a start threshold with warnings.catch_warnings(record=True) as w: spatio_temporal_func(X1d_3, connectivity=connectivity, threshold=dict(start=10, step=1)) if not did_warn: assert_true(len(w) == 1) did_warn = True # too extreme a start threshold assert_raises(ValueError, spatio_temporal_func, X1d_3, connectivity=connectivity, tail=-1, threshold=dict(start=1, step=-1)) assert_raises(ValueError, spatio_temporal_func, X1d_3, connectivity=connectivity, tail=-1, threshold=dict(start=-1, step=1)) # wrong type for threshold assert_raises(TypeError, spatio_temporal_func, X1d_3, connectivity=connectivity, threshold=[]) # wrong value for tail assert_raises(ValueError, spatio_temporal_func, X1d_3, connectivity=connectivity, tail=2) # make sure it actually found a significant point out_connectivity_6 = spatio_temporal_func(X1d_3, n_permutations=50, connectivity=connectivity, max_step=1, threshold=dict(start=1, step=1)) assert_true(np.min(out_connectivity_6[2]) < 0.05)
def generate_spatial_data(shape=(40, 40), n_subjects=1, n_parcels=1, mask=None, mu=None, sigma1=None, sigma2=None, model='ward', seed=1, smooth=0, jitter=0., verbose=0): """ Generate a dataset Parameters ========== shape: tuple, optional dimensions of the spatial model (assuming a grid) n_subjects: int, optional, the number of subjects considered n_parcels: int, optional, the number of generated parcels mask: array of shape (shape), domain-defining binary mask mu: array of shape (n_parcels), the mean of the simulated parcels sigma1: array of shape (n_parcels), the first-level variance of the simulated parcels sigma2: array of shape (n_parcels), the second-level variance of the simulated parcels model: string, one of ['ward, kmeans'], model used to generate the parcellation seed: int, optional, random generator seed smooth: float optional, posterior smoothing of the data jitter: float, optional, spatial jitter on the positions verbose: boolean, optional, verbosity mode Returns ======= xyz: array of shape (n_voxels, 3) the coordinates of the spatial data label: array of shape (n_voxels) indexes defining the spatial model X: array of shape(n_voxels, 1), signal attached to the voxels """ from scipy.ndimage import gaussian_filter # Create the spatial model if mask is None: mask = np.ones(np.prod(shape)) xyz = np.indices(shape).reshape(len(shape), np.prod(shape)).T else: xyz = np.vstack(np.where(mask)).T if model == 'kmeans': spatial_model = KMeans(n_clusters=n_parcels).fit(xyz) label = spatial_model.labels_ elif model == 'ward': connectivity = grid_to_graph(*shape, mask=mask).tocsr() label = Ward(n_clusters=n_parcels, connectivity=connectivity).fit( np.random.randn(mask.sum(), 100)).labels_ from sklearn import neighbors spatial_model = neighbors.KNeighborsClassifier(3) spatial_model.fit(xyz, label) else: raise ValueError('%s Unknown simulation model' % model) if jitter > 0: labels = [ spatial_model.predict(xyz + jitter * np.random.rand(1, xyz.shape[1])) for subj in range(n_subjects) ] X = np.zeros((xyz.shape[0], n_subjects)) # Generate the functional data if mu == None: mu = np.zeros(n_parcels) if sigma1 == None: sigma1 = np.ones(n_parcels) if sigma2 == None: sigma2 = np.ones(n_parcels) beta_ = np.random.randn(n_subjects) for k in range(n_parcels): if jitter > 0: mask = [label_ == k for label_ in labels] else: mask = [label == k for subj in range(n_subjects)] x, subj = generate_data_jitter(mu[k], sigma1[k], sigma2[k], mask, seed=seed, beta=beta_ * sigma2[k]) for n_subj in range(n_subjects): X[mask[n_subj], n_subj] = x[subj == n_subj] if smooth > 0: # smooth the data for subj in range(n_subjects): X[:, subj] = gaussian_filter(np.reshape(X[:, subj], shape), sigma=smooth).ravel() if verbose: fig = plt.figure(figsize=(10, 1.5)) plt.subplot(1, n_subjects + 1, 1) plt.imshow(np.reshape(label, shape), interpolation='nearest', cmap=plt.cm.spectral) plt.title('Template') plt.axis('off') for ns in range(n_subjects): plt.subplot(1, n_subjects + 1, 2 + ns) plt.imshow(np.reshape(X[:, ns], shape), interpolation='nearest') plt.title('subject %d' % ns) plt.axis('off') plt.subplots_adjust(left=.01, bottom=.01, right=.99, top=.99, wspace=.05, hspace=.01) fig.set_figheight(1.5) return xyz, label, X
# ############################################################################# # Generate data try: # SciPy >= 0.16 have face in misc from scipy.misc import face face = face(gray=True) except ImportError: face = sp.face(gray=True) # Resize it to 10% of the original size to speed up the processing face = sp.misc.imresize(face, 0.10) / 255. X = np.reshape(face, (-1, 1)) # ############################################################################# # Define the structure A of the data. Pixels connected to their neighbors. connectivity = grid_to_graph(*face.shape) # ############################################################################# # Compute clustering print("Compute structured hierarchical clustering...") st = time.time() n_clusters = 15 # number of regions ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward', connectivity=connectivity) ward.fit(X) label = np.reshape(ward.labels_, face.shape) print("Elapsed time: ", time.time() - st) print("Number of pixels: ", label.size) print("Number of clusters: ", np.unique(label).size)
def test_cluster_permutation_with_adjacency(numba_conditional): """Test cluster level permutations with adjacency matrix.""" from sklearn.feature_extraction.image import grid_to_graph condition1_1d, condition2_1d, condition1_2d, condition2_2d = \ _get_conditions() n_pts = condition1_1d.shape[1] # we don't care about p-values in any of these, so do fewer permutations args = dict(seed=None, max_step=1, exclude=None, out_type='mask', step_down_p=0, t_power=1, threshold=1.67, check_disjoint=False, n_permutations=50) did_warn = False for X1d, X2d, func, spatio_temporal_func in \ [(condition1_1d, condition1_2d, permutation_cluster_1samp_test, spatio_temporal_cluster_1samp_test), ([condition1_1d, condition2_1d], [condition1_2d, condition2_2d], permutation_cluster_test, spatio_temporal_cluster_test)]: out = func(X1d, **args) adjacency = grid_to_graph(1, n_pts) out_adjacency = func(X1d, adjacency=adjacency, **args) assert_array_equal(out[0], out_adjacency[0]) for a, b in zip(out_adjacency[1], out[1]): assert_array_equal(out[0][a], out[0][b]) assert np.all(a[b]) # test spatio-temporal w/o time adjacency (repeat spatial pattern) adjacency_2 = sparse.coo_matrix( linalg.block_diag(adjacency.asfptype().todense(), adjacency.asfptype().todense())) # nesting here is time then space: adjacency_2a = combine_adjacency(np.eye(2), adjacency) assert_array_equal(adjacency_2.toarray().astype(bool), adjacency_2a.toarray().astype(bool)) if isinstance(X1d, list): X1d_2 = [np.concatenate((x, x), axis=1) for x in X1d] else: X1d_2 = np.concatenate((X1d, X1d), axis=1) out_adjacency_2 = func(X1d_2, adjacency=adjacency_2, **args) # make sure we were operating on the same values split = len(out[0]) assert_array_equal(out[0], out_adjacency_2[0][:split]) assert_array_equal(out[0], out_adjacency_2[0][split:]) # make sure we really got 2x the number of original clusters n_clust_orig = len(out[1]) assert len(out_adjacency_2[1]) == 2 * n_clust_orig # Make sure that we got the old ones back data_1 = {np.sum(out[0][b[:n_pts]]) for b in out[1]} data_2 = {np.sum(out_adjacency_2[0][a]) for a in out_adjacency_2[1][:]} assert len(data_1.intersection(data_2)) == len(data_1) # now use the other algorithm if isinstance(X1d, list): X1d_3 = [np.reshape(x, (-1, 2, n_space)) for x in X1d_2] else: X1d_3 = np.reshape(X1d_2, (-1, 2, n_space)) out_adjacency_3 = spatio_temporal_func( X1d_3, n_permutations=50, adjacency=adjacency, max_step=0, threshold=1.67, check_disjoint=True) # make sure we were operating on the same values split = len(out[0]) assert_array_equal(out[0], out_adjacency_3[0][0]) assert_array_equal(out[0], out_adjacency_3[0][1]) # make sure we really got 2x the number of original clusters assert len(out_adjacency_3[1]) == 2 * n_clust_orig # Make sure that we got the old ones back data_1 = {np.sum(out[0][b[:n_pts]]) for b in out[1]} data_2 = {np.sum(out_adjacency_3[0][a[0], a[1]]) for a in out_adjacency_3[1]} assert len(data_1.intersection(data_2)) == len(data_1) # test new versus old method out_adjacency_4 = spatio_temporal_func( X1d_3, n_permutations=50, adjacency=adjacency, max_step=2, threshold=1.67) out_adjacency_5 = spatio_temporal_func( X1d_3, n_permutations=50, adjacency=adjacency, max_step=1, threshold=1.67) # clusters could be in a different order sums_4 = [np.sum(out_adjacency_4[0][a]) for a in out_adjacency_4[1]] sums_5 = [np.sum(out_adjacency_4[0][a]) for a in out_adjacency_5[1]] sums_4 = np.sort(sums_4) sums_5 = np.sort(sums_5) assert_array_almost_equal(sums_4, sums_5) if not _force_serial: pytest.raises(ValueError, spatio_temporal_func, X1d_3, n_permutations=1, adjacency=adjacency, max_step=1, threshold=1.67, n_jobs=-1000) # not enough TFCE params with pytest.raises(KeyError, match='threshold, if dict, must have'): spatio_temporal_func( X1d_3, adjacency=adjacency, threshold=dict(me='hello')) # too extreme a start threshold with pytest.warns(None) as w: spatio_temporal_func(X1d_3, adjacency=adjacency, threshold=dict(start=10, step=1)) if not did_warn: assert len(w) == 1 did_warn = True with pytest.raises(ValueError, match='threshold.*<= 0 for tail == -1'): spatio_temporal_func( X1d_3, adjacency=adjacency, tail=-1, threshold=dict(start=1, step=-1)) with pytest.warns(RuntimeWarning, match='threshold.* is more extreme'): spatio_temporal_func( X1d_3, adjacency=adjacency, tail=1, threshold=dict(start=100, step=1)) bad_con = adjacency.todense() with pytest.raises(ValueError, match='must be a SciPy sparse matrix'): spatio_temporal_func( X1d_3, n_permutations=50, adjacency=bad_con, max_step=1, threshold=1.67) bad_con = adjacency.tocsr()[:-1, :-1].tocoo() with pytest.raises(ValueError, match='adjacency.*the correct size'): spatio_temporal_func( X1d_3, n_permutations=50, adjacency=bad_con, max_step=1, threshold=1.67) with pytest.raises(TypeError, match='must be a'): spatio_temporal_func( X1d_3, adjacency=adjacency, threshold=[]) with pytest.raises(ValueError, match='Invalid value for the \'tail\''): with pytest.warns(None): # sometimes ignoring tail spatio_temporal_func( X1d_3, adjacency=adjacency, tail=2) # make sure it actually found a significant point out_adjacency_6 = spatio_temporal_func( X1d_3, n_permutations=50, adjacency=adjacency, max_step=1, threshold=dict(start=1, step=1)) assert np.min(out_adjacency_6[2]) < 0.05 with pytest.raises(ValueError, match='not compatible'): with pytest.warns(RuntimeWarning, match='No clusters'): spatio_temporal_func( X1d_3, n_permutations=50, adjacency=adjacency, threshold=1e-3, stat_fun=lambda *x: f_oneway(*x)[:-1], buffer_size=None)
def test_cluster_permutation_t_test_with_connectivity(): """Test cluster level permutations T-test with connectivity matrix.""" try: try: from sklearn.feature_extraction.image import grid_to_graph except ImportError: from scikits.learn.feature_extraction.image import grid_to_graph except ImportError: return out = permutation_cluster_1samp_test(condition1_1d, n_permutations=500) connectivity = grid_to_graph(1, condition1_1d.shape[1]) out_connectivity = permutation_cluster_1samp_test( condition1_1d, n_permutations=500, connectivity=connectivity) assert_array_equal(out[0], out_connectivity[0]) for a, b in zip(out_connectivity[1], out[1]): assert_true(np.sum(out[0][a]) == np.sum(out[0][b])) assert_true(np.all(a[b])) # test spatio-temporal with no time connectivity (repeat spatial pattern) connectivity_2 = sparse.coo_matrix( linalg.block_diag(connectivity.asfptype().todense(), connectivity.asfptype().todense())) condition1_2 = np.concatenate((condition1_1d, condition1_1d), axis=1) out_connectivity_2 = permutation_cluster_1samp_test( condition1_2, n_permutations=500, connectivity=connectivity_2) # make sure we were operating on the same values split = len(out[0]) assert_array_equal(out[0], out_connectivity_2[0][:split]) assert_array_equal(out[0], out_connectivity_2[0][split:]) # make sure we really got 2x the number of original clusters n_clust_orig = len(out[1]) assert_true(len(out_connectivity_2[1]) == 2 * n_clust_orig) # Make sure that we got the old ones back n_pts = condition1_1d.shape[1] data_1 = set([np.sum(out[0][b[:n_pts]]) for b in out[1]]) data_2 = set([ np.sum(out_connectivity_2[0][a[:n_pts]]) for a in out_connectivity_2[1][:] ]) assert_true(len(data_1.intersection(data_2)) == len(data_1)) # now use the other algorithm condition1_3 = np.reshape(condition1_2, (40, 2, 350)) out_connectivity_3 = mnestats.spatio_temporal_cluster_1samp_test( condition1_3, n_permutations=500, connectivity=connectivity, max_step=0, threshold=1.67, check_disjoint=True) # make sure we were operating on the same values split = len(out[0]) assert_array_equal(out[0], out_connectivity_3[0][0]) assert_array_equal(out[0], out_connectivity_3[0][1]) # make sure we really got 2x the number of original clusters assert_true(len(out_connectivity_3[1]) == 2 * n_clust_orig) # Make sure that we got the old ones back data_1 = set([np.sum(out[0][b[:n_pts]]) for b in out[1]]) data_2 = set([ np.sum(out_connectivity_3[0][a[0], a[1]]) for a in out_connectivity_3[1] ]) assert_true(len(data_1.intersection(data_2)) == len(data_1))
X -= X.mean(axis=0) X /= X.std(axis=0) y = np.dot(X, coef.ravel()) noise = np.random.randn(y.shape[0]) noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2) y += noise_coef * noise # add noise ############################################################################### # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(len(y), 2) # cross-validation generator for model selection ridge = BayesianRidge() mem = Memory(cachedir='.', verbose=1) # Ward agglomeration followed by BayesianRidge A = grid_to_graph(n_x=size, n_y=size) ward = WardAgglomeration(n_clusters=10, connectivity=A, memory=mem, n_components=1) clf = Pipeline([('ward', ward), ('ridge', ridge)]) # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator.steps[-1][1].coef_ coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function anova = feature_selection.SelectPercentile(f_regression)
def boo(subject_idx=0, cut_coords=None, n_components=20, n_clusters=2000, memory='nilearn_cache'): mem = Memory(cachedir='nilearn_cache') # ## Load the data ################################################### print("Fetch the data files from Internet") haxby_dataset = datasets.fetch_haxby(n_subjects=subject_idx + 1) print("Second, load the labels") haxby_labels = np.genfromtxt(haxby_dataset.session_target[0], skip_header=1, usecols=[0], dtype=basestring) # ## Find voxels of interest ############################################## print("Load the data.") anat_filename = haxby_dataset.anat[subject_idx] anat_img = nibabel.load(anat_filename) fmri_filename = haxby_dataset.func[subject_idx] fmri_raw_img = nibabel.load(fmri_filename) print("Build a mask based on the activations.") epi_masker = NiftiMasker(mask_strategy='epi', detrend=True, standardize=True) epi_masker = mem.cache(epi_masker.fit)(fmri_raw_img) plot_roi(epi_masker.mask_img_, bg_img=anat_img, title='EPI mask', cut_coords=cut_coords) print("Normalize the (transformed) data") # zscore per pixel, over examples. fmri_masked_vectors = epi_masker.transform(fmri_raw_img) fmri_normed_vectors = mem.cache(stats.mstats.zscore)(fmri_masked_vectors, axis=0) fmri_normed_img = epi_masker.inverse_transform(fmri_normed_vectors) print("Smooth the (spatial) data.") fmri_smooth_img = mem.cache(image.smooth_img)(fmri_normed_img, fwhm=7) print("Mask the MRI data.") masked_fmri_vectors = mem.cache(epi_masker.transform)(fmri_smooth_img) # ## Compute mean values based on condition matrix ########################################## condition_names = list(np.unique(haxby_labels)) n_conditions = len(condition_names) n_good_voxels = masked_fmri_vectors.shape[1] mean_vectors = np.empty((n_conditions, n_good_voxels)) for ci, condition in enumerate(condition_names): condition_vectors = masked_fmri_vectors[haxby_labels == condition, :] mean_vectors[ci, :] = condition_vectors.mean(axis=0) # ## Use similarity across conditions as the 4th dimension ########################################## n_conds = len(condition_names) n_compares = n_conds * (n_conds - 1) / 2 p_vectors = np.zeros((n_compares, masked_fmri_vectors.shape[1])) comparison_text = [] comparison_img = [] idx = 0 for i, cond in enumerate(condition_names): for j, cond2 in enumerate(condition_names[i+1:]): print("Computing ttest for %s vs. %s." % (cond, cond2)) _, p_vector = stats.ttest_ind( masked_fmri_vectors[haxby_labels == cond, :], masked_fmri_vectors[haxby_labels == cond2, :], axis=0) p_vector /= p_vector.max() # normalize p_vector = -np.log10(p_vector) p_vector[np.isnan(p_vector)] = 0. p_vector[p_vector > 10.] = 10. p_img = epi_masker.inverse_transform(p_vector) comparison_img.append(p_img) comparison_text.append('%s vs. %s' % (cond, cond2)) p_vectors[idx, :] = p_vector idx += 1 #n_comparisons = n_conditions * (n_conditions-1) / 2 #similarity_vectors = np.empty((n_good_voxels, n_comparisons)) #for vi in np.arange(n_good_voxels): # similarity_vectors[vi, :] = pdist(mean_vectors[:, vi]) # Compute a connectivity matrix (for constraining the clustering) mask_data = epi_masker.mask_img_.get_data().astype(np.bool) connectivity = sk_image.grid_to_graph(n_x=mask_data.shape[0], n_y=mask_data.shape[1], n_z=mask_data.shape[2], mask=mask_data) # Cluster (#2) start = time.time() ward = WardAgglomeration(n_clusters=n_clusters, connectivity=connectivity, memory=memory) ward.fit(p_vectors) print("Ward agglomeration %d clusters: %.2fs" % ( n_clusters, time.time() - start)) # Compute an image with one ROI per label, and save to disk labels = ward.labels_ + 1 # Avoid 0 label - 0 means mask. labels_img = epi_masker.inverse_transform(labels) labels_img.to_filename('parcellation.nii') # Plot image with len(labels) ROIs, and store # the cut coordinates to reuse for all plots # and the figure for plotting all to a common axis first_plot = plot_roi(labels_img, title="Ward parcellation", bg_img=anat_img) plt.show()
def execute(self, eopatch): """ :param eopatch: Input EOPatch :type eopatch: EOPatch :return: Transformed EOPatch :rtype: EOPatch """ data = self.construct_data(eopatch) # Reshapes the data, because AgglomerativeClustering method only takes one dimensional arrays of vectors org_shape = data.shape data = np.reshape(data, (-1, org_shape[-1])) org_length = len(data) graph_args = {"n_x": org_shape[0], "n_y": org_shape[1]} locations = None # All connections to masked pixels are removed if self.mask_name is not None: mask = eopatch.mask_timeless[self.mask_name].squeeze() graph_args["mask"] = mask locations = [ i for i, elem in enumerate(np.ravel(mask)) if elem == 0 ] data = np.delete(data, locations, axis=0) # If connectivity is not set, it uses pixel-to-pixel connections if not self.connectivity: self.connectivity = grid_to_graph(**graph_args) model = AgglomerativeClustering( distance_threshold=self.distance_threshold, affinity=self.affinity, linkage=self.linkage, connectivity=self.connectivity, n_clusters=self.n_clusters, compute_full_tree=self.compute_full_tree, ) model.fit(data) trimmed_labels = model.labels_ if self.remove_small > 0: # Counts how many pixels covers each cluster labels = np.zeros(model.n_clusters_) for i in trimmed_labels: labels[i] += 1 # Sets to -1 all pixels corresponding to too small clusters for i, no_lab in enumerate(labels): if no_lab < self.remove_small: trimmed_labels[trimmed_labels == i] = -1 # Transforms data back to original shape and setting all masked regions to -1 if self.mask_name is not None: new_data = [-1] * org_length for i, val in zip(np.delete(range(org_length), locations), trimmed_labels): new_data[i] = val trimmed_labels = new_data trimmed_labels = np.reshape(trimmed_labels, org_shape[:-1]) eopatch[FeatureType.DATA_TIMELESS, self.new_feature_name] = trimmed_labels[..., np.newaxis] return eopatch
signal_mean = 100 signal_sd = 100 noise_sd = 0.01 gaussian_sd = 5 sigma = 1e-3 # sigma for the "hat" method threshold = -stats.distributions.t.ppf(0.05, n_subjects - 1) threshold_tfce = dict(start=0, step=0.2) n_permutations = 1024 # number of clustering permutations (1024 for exact) ############################################################################### # Construct simulated data # ------------------------ # # Make the connectivity matrix just next-neighbor spatially n_src = width * width connectivity = grid_to_graph(width, width) # For each "subject", make a smoothed noisy signal with a centered peak rng = np.random.RandomState(42) X = noise_sd * rng.randn(n_subjects, width, width) # Add a signal at the dead center X[:, width // 2, width // 2] = signal_mean + rng.randn(n_subjects) * signal_sd # Spatially smooth with a 2D Gaussian kernel size = width // 2 - 1 gaussian = np.exp(-(np.arange(-size, size + 1)**2 / float(gaussian_sd**2))) for si in range(X.shape[0]): for ri in range(X.shape[1]): X[si, ri, :] = np.convolve(X[si, ri, :], gaussian, 'same') for ci in range(X.shape[2]): X[si, :, ci] = np.convolve(X[si, :, ci], gaussian, 'same')
X /= X.std(axis=0) y = np.dot(X, coef.ravel()) noise = np.random.randn(y.shape[0]) noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2) y += noise_coef * noise # add noise # ############################################################################# # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(2) # cross-validation generator for model selection ridge = BayesianRidge() cachedir = tempfile.mkdtemp() mem = Memory(cachedir=cachedir, verbose=1) # Ward agglomeration followed by BayesianRidge connectivity = grid_to_graph(n_x=size, n_y=size) ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem) clf = Pipeline([('ward', ward), ('ridge', ridge)]) # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function anova = feature_selection.SelectPercentile(f_regression) clf = Pipeline([('anova', anova), ('ridge', ridge)])
from sklearn import datasets import numpy as np from sklearn.feature_extraction.image import grid_to_graph digits = datasets.load_digits() images = digits.images X = np.reshape(images, (len(images), -1)) print(images[0].shape) connectivity = grid_to_graph(*images[0].shape)