def test_perfect_checkerboard(): # XXX Previously failed on build bot (not reproducible) model = SpectralBiclustering(3, svd_method="arpack", random_state=0) S, rows, cols = make_checkerboard((30, 30), 3, noise=0, random_state=0) model.fit(S) assert consensus_score(model.biclusters_, (rows, cols)) == 1 S, rows, cols = make_checkerboard((40, 30), 3, noise=0, random_state=0) model.fit(S) assert consensus_score(model.biclusters_, (rows, cols)) == 1 S, rows, cols = make_checkerboard((30, 40), 3, noise=0, random_state=0) model.fit(S) assert consensus_score(model.biclusters_, (rows, cols)) == 1
def test_spectral_coclustering(): # Test Dhillon's Spectral CoClustering on a simple problem. param_grid = { 'svd_method': ['randomized', 'arpack'], 'n_svd_vecs': [None, 20], 'mini_batch': [False, True], 'init': ['k-means++'], 'n_init': [10], 'n_jobs': [1] } random_state = 0 S, rows, cols = make_biclusters((30, 30), 3, noise=0.5, random_state=random_state) S -= S.min() # needs to be nonnegative before making it sparse S = np.where(S < 1, 0, S) # threshold some values for mat in (S, csr_matrix(S)): for kwargs in ParameterGrid(param_grid): model = SpectralCoclustering(n_clusters=3, random_state=random_state, **kwargs) model.fit(mat) assert_equal(model.rows_.shape, (3, 30)) assert_array_equal(model.rows_.sum(axis=0), np.ones(30)) assert_array_equal(model.columns_.sum(axis=0), np.ones(30)) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1) _test_shape_indices(model)
def test_spectral_coclustering(): """Test Dhillon's Spectral CoClustering on a simple problem.""" param_grid = {'svd_method': ['randomized', 'arpack'], 'n_svd_vecs': [None, 20], 'mini_batch': [False, True], 'init': ['k-means++'], 'n_init': [10], 'n_jobs': [1]} random_state = 0 S, rows, cols = make_biclusters((30, 30), 3, noise=0.5, random_state=random_state) S -= S.min() # needs to be nonnegative before making it sparse S = np.where(S < 1, 0, S) # threshold some values for mat in (S, csr_matrix(S)): for kwargs in ParameterGrid(param_grid): model = SpectralCoclustering(n_clusters=3, random_state=random_state, **kwargs) model.fit(mat) assert_equal(model.rows_.shape, (3, 30)) assert_array_equal(model.rows_.sum(axis=0), np.ones(30)) assert_array_equal(model.columns_.sum(axis=0), np.ones(30)) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)
def test_perfect_checkerboard(): raise SkipTest("This test is failing on the buildbot, but cannot" " reproduce. Temporarily disabling it until it can be" " reproduced and fixed.") model = SpectralBiclustering(3, svd_method="arpack", random_state=0) S, rows, cols = make_checkerboard((30, 30), 3, noise=0, random_state=0) model.fit(S) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1) S, rows, cols = make_checkerboard((40, 30), 3, noise=0, random_state=0) model.fit(S) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1) S, rows, cols = make_checkerboard((30, 40), 3, noise=0, random_state=0) model.fit(S) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1)
def test_consensus_score_issue2445(): """ Different number of biclusters in A and B""" a_rows = np.array([[True, True, False, False], [False, False, True, True], [False, False, False, True]]) a_cols = np.array([[True, True, False, False], [False, False, True, True], [False, False, False, True]]) idx = [0, 2] s = consensus_score((a_rows, a_cols), (a_rows[idx], a_cols[idx])) # B contains 2 of the 3 biclusters in A, so score should be 2/3 assert_almost_equal(s, 2.0 / 3.0)
def test_consensus_score_issue2445(): ''' Different number of biclusters in A and B''' a_rows = np.array([[True, True, False, False], [False, False, True, True], [False, False, False, True]]) a_cols = np.array([[True, True, False, False], [False, False, True, True], [False, False, False, True]]) idx = [0, 2] s = consensus_score((a_rows, a_cols), (a_rows[idx], a_cols[idx])) # B contains 2 of the 3 biclusters in A, so score should be 2/3 assert_almost_equal(s, 2.0 / 3.0)
def bicluster_similarity(self, reference_model): """ Calculates the similarity between the current model of biclusters and the reference model of biclusters :param reference_model: The reference model of biclusters :return: Returns the consensus score(Hochreiter et. al., 2010), i.e. the similarity of two sets of biclusters. """ similarity_score = consensus_score(self.model.biclusters_, reference_model.biclusters_) return similarity_score
def test_consensus_score(): a = [[True, True, False, False], [False, False, True, True]] b = a[::-1] assert_equal(consensus_score((a, a), (a, a)), 1) assert_equal(consensus_score((a, a), (b, b)), 1) assert_equal(consensus_score((a, b), (a, b)), 1) assert_equal(consensus_score((a, b), (b, a)), 1) assert_equal(consensus_score((a, a), (b, a)), 0) assert_equal(consensus_score((a, a), (a, b)), 0) assert_equal(consensus_score((b, b), (a, b)), 0) assert_equal(consensus_score((b, b), (b, a)), 0)
def test_consensus_score(): a = [[True, True, False, False], [False, False, True, True]] b = a[::-1] assert consensus_score((a, a), (a, a)) == 1 assert consensus_score((a, a), (b, b)) == 1 assert consensus_score((a, b), (a, b)) == 1 assert consensus_score((a, b), (b, a)) == 1 assert consensus_score((a, a), (b, a)) == 0 assert consensus_score((a, a), (a, b)) == 0 assert consensus_score((b, b), (a, b)) == 0 assert consensus_score((b, b), (b, a)) == 0
def test_spectral_biclustering(): # Test Kluger methods on a checkerboard dataset. S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5, random_state=0) non_default_params = {'method': ['scale', 'log'], 'svd_method': ['arpack'], 'n_svd_vecs': [20], 'mini_batch': [True]} for mat in (S, csr_matrix(S)): for param_name, param_values in non_default_params.items(): for param_value in param_values: model = SpectralBiclustering( n_clusters=3, n_init=3, init='k-means++', random_state=0, ) model.set_params(**dict([(param_name, param_value)])) if issparse(mat) and model.get_params().get('method') == 'log': # cannot take log of sparse matrix with pytest.raises(ValueError): model.fit(mat) continue else: model.fit(mat) assert model.rows_.shape == (9, 30) assert model.columns_.shape == (9, 30) assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30)) assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30)) assert consensus_score(model.biclusters_, (rows, cols)) == 1 _test_shape_indices(model)
def jaccard(self, estimator, train=None): """Computes the Jaccard coefficient as a measure of similarity between two sets of biclusters. The Jaccard index achieves its minimum of zero when the biclusters to not overlap at all and its maximum of one when they are identical. Args: estimator (object): The fitted model holding bicluster esitmates. train: Ignored. Returns: (float): The Jaccard coefficient value. """ rows, cols = estimator.biclusters_ if len(rows) == 0 or len(cols) == 0: return 0.0 else: ytrue = (self._rows[:, self.row_idx], self._cols[:, self.col_idx]) return consensus_score((rows, cols), ytrue, similarity='jaccard')
def test_spectral_biclustering(): # Test Kluger methods on a checkerboard dataset. S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5, random_state=0) non_default_params = {'method': ['scale', 'log'], 'svd_method': ['arpack'], 'n_svd_vecs': [20], 'mini_batch': [True]} for mat in (S, csr_matrix(S)): for param_name, param_values in non_default_params.items(): for param_value in param_values: model = SpectralBiclustering( n_clusters=3, n_init=3, init='k-means++', random_state=0, ) model.set_params(**dict([(param_name, param_value)])) if issparse(mat) and model.get_params().get('method') == 'log': # cannot take log of sparse matrix assert_raises(ValueError, model.fit, mat) continue else: model.fit(mat) assert_equal(model.rows_.shape, (9, 30)) assert_equal(model.columns_.shape, (9, 30)) assert_array_equal(model.rows_.sum(axis=0), np.repeat(3, 30)) assert_array_equal(model.columns_.sum(axis=0), np.repeat(3, 30)) assert_equal(consensus_score(model.biclusters_, (rows, cols)), 1) _test_shape_indices(model)
from sklearn.cluster.bicluster import SpectralCoclustering from sklearn.metrics import consensus_score data, rows, columns = make_biclusters(shape=(300, 300), n_clusters=5, noise=5, shuffle=False, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Original dataset") data, row_idx, col_idx = sg._shuffle(data, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Shuffled dataset") model = SpectralCoclustering(n_clusters=6, random_state=0) model.fit(data) score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print("consensus score: {:.3f}".format(score)) fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.show()
def jaccard_consensus(a, b): '''Get similarity between individual biclusters, a and b, calculated using the Jaccard index''' score = consensus_score(a, b) print("consensus score: {:.3f}".format(score))
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ ------------------------------------------------- File Name:biClusteringL Description : 双聚类,对行列同时进行聚类 Email : [email protected] Date:2018/1/1 """ from sklearn.cluster.bicluster import SpectralCoclustering from sklearn.datasets import make_biclusters from sklearn.metrics import consensus_score data, rows, columns = make_biclusters(shape=(300, 300), n_clusters=5, noise=0.5, random_state=0) model = SpectralCoclustering(n_clusters=5, random_state=0) model.fit(data) score = consensus_score(model.biclusters_, (rows, columns)) print('scores: {}'.format(score))
n_clusters = (4, 3) data, rows, columns = make_checkerboard( shape=(300, 300), n_clusters=n_clusters, noise=10, shuffle=False, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Original dataset") data, row_idx, col_idx = sg._shuffle(data, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Shuffled dataset") model = SpectralBiclustering(n_clusters=n_clusters, method='log', random_state=0) model.fit(data) score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print("consensus score: {:.1f}".format(score)) fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.matshow(np.outer(np.sort(model.row_labels_) + 1, np.sort(model.column_labels_) + 1), cmap=plt.cm.Blues) plt.title("Checkerboard structure of rearranged data") #plt.show()
def test_co_clustering(): import numpy as np import nibabel as nb from matplotlib import pyplot as plt import sklearn as sk from sklearn.datasets import make_biclusters from sklearn.datasets import samples_generator as sg from sklearn.cluster.bicluster import SpectralCoclustering from sklearn.metrics import consensus_score # REAL DATA subject_file = '/Users/aki.nikolaidis/Desktop/NKI_SampleData/A00060280/3mm_bandpassed_demeaned_filtered_antswarp.nii.gz' roi_mask_file = home + '/git_repo/basc/masks/BG_3mm.nii.gz' roi2_mask_file = home + '/git_repo/basc/masks/yeo2_3mm.nii.gz' data = nb.load(subject_file).get_data().astype('float32') print('Data Loaded') print('Setting up NIS') roi_mask_file_nb = nb.load(roi_mask_file) roi2_mask_file_nb = nb.load(roi2_mask_file) roi_mask_nparray = nb.load(roi_mask_file).get_data().astype( 'float32').astype('bool') roi2_mask_nparray = nb.load(roi2_mask_file).get_data().astype( 'float32').astype('bool') roi1data = data[roi_mask_nparray] roi2data = data[roi2_mask_nparray] #add code that uploads the roi1data and roi2data, divides by the mean and standard deviation of the timeseries roi1data = sk.preprocessing.normalize(roi1data, norm='l2') roi2data = sk.preprocessing.normalize(roi2data, norm='l2') dist_btwn_data_1_2 = np.array( sp.spatial.distance.cdist(roi1data, roi2data, metric='correlation')) sim_btwn_data_1_2 = 1 - dist_btwn_data_1_2 sim_btwn_data_1_2[np.isnan(sim_btwn_data_1_2)] = 0 sim_btwn_data_1_2[sim_btwn_data_1_2 < 0] = 0 sim_btwn_data_1_2 = sim_btwn_data_1_2 + (np.random.rand( len(sim_btwn_data_1_2), len(sim_btwn_data_1_2[1, :]))) / 100 sim_btwn_data_1_2[sim_btwn_data_1_2 > 1] = 1 sum(sum(sim_btwn_data_1_2 == np.inf)) sum(sum(sim_btwn_data_1_2 == np.nan)) model = SpectralCoclustering(n_clusters=5, random_state=0, n_init=100) model.fit(sim_btwn_data_1_2) fit_data = sim_btwn_data_1_2[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.show() #SIMULATION DATA import numpy as np from matplotlib import pyplot as plt from sklearn.datasets import make_biclusters from sklearn.datasets import samples_generator as sg from sklearn.cluster.bicluster import SpectralCoclustering from sklearn.metrics import consensus_score #Creating Simulated Data data, rows, columns = make_biclusters(shape=(300, 100), n_clusters=5, noise=5, shuffle=False, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Original dataset") data, row_idx, col_idx = sg._shuffle(data, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Shuffled dataset") #Creating Model model = SpectralCoclustering(n_clusters=5, random_state=0) model.fit(data) score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print("consensus score: {:.3f}".format(score)) fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.show() #################################################################### #################################################################### from sklearn import cluster import scipy as sp import time from sklearn import cluster, datasets import numpy as np from matplotlib import pyplot as plt from sklearn.datasets import make_biclusters from sklearn.datasets import samples_generator as sg from sklearn.cluster.bicluster import SpectralCoclustering from sklearn.metrics import consensus_score data1 = generate_simple_blobs(27) data2 = generate_simple_blobs(27) data2 = data2[0:150, :] print("Calculating Cross-clustering") print("Calculating pairwise distances between areas") dist_btwn_data_1_2 = np.array( sp.spatial.distance.cdist(roi1data, roi2data, metric='correlation')) sim_btwn_data_1_2 = 1 - dist_btwn_data_1_2 sim_btwn_data_1_2[sim_btwn_data_1_2 < 0] = 0 co_cluster = cluster.SpectralCoclustering() co_cluster.fit(sim_btwn_data_1_2) score = consensus_score(co_cluster.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print("consensus score: {:.3f}".format(score)) fit_data = data[np.argsort(co_cluster.row_labels_)] fit_data = fit_data[:, np.argsort(co_cluster.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") plt.show()