Example #1
0
def test_ward_clustering():
    """
    Check that we obtain the correct number of clusters with Ward clustering.
    """
    np.random.seed(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = np.random.randn(100, 50)
    connectivity = grid_to_graph(*mask.shape)
    clustering = Ward(n_clusters=10, connectivity=connectivity)
    clustering.fit(X)
    assert(np.size(np.unique(clustering.labels_)) == 10)
Example #2
0
def test_height_ward_tree():
    """
    Check that the height of ward tree is sorted.
    """
    np.random.seed(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = np.random.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    children, n_nodes, n_leaves = ward_tree(X.T, connectivity)
    n_nodes = 2 * X.shape[1] - 1
    assert(len(children) + n_leaves == n_nodes)
Example #3
0
def test_structured_ward_tree():
    """
    Check that we obtain the correct solution for structured ward tree.
    """
    np.random.seed(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = np.random.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    children, n_components, n_leaves = ward_tree(X.T, connectivity)
    n_nodes = 2 * X.shape[1] - 1
    assert(len(children) + n_leaves == n_nodes)
def test_cluster_permutation_t_test_with_connectivity():
    """Test cluster level permutations T-test with connectivity matrix."""
    try:
        from scikits.learn.feature_extraction.image import grid_to_graph
    except ImportError:
        pass
    else:
        out = permutation_cluster_1samp_test(condition1, n_permutations=500)
        connectivity = grid_to_graph(1, condition1.shape[1])
        out_connectivity = permutation_cluster_1samp_test(condition1,
                                 n_permutations=500, connectivity=connectivity)
        assert_array_equal(out[0], out_connectivity[0])
        for a, b in zip(out_connectivity[1], out[1]):
            assert_true(np.sum(out[0][a]) == np.sum(out[0][b]))
            assert_true(np.all(a[b]))
Example #5
0
def test_ward_agglomeration():
    """
    Check that we obtain the correct solution in a simplistic case
    """
    np.random.seed(0)
    mask = np.ones([10, 10], dtype=np.bool)
    X = np.random.randn(50, 100)
    connectivity = grid_to_graph(*mask.shape)
    ward = WardAgglomeration(n_clusters=5, connectivity=connectivity)
    ward.fit(X)
    assert(np.size(np.unique(ward.labels_)) == 5)

    Xred = ward.transform(X)
    assert(Xred.shape[1] == 5)
    Xfull = ward.inverse_transform(Xred)
    assert(np.unique(Xfull[0]).size == 5)
import pylab as pl
from scikits.learn.feature_extraction.image import grid_to_graph
from scikits.learn.cluster import Ward

###############################################################################
# Generate data
lena = sp.lena()
# Downsample the image by a factor of 4
lena = lena[::2, ::2] + lena[1::2, ::2] + lena[::2, 1::2] + lena[1::2, 1::2]
lena = lena[::2, ::2] + lena[1::2, ::2] + lena[::2, 1::2] + lena[1::2, 1::2]
mask = np.ones_like(lena).astype(bool)
X = np.atleast_2d(lena[mask]).T

###############################################################################
# Define the structure A of the data. Pixels connected to their neighbors.
connectivity = grid_to_graph(*lena.shape)

###############################################################################
# Compute clustering
print "Compute structured hierarchical clustering..."
st = time.time()
n_clusters = 15 # number of regions
ward = Ward(n_clusters=n_clusters).fit(X, connectivity=connectivity)
label = np.reshape(ward.labels_, mask.shape)
print "Elaspsed time: ", time.time() - st
print "Number of pixels: ", label.size
print "Number of clusters: ", np.unique(label).size

###############################################################################
# Plot the results on an image
pl.figure(figsize=(5, 5))
X -= X.mean(axis=0)
X /= X.std(axis=0)

y = np.dot(X, coef.ravel())
noise = np.random.randn(y.shape[0])
noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2)
y += noise_coef * noise # add noise

###############################################################################
# Compute the coefs of a Bayesian Ridge with GridSearch
cv = KFold(len(y), 2) # cross-validation generator for model selection
ridge = BayesianRidge()
mem = Memory(cachedir='.', verbose=1)

# Ward agglomeration followed by BayesianRidge
A = grid_to_graph(n_x=size, n_y=size)
ward = WardAgglomeration(n_clusters=10, connectivity=A, memory=mem,
                         n_components=1)
clf = Pipeline([('ward', ward), ('ridge', ridge)])
# Select the optimal number of parcels with grid search
clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1)
clf.fit(X, y, cv=cv) # set the best parameters
coef_ = clf.best_estimator.steps[-1][1].coef_
coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_)
coef_agglomeration_ = coef_.reshape(size, size)

# Anova univariate feature selection followed by BayesianRidge
f_regression = mem.cache(feature_selection.f_regression) # caching function
anova = feature_selection.SelectPercentile(f_regression)
clf = Pipeline([('anova', anova), ('ridge', ridge)])
# Select the optimal percentage of features with grid search
size = 4  # image size
roi_size = 2
X = np.zeros(size**2)
X2 = X
#Generating two convexe parts
mask = np.zeros((size, size), dtype=bool)
mask[0:roi_size, 0:roi_size] = True
mask[-roi_size:, -roi_size:] = True
mask = mask.reshape(size**2)
X = X[mask]
# making n_samples
X2 = X2 + np.zeros((n_samples, 1))
X = X + np.arange(n_samples).reshape((n_samples, 1))
Y = np.arange(n_samples)
# Generating the connectivity grids and ward trees
A = grid_to_graph(n_x=size, n_y=size, mask=mask)
children, n_components, n_leaves = ward_tree(X.T,
                                             connectivity=A,
                                             n_components=2)
children = children.tolist()
A2 = grid_to_graph(n_x=size, n_y=size)
children2, n_components2, n_leaves2 = ward_tree(X2.T,
                                                connectivity=A2,
                                                n_components=1)
children2 = children2.tolist()


###############################################################################
# Test functions
def test_tree_roots():
    """
X -= X.mean(axis=0)
X /= X.std(axis=0)

y = np.dot(X, coef.ravel())
noise = np.random.randn(y.shape[0])
noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2)
y += noise_coef * noise  # add noise

###############################################################################
# Compute the coefs of a Bayesian Ridge with GridSearch
cv = KFold(len(y), 2)  # cross-validation generator for model selection
ridge = BayesianRidge()
mem = Memory(cachedir='.', verbose=1)

# Ward agglomeration followed by BayesianRidge
A = grid_to_graph(n_x=size, n_y=size)
ward = WardAgglomeration(n_clusters=10,
                         connectivity=A,
                         memory=mem,
                         n_components=1)
clf = Pipeline([('ward', ward), ('ridge', ridge)])
parameters = {'ward__n_clusters': [10, 20, 30]}
# Select the optimal number of parcels with grid search
clf = GridSearchCV(clf, parameters, n_jobs=1)
clf.fit(X, y, cv=cv)  # set the best parameters
coef_ = clf.best_estimator.steps[-1][1].coef_
coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_)
coef_agglomeration_ = coef_.reshape(size, size)

# Anova univariate feature selection followed by BayesianRidge
f_regression = mem.cache(feature_selection.f_regression)  # caching function
import pylab as pl
from scikits.learn.feature_extraction.image import grid_to_graph
from scikits.learn.cluster import Ward

###############################################################################
# Generate data
lena = sp.lena()
# Downsample the image by a factor of 4
lena = lena[::2, ::2] + lena[1::2, ::2] + lena[::2, 1::2] + lena[1::2, 1::2]
lena = lena[::2, ::2] + lena[1::2, ::2] + lena[::2, 1::2] + lena[1::2, 1::2]
mask = np.ones_like(lena).astype(bool)
X = np.atleast_2d(lena[mask]).T

###############################################################################
# Define the structure A of the data. Pixels connected to their neighbors.
connectivity = grid_to_graph(*lena.shape)

###############################################################################
# Compute clustering
print "Compute structured hierarchical clustering..."
st = time.time()
n_clusters = 15  # number of regions
ward = Ward(n_clusters=n_clusters).fit(X, connectivity=connectivity)
label = np.reshape(ward.labels_, mask.shape)
print "Elaspsed time: ", time.time() - st
print "Number of pixels: ", label.size
print "Number of clusters: ", np.unique(label).size

###############################################################################
# Plot the results on an image
pl.figure(figsize=(5, 5))
import numpy as np
import pylab as pl

from scikits.learn import datasets, cluster
from scikits.learn.feature_extraction.image import grid_to_graph

digits = datasets.load_digits()
images = digits.images
X = np.reshape(images, (len(images), -1))
connectivity = grid_to_graph(*images[0].shape)

agglo = cluster.WardAgglomeration(connectivity=connectivity,
                                  n_clusters=32)

agglo.fit(X)
X_reduced = agglo.transform(X)

X_restored = agglo.inverse_transform(X_reduced)
images_restored = np.reshape(X_restored, images.shape)
pl.figure(1, figsize=(4, 3.5))
pl.clf()
pl.subplots_adjust(left=.01, right=.99, bottom=.01, top=.91)
for i in range(4):
    pl.subplot(3, 4, i+1)
    pl.imshow(images[i], cmap=pl.cm.gray,
                vmax=16, interpolation='nearest')
    pl.xticks(())
    pl.yticks(())
    if i == 1:
        pl.title('Original data')
    pl.subplot(3, 4, 4+i+1)
import numpy as np
import pylab as pl

from scikits.learn import datasets, cluster
from scikits.learn.feature_extraction.image import grid_to_graph

digits = datasets.load_digits()
images = digits.images
X = np.reshape(images, (len(images), -1))
connectivity = grid_to_graph(*images[0].shape)

agglo = cluster.WardAgglomeration(connectivity=connectivity, n_clusters=32)

agglo.fit(X)
X_reduced = agglo.transform(X)

X_restored = agglo.inverse_transform(X_reduced)
images_restored = np.reshape(X_restored, images.shape)
pl.figure(1, figsize=(4, 3.5))
pl.clf()
pl.subplots_adjust(left=.01, right=.99, bottom=.01, top=.91)
for i in range(4):
    pl.subplot(3, 4, i + 1)
    pl.imshow(images[i], cmap=pl.cm.gray, vmax=16, interpolation='nearest')
    pl.xticks(())
    pl.yticks(())
    if i == 1:
        pl.title('Original data')
    pl.subplot(3, 4, 4 + i + 1)
    pl.imshow(images_restored[i],
              cmap=pl.cm.gray,
size = 4  # image size
roi_size = 2
X = np.zeros(size**2)
X2 = X
#Generating two convexe parts
mask = np.zeros((size, size), dtype=bool)
mask[0:roi_size, 0:roi_size] = True
mask[-roi_size:, -roi_size:] = True
mask = mask.reshape(size**2)
X = X[mask]
# making n_samples
X2 = X2 + np.zeros((n_samples, 1))
X = X + np.arange(n_samples).reshape((n_samples, 1))
Y = np.arange(n_samples)
# Generating the connectivity grids and ward trees
A = grid_to_graph(n_x=size, n_y=size, mask=mask)
children, n_components, n_leaves = ward_tree(X.T, connectivity=A,
        n_components=2)
children = children.tolist()
A2 = grid_to_graph(n_x=size, n_y=size)
children2, n_components2, n_leaves2 = ward_tree(X2.T, connectivity=A2,
        n_components=1)
children2 = children2.tolist()


###############################################################################
# Test functions
def test_tree_roots():
    """
    Tests that the function returns the right roots.
    """