def cluster_objects(objects, optimize_within_clusters=False, round_trip=False,
                    initial=None):
    """
    Return a list of objects clustered by geographical position.

    :param objects: The list of objects or a queryset. The objects
    must be an instance of PointGeoTag or implement
    `get_point_coordinates(self, as_string=False, inverted=False)` to
    obtain the coordinates

    :param optimize_within_clusters: a boolean specifying if the
    clusters should be ordered based on the (near-)optimal route.

    :returns: A list of clusters. Example: [[<p1>, <p2>], [<p3>, <p4>, <p5>]]
    """
    X = np.array([list(i.get_point_coordinates(as_string=False, inverted=True))
                  for i in objects 
                  if i.get_point_coordinates(as_string=False, inverted=True)])

    # Afinity propagation. 
    # This way we can determine the number of clusters automatically
    # X_norms = np.sum(X*X, axis=1)
    # S = - X_norms[:,np.newaxis] - X_norms[np.newaxis,:] + 2 * np.dot(X, X.T)
    # p = 10*np.median(S)
    # af = AffinityPropagation()
    # af.fit(S, p)
    # n_clusters_ = len(af.cluster_centers_indices_)

    n_items = len(X)
    max_items = getattr(settings, 'ITEMS_PER_BUCKET', 10) - 1
    n_clusters = n_items / max_items
    #n_clusters += n_items % max_items == 0 and 0 or 1

    # KMeans. 
    # If we want a pre-specified number of clusters this is the way to go 
    km = KMeans(k=n_clusters, init='k-means++')
    km.fit(X)

    cluster_dict = defaultdict(list)
    for i, cluster_id in enumerate(km.labels_):
        cluster_dict[cluster_id].append(objects[i])

    clusters = cluster_dict.values()
    if optimize_within_clusters:
        if initial:
            result = []
            for cluster in clusters:
                if initial in cluster:
                    cluster.remove(initial)
                    cluster.insert(0, initial)
                    result.insert(0, google_TSP(cluster, round_trip=round_trip))
                else:
                    result.append(google_TSP(cluster, round_trip=round_trip))
            return result
        else:
            return [google_TSP(cluster, round_trip=round_trip) for cluster in clusters]
    return clusters
Exemple #2
0
def cluster_centroids(x, k=32, max_iter=300, km_kwargs={}):
    '''Return norm-ordered centroids'''
    km = KMeans(k, init='k-means++', max_iter=300, **km_kwargs)
    trained = km.fit(x)
    centroids = trained.cluster_centers_
    ind = np.argsort(np.linalg.norm(centroids, axis=1))
    return centroids[ind]
##############################################################################
# Generate sample data
np.random.seed(0)

batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=1200, centers=centers, cluster_std=0.7)

##############################################################################
# Compute clustering with Means

k_means = KMeans(init='k-means++', k=3)
t0 = time.time()
k_means.fit(X)
t_batch = time.time() - t0
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)

##############################################################################
# Compute clustering with MiniBatchKMeans

mbk = MiniBatchKMeans(init='k-means++', k=3, chunk_size=batch_size)
t0 = time.time()
mbk.fit(X)
t_mini_batch = time.time() - t0
mbk_means_labels = mbk.labels_
mbk_means_cluster_centers = mbk.cluster_centers_
mbk_means_labels_unique = np.unique(mbk_means_labels)
Exemple #4
0
n_points_per_cluster = 250
n_clusters = 3
n_points = n_points_per_cluster*n_clusters
means = np.array([[1,1],[-1,-1],[1,-1]])
std = .6
clustMed = []

X = np.empty((0, 2))
for i in range(n_clusters):
    X = np.r_[X, means[i] + std * np.random.randn(n_points_per_cluster, 2)]

################################################################################
# Compute clustering with KMeans
km = KMeans(init='k-means++', k=3, n_init=1)
km.fit(X);

labels = km.labels_
cluster_centers = km.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print "number of estimated clusters : %d" % n_clusters_

################################################################################
# Plot result
import pylab as pl
from itertools import cycle

pl.figure(1)
Exemple #5
0
print

################################################################################
# Digits dataset clustering using Self-Organizing Map

print "Self-Organizing Map "
t0 = time()
grid_width = 4
som = SelfOrganizingMap(size=grid_width, n_iterations=n_samples*5,
                        learning_rate=1)
som.fit(data)
print "done in %0.3fs" % (time() - t0)
print

F = pseudo_F(data, som.labels_, som.neurons_)
print 'pseudo_F %0.2f | %0.2f%%' % (F, 100 * (F / (1 + F)))
print

################################################################################
# Digits dataset clustering using Kmeans

print "KMeans "
t0 = time()
km = KMeans(init='k-means++', k=grid_width**2, n_init=10)
km.fit(data)
print "done in %0.3fs" % (time() - t0)
print

F = pseudo_F(data, km.labels_, km.cluster_centers_)
print 'pseudo_F %0.2f | %0.2f%%' % (F, 100 * (F / (1 + F)))
Exemple #6
0
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
from __future__ import division
import numpy as np
import pickle
from scikits.learn.cluster import KMeans
import logging 
import time

logging.basicConfig(level=logging.DEBUG)

LEARN_SIZE = 100
K = 8
ITER = 10 

moto,plane  = [pickle.load(open(file)) 
               for file in ['moto','plane']]

logging.info('Data loaded') 

m   = np.vstack([v for f,v in  moto.items()[0:LEARN_SIZE]])
p   = np.vstack([v for f,v in plane.items()[0:LEARN_SIZE]])
all = np.vstack([m,p])

km = KMeans(k=K,max_iter=ITER)
km.fit(all) 

filename = 'centroids_%d_%d_%d' % (LEARN_SIZE,K,ITER)
pickle.dump(km.cluster_centers_,open(filename,'wb'))

Exemple #7
0
n_points_per_cluster = 250
n_clusters = 3
n_points = n_points_per_cluster * n_clusters
means = np.array([[1, 1], [-1, -1], [1, -1]])
std = .6
clustMed = []

X = np.empty((0, 2))
for i in range(n_clusters):
    X = np.r_[X, means[i] + std * np.random.randn(n_points_per_cluster, 2)]

################################################################################
# Compute clustering with KMeans
km = KMeans(init='k-means++', k=3, n_init=1)
km.fit(X)

labels = km.labels_
cluster_centers = km.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print "number of estimated clusters : %d" % n_clusters_

################################################################################
# Plot result
import pylab as pl
from itertools import cycle

pl.figure(1)
Exemple #8
0
##############################################################################
# Generate sample data
np.random.seed(0)

batch_size = 45
centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=1200, centers=centers, cluster_std=0.7)

##############################################################################
# Compute clustering with Means

k_means = KMeans(init='k-means++', k=3)
t0 = time.time()
k_means.fit(X)
t_batch = time.time() - t0
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)

##############################################################################
# Compute clustering with MiniBatchKMeans

mbk = MiniBatchKMeans(init='k-means++', k=3, chunk_size=batch_size)
t0 = time.time()
mbk.fit(X)
t_mini_batch = time.time() - t0
mbk_means_labels = mbk.labels_
mbk_means_cluster_centers = mbk.cluster_centers_
mbk_means_labels_unique = np.unique(mbk_means_labels)
def cluster_objects(objects,
                    optimize_within_clusters=False,
                    round_trip=False,
                    initial=None):
    """
    Return a list of objects clustered by geographical position.

    :param objects: The list of objects or a queryset. The objects
    must be an instance of PointGeoTag or implement
    `get_point_coordinates(self, as_string=False, inverted=False)` to
    obtain the coordinates

    :param optimize_within_clusters: a boolean specifying if the
    clusters should be ordered based on the (near-)optimal route.

    :returns: A list of clusters. Example: [[<p1>, <p2>], [<p3>, <p4>, <p5>]]
    """
    X = np.array([
        list(i.get_point_coordinates(as_string=False, inverted=True))
        for i in objects
        if i.get_point_coordinates(as_string=False, inverted=True)
    ])

    # Afinity propagation.
    # This way we can determine the number of clusters automatically
    # X_norms = np.sum(X*X, axis=1)
    # S = - X_norms[:,np.newaxis] - X_norms[np.newaxis,:] + 2 * np.dot(X, X.T)
    # p = 10*np.median(S)
    # af = AffinityPropagation()
    # af.fit(S, p)
    # n_clusters_ = len(af.cluster_centers_indices_)

    n_items = len(X)
    max_items = getattr(settings, 'ITEMS_PER_BUCKET', 10) - 1
    n_clusters = n_items / max_items
    #n_clusters += n_items % max_items == 0 and 0 or 1

    # KMeans.
    # If we want a pre-specified number of clusters this is the way to go
    km = KMeans(k=n_clusters, init='k-means++')
    km.fit(X)

    cluster_dict = defaultdict(list)
    for i, cluster_id in enumerate(km.labels_):
        cluster_dict[cluster_id].append(objects[i])

    clusters = cluster_dict.values()
    if optimize_within_clusters:
        if initial:
            result = []
            for cluster in clusters:
                if initial in cluster:
                    cluster.remove(initial)
                    cluster.insert(0, initial)
                    result.insert(0, google_TSP(cluster,
                                                round_trip=round_trip))
                else:
                    result.append(google_TSP(cluster, round_trip=round_trip))
            return result
        else:
            return [
                google_TSP(cluster, round_trip=round_trip)
                for cluster in clusters
            ]
    return clusters