vectorizer = Vectorizer(max_features=10000)
X = vectorizer.fit_transform((open(f).read() for f in filenames))

X = Normalizer(norm="l2", copy=False).transform(X)

print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X.shape
print


###############################################################################
# Now sparse MiniBatchKmeans

print "_" * 80

mbkm = MiniBatchKMeans(init="random", k=true_k, max_iter=10, random_state=13,
                       chunk_size=1000, tol=0.0, n_init=1)

print "Clustering sparse data with %s" % str(mbkm)
print

t0 = time()
mbkm.fit(X)
print "done in %0.3fs" % (time() - t0)

ri = randindex(labels, mbkm.labels_)
vmeasure = metrics.v_measure_score(labels, mbkm.labels_)
print "Homogeneity: %0.3f" % metrics.homogeneity_score(labels, mbkm.labels_)
print "Completeness: %0.3f" % metrics.completeness_score(labels, mbkm.labels_)
print "V-measure: %0.3f" % vmeasure
print "Rand-Index: %.3f" % ri
print
##############################################################################
# Compute clustering with Means

k_means = KMeans(init='k-means++', k=3)
t0 = time.time()
k_means.fit(X)
t_batch = time.time() - t0
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)

##############################################################################
# Compute clustering with MiniBatchKMeans

mbk = MiniBatchKMeans(init='k-means++', k=3, chunk_size=batch_size)
t0 = time.time()
mbk.fit(X)
t_mini_batch = time.time() - t0
mbk_means_labels = mbk.labels_
mbk_means_cluster_centers = mbk.cluster_centers_
mbk_means_labels_unique = np.unique(mbk_means_labels)

##############################################################################
# Plot result

fig = pl.figure()
colors = ['#4EACC5', '#FF9C34', '#4E9A06']

# We want to have the same colors for the same cluster from the
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
Example #3
0
##############################################################################
# Compute clustering with Means

k_means = KMeans(init='k-means++', k=3)
t0 = time.time()
k_means.fit(X)
t_batch = time.time() - t0
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)

##############################################################################
# Compute clustering with MiniBatchKMeans

mbk = MiniBatchKMeans(init='k-means++', k=3, chunk_size=batch_size)
t0 = time.time()
mbk.fit(X)
t_mini_batch = time.time() - t0
mbk_means_labels = mbk.labels_
mbk_means_cluster_centers = mbk.cluster_centers_
mbk_means_labels_unique = np.unique(mbk_means_labels)

##############################################################################
# Plot result

fig = pl.figure()
colors = ['#4EACC5', '#FF9C34', '#4E9A06']

# We want to have the same colors for the same cluster from the
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per