def test_silhouette(): # Tests the Silhouette Coefficient. dataset = datasets.load_iris() X = dataset.data y = dataset.target D = pairwise_distances(X, metric='euclidean') # Given that the actual labels are used, we can assume that S would be # positive. silhouette = silhouette_score(D, y, metric='precomputed') assert(silhouette > 0) # Test without calculating D silhouette_metric = silhouette_score(X, y, metric='euclidean') assert_almost_equal(silhouette, silhouette_metric) # Test with sampling silhouette = silhouette_score(D, y, metric='precomputed', sample_size=int(X.shape[0] / 2), random_state=0) silhouette_metric = silhouette_score(X, y, metric='euclidean', sample_size=int(X.shape[0] / 2), random_state=0) assert(silhouette > 0) assert(silhouette_metric > 0) assert_almost_equal(silhouette_metric, silhouette) # Test with sparse X X_sparse = csr_matrix(X) D = pairwise_distances(X_sparse, metric='euclidean') silhouette = silhouette_score(D, y, metric='precomputed') assert(silhouette > 0)
def test3(isKmeans): n_clusters = 16 data = [] with open(data3Test, 'rb') as csvfile: #skip header _ = csvfile.next() for line in csvfile: line = line.strip() if len(line) > 0: data.append(line.split(',')) # with open(data3,'rb') as csvfile: # for line in csvfile: # line = line.replace("\n", "") # temp = line.split(' ') # temp1 = [] # for item in temp: # if len(item.strip()) > 0: # temp1.append(item.strip()) # data.append(temp1) # convert to numpy array data = np.array(data) print data # # # km = KMeans(16).fit(data) # if isKmeans == True: clusterer = KMeans(n_clusters) cluster_labels = clusterer.fit_predict(data) # print cluster_labels print silhouette_score(data, cluster_labels, metric='euclidean') showChartKmeans(clusterer, False, data, n_clusters) compareResult(cluster_labels) else: ward = AgglomerativeClustering(n_clusters, affinity='euclidean', linkage='ward') ward.fit(data) print ward.labels_ print silhouette_score(data, ward.labels_, metric='euclidean') showChartHierarchical(ward, False, data, n_clusters) compareResult(ward.labels_)
def test3(isKmeans): n_clusters = 16 data = [] with open(data3Test,'rb') as csvfile: #skip header _ = csvfile.next() for line in csvfile: line = line.strip() if len(line) > 0: data.append(line.split(',')) # with open(data3,'rb') as csvfile: # for line in csvfile: # line = line.replace("\n", "") # temp = line.split(' ') # temp1 = [] # for item in temp: # if len(item.strip()) > 0: # temp1.append(item.strip()) # data.append(temp1) # convert to numpy array data = np.array(data) print data # # # km = KMeans(16).fit(data) # if isKmeans == True: clusterer = KMeans(n_clusters) cluster_labels = clusterer.fit_predict(data) # print cluster_labels print silhouette_score(data, cluster_labels, metric='euclidean') showChartKmeans(clusterer, False, data, n_clusters) compareResult(cluster_labels) else: ward = AgglomerativeClustering(n_clusters, affinity='euclidean', linkage='ward') ward.fit(data) print ward.labels_ print silhouette_score(data, ward.labels_, metric='euclidean') showChartHierarchical(ward, False, data, n_clusters) compareResult(ward.labels_)
def test2(isKmeans): n_clusters = 15 data = [] with open(data2Test, 'rb') as csvfile: #skip header _ = csvfile.next() for line in csvfile: line = line.strip() if len(line) > 0: value1, value2 = line.split(',') data.append([value1, value2]) # with open(data2,'rb') as csvfile: # for line in csvfile: # line = line.replace("\n", "") # temp = line.split(' ') # key1 = '' # key2 = '' # # key1 = temp[1] # # if len(temp[2]) > 0: # key2 = temp[2] # else: # key2 = temp[3] # data.append([key1, key2]) #convert to numpy array data = np.array(data) if isKmeans == True: clusterer = KMeans(n_clusters) cluster_labels = clusterer.fit_predict(data) print cluster_labels print silhouette_score(data, cluster_labels, metric='euclidean') showChartKmeans(clusterer, False, data, n_clusters) else: ward = AgglomerativeClustering(n_clusters, affinity='euclidean', linkage='ward') ward.fit(data) print ward.labels_ print silhouette_score(data, ward.labels_, metric='euclidean') showChartHierarchical(ward, False, data, n_clusters)
def silhouette_score(self, data, labels, metric='euclidean', sample_size=None, random_state=None, **kwds): """Compute the mean Silhouette Coefficient of all samples. The Silhouette Coefficient is calculated using the mean intra-cluster distance (``a``) and the mean nearest-cluster distance (``b``) for each sample. The Silhouette Coefficient for a sample is ``(b - a) / max(a, b)``. To clarify, ``b`` is the distance between a sample and the nearest cluster that the sample is not a part of. Note that Silhouette Coefficient is only defined if number of labels is 2 <= n_labels <= n_samples - 1. This function returns the mean Silhouette Coefficient over all samples. To obtain the values for each sample, use :func:`silhouette_samples`. The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar. Read more in the :ref:`User Guide <silhouette_coefficient>`. Parameters Args: data : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \ [n_samples_a, n_features] otherwise Array of pairwise distances between samples, or a feature array. labels : array, shape = [n_samples] Predicted labels for each sample. metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by :func:`metrics.pairwise.pairwise_distances <sklearn.metrics.pairwise.pairwise_distances>`. If data is the distance array itself, use ``metric="precomputed"``. sample_size : int or None The size of the sample to use when computing the Silhouette Coefficient on a random subset of the data. If ``sample_size is None``, no sampling is used. random_state : int, RandomState instance or None, optional (default=None) The generator used to randomly select a subset of samples. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used when ``sample_size is not None``. **kwds : optional keyword parameters Any further parameters are passed directly to the distance function. If using a scipy.spatial.distance metric, the parameters are still metric dependent. See the scipy docs for usage examples. Returns: silhouette : float Mean Silhouette Coefficient for all samples. """ return unsupervised.silhouette_score(data, labels, metric, sample_size, random_state, **kwds)
def test1(): n_clusters = 100 data = [] with open(data1Test, 'rb') as csvfile: #skip header _ = csvfile.next() for line in csvfile: line = line.strip() if len(line) > 0: value1, value2 = line.split(',') data.append([value1, value2]) # with open(data1,'rb') as csvfile: # for line in csvfile: # line = line.replace("\n", "") # temp = line.split(' ') # key1 = '' # key2 = '' # # key1 = temp[1] # # if len(temp[2]) > 0: # key2 = temp[2] # else: # key2 = temp[3] # data.append([key1, key2]) #convert to numpy array data = np.array(data) # km = KMeans(10).fit(data) clusterer = KMeans(n_clusters) cluster_labels = clusterer.fit_predict(data) print cluster_labels # print km.labels_ # print km.cluster_centers_ print silhouette_score(data, cluster_labels, sample_size=5000, metric='euclidean') showChartKmeans(clusterer, False, data, n_clusters)
def test2(isKmeans): n_clusters = 15 data = [] with open(data2Test,'rb') as csvfile: #skip header _ = csvfile.next() for line in csvfile: line = line.strip() if len(line) > 0: value1, value2 = line.split(',') data.append([value1, value2]) # with open(data2,'rb') as csvfile: # for line in csvfile: # line = line.replace("\n", "") # temp = line.split(' ') # key1 = '' # key2 = '' # # key1 = temp[1] # # if len(temp[2]) > 0: # key2 = temp[2] # else: # key2 = temp[3] # data.append([key1, key2]) #convert to numpy array data = np.array(data) if isKmeans == True: clusterer = KMeans(n_clusters) cluster_labels = clusterer.fit_predict(data) print cluster_labels print silhouette_score(data, cluster_labels, metric='euclidean') showChartKmeans(clusterer, False, data, n_clusters) else: ward = AgglomerativeClustering(n_clusters, affinity='euclidean', linkage='ward') ward.fit(data) print ward.labels_ print silhouette_score(data, ward.labels_, metric='euclidean') showChartHierarchical(ward, False, data, n_clusters)
def test_no_nan(): # Assert Silhouette Coefficient != nan when there is 1 sample in a class. # This tests for the condition that caused issue 960. # Note that there is only one sample in cluster 0. This used to cause the # silhouette_score to return nan (see bug #960). labels = np.array([1, 0, 1, 1, 1]) # The distance matrix doesn't actually matter. D = np.random.RandomState(0).rand(len(labels), len(labels)) silhouette = silhouette_score(D, labels, metric='precomputed') assert_false(np.isnan(silhouette))
def test1(): n_clusters = 100 data = [] with open(data1Test,'rb') as csvfile: #skip header _ = csvfile.next() for line in csvfile: line = line.strip() if len(line) > 0: value1, value2 = line.split(',') data.append([value1, value2]) # with open(data1,'rb') as csvfile: # for line in csvfile: # line = line.replace("\n", "") # temp = line.split(' ') # key1 = '' # key2 = '' # # key1 = temp[1] # # if len(temp[2]) > 0: # key2 = temp[2] # else: # key2 = temp[3] # data.append([key1, key2]) #convert to numpy array data = np.array(data) # km = KMeans(10).fit(data) clusterer = KMeans(n_clusters) cluster_labels = clusterer.fit_predict(data) print cluster_labels # print km.labels_ # print km.cluster_centers_ print silhouette_score(data, cluster_labels, sample_size=5000, metric='euclidean') showChartKmeans(clusterer, False, data, n_clusters)
def silhouette_example(dset='CB1', neuropil='Optic_Glomeruli', seed=0, clusterer='kmedoids'): print('Reading dataset') X = ExpressionDataset(neuropil_dir(dset, neuropil)).Xarray() print('Reading clusterings') df = read_clusterings_cache(dset=dset, neuropil=neuropil) # Pick just one seed (you might want to change this...) df = df.query('repeat==%d and clusterer=="%s"' % (seed, clusterer)) # Compute all the silhouettes for k, kdf in df.groupby('k'): print('Computing mean silhoutte for k=%d' % k) assert len(kdf) == 1 # sklearn expects labels to be in [0...k-1] labels = np.array(relabel_to_0k(kdf['labels'].iloc[0])) print(k, silhouette_score(X, labels, metric='euclidean', random_state=0)) # sample_size=1000
def _kmeans_silhouette_train_predict(table, input_cols, n_clusters_list=range(2, 10), prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): feature_names, features = check_col_type(table, input_cols) if n_samples is None: n_samples = len(table) inputarr = features pca2_model = PCA(n_components=min(2, len(feature_names))).fit(inputarr) pca2 = pca2_model.transform(inputarr) silhouette_list = [] models = [] centers_list = [] images = [] for k in n_clusters_list: k_means = SKKMeans(n_clusters=k, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) models.append(k_means) predict = k_means.labels_ centersk = k_means.cluster_centers_ centers_list.append(centersk) score = silhouette_score(inputarr, predict) silhouette_list.append(score) samples = silhouette_samples(inputarr, predict) # silhouette_samples_list.append(samples) pca2_centers = pca2_model.transform(centersk) _, (ax1, ax2) = plt.subplots(1, 2) colors = cm.nipy_spectral(np.arange(k).astype(float) / k) y_lower = 0 for i, color in zip(range(k), colors): si = samples[predict == i] si.sort() sizei = si.shape[0] y_upper = y_lower + sizei ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, si, facecolor=color, edgecolor=color, alpha=0.7) # cluster label ax1.text(0.9, y_lower + 0.45 * sizei, str(i)) y_lower = y_upper if pca2.shape[1] == 1: ax2.scatter(pca2[:, 0][predict == i], pca2[:, 0][predict == i], color=color) else: ax2.scatter(pca2[:, 0][predict == i], pca2[:, 1][predict == i], color=color) ax1.axvline(x=score, color="red") ax1.set_xlim(right=1.0) ax1.set_yticks([]) ax1.set_xlabel("Silhouette coefficient values") ax1.set_ylabel("Cluster label") if pca2.shape[1] == 1: ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 0], marker='x', edgecolors=1, s=200, color=colors) ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 1st feature") else: ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors) ax2.set_xlabel("Feature space for the 1st feature") ax2.set_ylabel("Feature space for the 2nd feature") plt.tight_layout() imagek = plt2MD(plt) plt.clf() images.append(imagek) argmax = np.argmax(silhouette_list) best_k = n_clusters_list[argmax] best_model = models[argmax] predict = best_model.predict(inputarr) best_centers = best_model.cluster_centers_ best_labels = best_model.labels_ best_sse = best_model.inertia_ n_clusters = len(best_centers) colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters) fig_centers = _kmeans_centers_plot(feature_names, best_centers, colors) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, best_centers, seed, colors) fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2, colors) x_clusters = range(len(n_clusters_list)) plt.xticks(x_clusters, n_clusters_list) plt.plot(x_clusters, silhouette_list, '.-') plt.xlabel("Number of Clusters k") plt.tight_layout() fig_silhouette = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Kmeans Silhouette Result | - silhoutte metrics: | {fig_silhouette} | - best K: {best_k} | - Sum of square error: {best_sse}. | - best centers: | {fig_pca} | {fig_centers} | {fig_samples} | """.format(fig_silhouette=fig_silhouette, best_k=best_k, best_sse=best_sse, fig_pca=fig_pca, fig_centers=fig_centers, fig_samples=fig_samples))) for k, image in zip(n_clusters_list, images): rb.addMD(strip_margin(""" | ### k = {k} | {image} | """.format(k=k, image=image))) model = _model_dict('kmeans_silhouette') model['best_k'] = best_k model['best_centers'] = best_centers model['best_model'] = best_model model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = predict # out_table['silhouette'] = silhouette_samples_list[best_k-2] # out_table = out_table.sort_values(by=['prediction','silhouette']) # out_table = out_table.reset_index(drop=True) return {'out_table': out_table, 'model': model}
def _kmeans_silhouette_train_predict(table, input_cols, n_clusters_list=range(2, 10), prediction_col='prediction', init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', seed=None, n_jobs=1, algorithm='auto', n_samples=None): if n_samples is None: n_samples = len(table) inputarr = table[input_cols] validate(all_elements_greater_than(n_clusters_list, 1, 'n_clusters_list'), greater_than_or_equal_to(n_init, 1, 'n_init'), greater_than_or_equal_to(max_iter, 1, 'max_iter'), greater_than(tol, 0.0, 'tol'), greater_than_or_equal_to(n_jobs, 1, 'n_jobs'), greater_than_or_equal_to(n_samples, 0, 'n_samples')) pca2_model = PCA(n_components=2).fit(inputarr) pca2 = pca2_model.transform(inputarr) silhouette_list = [] silouette_samples_list = [] models = [] centers_list = [] images = [] for k in n_clusters_list: k_means = SKKMeans(n_clusters=k, init=init, n_init=n_init, max_iter=max_iter, tol=tol, precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm) k_means.fit(inputarr) models.append(k_means) predict = k_means.labels_ centersk = k_means.cluster_centers_ centers_list.append(centersk) score = silhouette_score(inputarr, predict) silhouette_list.append(score) samples = silhouette_samples(inputarr, predict) silouette_samples_list.append(samples) pca2_centers = pca2_model.transform(centersk) _, (ax1, ax2) = plt.subplots(1, 2) colors = cm.nipy_spectral(np.arange(k).astype(float) / k) y_lower = 0 for i, color in zip(range(k), colors): si = samples[predict == i] si.sort() sizei = si.shape[0] y_upper = y_lower + sizei ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, si, facecolor=color, edgecolor=color, alpha=0.7) y_lower = y_upper ax2.scatter(pca2[:, 0][predict == i], pca2[:, 1][predict == i], color=color) ax1.axvline(x=score, color="red") ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors) imagek = plt2MD(plt) plt.clf() images.append(imagek) argmax = np.argmax(silhouette_list) best_k = n_clusters_list[argmax] best_model = models[argmax] predict = best_model.predict(inputarr) best_centers = best_model.cluster_centers_ best_labels = best_model.labels_ fig_centers = _kmeans_centers_plot(input_cols, best_centers) fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, best_centers) fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2) x_clusters = range(len(n_clusters_list)) plt.xticks(x_clusters, n_clusters_list) plt.plot(x_clusters, silhouette_list, '.-') fig_silhouette = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Kmeans Silhouette Result | - silloutte metrics: | {fig_silhouette} | - best K: {best_k} | - best centers: | {fig_pca} | {fig_centers} | {fig_samples} | """.format(fig_silhouette=fig_silhouette, best_k=best_k, fig_pca=fig_pca, fig_centers=fig_centers, fig_samples=fig_samples))) for k, image in zip(n_clusters_list, images): rb.addMD( strip_margin(""" | ### k = {k} | {image} | """.format(k=k, image=image))) model = _model_dict('kmeans_silhouette') model['best_k'] = best_k model['best_centers'] = best_centers model['best_model'] = best_model model['input_cols'] = input_cols model['_repr_brtc_'] = rb.get() out_table = table.copy() out_table[prediction_col] = predict return {'out_table': out_table, 'model': model}
zip(combinations(unique_labels, 2), values): indices_a = np.where(labels == label_a)[0] inter_dist[indices_a] = np.minimum(values_a, inter_dist[indices_a]) del indices_a indices_b = np.where(labels == label_b)[0] inter_dist[indices_b] = np.minimum(values_b, inter_dist[indices_b]) del indices_b return inter_dist if __name__ == '__main__': import time from sklearn.metrics.cluster.unsupervised import silhouette_score np.random.seed(0) X = np.random.random((10000, 100)) y = np.repeat(np.arange(100), 100) t0 = time.time() s = silhouette_score(X, y) t = time.time() - t0 print 'Scikit silhouette (%fs): %f' % (t, s) t0 = time.time() s = silhouette_score_block(X, y) t = time.time() - t0 print 'Block silhouette (%fs): %f' % (t, s) t0 = time.time() s = silhouette_score_block(X, y, n_jobs=2) t = time.time() - t0 print 'Block silhouette parallel (%fs): %f' % (t, s)
indices_a = np.where(labels == label_a)[0] inter_dist[indices_a] = np.minimum(values_a, inter_dist[indices_a]) del indices_a indices_b = np.where(labels == label_b)[0] inter_dist[indices_b] = np.minimum(values_b, inter_dist[indices_b]) del indices_b return inter_dist if __name__ == '__main__': import time from sklearn.metrics.cluster.unsupervised import silhouette_score np.random.seed(0) X = np.random.random((10000, 100)) y = np.repeat(np.arange(100), 100) t0 = time.time() s = silhouette_score(X, y) t = time.time() - t0 print 'Scikit silhouette (%fs): %f' % (t, s) t0 = time.time() s = silhouette_score_block(X, y) t = time.time() - t0 print 'Block silhouette (%fs): %f' % (t, s) t0 = time.time() s = silhouette_score_block(X, y, n_jobs=2) t = time.time() - t0 print 'Block silhouette parallel (%fs): %f' % (t, s)
def test_non_encoded_labels(): dataset = datasets.load_iris() X = dataset.data labels = dataset.target assert_equal( silhouette_score(X, labels + 10), silhouette_score(X, labels))
def test_non_numpy_labels(): dataset = datasets.load_iris() X = dataset.data y = dataset.target assert_equal( silhouette_score(list(X), list(y)), silhouette_score(X, y))
plt.xlim([0,10]) plt.ylim([0,10]) plt.title('Instances') plt.scatter(x1,x2) colors = ['b','g','r','c','m','y','k','b'] markers = ['o','s','D','v','^','p','*','+'] clusters = [2,3,4,5,8] subplot_counter = 1 sc_scores = [] for t in clusters: subplot_counter += 1 plt.subplot(3,2,subplot_counter) kmeans_model = KMeans(n_clusters=t).fit(X) for i,l in enumerate(kmeans_model.labels_): plt.plot(x1[i],x2[i],color=colors[l],marker=markers[l],ls='None') plt.xlim([0,10]) plt.ylim([0,10]) sc_score = silhouette_score(X,kmeans_model.labels_,metric='euclidean') sc_scores.append(sc_score) plt.title('K=%s, silhouette Coefficient=%0.03f' %(t,sc_score)) plt.figure() plt.plot(clusters,sc_scores,'*-') plt.xlabel('Number of Clusters') plt.ylabel('Silhouette Coefficient Score') plt.show()