def test_non_encoded_labels(): dataset = datasets.load_iris() X = dataset.data labels = dataset.target assert silhouette_score(X, labels * 2 + 10) == silhouette_score(X, labels) assert_array_equal(silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels))
def test_non_encoded_labels(): dataset = datasets.load_iris() X = dataset.data labels = dataset.target assert_equal( silhouette_score(X, labels * 2 + 10), silhouette_score(X, labels)) assert_array_equal( silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels))
def calc_metrics(topics, cluster_assignments, raw_data, soft_clustering=True): print("Calculate KPIs...") total_count = cluster_assignments.shape[0] if soft_clustering: sums = np.sum(cluster_assignments, axis=0) counts = np.count_nonzero(cluster_assignments, axis=0) hard_cluster = np.argsort( cluster_assignments, axis=1)[:, -1] # just the last column (Ascending sorted!) # inter-cluster-sim - intra-cluster-sim / max of both silhouette_scores = silhouette_samples(raw_data, hard_cluster) else: # inter-cluster-sim - intra-cluster-sim / max of both silhouette_scores = silhouette_samples(raw_data, cluster_assignments) for c_idx, topic in topics.items(): # Calc. KPIs if soft_clustering: avg_weight = sums[c_idx] / counts[c_idx] article_ratio = counts[c_idx] / total_count mask_nonzero = cluster_assignments[:, c_idx] > 0 std = np.std(cluster_assignments[mask_nonzero, c_idx]) median = np.median(cluster_assignments[mask_nonzero, c_idx]) top_ratio = len(np.where(hard_cluster == c_idx)[0]) / counts[c_idx] silhouette_score = np.mean( silhouette_scores[hard_cluster == c_idx]) #Store KPIs topic.update({ "avg_weight": avg_weight, "median_weight": median, "std_weight": std, "article_ratio": article_ratio, "top_ratio": top_ratio, "silhouette_score": silhouette_score }) else: article_ratio = len(cluster_assignments[cluster_assignments == c_idx]) / total_count silhouette_score = np.mean( silhouette_scores[cluster_assignments == c_idx]) # Store KPIs topic.update({ "article_ratio": article_ratio, "silhouette_score": silhouette_score }) return #topics #inplace update!
def gerar_grafico_silhueta(amostras, y_aca): cluster_labels = np.unique(y_aca) n_clusters = cluster_labels.shape[0] silhouette_vals = cluster.silhouette_samples(amostras, y_aca, metric='euclidean') y_ax_lower, y_ax_upper = 0, 0 yticks = [] for i, c in enumerate(cluster_labels): c_silhouette_vals = silhouette_vals[y_aca == c] c_silhouette_vals.sort() y_ax_upper += len(c_silhouette_vals) color = cm.jet(float(i) / n_clusters) plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color) yticks.append((y_ax_lower + y_ax_upper) / 2.) y_ax_lower += len(c_silhouette_vals) silhouette_avg = np.mean(silhouette_vals) plt.axvline(silhouette_avg, color='red', linestyle="--") plt.yticks(yticks, cluster_labels + 1) plt.ylabel('Cluster') plt.xlabel('Silhouette coefficient') plt.show()
def test_silhouette_nonzero_diag(dtype): # Make sure silhouette_samples requires diagonal to be zero. # Non-regression test for #12178 # Construct a zero-diagonal matrix dists = pairwise_distances( np.array([[0.2, 0.1, 0.12, 1.34, 1.11, 1.6]], dtype=dtype).T) labels = [0, 0, 0, 1, 1, 1] # small values on the diagonal are OK dists[2][2] = np.finfo(dists.dtype).eps * 10 silhouette_samples(dists, labels, metric='precomputed') # values bigger than eps * 100 are not dists[2][2] = np.finfo(dists.dtype).eps * 1000 with pytest.raises(ValueError, match='contains non-zero'): silhouette_samples(dists, labels, metric='precomputed')
def test_silhouette_paper_example(): # Explicitly check per-sample results against Rousseeuw (1987) # Data from Table 1 lower = [5.58, 7.00, 6.50, 7.08, 7.00, 3.83, 4.83, 5.08, 8.17, 5.83, 2.17, 5.75, 6.67, 6.92, 4.92, 6.42, 5.00, 5.58, 6.00, 4.67, 6.42, 3.42, 5.50, 6.42, 6.42, 5.00, 3.92, 6.17, 2.50, 4.92, 6.25, 7.33, 4.50, 2.25, 6.33, 2.75, 6.08, 6.67, 4.25, 2.67, 6.00, 6.17, 6.17, 6.92, 6.17, 5.25, 6.83, 4.50, 3.75, 5.75, 5.42, 6.08, 5.83, 6.67, 3.67, 4.75, 3.00, 6.08, 6.67, 5.00, 5.58, 4.83, 6.17, 5.67, 6.50, 6.92] D = np.zeros((12, 12)) D[np.tril_indices(12, -1)] = lower D += D.T names = ['BEL', 'BRA', 'CHI', 'CUB', 'EGY', 'FRA', 'IND', 'ISR', 'USA', 'USS', 'YUG', 'ZAI'] # Data from Figure 2 labels1 = [1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1] expected1 = {'USA': .43, 'BEL': .39, 'FRA': .35, 'ISR': .30, 'BRA': .22, 'EGY': .20, 'ZAI': .19, 'CUB': .40, 'USS': .34, 'CHI': .33, 'YUG': .26, 'IND': -.04} score1 = .28 # Data from Figure 3 labels2 = [1, 2, 3, 3, 1, 1, 2, 1, 1, 3, 3, 2] expected2 = {'USA': .47, 'FRA': .44, 'BEL': .42, 'ISR': .37, 'EGY': .02, 'ZAI': .28, 'BRA': .25, 'IND': .17, 'CUB': .48, 'USS': .44, 'YUG': .31, 'CHI': .31} score2 = .33 for labels, expected, score in [(labels1, expected1, score1), (labels2, expected2, score2)]: expected = [expected[name] for name in names] # we check to 2dp because that's what's in the paper pytest.approx(expected, silhouette_samples(D, np.array(labels), metric='precomputed'), abs=1e-2) pytest.approx(score, silhouette_score(D, np.array(labels), metric='precomputed'), abs=1e-2)
def test_cluster_size_1(): # Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster # (cluster 0). We also test the case where there are identical samples # as the only members of a cluster (cluster 2). To our knowledge, this case # is not discussed in reference material, and we choose for it a sample # score of 1. X = [[0.], [1.], [1.], [2.], [3.], [3.]] labels = np.array([0, 1, 1, 1, 2, 2]) # Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention # Cluster 1: intra-cluster = [.5, .5, 1] # inter-cluster = [1, 1, 1] # silhouette = [.5, .5, 0] # Cluster 2: intra-cluster = [0, 0] # inter-cluster = [arbitrary, arbitrary] # silhouette = [1., 1.] silhouette = silhouette_score(X, labels) assert not np.isnan(silhouette) ss = silhouette_samples(X, labels) assert_array_equal(ss, [0, .5, .5, 0, 1, 1])
def test_cluster_size_1(): # Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster # (cluster 0). We also test the case where there are identical samples # as the only members of a cluster (cluster 2). To our knowledge, this case # is not discussed in reference material, and we choose for it a sample # score of 1. X = [[0.], [1.], [1.], [2.], [3.], [3.]] labels = np.array([0, 1, 1, 1, 2, 2]) # Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention # Cluster 1: intra-cluster = [.5, .5, 1] # inter-cluster = [1, 1, 1] # silhouette = [.5, .5, 0] # Cluster 2: intra-cluster = [0, 0] # inter-cluster = [arbitrary, arbitrary] # silhouette = [1., 1.] silhouette = silhouette_score(X, labels) assert_false(np.isnan(silhouette)) ss = silhouette_samples(X, labels) assert_array_equal(ss, [0, .5, .5, 0, 1, 1])
def test_silhouette_paper_example(): # Explicitly check per-sample results against Rousseeuw (1987) # Data from Table 1 lower = [ 5.58, 7.00, 6.50, 7.08, 7.00, 3.83, 4.83, 5.08, 8.17, 5.83, 2.17, 5.75, 6.67, 6.92, 4.92, 6.42, 5.00, 5.58, 6.00, 4.67, 6.42, 3.42, 5.50, 6.42, 6.42, 5.00, 3.92, 6.17, 2.50, 4.92, 6.25, 7.33, 4.50, 2.25, 6.33, 2.75, 6.08, 6.67, 4.25, 2.67, 6.00, 6.17, 6.17, 6.92, 6.17, 5.25, 6.83, 4.50, 3.75, 5.75, 5.42, 6.08, 5.83, 6.67, 3.67, 4.75, 3.00, 6.08, 6.67, 5.00, 5.58, 4.83, 6.17, 5.67, 6.50, 6.92, ] D = np.zeros((12, 12)) D[np.tril_indices(12, -1)] = lower D += D.T names = [ "BEL", "BRA", "CHI", "CUB", "EGY", "FRA", "IND", "ISR", "USA", "USS", "YUG", "ZAI", ] # Data from Figure 2 labels1 = [1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1] expected1 = { "USA": 0.43, "BEL": 0.39, "FRA": 0.35, "ISR": 0.30, "BRA": 0.22, "EGY": 0.20, "ZAI": 0.19, "CUB": 0.40, "USS": 0.34, "CHI": 0.33, "YUG": 0.26, "IND": -0.04, } score1 = 0.28 # Data from Figure 3 labels2 = [1, 2, 3, 3, 1, 1, 2, 1, 1, 3, 3, 2] expected2 = { "USA": 0.47, "FRA": 0.44, "BEL": 0.42, "ISR": 0.37, "EGY": 0.02, "ZAI": 0.28, "BRA": 0.25, "IND": 0.17, "CUB": 0.48, "USS": 0.44, "YUG": 0.31, "CHI": 0.31, } score2 = 0.33 for labels, expected, score in [ (labels1, expected1, score1), (labels2, expected2, score2), ]: expected = [expected[name] for name in names] # we check to 2dp because that's what's in the paper pytest.approx( expected, silhouette_samples(D, np.array(labels), metric="precomputed"), abs=1e-2, ) pytest.approx(score, silhouette_score(D, np.array(labels), metric="precomputed"), abs=1e-2)
plt.scatter(centroids[:,0], centroids[:,1], marker='x', color='k', s=50, linewidth=5) plt.show() pairs = itertools.combinations(centroids, 2) for item in pairs: plot__kmeans_scatter_in_2d(item) # ## 6. Calculate the silhouette coefficients. (10 points) # In[9]: silhouette_coefficient = silhouette_score(X, kmeans_model.labels_, metric='sqeuclidean') silhouette_coefficient # ## 7. Assuming that the data is ordered by class labels, print the average silhouette coefficient for each class. (20 points) # In[ ]: silhouette_samples(X, labels, metric='sqeuclidean') # In[ ]: