def intra_cluster_distance(X, labels, *, metric='euclidean', **kwds): X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr']) # Check for non-zero diagonal entries in precomputed distance matrix if metric == 'precomputed': atol = np.finfo(X.dtype).eps * 100 if np.any(np.abs(np.diagonal(X)) > atol): raise ValueError( 'The precomputed distance matrix contains non-zero ' 'elements on the diagonal. Use np.fill_diagonal(X, 0).') le = LabelEncoder() labels = le.fit_transform(labels) n_samples = len(labels) label_freqs = np.bincount(labels) check_number_of_labels(len(le.classes_), n_samples) kwds['metric'] = metric reduce_func = functools.partial(_silhouette_reduce, labels=labels, label_freqs=label_freqs) results = zip( *pairwise_distances_chunked(X, reduce_func=reduce_func, **kwds)) intra_clust_dists, inter_clust_dists = results intra_clust_dists = np.concatenate(intra_clust_dists) inter_clust_dists = np.concatenate(inter_clust_dists) denom = (label_freqs - 1).take(labels, mode='clip') with np.errstate(divide="ignore", invalid="ignore"): intra_clust_dists /= denom return intra_clust_dists, inter_clust_dists
def davies_bouldin_index(X, labels, metric='euclidean'): """Compute the Davies Bouldin index. The index is defined as the ratio of within-cluster and between-cluster distances. Parameters ---------- X : array-like, shape (``n_samples``, ``n_features``) List of ``n_features``-dimensional data points. Each row corresponds to a single data point. labels : array-like, shape (``n_samples``,) Predicted labels for each sample. Returns ------- score : float The resulting Davies-Bouldin index. References ---------- .. [1] `Davies, David L.; Bouldin, Donald W. (1979). "A Cluster Separation Measure". IEEE Transactions on Pattern Analysis and Machine Intelligence. PAMI-1 (2): 224-227`_ """ X, labels = check_X_y(X, labels) le = LabelEncoder() labels = le.fit_transform(labels) n_samples, _ = X.shape n_labels = len(le.classes_) check_number_of_labels(n_labels, n_samples) intra_dists = np.zeros(n_labels) centroids = np.zeros((n_labels, len(X[0])), np.float32) # print("Start") # print(labels) # print(X) for k in range(n_labels): cluster_k = X[labels == k] mean_k = np.mean(cluster_k, axis=0) centroids[k] = mean_k # print("Process") # print(mean_k) # print(cluster_k) intra_dists[k] = np.average( pairwise_distances(cluster_k, [mean_k], metric=metric)) centroid_distances = pairwise_distances(centroids, metric=metric) with np.errstate(divide='ignore', invalid='ignore'): if np.all((intra_dists[:, None] + intra_dists) == 0.0) or \ np.all(centroid_distances == 0.0): return 0.0 scores = (intra_dists[:, None] + intra_dists) / centroid_distances # remove inf values scores[scores == np.inf] = np.nan return np.nanmax(scores, axis=1)
def new_silhouette_samples(X, labels, metric='precomputed', **kwds): """ I will remove 'distances' object since in this case X is already a distance matrix. I will also exclude '-1' label from the computation""" X = X.astype(np.float32) X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr']) le = LabelEncoder() labels = le.fit_transform(labels) unsupervised.check_number_of_labels(len(le.classes_), X.shape[0]) unique_labels = le.classes_ n_samples_per_label = np.bincount(labels, minlength=len(unique_labels)) # For sample i, store the mean distance of the cluster to which # it belongs in intra_clust_dists[i] intra_clust_dists = np.zeros(X.shape[0], dtype=X.dtype) # For sample i, store the mean distance of the second closest # cluster in inter_clust_dists[i] inter_clust_dists = np.inf + intra_clust_dists for curr_label in range(len(unique_labels)): # Do not consider noise label (that is 0) if curr_label != 0: # Find inter_clust_dist for all samples belonging to the same # label. mask = labels == curr_label current_distances = X[mask] # Leave out current sample. n_samples_curr_lab = n_samples_per_label[curr_label] - 1 if n_samples_curr_lab != 0: intra_clust_dists[mask] = np.sum(current_distances[:, mask], axis=1) / n_samples_curr_lab # Now iterate over all other labels, finding the mean # cluster distance that is closest to every sample. for other_label in range(len(unique_labels)): if other_label != curr_label and other_label != 0: other_mask = labels == other_label other_distances = np.mean(current_distances[:, other_mask], axis=1) inter_clust_dists[mask] = np.minimum( inter_clust_dists[mask], other_distances) sil_samples = inter_clust_dists - intra_clust_dists sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists) # score 0 for clusters of size 1, according to the paper sil_samples[n_samples_per_label.take(labels) == 1] = 0 return sil_samples
def silhouette_samples_memory_saving(X, labels, metric='euclidean', **kwds): X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr']) le = LabelEncoder() labels = le.fit_transform(labels) check_number_of_labels(len(le.classes_), X.shape[0]) unique_labels = le.classes_ n_samples_per_label = np.bincount(labels, minlength=len(unique_labels)) # For sample i, store the mean distance of the cluster to which # it belongs in intra_clust_dists[i] intra_clust_dists = np.zeros(X.shape[0], dtype=X.dtype) # For sample i, store the mean distance of the second closest # cluster in inter_clust_dists[i] inter_clust_dists = np.inf + intra_clust_dists for curr_label in range(len(unique_labels)): # Find inter_clust_dist for all samples belonging to the same label. mask = labels == curr_label # Leave out current sample. n_samples_curr_lab = n_samples_per_label[curr_label] - 1 if n_samples_curr_lab != 0: intra_clust_dists[mask] = euclidean_distances_sum( X[mask, :]) / n_samples_curr_lab # Now iterate over all other labels, finding the mean # cluster distance that is closest to every sample. for other_label in range(len(unique_labels)): if other_label != curr_label: other_mask = labels == other_label other_distances = euclidean_distances_mean( X[mask, :], X[other_mask, :]) inter_clust_dists[mask] = np.minimum(inter_clust_dists[mask], other_distances) sil_samples = inter_clust_dists - intra_clust_dists sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists) # score 0 for clusters of size 1, according to the paper sil_samples[n_samples_per_label.take(labels) == 1] = 0 return sil_samples