def test_calinski_harabasz_score(): rng = np.random.RandomState(seed=0) # Assert message when there is only one label assert_raise_message(ValueError, "Number of labels is", calinski_harabasz_score, rng.rand(10, 2), np.zeros(10)) # Assert message when all point are in different clusters assert_raise_message(ValueError, "Number of labels is", calinski_harabasz_score, rng.rand(10, 2), np.arange(10)) # Assert the value is 1. when all samples are equals assert_equal(1., calinski_harabasz_score(np.ones((10, 2)), [0] * 5 + [1] * 5)) # Assert the value is 0. when all the mean cluster are equal assert_equal( 0., calinski_harabasz_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10)) # General case (with non numpy arrays) X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5) labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 assert_almost_equal(calinski_harabasz_score(X, labels), 45 * (40 - 4) / (5 * (4 - 1)))
def get_clustering_metrics(train_data, cluster_labels, ground_truth_labels=None): clustering_metric_dict = dict({}) clustering_metric_dict['silhouette_score'] = silhouette_score( train_data, cluster_labels, random_state=42) clustering_metric_dict[ 'calinski_harabasz_score'] = calinski_harabasz_score( train_data, cluster_labels) clustering_metric_dict['davies_bouldin_score'] = davies_bouldin_score( train_data, cluster_labels) if ground_truth_labels is not None: clustering_metric_dict['v_measure_score'] = v_measure_score( ground_truth_labels, cluster_labels) clustering_metric_dict[ 'fowlkes_mallows_score'] = fowlkes_mallows_score( ground_truth_labels, cluster_labels) clustering_metric_dict['homogeneity_score'] = homogeneity_score( ground_truth_labels, cluster_labels) clustering_metric_dict[ 'normalized_mutual_info_score'] = normalized_mutual_info_score( ground_truth_labels, cluster_labels) clustering_metric_dict['adjusted_rand_score'] = adjusted_rand_score( ground_truth_labels, cluster_labels) clustering_metric_dict['completeness_score'] = completeness_score( ground_truth_labels, cluster_labels) return clustering_metric_dict
def test_calinski_harabasz_score(): assert_raises_on_only_one_label(calinski_harabasz_score) assert_raises_on_all_points_same_cluster(calinski_harabasz_score) # Assert the value is 1. when all samples are equals assert 1. == calinski_harabasz_score(np.ones((10, 2)), [0] * 5 + [1] * 5) # Assert the value is 0. when all the mean cluster are equal assert 0. == calinski_harabasz_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10) # General case (with non numpy arrays) X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5) labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 pytest.approx(calinski_harabasz_score(X, labels), 45 * (40 - 4) / (5 * (4 - 1)))
def evaluation(X_selected, X_test, n_clusters, y): """ This function calculates ARI, ACC and NMI of clustering results Input ----- X_selected: {numpy array}, shape (n_samples, n_selected_features} input data on the selected features n_clusters: {int} number of clusters y: {numpy array}, shape (n_samples,) true labels Output ------ nmi: {float} Normalized Mutual Information acc: {float} Accuracy """ k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1) k_means.fit(X_selected) y_predict = k_means.predict(X_test) # calculate NMI nmi = normalized_mutual_info_score(y, y_predict, average_method='arithmetic') # calculate Silhouette score try: sil = silhouette_score(X_test, y_predict, metric='euclidean') except ValueError: sil = float('nan') app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Silhouette score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT) # calculate Davies Bouldin try: db = davies_bouldin_score(X_test, y_predict) except ValueError: db = float('nan') app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Davies Bouldin score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT) # calculate Calinski Harabasz score try: ch = calinski_harabasz_score(X_test, y_predict) except ValueError: ch = float('nan') app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Calinski Harabasz score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT) # calculate Purity pur = purity(y, y_predict) return nmi, sil, db, ch, pur '''
def calinski_harabasz(dataset_values: DatasetValues): """Calinski, T.; Harabasz, J. (1974). A dendrite method for cluster analysis. Communications in Statistics - Theory and Methods, v.3, n.1, p.1�27. The objective is maximize value [0, +Inf]""" if dataset_values.K == 1: return 0 return calinski_harabasz_score(dataset_values.data, dataset_values.cluster_labels)
def test_calinski_harabasz_score(): assert_raises_on_only_one_label(calinski_harabasz_score) assert_raises_on_all_points_same_cluster(calinski_harabasz_score) # Assert the value is 1. when all samples are equals assert_equal(1., calinski_harabasz_score(np.ones((10, 2)), [0] * 5 + [1] * 5)) # Assert the value is 0. when all the mean cluster are equal assert_equal(0., calinski_harabasz_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10)) # General case (with non numpy arrays) X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5) labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 pytest.approx(calinski_harabasz_score(X, labels), 45 * (40 - 4) / (5 * (4 - 1)))
def _clustering_metrics(labels, X, digits): if X is None: SIL = None DB = None CH = None else: SIL = round(silhouette_score(X, labels),digits) DB = round(davies_bouldin_score(X, labels),digits) CH = round(calinski_harabasz_score(X, labels),digits) return SIL, DB, CH
def _eval_clustering(self, labels_true, labels_predicted): # To address when COP-KMeans fails to satisfy all constraints at a k: if labels_predicted is None: # return an empty dictionary to expose in the final output return {"nmi": None, "ami": None, "ari": None, "fms": None, "v_measure": None, "bcubed_precision": None, "bcubed_recall": None, "bcubed_fscore": None, "Silhouette": None, "Calinski_harabasz": None, "Davies_Bouldin": None } nmi = normalized_mutual_info_score(labels_true, labels_predicted, average_method="max") ami = adjusted_mutual_info_score(labels_true, labels_predicted, average_method="arithmetic") ari = adjusted_rand_score(labels_true, labels_predicted) v_measure = v_measure_score(labels_true, labels_predicted, beta=1.0) fms = fowlkes_mallows_score(labels_true, labels_predicted) # Reshape labels for BCubed measures true_dict = self._reshape_labels_as_dicts(labels_true) pred_dict = self._reshape_labels_as_dicts(labels_predicted) bcubed_precision = bcubed.precision(cdict=pred_dict, ldict=true_dict) bcubed_recall = bcubed.recall(cdict=pred_dict, ldict=true_dict) bcubed_f1 = bcubed.fscore(bcubed_precision, bcubed_recall) # ===================================================================== # Unsupervised Metrics # ===================================================================== if not labels_predicted.nunique() in (1, len(self.data)): sil = silhouette_score(X=self.data, labels=labels_predicted, metric=self.distance_metric, random_state=13712) ch = calinski_harabasz_score(X=self.data, labels=labels_predicted) dv = davies_bouldin_score(X=self.data, labels=labels_predicted) else: sil = None ch = None dv = None ret = {} ret.update({"nmi": round(nmi, 4), "ami": round(ami, 4), "ari": round(ari, 4), "fms": round(fms, 4), "v_measure": round(v_measure, 4), "bcubed_precision": round(bcubed_precision, 4), "bcubed_recall": round(bcubed_recall, 4), "bcubed_fscore": round(bcubed_f1, 4), "Silhouette": round(sil, 4 ) if sil is not None else None, "Calinski_harabasz": round(ch, 4 ) if ch is not None else None, "Davies_Bouldin": round(dv, 4 ) if dv is not None else None # Here goes the unsupervised indices }) return ret
y_km = kmeans.fit_predict(scaled_df) labels = kmeans.labels_ cluster_centers = kmeans.cluster_centers_ # In[ ]: ##Calinski-Harabasz Index # In[247]: from sklearn.metrics.cluster import calinski_harabasz_score calinski_harabasz_score(scaled_df, labels) # In[353]: kmeans = KMeans(n_clusters=5) kmeans.fit(scaled_df) print(kmeans.cluster_centers_) X = kmeans.cluster_centers_ y_km = kmeans.fit_predict(scaled_df) labels = kmeans.labels_ cluster_centers = kmeans.cluster_centers_ # In[249]:
n_features=2 #, cluster_std=1.03 , shuffle=True, random_state=123) # features scaling scaled_feature = feature_scaling(features) # Set k n_clusters = len(np.unique(target)) # Create K-means object k_means = KMeans(k=n_clusters, max_iter=100, plot_flag=True) # Fit predictions_fit = k_means.fit(scaled_feature) # Predict predictions_pre = k_means.predict(scaled_feature) from sklearn.metrics.cluster import adjusted_mutual_info_score \ , completeness_score, adjusted_rand_score, calinski_harabasz_score \ , davies_bouldin_score, contingency_matrix, silhouette_score print('adjusted_mutual_info_score:', adjusted_mutual_info_score(target, predictions_pre)) print('completeness_score:', completeness_score(target, predictions_pre)) print('adjusted_rand_score:', adjusted_rand_score(target, predictions_pre)) print('calinski_harabasz_score:', calinski_harabasz_score(scaled_feature, target)) print('davies_bouldin_score:', davies_bouldin_score(scaled_feature, target)) print('contingency_matrix:\n', contingency_matrix(target, predictions_pre)) print('silhouette_score:', silhouette_score(scaled_feature, target))
def _ch(X, labels,digits): return round(calinski_harabasz_score(X, labels),digits)