def main(): from sklearn.cluster import KMeans from sklearn.metrics.cluster import davies_bouldin_score # Load and convert generated data X_train, X_test, _, _ = bench.load_data(params) X_init: Any if params.filei == 'k-means++': X_init = 'k-means++' # Load initial centroids from specified path elif params.filei is not None: X_init = {k: v.astype(params.dtype) for k, v in np.load(params.filei).items()} if isinstance(X_init, np.ndarray): params.n_clusters = X_init.shape[0] # or choose random centroids from training data else: np.random.seed(params.seed) centroids_idx = np.random.randint(low=0, high=X_train.shape[0], size=params.n_clusters) if hasattr(X_train, "iloc"): X_init = X_train.iloc[centroids_idx].values else: X_init = X_train[centroids_idx] def fit_kmeans(X, X_init): alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, max_iter=params.maxiter, init=X_init, n_init=1) alg.fit(X) return alg # Time fit fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train, X_init, params=params) train_predict = kmeans.predict(X_train) acc_train = davies_bouldin_score(X_train, train_predict) # Time predict predict_time, test_predict = bench.measure_function_time( kmeans.predict, X_test, params=params) acc_test = davies_bouldin_score(X_test, test_predict) bench.print_output(library='sklearn', algorithm='kmeans', stages=['training', 'prediction'], params=params, functions=['KMeans.fit', 'KMeans.predict'], times=[fit_time, predict_time], accuracy_type='davies_bouldin_score', accuracies=[acc_train, acc_test], data=[X_train, X_test], alg_instance=kmeans)
def get_clustering_metrics(train_data, cluster_labels, ground_truth_labels=None): clustering_metric_dict = dict({}) clustering_metric_dict['silhouette_score'] = silhouette_score( train_data, cluster_labels, random_state=42) clustering_metric_dict[ 'calinski_harabasz_score'] = calinski_harabasz_score( train_data, cluster_labels) clustering_metric_dict['davies_bouldin_score'] = davies_bouldin_score( train_data, cluster_labels) if ground_truth_labels is not None: clustering_metric_dict['v_measure_score'] = v_measure_score( ground_truth_labels, cluster_labels) clustering_metric_dict[ 'fowlkes_mallows_score'] = fowlkes_mallows_score( ground_truth_labels, cluster_labels) clustering_metric_dict['homogeneity_score'] = homogeneity_score( ground_truth_labels, cluster_labels) clustering_metric_dict[ 'normalized_mutual_info_score'] = normalized_mutual_info_score( ground_truth_labels, cluster_labels) clustering_metric_dict['adjusted_rand_score'] = adjusted_rand_score( ground_truth_labels, cluster_labels) clustering_metric_dict['completeness_score'] = completeness_score( ground_truth_labels, cluster_labels) return clustering_metric_dict
def main(): from sklearn.cluster import DBSCAN from sklearn.metrics.cluster import davies_bouldin_score # Load generated data X, _, _, _ = bench.load_data(params, add_dtype=True) # Create our clustering object dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs, min_samples=params.min_samples, metric='euclidean', algorithm='auto') # N.B. algorithm='auto' will select oneAPI Data Analytics Library (oneDAL) # brute force method when running daal4py-patched scikit-learn, and probably # 'kdtree' when running unpatched scikit-learn. # Time fit time, _ = bench.measure_function_time(dbscan.fit, X, params=params) labels = dbscan.labels_ params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0) acc = davies_bouldin_score(X, labels) bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'], params=params, functions=['DBSCAN'], times=[time], accuracies=[acc], accuracy_type='davies_bouldin_score', data=[X], alg_instance=dbscan)
def evaluation(X_selected, X_test, n_clusters, y): """ This function calculates ARI, ACC and NMI of clustering results Input ----- X_selected: {numpy array}, shape (n_samples, n_selected_features} input data on the selected features n_clusters: {int} number of clusters y: {numpy array}, shape (n_samples,) true labels Output ------ nmi: {float} Normalized Mutual Information acc: {float} Accuracy """ k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1) k_means.fit(X_selected) y_predict = k_means.predict(X_test) # calculate NMI nmi = normalized_mutual_info_score(y, y_predict, average_method='arithmetic') # calculate Silhouette score try: sil = silhouette_score(X_test, y_predict, metric='euclidean') except ValueError: sil = float('nan') app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Silhouette score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT) # calculate Davies Bouldin try: db = davies_bouldin_score(X_test, y_predict) except ValueError: db = float('nan') app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Davies Bouldin score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT) # calculate Calinski Harabasz score try: ch = calinski_harabasz_score(X_test, y_predict) except ValueError: ch = float('nan') app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Calinski Harabasz score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT) # calculate Purity pur = purity(y, y_predict) return nmi, sil, db, ch, pur '''
def _clustering_metrics(labels, X, digits): if X is None: SIL = None DB = None CH = None else: SIL = round(silhouette_score(X, labels),digits) DB = round(davies_bouldin_score(X, labels),digits) CH = round(calinski_harabasz_score(X, labels),digits) return SIL, DB, CH
def davies_bouldin(dataset_values: DatasetValues): """ Davies, D. L.; Bouldin, D. W. (1979). A cluster separation measure. IEEE Trans. Pattern Anal. Mach. Intell., v.1, n.2, p.224�227. The objective is minimize value [0, +Inf] """ if dataset_values.K == 1: return np.inf return davies_bouldin_score(dataset_values.data, dataset_values.cluster_labels)
def test_davies_bouldin_score(): assert_raises_on_only_one_label(davies_bouldin_score) assert_raises_on_all_points_same_cluster(davies_bouldin_score) # Assert the value is 0. when all samples are equals assert davies_bouldin_score(np.ones((10, 2)), [0] * 5 + [1] * 5) == pytest.approx(0.0) # Assert the value is 0. when all the mean cluster are equal assert davies_bouldin_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10) == pytest.approx(0.0) # General case (with non numpy arrays) X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5) labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3) # Ensure divide by zero warning is not raised in general case with pytest.warns(None) as record: davies_bouldin_score(X, labels) div_zero_warnings = [ warning for warning in record if "divide by zero encountered" in warning.message.args[0] ] assert len(div_zero_warnings) == 0 # General case - cluster have one sample X = ([[0, 0], [2, 2], [3, 3], [5, 5]]) labels = [0, 0, 1, 2] pytest.approx(davies_bouldin_score(X, labels), (5. / 4) / 3)
def test_davies_bouldin_score(): assert_raises_on_only_one_label(davies_bouldin_score) assert_raises_on_all_points_same_cluster(davies_bouldin_score) # Assert the value is 0. when all samples are equals assert davies_bouldin_score(np.ones((10, 2)), [0] * 5 + [1] * 5) == pytest.approx( 0.0 ) # Assert the value is 0. when all the mean cluster are equal assert davies_bouldin_score( [[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10 ) == pytest.approx(0.0) # General case (with non numpy arrays) X = ( [[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5 ) labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3) # Ensure divide by zero warning is not raised in general case with warnings.catch_warnings(): warnings.simplefilter("error", RuntimeWarning) davies_bouldin_score(X, labels) # General case - cluster have one sample X = [[0, 0], [2, 2], [3, 3], [5, 5]] labels = [0, 0, 1, 2] pytest.approx(davies_bouldin_score(X, labels), (5.0 / 4) / 3)
def test_davies_bouldin_score(): assert_raises_on_only_one_label(davies_bouldin_score) assert_raises_on_all_points_same_cluster(davies_bouldin_score) # Assert the value is 0. when all samples are equals assert davies_bouldin_score(np.ones((10, 2)), [0] * 5 + [1] * 5) == pytest.approx(0.0) # Assert the value is 0. when all the mean cluster are equal assert davies_bouldin_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10) == pytest.approx(0.0) # General case (with non numpy arrays) X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5) labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3) # Ensure divide by zero warning is not raised in general case with pytest.warns(None) as record: davies_bouldin_score(X, labels) div_zero_warnings = [ warning for warning in record if "divide by zero encountered" in warning.message.args[0] ] assert len(div_zero_warnings) == 0 # General case - cluster have one sample X = ([[0, 0], [2, 2], [3, 3], [5, 5]]) labels = [0, 0, 1, 2] pytest.approx(davies_bouldin_score(X, labels), (5. / 4) / 3)
def test_davies_bouldin_score(): assert_raises_on_only_one_label(davies_bouldin_score) assert_raises_on_all_points_same_cluster(davies_bouldin_score) # Assert the value is 0. when all samples are equals assert davies_bouldin_score(np.ones((10, 2)), [0] * 5 + [1] * 5) == pytest.approx(0.0) # Assert the value is 0. when all the mean cluster are equal assert davies_bouldin_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10) == pytest.approx(0.0) # General case (with non numpy arrays) X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5) labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10 pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3) # General case - cluster have one sample X = ([[0, 0], [2, 2], [3, 3], [5, 5]]) labels = [0, 0, 1, 2] pytest.approx(davies_bouldin_score(X, labels), (5. / 4) / 3)
def __computeKmeansMetrics(self, data, predictedLabels, gsLabels, title, basePath, phase4Results): metrics = dict() metrics["davies_bouldin_score"] = clusteringMetrics.davies_bouldin_score(data, predictedLabels) metrics["adjusted_rand_score"] = clusteringMetrics.adjusted_rand_score(gsLabels, predictedLabels) metrics["completeness_score"] = clusteringMetrics.completeness_score(gsLabels, predictedLabels) metrics["purity_score"] = purity_score(gsLabels, predictedLabels) confusionMatrixMapped = clusteringMappingMetric(predictedLabels, gsLabels) confusionMatrix = confusion_matrix(gsLabels, predictedLabels) kdf = pd.DataFrame.from_dict(metrics, orient='index', columns=[title]) phase4Results = phase4Results.join(kdf) np.savetxt(basePath / f"{title}_kmeans_confusionMapping.csv", confusionMatrixMapped, delimiter=",", fmt='%i') np.savetxt(basePath / f"{title}_kmeans_confusion.csv", confusionMatrix, delimiter=",", fmt='%i') return phase4Results
def computeMetrics(self, data, trueLabels, predictedLabels): confusionMatrixes = dict() metrics = dict() for algorithmName, labels in predictedLabels.items(): metrics[algorithmName] = dict() metrics[algorithmName][ "davies_bouldin_score"] = clusteringMetrics.davies_bouldin_score( data, labels) metrics[algorithmName][ "adjusted_rand_score"] = clusteringMetrics.adjusted_rand_score( trueLabels, labels) metrics[algorithmName][ "completeness_score"] = clusteringMetrics.completeness_score( trueLabels, labels) metrics[algorithmName]["purity_score"] = purity_score( trueLabels, labels) confusionMatrixes[algorithmName] = clusteringMappingMetric( labels, trueLabels) return metrics, confusionMatrixes
alg.fit(X) return alg # Time fit fit_time, kmeans = measure_function_time(kmeans_fit, X_train, params=params) train_predict = kmeans.predict(X_train) # Time predict predict_time, test_predict = measure_function_time(kmeans.predict, X_test, params=params) X_train_host = convert_to_numpy(X_train) train_predict_host = convert_to_numpy(train_predict) acc_train = davies_bouldin_score(X_train_host, train_predict_host) X_test_host = convert_to_numpy(X_test) test_predict_host = convert_to_numpy(test_predict) acc_test = davies_bouldin_score(X_test_host, test_predict_host) print_output(library='cuml', algorithm='kmeans', stages=['training', 'prediction'], columns=columns, params=params, functions=['KMeans.fit', 'KMeans.predict'], times=[fit_time, predict_time], accuracy_type='davies_bouldin_score', accuracies=[acc_train, acc_test],
def _eval_clustering(self, labels_true, labels_predicted): # To address when COP-KMeans fails to satisfy all constraints at a k: if labels_predicted is None: # return an empty dictionary to expose in the final output return {"nmi": None, "ami": None, "ari": None, "fms": None, "v_measure": None, "bcubed_precision": None, "bcubed_recall": None, "bcubed_fscore": None, "Silhouette": None, "Calinski_harabasz": None, "Davies_Bouldin": None } nmi = normalized_mutual_info_score(labels_true, labels_predicted, average_method="max") ami = adjusted_mutual_info_score(labels_true, labels_predicted, average_method="arithmetic") ari = adjusted_rand_score(labels_true, labels_predicted) v_measure = v_measure_score(labels_true, labels_predicted, beta=1.0) fms = fowlkes_mallows_score(labels_true, labels_predicted) # Reshape labels for BCubed measures true_dict = self._reshape_labels_as_dicts(labels_true) pred_dict = self._reshape_labels_as_dicts(labels_predicted) bcubed_precision = bcubed.precision(cdict=pred_dict, ldict=true_dict) bcubed_recall = bcubed.recall(cdict=pred_dict, ldict=true_dict) bcubed_f1 = bcubed.fscore(bcubed_precision, bcubed_recall) # ===================================================================== # Unsupervised Metrics # ===================================================================== if not labels_predicted.nunique() in (1, len(self.data)): sil = silhouette_score(X=self.data, labels=labels_predicted, metric=self.distance_metric, random_state=13712) ch = calinski_harabasz_score(X=self.data, labels=labels_predicted) dv = davies_bouldin_score(X=self.data, labels=labels_predicted) else: sil = None ch = None dv = None ret = {} ret.update({"nmi": round(nmi, 4), "ami": round(ami, 4), "ari": round(ari, 4), "fms": round(fms, 4), "v_measure": round(v_measure, 4), "bcubed_precision": round(bcubed_precision, 4), "bcubed_recall": round(bcubed_recall, 4), "bcubed_fscore": round(bcubed_f1, 4), "Silhouette": round(sil, 4 ) if sil is not None else None, "Calinski_harabasz": round(ch, 4 ) if ch is not None else None, "Davies_Bouldin": round(dv, 4 ) if dv is not None else None # Here goes the unsupervised indices }) return ret
def _cluster_constraint_based_estimate_k(self, constraints_size: float = 0.05, initialisation: str = "random", random_const: bool = True): """ A controller that runs COP KMEANS n-2 times and selects the best model. Parameters ---------- random_const : bool True whether constraints are to be generated randomly; False if constraints are generated author-wise. Returns ------- list The clustering of the best COP KMEANS chosen via grid search.. """ # Generate the constraints once truth = self.true_labels.sort_index() if random_const: must_l, cant_l, _ = self._elicit_random_constraints( truth=truth, prct=constraints_size) else: must_l, cant_l, _ = self._elicit_author_based_const( truth=truth) # If k is to be estimated: if self.estimated_k: stats = [] for k in range(2, len(self.data)): try: pred, _ = cop_kmeans(dataset=self.data.to_numpy(), k=k, ml=must_l, cl=cant_l, initialization=initialisation) # Calculate DB index if pred is not None: # Flagging a failure to cluster dbi = davies_bouldin_score(X=self.data, labels=pred) sil = silhouette_score(X=self.data, labels=pred) stats.append([k, dbi, sil, pred]) except IndexError: print("\t\tGrid Search Early Termination: " f"Attempting COP-KMEANS with k={k} was unworkable.") break # Detect the best k and the best clustering based on DB index df_stats = pd.DataFrame(stats, columns=["k", "dbi", "sil", "pred"]) # Calculate the penalised score per each k df_stats["score"] = df_stats.dbi * df_stats.k # Pick the lowest db score best_record = df_stats[df_stats.score == df_stats.score.min()] # If more than one optimum is there, # opt for the smallest k as it is less likely to be an overfit best_record = best_record[best_record.k == best_record.k.min()] self.cand_k.append(int(best_record.k)) return best_record.pred.tolist()[0] else: pred, _ = cop_kmeans(dataset=self.data.to_numpy(), k=self.k, ml=must_l, cl=cant_l, initialization=initialisation) return pred
def _db(X, labels,digits): return round(davies_bouldin_score(X, labels),digits)
help='The minimum number of samples required in a ' 'neighborhood to consider a point a core point') params = bench.parse_args(parser) # Load generated data X, _, _, _ = bench.load_data(params) # Create our clustering object dbscan = DBSCAN(eps=params.eps, min_samples=params.min_samples) # Time fit time, _ = bench.measure_function_time(dbscan.fit, X, params=params) labels = dbscan.labels_ X_host = bench.convert_to_numpy(X) labels_host = bench.convert_to_numpy(labels) acc = davies_bouldin_score(X_host, labels_host) params.n_clusters = len(set(labels_host)) - (1 if -1 in labels_host else 0) bench.print_output(library='cuml', algorithm='dbscan', stages=['training'], params=params, functions=['DBSCAN'], times=[time], metrics=[acc], metric_type='davies_bouldin_score', data=[X], alg_instance=dbscan)
X, _, _, _ = bench.load_data(params, add_dtype=True) # Create our clustering object dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs, min_samples=params.min_samples, metric='euclidean', algorithm='auto') # N.B. algorithm='auto' will select DAAL's brute force method when running # daal4py-patched scikit-learn, and probably 'kdtree' when running unpatched # scikit-learn. # Time fit time, _ = bench.measure_function_time(dbscan.fit, X, params=params) labels = dbscan.labels_ params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0) acc = davies_bouldin_score(X, labels) bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'], params=params, functions=['DBSCAN'], times=[time], accuracies=[acc], accuracy_type='davies_bouldin_score', data=[X], alg_instance=dbscan)
n_features=2 #, cluster_std=1.03 , shuffle=True, random_state=123) # features scaling scaled_feature = feature_scaling(features) # Set k n_clusters = len(np.unique(target)) # Create K-means object k_means = KMeans(k=n_clusters, max_iter=100, plot_flag=True) # Fit predictions_fit = k_means.fit(scaled_feature) # Predict predictions_pre = k_means.predict(scaled_feature) from sklearn.metrics.cluster import adjusted_mutual_info_score \ , completeness_score, adjusted_rand_score, calinski_harabasz_score \ , davies_bouldin_score, contingency_matrix, silhouette_score print('adjusted_mutual_info_score:', adjusted_mutual_info_score(target, predictions_pre)) print('completeness_score:', completeness_score(target, predictions_pre)) print('adjusted_rand_score:', adjusted_rand_score(target, predictions_pre)) print('calinski_harabasz_score:', calinski_harabasz_score(scaled_feature, target)) print('davies_bouldin_score:', davies_bouldin_score(scaled_feature, target)) print('contingency_matrix:\n', contingency_matrix(target, predictions_pre)) print('silhouette_score:', silhouette_score(scaled_feature, target))