Example #1
0
def main():
    from sklearn.cluster import KMeans
    from sklearn.metrics.cluster import davies_bouldin_score

    # Load and convert generated data
    X_train, X_test, _, _ = bench.load_data(params)

    X_init: Any
    if params.filei == 'k-means++':
        X_init = 'k-means++'
    # Load initial centroids from specified path
    elif params.filei is not None:
        X_init = {k: v.astype(params.dtype) for k, v in np.load(params.filei).items()}
        if isinstance(X_init, np.ndarray):
            params.n_clusters = X_init.shape[0]
    # or choose random centroids from training data
    else:
        np.random.seed(params.seed)
        centroids_idx = np.random.randint(low=0, high=X_train.shape[0],
                                          size=params.n_clusters)
        if hasattr(X_train, "iloc"):
            X_init = X_train.iloc[centroids_idx].values
        else:
            X_init = X_train[centroids_idx]

    def fit_kmeans(X, X_init):
        alg = KMeans(n_clusters=params.n_clusters, tol=params.tol,
                     max_iter=params.maxiter, init=X_init, n_init=1)
        alg.fit(X)
        return alg

    # Time fit
    fit_time, kmeans = bench.measure_function_time(fit_kmeans, X_train,
                                                   X_init, params=params)

    train_predict = kmeans.predict(X_train)
    acc_train = davies_bouldin_score(X_train, train_predict)

    # Time predict
    predict_time, test_predict = bench.measure_function_time(
        kmeans.predict, X_test, params=params)

    acc_test = davies_bouldin_score(X_test, test_predict)

    bench.print_output(library='sklearn', algorithm='kmeans',
                       stages=['training', 'prediction'],
                       params=params, functions=['KMeans.fit', 'KMeans.predict'],
                       times=[fit_time, predict_time],
                       accuracy_type='davies_bouldin_score',
                       accuracies=[acc_train, acc_test], data=[X_train, X_test],
                       alg_instance=kmeans)
Example #2
0
def get_clustering_metrics(train_data,
                           cluster_labels,
                           ground_truth_labels=None):
    clustering_metric_dict = dict({})
    clustering_metric_dict['silhouette_score'] = silhouette_score(
        train_data, cluster_labels, random_state=42)
    clustering_metric_dict[
        'calinski_harabasz_score'] = calinski_harabasz_score(
            train_data, cluster_labels)
    clustering_metric_dict['davies_bouldin_score'] = davies_bouldin_score(
        train_data, cluster_labels)

    if ground_truth_labels is not None:
        clustering_metric_dict['v_measure_score'] = v_measure_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict[
            'fowlkes_mallows_score'] = fowlkes_mallows_score(
                ground_truth_labels, cluster_labels)
        clustering_metric_dict['homogeneity_score'] = homogeneity_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict[
            'normalized_mutual_info_score'] = normalized_mutual_info_score(
                ground_truth_labels, cluster_labels)
        clustering_metric_dict['adjusted_rand_score'] = adjusted_rand_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict['completeness_score'] = completeness_score(
            ground_truth_labels, cluster_labels)

    return clustering_metric_dict
Example #3
0
def main():
    from sklearn.cluster import DBSCAN
    from sklearn.metrics.cluster import davies_bouldin_score

    # Load generated data
    X, _, _, _ = bench.load_data(params, add_dtype=True)

    # Create our clustering object
    dbscan = DBSCAN(eps=params.eps, n_jobs=params.n_jobs,
                    min_samples=params.min_samples, metric='euclidean',
                    algorithm='auto')

    # N.B. algorithm='auto' will select oneAPI Data Analytics Library (oneDAL)
    # brute force method when running daal4py-patched scikit-learn, and probably
    #  'kdtree' when running unpatched scikit-learn.

    # Time fit
    time, _ = bench.measure_function_time(dbscan.fit, X, params=params)
    labels = dbscan.labels_

    params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    acc = davies_bouldin_score(X, labels)

    bench.print_output(library='sklearn', algorithm='dbscan', stages=['training'],
                       params=params, functions=['DBSCAN'], times=[time],
                       accuracies=[acc], accuracy_type='davies_bouldin_score',
                       data=[X], alg_instance=dbscan)
Example #4
0
def evaluation(X_selected, X_test, n_clusters, y):
    """
    This function calculates ARI, ACC and NMI of clustering results

    Input
    -----
    X_selected: {numpy array}, shape (n_samples, n_selected_features}
            input data on the selected features
    n_clusters: {int}
            number of clusters
    y: {numpy array}, shape (n_samples,)
            true labels

    Output
    ------
    nmi: {float}
        Normalized Mutual Information
    acc: {float}
        Accuracy
    """
    k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300,
                     tol=0.0001, precompute_distances=True, verbose=0,
                     random_state=None, copy_x=True, n_jobs=1)

    k_means.fit(X_selected)
    y_predict = k_means.predict(X_test)
    
    # calculate NMI
    nmi = normalized_mutual_info_score(y, y_predict, average_method='arithmetic')

    # calculate Silhouette score
    try:
        sil = silhouette_score(X_test, y_predict, metric='euclidean')
    except ValueError:
        sil = float('nan')
        app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Silhouette score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT)

    # calculate Davies Bouldin 
    try:
        db = davies_bouldin_score(X_test, y_predict)
    except ValueError:
        db = float('nan')
        app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Davies Bouldin score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT)

    # calculate Calinski Harabasz score
    try:
        ch = calinski_harabasz_score(X_test, y_predict)
    except ValueError:
        ch = float('nan')
        app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Calinski Harabasz score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT)

    # calculate Purity
    pur = purity(y, y_predict)

    return nmi, sil, db, ch, pur

    '''
Example #5
0
def _clustering_metrics(labels, X, digits):
    if X is None:
        SIL = None
        DB = None
        CH = None
    else:
        SIL = round(silhouette_score(X, labels),digits)
        DB = round(davies_bouldin_score(X, labels),digits)
        CH = round(calinski_harabasz_score(X, labels),digits)

    return SIL, DB, CH
Example #6
0
def davies_bouldin(dataset_values: DatasetValues):
    """
   Davies, D. L.; Bouldin, D. W. (1979). A cluster separation measure. IEEE
   Trans. Pattern Anal. Mach. Intell., v.1, n.2, p.224�227.

   The objective is minimize value [0, +Inf]
    """
    if dataset_values.K == 1:
        return np.inf

    return davies_bouldin_score(dataset_values.data,
                                dataset_values.cluster_labels)
def test_davies_bouldin_score():
    assert_raises_on_only_one_label(davies_bouldin_score)
    assert_raises_on_all_points_same_cluster(davies_bouldin_score)

    # Assert the value is 0. when all samples are equals
    assert davies_bouldin_score(np.ones((10, 2)),
                                [0] * 5 + [1] * 5) == pytest.approx(0.0)

    # Assert the value is 0. when all the mean cluster are equal
    assert davies_bouldin_score([[-1, -1], [1, 1]] * 10,
                                [0] * 10 + [1] * 10) == pytest.approx(0.0)

    # General case (with non numpy arrays)
    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
         [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
    pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3)

    # Ensure divide by zero warning is not raised in general case
    with pytest.warns(None) as record:
        davies_bouldin_score(X, labels)
    div_zero_warnings = [
        warning for warning in record
        if "divide by zero encountered" in warning.message.args[0]
    ]
    assert len(div_zero_warnings) == 0

    # General case - cluster have one sample
    X = ([[0, 0], [2, 2], [3, 3], [5, 5]])
    labels = [0, 0, 1, 2]
    pytest.approx(davies_bouldin_score(X, labels), (5. / 4) / 3)
def test_davies_bouldin_score():
    assert_raises_on_only_one_label(davies_bouldin_score)
    assert_raises_on_all_points_same_cluster(davies_bouldin_score)

    # Assert the value is 0. when all samples are equals
    assert davies_bouldin_score(np.ones((10, 2)), [0] * 5 + [1] * 5) == pytest.approx(
        0.0
    )

    # Assert the value is 0. when all the mean cluster are equal
    assert davies_bouldin_score(
        [[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10
    ) == pytest.approx(0.0)

    # General case (with non numpy arrays)
    X = (
        [[0, 0], [1, 1]] * 5
        + [[3, 3], [4, 4]] * 5
        + [[0, 4], [1, 3]] * 5
        + [[3, 1], [4, 0]] * 5
    )
    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
    pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3)

    # Ensure divide by zero warning is not raised in general case
    with warnings.catch_warnings():
        warnings.simplefilter("error", RuntimeWarning)
        davies_bouldin_score(X, labels)

    # General case - cluster have one sample
    X = [[0, 0], [2, 2], [3, 3], [5, 5]]
    labels = [0, 0, 1, 2]
    pytest.approx(davies_bouldin_score(X, labels), (5.0 / 4) / 3)
Example #9
0
def test_davies_bouldin_score():
    assert_raises_on_only_one_label(davies_bouldin_score)
    assert_raises_on_all_points_same_cluster(davies_bouldin_score)

    # Assert the value is 0. when all samples are equals
    assert davies_bouldin_score(np.ones((10, 2)),
                                [0] * 5 + [1] * 5) == pytest.approx(0.0)

    # Assert the value is 0. when all the mean cluster are equal
    assert davies_bouldin_score([[-1, -1], [1, 1]] * 10,
                                [0] * 10 + [1] * 10) == pytest.approx(0.0)

    # General case (with non numpy arrays)
    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
         [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
    pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3)

    # Ensure divide by zero warning is not raised in general case
    with pytest.warns(None) as record:
        davies_bouldin_score(X, labels)
    div_zero_warnings = [
        warning for warning in record
        if "divide by zero encountered" in warning.message.args[0]
    ]
    assert len(div_zero_warnings) == 0

    # General case - cluster have one sample
    X = ([[0, 0], [2, 2], [3, 3], [5, 5]])
    labels = [0, 0, 1, 2]
    pytest.approx(davies_bouldin_score(X, labels), (5. / 4) / 3)
def test_davies_bouldin_score():
    assert_raises_on_only_one_label(davies_bouldin_score)
    assert_raises_on_all_points_same_cluster(davies_bouldin_score)

    # Assert the value is 0. when all samples are equals
    assert davies_bouldin_score(np.ones((10, 2)),
                                [0] * 5 + [1] * 5) == pytest.approx(0.0)

    # Assert the value is 0. when all the mean cluster are equal
    assert davies_bouldin_score([[-1, -1], [1, 1]] * 10,
                                [0] * 10 + [1] * 10) == pytest.approx(0.0)

    # General case (with non numpy arrays)
    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
         [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
    pytest.approx(davies_bouldin_score(X, labels), 2 * np.sqrt(0.5) / 3)

    # General case - cluster have one sample
    X = ([[0, 0], [2, 2], [3, 3], [5, 5]])
    labels = [0, 0, 1, 2]
    pytest.approx(davies_bouldin_score(X, labels), (5. / 4) / 3)
Example #11
0
    def __computeKmeansMetrics(self, data, predictedLabels, gsLabels, title, basePath, phase4Results):
        metrics = dict()
        metrics["davies_bouldin_score"] = clusteringMetrics.davies_bouldin_score(data, predictedLabels)
        metrics["adjusted_rand_score"] = clusteringMetrics.adjusted_rand_score(gsLabels, predictedLabels)
        metrics["completeness_score"] = clusteringMetrics.completeness_score(gsLabels, predictedLabels)
        metrics["purity_score"] = purity_score(gsLabels, predictedLabels)
        confusionMatrixMapped = clusteringMappingMetric(predictedLabels, gsLabels)
        confusionMatrix = confusion_matrix(gsLabels, predictedLabels)

        kdf = pd.DataFrame.from_dict(metrics, orient='index', columns=[title])
        phase4Results = phase4Results.join(kdf)

        np.savetxt(basePath / f"{title}_kmeans_confusionMapping.csv", confusionMatrixMapped, delimiter=",", fmt='%i')
        np.savetxt(basePath / f"{title}_kmeans_confusion.csv", confusionMatrix, delimiter=",", fmt='%i')
        return phase4Results
Example #12
0
 def computeMetrics(self, data, trueLabels, predictedLabels):
     confusionMatrixes = dict()
     metrics = dict()
     for algorithmName, labels in predictedLabels.items():
         metrics[algorithmName] = dict()
         metrics[algorithmName][
             "davies_bouldin_score"] = clusteringMetrics.davies_bouldin_score(
                 data, labels)
         metrics[algorithmName][
             "adjusted_rand_score"] = clusteringMetrics.adjusted_rand_score(
                 trueLabels, labels)
         metrics[algorithmName][
             "completeness_score"] = clusteringMetrics.completeness_score(
                 trueLabels, labels)
         metrics[algorithmName]["purity_score"] = purity_score(
             trueLabels, labels)
         confusionMatrixes[algorithmName] = clusteringMappingMetric(
             labels, trueLabels)
     return metrics, confusionMatrixes
Example #13
0
    alg.fit(X)
    return alg


# Time fit
fit_time, kmeans = measure_function_time(kmeans_fit, X_train, params=params)
train_predict = kmeans.predict(X_train)

# Time predict
predict_time, test_predict = measure_function_time(kmeans.predict,
                                                   X_test,
                                                   params=params)

X_train_host = convert_to_numpy(X_train)
train_predict_host = convert_to_numpy(train_predict)
acc_train = davies_bouldin_score(X_train_host, train_predict_host)

X_test_host = convert_to_numpy(X_test)
test_predict_host = convert_to_numpy(test_predict)

acc_test = davies_bouldin_score(X_test_host, test_predict_host)

print_output(library='cuml',
             algorithm='kmeans',
             stages=['training', 'prediction'],
             columns=columns,
             params=params,
             functions=['KMeans.fit', 'KMeans.predict'],
             times=[fit_time, predict_time],
             accuracy_type='davies_bouldin_score',
             accuracies=[acc_train, acc_test],
    def _eval_clustering(self, labels_true, labels_predicted):
        # To address when COP-KMeans fails to satisfy all constraints at a k:
        if labels_predicted is None:
            # return an empty dictionary to expose in the final output
            return {"nmi": None,
                    "ami": None,
                    "ari": None,
                    "fms": None,
                    "v_measure": None,
                    "bcubed_precision": None,
                    "bcubed_recall": None,
                    "bcubed_fscore": None,
                    "Silhouette": None,
                    "Calinski_harabasz": None,
                    "Davies_Bouldin": None
                    }

        nmi = normalized_mutual_info_score(labels_true,
                                           labels_predicted,
                                           average_method="max")

        ami = adjusted_mutual_info_score(labels_true,
                                         labels_predicted,
                                         average_method="arithmetic")

        ari = adjusted_rand_score(labels_true,
                                  labels_predicted)

        v_measure = v_measure_score(labels_true,
                                    labels_predicted,
                                    beta=1.0)

        fms = fowlkes_mallows_score(labels_true,
                                    labels_predicted)

        # Reshape labels for BCubed measures
        true_dict = self._reshape_labels_as_dicts(labels_true)
        pred_dict = self._reshape_labels_as_dicts(labels_predicted)

        bcubed_precision = bcubed.precision(cdict=pred_dict, ldict=true_dict)
        bcubed_recall = bcubed.recall(cdict=pred_dict, ldict=true_dict)
        bcubed_f1 = bcubed.fscore(bcubed_precision, bcubed_recall)

        # =====================================================================
        # Unsupervised Metrics
        # =====================================================================
        if not labels_predicted.nunique() in (1, len(self.data)):
            sil = silhouette_score(X=self.data,
                                   labels=labels_predicted,
                                   metric=self.distance_metric,
                                   random_state=13712)

            ch = calinski_harabasz_score(X=self.data, labels=labels_predicted)

            dv = davies_bouldin_score(X=self.data, labels=labels_predicted)
        else:
            sil = None
            ch = None
            dv = None

        ret = {}
        ret.update({"nmi": round(nmi, 4),
                    "ami": round(ami, 4),
                    "ari": round(ari, 4),
                    "fms": round(fms, 4),
                    "v_measure": round(v_measure, 4),
                    "bcubed_precision": round(bcubed_precision, 4),
                    "bcubed_recall": round(bcubed_recall, 4),
                    "bcubed_fscore": round(bcubed_f1, 4),
                    "Silhouette": round(sil, 4
                                        ) if sil is not None else None,
                    "Calinski_harabasz": round(ch, 4
                                               ) if ch is not None else None,
                    "Davies_Bouldin": round(dv, 4
                                            ) if dv is not None else None
                    # Here goes the unsupervised indices
                    })

        return ret
    def _cluster_constraint_based_estimate_k(self,
                                             constraints_size: float = 0.05,
                                             initialisation: str = "random",
                                             random_const: bool = True):
        """
        A controller that runs COP KMEANS n-2 times and selects the best model.

        Parameters
        ----------
        random_const : bool
            True whether constraints are to be generated randomly;
            False if constraints are generated author-wise.

        Returns
        -------
        list
            The clustering of the best COP KMEANS chosen via grid search..

        """
        # Generate the constraints once
        truth = self.true_labels.sort_index()
        if random_const:
            must_l, cant_l, _ = self._elicit_random_constraints(
                truth=truth,
                prct=constraints_size)
        else:
            must_l, cant_l, _ = self._elicit_author_based_const(
                truth=truth)

        # If k is to be estimated:
        if self.estimated_k:
            stats = []

            for k in range(2, len(self.data)):
                try:
                    pred, _ = cop_kmeans(dataset=self.data.to_numpy(),
                                         k=k,
                                         ml=must_l,
                                         cl=cant_l,
                                         initialization=initialisation)
                    # Calculate DB index
                    if pred is not None:  # Flagging a failure to cluster
                        dbi = davies_bouldin_score(X=self.data, labels=pred)
                        sil = silhouette_score(X=self.data, labels=pred)
                        stats.append([k, dbi, sil, pred])
                except IndexError:
                    print("\t\tGrid Search Early Termination: "
                          f"Attempting COP-KMEANS with k={k} was unworkable.")
                    break

            # Detect the best k and the best clustering based on DB index
            df_stats = pd.DataFrame(stats, columns=["k", "dbi", "sil", "pred"])
            # Calculate the penalised score per each k
            df_stats["score"] = df_stats.dbi * df_stats.k
            # Pick the lowest db score
            best_record = df_stats[df_stats.score == df_stats.score.min()]

            # If more than one optimum is there,
            # opt for the smallest k as it is less likely to be an overfit
            best_record = best_record[best_record.k == best_record.k.min()]
            self.cand_k.append(int(best_record.k))

            return best_record.pred.tolist()[0]
        else:
            pred, _ = cop_kmeans(dataset=self.data.to_numpy(),
                                 k=self.k,
                                 ml=must_l,
                                 cl=cant_l,
                                 initialization=initialisation)
            return pred
Example #16
0
def _db(X, labels,digits):
    return round(davies_bouldin_score(X, labels),digits)
Example #17
0
                    help='The minimum number of samples required in a '
                    'neighborhood to consider a point a core point')
params = bench.parse_args(parser)

# Load generated data
X, _, _, _ = bench.load_data(params)

# Create our clustering object
dbscan = DBSCAN(eps=params.eps, min_samples=params.min_samples)

# Time fit
time, _ = bench.measure_function_time(dbscan.fit, X, params=params)
labels = dbscan.labels_

X_host = bench.convert_to_numpy(X)
labels_host = bench.convert_to_numpy(labels)

acc = davies_bouldin_score(X_host, labels_host)
params.n_clusters = len(set(labels_host)) - (1 if -1 in labels_host else 0)

bench.print_output(library='cuml',
                   algorithm='dbscan',
                   stages=['training'],
                   params=params,
                   functions=['DBSCAN'],
                   times=[time],
                   metrics=[acc],
                   metric_type='davies_bouldin_score',
                   data=[X],
                   alg_instance=dbscan)
Example #18
0
X, _, _, _ = bench.load_data(params, add_dtype=True)

# Create our clustering object
dbscan = DBSCAN(eps=params.eps,
                n_jobs=params.n_jobs,
                min_samples=params.min_samples,
                metric='euclidean',
                algorithm='auto')

# N.B. algorithm='auto' will select DAAL's brute force method when running
# daal4py-patched scikit-learn, and probably 'kdtree' when running unpatched
# scikit-learn.

# Time fit
time, _ = bench.measure_function_time(dbscan.fit, X, params=params)
labels = dbscan.labels_

params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
acc = davies_bouldin_score(X, labels)

bench.print_output(library='sklearn',
                   algorithm='dbscan',
                   stages=['training'],
                   params=params,
                   functions=['DBSCAN'],
                   times=[time],
                   accuracies=[acc],
                   accuracy_type='davies_bouldin_score',
                   data=[X],
                   alg_instance=dbscan)
Example #19
0
        n_features=2
        #, cluster_std=1.03
        ,
        shuffle=True,
        random_state=123)
    # features scaling
    scaled_feature = feature_scaling(features)
    # Set k
    n_clusters = len(np.unique(target))
    # Create K-means object
    k_means = KMeans(k=n_clusters, max_iter=100, plot_flag=True)
    # Fit
    predictions_fit = k_means.fit(scaled_feature)
    # Predict
    predictions_pre = k_means.predict(scaled_feature)

    from sklearn.metrics.cluster import adjusted_mutual_info_score \
        , completeness_score, adjusted_rand_score, calinski_harabasz_score \
        , davies_bouldin_score, contingency_matrix, silhouette_score

    print('adjusted_mutual_info_score:',
          adjusted_mutual_info_score(target, predictions_pre))
    print('completeness_score:', completeness_score(target, predictions_pre))
    print('adjusted_rand_score:', adjusted_rand_score(target, predictions_pre))
    print('calinski_harabasz_score:',
          calinski_harabasz_score(scaled_feature, target))
    print('davies_bouldin_score:',
          davies_bouldin_score(scaled_feature, target))
    print('contingency_matrix:\n', contingency_matrix(target, predictions_pre))
    print('silhouette_score:', silhouette_score(scaled_feature, target))