Ejemplo n.º 1
0
def test_calinski_harabasz_score():
    rng = np.random.RandomState(seed=0)

    # Assert message when there is only one label
    assert_raise_message(ValueError,
                         "Number of labels is", calinski_harabasz_score,
                         rng.rand(10, 2), np.zeros(10))

    # Assert message when all point are in different clusters
    assert_raise_message(ValueError,
                         "Number of labels is", calinski_harabasz_score,
                         rng.rand(10, 2), np.arange(10))

    # Assert the value is 1. when all samples are equals
    assert_equal(1.,
                 calinski_harabasz_score(np.ones((10, 2)), [0] * 5 + [1] * 5))

    # Assert the value is 0. when all the mean cluster are equal
    assert_equal(
        0.,
        calinski_harabasz_score([[-1, -1], [1, 1]] * 10, [0] * 10 + [1] * 10))

    # General case (with non numpy arrays)
    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + [[0, 4], [1, 3]] * 5 +
         [[3, 1], [4, 0]] * 5)
    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
    assert_almost_equal(calinski_harabasz_score(X, labels),
                        45 * (40 - 4) / (5 * (4 - 1)))
Ejemplo n.º 2
0
def get_clustering_metrics(train_data,
                           cluster_labels,
                           ground_truth_labels=None):
    clustering_metric_dict = dict({})
    clustering_metric_dict['silhouette_score'] = silhouette_score(
        train_data, cluster_labels, random_state=42)
    clustering_metric_dict[
        'calinski_harabasz_score'] = calinski_harabasz_score(
            train_data, cluster_labels)
    clustering_metric_dict['davies_bouldin_score'] = davies_bouldin_score(
        train_data, cluster_labels)

    if ground_truth_labels is not None:
        clustering_metric_dict['v_measure_score'] = v_measure_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict[
            'fowlkes_mallows_score'] = fowlkes_mallows_score(
                ground_truth_labels, cluster_labels)
        clustering_metric_dict['homogeneity_score'] = homogeneity_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict[
            'normalized_mutual_info_score'] = normalized_mutual_info_score(
                ground_truth_labels, cluster_labels)
        clustering_metric_dict['adjusted_rand_score'] = adjusted_rand_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict['completeness_score'] = completeness_score(
            ground_truth_labels, cluster_labels)

    return clustering_metric_dict
Ejemplo n.º 3
0
def test_calinski_harabasz_score():
    assert_raises_on_only_one_label(calinski_harabasz_score)

    assert_raises_on_all_points_same_cluster(calinski_harabasz_score)

    # Assert the value is 1. when all samples are equals
    assert 1. == calinski_harabasz_score(np.ones((10, 2)), [0] * 5 + [1] * 5)

    # Assert the value is 0. when all the mean cluster are equal
    assert 0. == calinski_harabasz_score([[-1, -1], [1, 1]] * 10,
                                         [0] * 10 + [1] * 10)

    # General case (with non numpy arrays)
    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + [[0, 4], [1, 3]] * 5 +
         [[3, 1], [4, 0]] * 5)
    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
    pytest.approx(calinski_harabasz_score(X, labels),
                  45 * (40 - 4) / (5 * (4 - 1)))
Ejemplo n.º 4
0
def evaluation(X_selected, X_test, n_clusters, y):
    """
    This function calculates ARI, ACC and NMI of clustering results

    Input
    -----
    X_selected: {numpy array}, shape (n_samples, n_selected_features}
            input data on the selected features
    n_clusters: {int}
            number of clusters
    y: {numpy array}, shape (n_samples,)
            true labels

    Output
    ------
    nmi: {float}
        Normalized Mutual Information
    acc: {float}
        Accuracy
    """
    k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300,
                     tol=0.0001, precompute_distances=True, verbose=0,
                     random_state=None, copy_x=True, n_jobs=1)

    k_means.fit(X_selected)
    y_predict = k_means.predict(X_test)
    
    # calculate NMI
    nmi = normalized_mutual_info_score(y, y_predict, average_method='arithmetic')

    # calculate Silhouette score
    try:
        sil = silhouette_score(X_test, y_predict, metric='euclidean')
    except ValueError:
        sil = float('nan')
        app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Silhouette score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT)

    # calculate Davies Bouldin 
    try:
        db = davies_bouldin_score(X_test, y_predict)
    except ValueError:
        db = float('nan')
        app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Davies Bouldin score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT)

    # calculate Calinski Harabasz score
    try:
        ch = calinski_harabasz_score(X_test, y_predict)
    except ValueError:
        ch = float('nan')
        app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Calinski Harabasz score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT)

    # calculate Purity
    pur = purity(y, y_predict)

    return nmi, sil, db, ch, pur

    '''
Ejemplo n.º 5
0
def calinski_harabasz(dataset_values: DatasetValues):
    """Calinski, T.; Harabasz, J. (1974). A dendrite method for cluster analysis.
    Communications in Statistics - Theory and Methods, v.3, n.1, p.1�27.
    The objective is maximize value [0, +Inf]"""

    if dataset_values.K == 1:
        return 0

    return calinski_harabasz_score(dataset_values.data,
                                   dataset_values.cluster_labels)
Ejemplo n.º 6
0
def test_calinski_harabasz_score():
    assert_raises_on_only_one_label(calinski_harabasz_score)

    assert_raises_on_all_points_same_cluster(calinski_harabasz_score)

    # Assert the value is 1. when all samples are equals
    assert_equal(1., calinski_harabasz_score(np.ones((10, 2)),
                                             [0] * 5 + [1] * 5))

    # Assert the value is 0. when all the mean cluster are equal
    assert_equal(0., calinski_harabasz_score([[-1, -1], [1, 1]] * 10,
                                             [0] * 10 + [1] * 10))

    # General case (with non numpy arrays)
    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
         [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
    pytest.approx(calinski_harabasz_score(X, labels),
                  45 * (40 - 4) / (5 * (4 - 1)))
Ejemplo n.º 7
0
def _clustering_metrics(labels, X, digits):
    if X is None:
        SIL = None
        DB = None
        CH = None
    else:
        SIL = round(silhouette_score(X, labels),digits)
        DB = round(davies_bouldin_score(X, labels),digits)
        CH = round(calinski_harabasz_score(X, labels),digits)

    return SIL, DB, CH
    def _eval_clustering(self, labels_true, labels_predicted):
        # To address when COP-KMeans fails to satisfy all constraints at a k:
        if labels_predicted is None:
            # return an empty dictionary to expose in the final output
            return {"nmi": None,
                    "ami": None,
                    "ari": None,
                    "fms": None,
                    "v_measure": None,
                    "bcubed_precision": None,
                    "bcubed_recall": None,
                    "bcubed_fscore": None,
                    "Silhouette": None,
                    "Calinski_harabasz": None,
                    "Davies_Bouldin": None
                    }

        nmi = normalized_mutual_info_score(labels_true,
                                           labels_predicted,
                                           average_method="max")

        ami = adjusted_mutual_info_score(labels_true,
                                         labels_predicted,
                                         average_method="arithmetic")

        ari = adjusted_rand_score(labels_true,
                                  labels_predicted)

        v_measure = v_measure_score(labels_true,
                                    labels_predicted,
                                    beta=1.0)

        fms = fowlkes_mallows_score(labels_true,
                                    labels_predicted)

        # Reshape labels for BCubed measures
        true_dict = self._reshape_labels_as_dicts(labels_true)
        pred_dict = self._reshape_labels_as_dicts(labels_predicted)

        bcubed_precision = bcubed.precision(cdict=pred_dict, ldict=true_dict)
        bcubed_recall = bcubed.recall(cdict=pred_dict, ldict=true_dict)
        bcubed_f1 = bcubed.fscore(bcubed_precision, bcubed_recall)

        # =====================================================================
        # Unsupervised Metrics
        # =====================================================================
        if not labels_predicted.nunique() in (1, len(self.data)):
            sil = silhouette_score(X=self.data,
                                   labels=labels_predicted,
                                   metric=self.distance_metric,
                                   random_state=13712)

            ch = calinski_harabasz_score(X=self.data, labels=labels_predicted)

            dv = davies_bouldin_score(X=self.data, labels=labels_predicted)
        else:
            sil = None
            ch = None
            dv = None

        ret = {}
        ret.update({"nmi": round(nmi, 4),
                    "ami": round(ami, 4),
                    "ari": round(ari, 4),
                    "fms": round(fms, 4),
                    "v_measure": round(v_measure, 4),
                    "bcubed_precision": round(bcubed_precision, 4),
                    "bcubed_recall": round(bcubed_recall, 4),
                    "bcubed_fscore": round(bcubed_f1, 4),
                    "Silhouette": round(sil, 4
                                        ) if sil is not None else None,
                    "Calinski_harabasz": round(ch, 4
                                               ) if ch is not None else None,
                    "Davies_Bouldin": round(dv, 4
                                            ) if dv is not None else None
                    # Here goes the unsupervised indices
                    })

        return ret
Ejemplo n.º 9
0
y_km = kmeans.fit_predict(scaled_df)
labels = kmeans.labels_
cluster_centers = kmeans.cluster_centers_


# In[ ]:


##Calinski-Harabasz Index


# In[247]:


from sklearn.metrics.cluster import calinski_harabasz_score
calinski_harabasz_score(scaled_df, labels)


# In[353]:


kmeans = KMeans(n_clusters=5)
kmeans.fit(scaled_df)
print(kmeans.cluster_centers_)
X = kmeans.cluster_centers_
y_km = kmeans.fit_predict(scaled_df)
labels = kmeans.labels_
cluster_centers = kmeans.cluster_centers_


# In[249]:
Ejemplo n.º 10
0
        n_features=2
        #, cluster_std=1.03
        ,
        shuffle=True,
        random_state=123)
    # features scaling
    scaled_feature = feature_scaling(features)
    # Set k
    n_clusters = len(np.unique(target))
    # Create K-means object
    k_means = KMeans(k=n_clusters, max_iter=100, plot_flag=True)
    # Fit
    predictions_fit = k_means.fit(scaled_feature)
    # Predict
    predictions_pre = k_means.predict(scaled_feature)

    from sklearn.metrics.cluster import adjusted_mutual_info_score \
        , completeness_score, adjusted_rand_score, calinski_harabasz_score \
        , davies_bouldin_score, contingency_matrix, silhouette_score

    print('adjusted_mutual_info_score:',
          adjusted_mutual_info_score(target, predictions_pre))
    print('completeness_score:', completeness_score(target, predictions_pre))
    print('adjusted_rand_score:', adjusted_rand_score(target, predictions_pre))
    print('calinski_harabasz_score:',
          calinski_harabasz_score(scaled_feature, target))
    print('davies_bouldin_score:',
          davies_bouldin_score(scaled_feature, target))
    print('contingency_matrix:\n', contingency_matrix(target, predictions_pre))
    print('silhouette_score:', silhouette_score(scaled_feature, target))
Ejemplo n.º 11
0
def _ch(X, labels,digits):
    return round(calinski_harabasz_score(X, labels),digits)