Ejemplo n.º 1
0
def test_hdbscan_approximate_predict_score():
    clusterer = HDBSCAN(min_cluster_size=200).fit(X)
    # no prediction data error
    assert_raises(ValueError, approximate_predict_scores, clusterer, X)
    clusterer.generate_prediction_data()
    # wrong dimensions error
    assert_raises(ValueError, approximate_predict_scores, clusterer, np.array([[1, 2, 3]]))
    with warnings.catch_warnings(record=True) as w:
        approximate_predict_scores(clusterer, np.array([[1.5, -1.0]]))
        # no clusters warning
        assert 'Clusterer does not have any defined clusters' in str(w[-1].message)
    clusterer = HDBSCAN(prediction_data=True).fit(X)
    scores = approximate_predict_scores(clusterer, X)
    assert_array_almost_equal(scores, clusterer.outlier_scores_)
    assert scores.min() >= 0
    assert scores.max() <= 1
Ejemplo n.º 2
0
    def predict(self, dim_reduced_vecs, outlier_labels, scores, contamination,
                min_cluster_size, allow_noise):
        print("Clustering ...")
        clusterer = HDBSCAN(min_cluster_size=min_cluster_size,
                            prediction_data=True,
                            metric="euclidean").fit(dim_reduced_vecs)
        print("Get prediction data ...")
        clusterer.generate_prediction_data()

        try:
            cluster_pred = clusterer.labels_ if allow_noise else np.argmax(
                all_points_membership_vectors(clusterer)[:, 1:], axis=1)
        except IndexError:
            print(
                "Got IndexError and will not enforce cluster membership (allow noise) ..."
            )
            print(all_points_membership_vectors(clusterer))
            cluster_pred = clusterer.labels_

        # scoring
        print("Get scores ...")

        # GLOSH
        threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9)
        outlier_pred = np.where(clusterer.outlier_scores_ > threshold, -1, 1)

        scores["cluster_n"] = len(np.unique(clusterer.labels_))
        scores["homogeneity"] = homogeneity_score(outlier_labels, cluster_pred)
        scores["completeness"] = completeness_score(outlier_labels,
                                                    cluster_pred)
        scores["v_measure"] = v_measure_score(outlier_labels, cluster_pred)

        scores = get_scores(scores, outlier_labels, outlier_pred)

        print(
            f"Homogeneity - {homogeneity_score(outlier_labels, cluster_pred)*100:.1f}  \
                cluster_n - {len(np.unique(clusterer.labels_))}")

        return scores, clusterer.outlier_scores_