def test_on_trivial_input(inp): """Test that with one cluster, and one point, we always get one cluster, regardless of its location.""" n_points_per_cluster, n_clusters, dim, pts = inp fs = FirstSimpleGap() fs = fs.fit(pts) assert fs.n_clusters_ == n_clusters fh = FirstHistogramGap() fh = fh.fit(pts) assert fh.n_clusters_ == n_clusters
def test_max_fraction_clusters(inp, max_frac): """ Check that ``FirstSimpleGap`` and ``FirstHistogramGap`` respect the ``max_num_clusters`` constraint, if it is set.""" n_points_per_cluster, n_clusters, _, pts = inp max_num_clusters = max_frac * n_points_per_cluster * n_clusters fs = FirstSimpleGap(max_fraction=max_frac) _ = fs.fit_predict(pts) assert fs.n_clusters_ <= np.floor(max_num_clusters) fh = FirstHistogramGap(max_fraction=max_frac) _ = fh.fit_predict(pts) assert fh.n_clusters_ <= np.floor(max_num_clusters)
def test_firsthistogramgap(inp): """For a multimodal distribution, check that the ``FirstHistogramGap`` with appropriate parameters finds the right number of clusters, and that each has the correct number of points ``n_points_per_cluster``.""" n_points_per_cluster, n_clusters, _, pts = inp fh = FirstHistogramGap(freq_threshold=0, max_fraction=1., n_bins_start=5, affinity='euclidean', memory=None, linkage='single') preds = fh.fit_predict(pts) unique, counts = np.unique(preds, return_counts=True) # check that the nb of clusters corresponds to the nb of synth. clusters assert unique.shape[0] == n_clusters # check that the nb of pts in a cluster corresponds to what we expect assert_almost_equal(counts, n_points_per_cluster)
def test_precomputed_distances(inp): """Verify that the clustering based on a distance matrix is the same as the clustering on points used to calculate that distance matrix.""" n_points_per_cluster, n_clusters, _, pts = inp dist_matrix = distance_matrix(pts, pts, p=2) fh_matrix = FirstHistogramGap(freq_threshold=0, max_fraction=1., n_bins_start=5, affinity='precomputed', memory=None, linkage='single') preds_mat = fh_matrix.fit_predict(dist_matrix) fh = FirstHistogramGap(freq_threshold=0, max_fraction=1., n_bins_start=5, affinity='euclidean', memory=None, linkage='single') preds = fh.fit_predict(pts) indices_cluster = set(preds) def get_partition_from_preds(preds): """From a vector of predictions (labels), get a set of frozensets, where each frozenset represents a cluster, and has the indices of rows of its elements.""" return set( [frozenset(np.where(preds == c)[0]) for c in indices_cluster]) assert get_partition_from_preds(preds) == \ get_partition_from_preds(preds_mat)
def test_precomputed_distances(inp): """Verify that the clustering based on ``distance_matrix`` is the same as the clustering on points, that were used to calculate that distance matrix.""" n_points_per_cluster, n_clusters, _, pts = inp dist_matrix = distance_matrix(pts, pts, p=2) fh_matrix = FirstHistogramGap(freq_threshold=0, max_fraction=None, n_bins_start=5, affinity='precomputed', memory=None, linkage='single') preds_mat = fh_matrix.fit_predict(dist_matrix) fh = FirstHistogramGap(freq_threshold=0, max_fraction=None, n_bins_start=5, affinity='euclidean', memory=None, linkage='single') preds = fh.fit_predict(pts) assert_almost_equal(preds, preds_mat)