def test_shc_validation(): """Test the validation of hyper-parameters and input data in SHC""" X, _ = generate_data(supervised=False, affinity=False) with pytest.raises(ValueError): clusterer = ScipyHierarchicalClustering(n_clusters=len(X) + 1) labels = clusterer.fit_predict(X) with pytest.raises(ValueError): clusterer = ScipyHierarchicalClustering(n_clusters=-1) labels = clusterer.fit_predict(X) with pytest.raises(ValueError): clusterer = ScipyHierarchicalClustering(scoring_data="affinity") labels = clusterer.fit_predict(X)
def test_shc_custom_affinity(): """Test custom affinity function in SHC.""" X, _ = generate_data(supervised=False, affinity=False) clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances, n_clusters=4) labels = clusterer.fit_predict(X) assert_array_equal([25, 25, 25, 25], np.bincount(labels))
def test_shc_precomputed_distance(): """Test using precomputed distances in SHC.""" X, _ = generate_data(supervised=False, affinity=True) clusterer = ScipyHierarchicalClustering(affinity="precomputed", n_clusters=4) labels = clusterer.fit_predict(X) assert_array_equal([25, 25, 25, 25], np.bincount(labels))
def test_shc_unsupervised_scoring_data_raw(): """Test unsupervised clustering for SHC when scoring_data='raw'.""" X, _ = generate_data(supervised=False, affinity=False) _scoring = partial(silhouette_score, metric="euclidean") clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances, scoring=_scoring, scoring_data="raw") labels = clusterer.fit_predict(X) assert_array_equal([25, 25, 25, 25], np.bincount(labels))
def test_shc_unsupervised_scoring_data_affinity(): """Test unsupervised clustering for SHC when scoring_data='affinity'.""" # Passing feature matrix X, _ = generate_data(supervised=False, affinity=False) _scoring = partial(silhouette_score, metric="precomputed") clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances, scoring=_scoring, scoring_data="affinity") labels = clusterer.fit_predict(X) assert_array_equal([25, 25, 25, 25], np.bincount(labels)) # Passing affinity matrix X, _ = generate_data(supervised=False, affinity=True) _scoring = partial(silhouette_score, metric="precomputed") clusterer = ScipyHierarchicalClustering(affinity="precomputed", scoring=_scoring, scoring_data="affinity") labels = clusterer.fit_predict(X) assert_array_equal([25, 25, 25, 25], np.bincount(labels))
def test_shc_unsupervised_scoring_data_None(): """Test unsupervised clustering for SHC when scoring_data is None.""" X, _ = generate_data(supervised=False, affinity=False) def _scoring(labels_pred): return -np.inf clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances, scoring=_scoring) labels = clusterer.fit_predict(X) assert_array_equal([100], np.bincount(labels))
def test_shc_threshold(): """Test changing threshold in SHC.""" X, _ = generate_data(supervised=False, affinity=True) clusterer = ScipyHierarchicalClustering(affinity="precomputed", scoring_data="affinity", n_clusters=4) labels = clusterer.fit_predict(X) clusterer.set_params(threshold=clusterer.linkage_[-4, 2]) labels = clusterer.labels_ assert_array_equal([25, 25, 25, 25], np.bincount(labels))
def test_shc_n_clusters(): """Test changing number of clusters in SHC.""" X, _ = generate_data(supervised=False, affinity=True) clusterer = ScipyHierarchicalClustering(affinity="precomputed", n_clusters=4) labels = clusterer.fit_predict(X) assert_equal(len(np.unique(labels)), 4) clusterer.set_params(n_clusters=10) labels = clusterer.labels_ assert_equal(len(np.unique(labels)), 10)
def test_shc_threshold(): """Test changing threshold in SHC.""" X, _ = generate_data(supervised=False, affinity=True) # n_clusters has precedence over threshold clusterer = ScipyHierarchicalClustering(affinity="precomputed", n_clusters=2) labels1 = clusterer.fit_predict(X) clusterer.set_params(threshold=clusterer.linkage_[-4, 2]) labels2 = clusterer.labels_ assert_array_equal(labels1, labels2) assert_equal(len(np.unique(labels1)), 2) # change threshold clusterer.set_params(n_clusters=None, threshold=clusterer.linkage_[-5, 2]) labels = clusterer.labels_ assert_equal(len(np.unique(labels)), 5) clusterer.set_params(threshold=clusterer.linkage_[-4, 2]) labels = clusterer.labels_ assert_equal(len(np.unique(labels)), 4)
def test_shc_default_euclidean(): """Test default parameters of SHC, using euclidean distance.""" X, _ = generate_data(supervised=False, affinity=False) clusterer = ScipyHierarchicalClustering(n_clusters=4) labels = clusterer.fit_predict(X) assert_array_equal([25, 25, 25, 25], np.bincount(labels))