def test_v_measure_and_mutual_information(seed=36): # Check relation between v_measure, entropy and mutual information for i in np.logspace(1, 4, 4).astype(np.int): random_state = np.random.RandomState(seed) labels_a, labels_b = (random_state.randint(0, 10, i), random_state.randint(0, 10, i)) assert_almost_equal(v_measure_score(labels_a, labels_b), 2.0 * mutual_info_score(labels_a, labels_b) / (entropy(labels_a) + entropy(labels_b)), 0) avg = 'arithmetic' assert_almost_equal(v_measure_score(labels_a, labels_b), normalized_mutual_info_score(labels_a, labels_b, average_method=avg) )
def test_int_input(): X_list = [[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]] for dtype in [np.int32, np.int64]: X_int = np.array(X_list, dtype=dtype) X_int_csr = sp.csr_matrix(X_int) init_int = X_int[:2] fitted_models = [ KMeans(n_clusters=2).fit(X_int), KMeans(n_clusters=2, init=init_int, n_init=1).fit(X_int), # mini batch kmeans is very unstable on such a small dataset hence # we use many inits MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int), MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int_csr), MiniBatchKMeans(n_clusters=2, batch_size=2, init=init_int, n_init=1).fit(X_int), MiniBatchKMeans(n_clusters=2, batch_size=2, init=init_int, n_init=1).fit(X_int_csr), ] for km in fitted_models: assert km.cluster_centers_.dtype == np.float64 expected_labels = [0, 1, 1, 0, 0, 1] scores = np.array([ v_measure_score(expected_labels, km.labels_) for km in fitted_models ]) assert_array_almost_equal(scores, np.ones(scores.shape[0]))
def test_mini_batch_k_means_random_init_partial_fit(): km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42) # use the partial_fit API for online learning for X_minibatch in np.array_split(X, 10): km.partial_fit(X_minibatch) # compute the labeling on the complete dataset labels = km.predict(X) assert v_measure_score(true_labels, labels) == 1.0
def test_scaled_weights(): # scaling all sample weights by a common factor # shouldn't change the result sample_weight = np.ones(n_samples) for estimator in [ KMeans(n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, random_state=42) ]: est_1 = clone(estimator).fit(X) est_2 = clone(estimator).fit(X, sample_weight=0.5 * sample_weight) assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0) assert_almost_equal(_sort_centers(est_1.cluster_centers_), _sort_centers(est_2.cluster_centers_))
def test_unit_weights_vs_no_weights(): # not passing any sample weights should be equivalent # to all weights equal to one sample_weight = np.ones(n_samples) for estimator in [ KMeans(n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, random_state=42) ]: est_1 = clone(estimator).fit(X) est_2 = clone(estimator).fit(X, sample_weight=sample_weight) assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0) assert_almost_equal(_sort_centers(est_1.cluster_centers_), _sort_centers(est_2.cluster_centers_))
def test_exactly_zero_info_score(): # Check numerical stability when information is exactly zero for i in np.logspace(1, 4, 4).astype(np.int): labels_a, labels_b = (np.ones(i, dtype=np.int), np.arange(i, dtype=np.int)) assert normalized_mutual_info_score(labels_a, labels_b) == 0.0 assert v_measure_score(labels_a, labels_b) == 0.0 assert adjusted_mutual_info_score(labels_a, labels_b) == 0.0 assert normalized_mutual_info_score(labels_a, labels_b) == 0.0 for method in ["min", "geometric", "arithmetic", "max"]: assert adjusted_mutual_info_score(labels_a, labels_b, method) == 0.0 assert normalized_mutual_info_score(labels_a, labels_b, method) == 0.0
def _check_fitted_model(km): # check that the number of clusters centers and distinct labels match # the expectation centers = km.cluster_centers_ assert centers.shape == (n_clusters, n_features) labels = km.labels_ assert np.unique(labels).shape[0] == n_clusters # check that the labels assignment are perfect (up to a permutation) assert v_measure_score(true_labels, labels) == 1.0 assert km.inertia_ > 0.0 # check error on dataset being too small assert_raise_message( ValueError, "n_samples=1 should be >= n_clusters=%d" % km.n_clusters, km.fit, [[0., 1.]])
def test_k_means_function(): # test calling the k_means function directly # catch output old_stdout = sys.stdout sys.stdout = StringIO() try: cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters, sample_weight=None, verbose=True) finally: sys.stdout = old_stdout centers = cluster_centers assert centers.shape == (n_clusters, n_features) labels = labels assert np.unique(labels).shape[0] == n_clusters # check that the labels assignment are perfect (up to a permutation) assert v_measure_score(true_labels, labels) == 1.0 assert inertia > 0.0 # check warning when centers are passed assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters, sample_weight=None, init=centers) # to many clusters desired with pytest.raises(ValueError): k_means(X, n_clusters=X.shape[0] + 1, sample_weight=None) # kmeans for algorithm='elkan' raises TypeError on sparse matrix assert_raise_message(TypeError, "algorithm='elkan' not supported for " "sparse input X", k_means, X=X_csr, n_clusters=2, sample_weight=None, algorithm="elkan")
def test_weighted_vs_repeated(): # a sample weight of N should yield the same result as an N-fold # repetition of the sample rng = np.random.RandomState(0) sample_weight = rng.randint(1, 5, size=n_samples) X_repeat = np.repeat(X, sample_weight, axis=0) estimators = [ KMeans(init="k-means++", n_clusters=n_clusters, random_state=42), KMeans(init="random", n_clusters=n_clusters, random_state=42), KMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42), MiniBatchKMeans(n_clusters=n_clusters, batch_size=10, random_state=42) ] for estimator in estimators: est_weighted = clone(estimator).fit(X, sample_weight=sample_weight) est_repeated = clone(estimator).fit(X_repeat) repeated_labels = np.repeat(est_weighted.labels_, sample_weight) assert_almost_equal( v_measure_score(est_repeated.labels_, repeated_labels), 1.0) if not isinstance(estimator, MiniBatchKMeans): assert_almost_equal(_sort_centers(est_weighted.cluster_centers_), _sort_centers(est_repeated.cluster_centers_))
def test_beta_parameter(): # test for when beta passed to # homogeneity_completeness_v_measure # and v_measure_score beta_test = 0.2 h_test = 0.67 c_test = 0.42 v_test = ((1 + beta_test) * h_test * c_test / (beta_test * h_test + c_test)) h, c, v = homogeneity_completeness_v_measure( [0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test) assert_almost_equal(h, h_test, 2) assert_almost_equal(c, c_test, 2) assert_almost_equal(v, v_test, 2) v = v_measure_score( [0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test) assert_almost_equal(v, v_test, 2)
cocluster = SpectralCoclustering(n_clusters=len(categories), svd_method='arpack', random_state=0) kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000, random_state=0) print("Vectorizing...") X = vectorizer.fit_transform(newsgroups.data) print("Coclustering...") start_time = time() cocluster.fit(X) y_cocluster = cocluster.row_labels_ print("Done in {:.2f}s. V-measure: {:.4f}".format( time() - start_time, v_measure_score(y_cocluster, y_true))) print("MiniBatchKMeans...") start_time = time() y_kmeans = kmeans.fit_predict(X) print("Done in {:.2f}s. V-measure: {:.4f}".format( time() - start_time, v_measure_score(y_kmeans, y_true))) feature_names = vectorizer.get_feature_names() document_names = list(newsgroups.target_names[i] for i in newsgroups.target) def bicluster_ncut(i): rows, cols = cocluster.get_indices(i) if not (np.any(rows) and np.any(cols)): import sys