def test_v_measure_and_mutual_information(seed=36):
    # Check relation between v_measure, entropy and mutual information
    for i in np.logspace(1, 4, 4).astype(np.int):
        random_state = np.random.RandomState(seed)
        labels_a, labels_b = (random_state.randint(0, 10, i),
                              random_state.randint(0, 10, i))
        assert_almost_equal(v_measure_score(labels_a, labels_b),
                            2.0 * mutual_info_score(labels_a, labels_b) /
                            (entropy(labels_a) + entropy(labels_b)), 0)
        avg = 'arithmetic'
        assert_almost_equal(v_measure_score(labels_a, labels_b),
                            normalized_mutual_info_score(labels_a, labels_b,
                                                         average_method=avg)
                            )
def test_int_input():
    X_list = [[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]]
    for dtype in [np.int32, np.int64]:
        X_int = np.array(X_list, dtype=dtype)
        X_int_csr = sp.csr_matrix(X_int)
        init_int = X_int[:2]

        fitted_models = [
            KMeans(n_clusters=2).fit(X_int),
            KMeans(n_clusters=2, init=init_int, n_init=1).fit(X_int),
            # mini batch kmeans is very unstable on such a small dataset hence
            # we use many inits
            MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int),
            MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int_csr),
            MiniBatchKMeans(n_clusters=2, batch_size=2,
                            init=init_int, n_init=1).fit(X_int),
            MiniBatchKMeans(n_clusters=2, batch_size=2,
                            init=init_int, n_init=1).fit(X_int_csr),
        ]

        for km in fitted_models:
            assert_equal(km.cluster_centers_.dtype, np.float64)

        expected_labels = [0, 1, 1, 0, 0, 1]
        scores = np.array([v_measure_score(expected_labels, km.labels_)
                           for km in fitted_models])
        assert_array_equal(scores, np.ones(scores.shape[0]))
Beispiel #3
0
def test_k_means_function():
    # test calling the k_means function directly
    # catch output
    from cStringIO import StringIO
    import sys
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters,
                                               verbose=True)
    sys.stdout = old_stdout
    centers = cluster_centers
    assert_equal(centers.shape, (n_clusters, n_features))

    labels = labels
    assert_equal(np.unique(labels).shape[0], n_clusters)

    # check that the labels assignements are perfect (up to a permutation)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
    assert_greater(inertia, 0.0)

    # check warning when centers are passed
    with warnings.catch_warnings(record=True) as w:
        k_means(X, n_clusters=n_clusters, init=centers)
        assert_equal(len(w), 1)

    # to many clusters desired
    assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1)
def test_k_means_function():
    # test calling the k_means function directly
    # catch output
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters,
                                                   sample_weight=None,
                                                   verbose=True)
    finally:
        sys.stdout = old_stdout
    centers = cluster_centers
    assert_equal(centers.shape, (n_clusters, n_features))

    labels = labels
    assert_equal(np.unique(labels).shape[0], n_clusters)

    # check that the labels assignment are perfect (up to a permutation)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
    assert_greater(inertia, 0.0)

    # check warning when centers are passed
    assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters,
                 sample_weight=None, init=centers)

    # to many clusters desired
    assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1,
                  sample_weight=None)

    # kmeans for algorithm='elkan' raises TypeError on sparse matrix
    assert_raise_message(TypeError, "algorithm='elkan' not supported for "
                         "sparse input X", k_means, X=X_csr, n_clusters=2,
                         sample_weight=None, algorithm="elkan")
    def test_fitted_model(self):

        # non centered, sparse centers to check the
        centers = np.array([
            [0.0, 5.0, 0.0, 0.0, 0.0],
            [1.0, 1.0, 4.0, 0.0, 0.0],
            [1.0, 0.0, 0.0, 5.0, 1.0],
            ])
        n_samples = 100
        n_clusters, n_features = centers.shape
        X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                    cluster_std=1., random_state=42)

        cbook = CoodeBook(n_words=3)
        cbook = cbook.fit(X) # TODO: Is it neaded to reasign? or it can be just cbook.fit(X)

        # check that the number of clusters centers and distinct labels match
        # the expectation
        centers = cbook.get_dictionary()
        assert_equal(centers.shape, (n_clusters, n_features))

        labels = cbook.predict(X)
        assert_equal(np.unique(labels).shape[0], n_clusters)

        # check that the labels assignment are perfect (up to a permutation)
        assert_equal(v_measure_score(true_labels, labels), 1.0)
        assert_greater(cbook.cluster_core.inertia_, 0.0)

        # check that the descriptor looks like the homogenous PDF used
        # to create the original samples
        cbook_hist = cbook.get_BoF_descriptor(X)
        expected_value = float(1)/cbook.n_words
        for bin_value in cbook_hist[0]:
            assert_less(round(bin_value-expected_value,3), 0.01)
def test_k_means_function():
    # test calling the k_means function directly
    # catch output
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters,
                                                   verbose=True)
    finally:
        sys.stdout = old_stdout
    centers = cluster_centers
    assert_equal(centers.shape, (n_clusters, n_features))

    labels = labels
    assert_equal(np.unique(labels).shape[0], n_clusters)

    # check that the labels assignment are perfect (up to a permutation)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
    assert_greater(inertia, 0.0)

    # check warning when centers are passed
    assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters,
                 init=centers)

    # to many clusters desired
    assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1)
 def calculate_scores(self):
   x, c, labels = self.x, self.c, self.labels
   self.v_measure = v_measure_score(c, labels)
   self.complete = completeness_score(c, labels)
   self.adjusted_mutual = adjusted_mutual_info_score(c, labels)
   self.adjusted_rand = adjusted_rand_score(c, labels)
   self.silhouette = silhouette_score(x, c)
   self.purity, self.partial_purity = self.__purity__()
def test_exactly_zero_info_score():
    """Check numerical stability when information is exactly zero"""
    for i in np.logspace(1, 4, 4).astype(np.int):
        labels_a, labels_b = np.ones(i, dtype=np.int), np.arange(i, dtype=np.int)
        assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
        assert_equal(v_measure_score(labels_a, labels_b), 0.0)
        assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0)
        assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
Beispiel #9
0
def test_k_means_perfect_init():
    try:
        p_suite = []#PY_suite(suite_name=u'perfect_init')
        for i in range(10):
            X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                    cluster_std=1., random_state=42)
            km = KMeans(init=centers.copy(), n_clusters=n_clusters,
                        random_state=42,n_init=1).fit(X)
            p_suite+=[PY_raises(ValueError,km.fit,[[0.,1.]]),
                              PY_equals(v_measure_score(true_labels, km.labels_),1.0),
                              PY_equals(km.cluster_centers_.shape,(n_clusters,n_features)),
                              PY_equals(v_measure_score(true_labels,km.labels_), 1.0),
                              PY_greater(km.inertia_,0.0)
                              ]
        return p_suite
    except Exception:
        return 50
Beispiel #10
0
def test_k_means_plus_plus_init_not_precomputed():
    try:
        p_suite = []#PY_suite(suite_name=u'plus_plus_init_not_precomputed')
        for i in range(10):
            X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                    cluster_std=1., random_state=42)
            km = KMeans(init="k-means++", n_clusters=n_clusters,
                        random_state=42,precompute_distances=False).fit(X)
            p_suite+=[PY_raises(ValueError,km.fit,[[0.,1.]]),
                              PY_equals(v_measure_score(true_labels, km.labels_),1.0),
                              PY_equals(km.cluster_centers_.shape,(n_clusters,n_features)),
                              PY_equals(v_measure_score(true_labels,km.labels_), 1.0),
                              PY_greater(km.inertia_,0.0)
                              ]
        return p_suite
    except Exception:
        return 50
Beispiel #11
0
def test_k_means_random_init_sparse():
    try:
        p_suite = []#PY_suite(suite_name=u'init_random_sparse')
        for i in range(10):
            X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                    cluster_std=1., random_state=42)
            X_csr = sp.csr_matrix(X)
            km = KMeans(init="random", n_clusters=n_clusters,
                        random_state=42).fit(X_csr)
            p_suite+=[PY_raises(ValueError,km.fit,[[0.,1.]]),
                              PY_equals(v_measure_score(true_labels, km.labels_),1.0),
                              PY_equals(km.cluster_centers_.shape,(n_clusters,n_features)),
                              PY_equals(v_measure_score(true_labels,km.labels_), 1.0),
                              PY_greater(km.inertia_,0.0)
                              ]
        return p_suite
    except Exception:
        return 50
Beispiel #12
0
    def test_accuracy(self):
        from sklearn.cluster import KMeans as skKMeans
        n_samples = 100000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
                                    cluster_std=1., random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42)
        kmeans_h2o.fit(X)
        kmeans_sk = skKMeans(n_init=1, n_clusters=centers, init='random',
                             random_state=42)
        kmeans_sk.fit(X)

        accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels)
        accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels)
        # We also want to be either better or at most 10% worse than SKLearn
        # Everything else is horrible and we probably should fix something
        assert accuracy_h2o - accuracy_sk >= -0.1
def test_v_measure_and_mutual_information(seed=36):
    """Check relation between v_measure, entropy and mutual information"""
    for i in np.logspace(1, 4, 4):
        random_state = np.random.RandomState(seed)
        labels_a, labels_b = random_state.random_integers(0, 10, i),\
            random_state.random_integers(0, 10, i)
        assert_almost_equal(v_measure_score(labels_a, labels_b),
                            2.0 * mutual_info_score(labels_a, labels_b) /
                            (entropy(labels_a) + entropy(labels_b)), 0)
def test_mini_batch_k_means_random_init_partial_fit():
    km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42)

    # use the partial_fit API for online learning
    for X_minibatch in np.array_split(X, 10):
        km.partial_fit(X_minibatch)

    # compute the labeling on the complete dataset
    labels = km.predict(X)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
def test_scaled_weights():
    # scaling all sample weights by a common factor
    # shouldn't change the result
    sample_weight = np.ones(n_samples)
    for estimator in [KMeans(n_clusters=n_clusters, random_state=42),
                      MiniBatchKMeans(n_clusters=n_clusters, random_state=42)]:
        est_1 = clone(estimator).fit(X)
        est_2 = clone(estimator).fit(X, sample_weight=0.5*sample_weight)
        assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0)
        assert_almost_equal(_sort_centers(est_1.cluster_centers_),
                            _sort_centers(est_2.cluster_centers_))
def test_unit_weights_vs_no_weights():
    # not passing any sample weights should be equivalent
    # to all weights equal to one
    sample_weight = np.ones(n_samples)
    for estimator in [KMeans(n_clusters=n_clusters, random_state=42),
                      MiniBatchKMeans(n_clusters=n_clusters, random_state=42)]:
        est_1 = clone(estimator).fit(X)
        est_2 = clone(estimator).fit(X, sample_weight=sample_weight)
        assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0)
        assert_almost_equal(_sort_centers(est_1.cluster_centers_),
                            _sort_centers(est_2.cluster_centers_))
Beispiel #17
0
def _check_fitted_model(km):
    # check that the number of clusters centers and distinct labels match
    # the expectation
    centers = km.cluster_centers_
    assert_equal(centers.shape, (n_clusters, n_features))

    labels = km.labels_
    assert_equal(np.unique(labels).shape[0], n_clusters)

    # check that the labels assignment are perfect (up to a permutation)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
    assert_greater(km.score_, 0.0)
def _check_fitted_model(km):
    centers = km.cluster_centers_
    assert_equal(centers.shape, (n_clusters, n_features))

    labels = km.labels_
    assert_equal(np.unique(labels).shape[0], n_clusters)

    # check that the labels assignements are perfect (up to a permutation)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
    assert_true(km.inertia_ > 0.0)

    # check error on dataset being too small
    assert_raises(ValueError, km.fit, [[0., 1.]])
Beispiel #19
0
def test_k_means_perfect_init():
    try:
        p_suite = []  #PY_suite(suite_name=u'perfect_init')
        for i in range(10):
            X, true_labels = make_blobs(n_samples=n_samples,
                                        centers=centers,
                                        cluster_std=1.,
                                        random_state=42)
            km = KMeans(init=centers.copy(),
                        n_clusters=n_clusters,
                        random_state=42,
                        n_init=1).fit(X)
            p_suite += [
                PY_raises(ValueError, km.fit, [[0., 1.]]),
                PY_equals(v_measure_score(true_labels, km.labels_), 1.0),
                PY_equals(km.cluster_centers_.shape, (n_clusters, n_features)),
                PY_equals(v_measure_score(true_labels, km.labels_), 1.0),
                PY_greater(km.inertia_, 0.0)
            ]
        return p_suite
    except Exception:
        return 50
Beispiel #20
0
def test_k_means_plus_plus_init_not_precomputed():
    try:
        p_suite = []  #PY_suite(suite_name=u'plus_plus_init_not_precomputed')
        for i in range(10):
            X, true_labels = make_blobs(n_samples=n_samples,
                                        centers=centers,
                                        cluster_std=1.,
                                        random_state=42)
            km = KMeans(init="k-means++",
                        n_clusters=n_clusters,
                        random_state=42,
                        precompute_distances=False).fit(X)
            p_suite += [
                PY_raises(ValueError, km.fit, [[0., 1.]]),
                PY_equals(v_measure_score(true_labels, km.labels_), 1.0),
                PY_equals(km.cluster_centers_.shape, (n_clusters, n_features)),
                PY_equals(v_measure_score(true_labels, km.labels_), 1.0),
                PY_greater(km.inertia_, 0.0)
            ]
        return p_suite
    except Exception:
        return 50
Beispiel #21
0
def evaluate(data, net, t, landmarks):
    out = net(torch.from_numpy(data).float(), False)
    print(time.time() - start_time)
    t = t.astype(float)
    out = out.detach().numpy()
    print('New score metric')
    print(score(out, t))
    cmap = colors.ListedColormap(['red', 'blue'])
    plt.scatter(out[:, 0], out[:, 1], c=t, cmap=cmap, marker='o')
    kmeans = KMeans(n_clusters=2)
    kmeans.fit(out)
    vmeasure = v_measure_score(t, kmeans.labels_)
    print(vmeasure)
Beispiel #22
0
def test_unit_weights_vs_no_weights():
    # not passing any sample weights should be equivalent
    # to all weights equal to one
    sample_weight = np.ones(n_samples)
    for estimator in [
            KMeans(n_clusters=n_clusters, random_state=42),
            MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
    ]:
        est_1 = clone(estimator).fit(X)
        est_2 = clone(estimator).fit(X, sample_weight=sample_weight)
        assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0)
        assert_almost_equal(_sort_centers(est_1.cluster_centers_),
                            _sort_centers(est_2.cluster_centers_))
Beispiel #23
0
def clusterEvaluation(trueY, fittedY):
    result = dict()
    ## NMI denotes normalized mutual information
    ## ARS denotes adjusted rand score
    ## HS stands for homogeneity_score, 1 means perfect
    ## VM represents v_measure_score ranging [0, 1], 1.0 is perfectly complete labeling
    ## SS represents silhouette_score
    result['NMI'] = normalized_mutual_info_score(trueY, fittedY)
    result['ARS'] = adjusted_rand_score(trueY, fittedY)
    result['HS'] = homogeneity_score(trueY, fittedY)
    result['CS'] = completeness_score(trueY, fittedY)
    result['VM'] = v_measure_score(trueY, fittedY)
    return result
Beispiel #24
0
def run_kmeans(Xtrain,
               Ytrain,
               Xtest,
               Ytest,
               K=6,
               n_init=1,
               verbose=1,
               plotTSNE=False):
    # let's use the TF-IDF vectorizer
    tfidf = True

    # we use a dummy function as tokenizer and preprocessor,
    # since the texts are already preprocessed and tokenized.
    if tfidf:
        vec = TfidfVectorizer(preprocessor=identity, tokenizer=identity)
    else:
        vec = CountVectorizer(preprocessor=identity, tokenizer=identity)

    ######## RUN K-MEANS ########
    km = KMeans(n_clusters=K, n_init=n_init, verbose=verbose)

    classifier = Pipeline([('vec', vec), ('cls', km)])

    classifier.fit(Xtrain)

    print("\n########## Development scores on train set:")
    print("adjusted rand score: ", adjusted_rand_score(Ytrain, km.labels_))
    print("v measure: ", v_measure_score(Ytrain, km.labels_))

    Yguess = classifier.predict(Xtest)

    print("\n########## Generalization scores on test set:")
    print("adjusted rand score: ", adjusted_rand_score(Ytest, Yguess))
    print("v measure: ", v_measure_score(Ytest, Yguess))

    if plotTSNE:
        # perform_tsne(Xtrain, Ytrain, clusterLabels=True)  # tSNE with gold labels
        perform_tsne(Xtrain, km.labels_, vec=vec,
                     clusterLabels=True)  # tSNE clustering
Beispiel #25
0
def _check_fitted_model(km):
    centers = km.cluster_centers_
    assert_equal(centers.shape, (n_clusters, n_features))

    labels = km.labels_
    assert_equal(np.unique(labels).shape[0], n_clusters)

    # check that the labels assignements are perfect (up to a permutation)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
    assert_true(km.inertia_ > 0.0)

    # check error on dataset being too small
    assert_raises(ValueError, km.fit, [[0., 1.]])
Beispiel #26
0
def test_exactly_zero_info_score():
    # Check numerical stability when information is exactly zero
    for i in np.logspace(1, 4, 4).astype(np.int):
        labels_a, labels_b = np.ones(i, dtype=np.int),\
            np.arange(i, dtype=np.int)
        assert_equal(normalized_mutual_info_score(labels_a, labels_b,
                                                  max_n_classes=1e4), 0.0)
        assert_equal(v_measure_score(labels_a, labels_b,
                                     max_n_classes=1e4), 0.0)
        assert_equal(adjusted_mutual_info_score(labels_a, labels_b,
                                                max_n_classes=1e4), 0.0)
        assert_equal(normalized_mutual_info_score(labels_a, labels_b,
                                                  max_n_classes=1e4), 0.0)
Beispiel #27
0
def test_scaled_weights():
    # scaling all sample weights by a common factor
    # shouldn't change the result
    sample_weight = np.ones(n_samples)
    for estimator in [
            KMeans(n_clusters=n_clusters, random_state=42),
            MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
    ]:
        est_1 = clone(estimator).fit(X)
        est_2 = clone(estimator).fit(X, sample_weight=0.5 * sample_weight)
        assert_almost_equal(v_measure_score(est_1.labels_, est_2.labels_), 1.0)
        assert_almost_equal(_sort_centers(est_1.cluster_centers_),
                            _sort_centers(est_2.cluster_centers_))
Beispiel #28
0
def test_exactly_zero_info_score():
    # Check numerical stability when information is exactly zero
    for i in np.logspace(1, 4, 4).astype(int):
        labels_a, labels_b = (np.ones(i, dtype=int), np.arange(i, dtype=int))
        assert normalized_mutual_info_score(labels_a, labels_b) == 0.0
        assert v_measure_score(labels_a, labels_b) == 0.0
        assert adjusted_mutual_info_score(labels_a, labels_b) == 0.0
        assert normalized_mutual_info_score(labels_a, labels_b) == 0.0
        for method in ["min", "geometric", "arithmetic", "max"]:
            assert adjusted_mutual_info_score(labels_a, labels_b,
                                              method) == 0.0
            assert normalized_mutual_info_score(labels_a, labels_b,
                                                method) == 0.0
Beispiel #29
0
    def test_accuracy(self):
        from sklearn.cluster import KMeans as skKMeans
        n_samples = 100000
        centers = 10
        X, true_labels = make_blobs(n_samples=n_samples,
                                    centers=centers,
                                    cluster_std=1.,
                                    random_state=42)

        kmeans_h2o = KMeans(n_gpus=1, n_clusters=centers, random_state=42)
        kmeans_h2o.fit(X)
        kmeans_sk = skKMeans(n_init=1,
                             n_clusters=centers,
                             init='random',
                             random_state=42)
        kmeans_sk.fit(X)

        accuracy_h2o = v_measure_score(kmeans_h2o.labels_, true_labels)
        accuracy_sk = v_measure_score(kmeans_sk.labels_, true_labels)
        # We also want to be either better or at most 10% worse than SKLearn
        # Everything else is horrible and we probably should fix something
        assert accuracy_h2o - accuracy_sk >= -0.1
Beispiel #30
0
    def testKMeansFunction(self):
        # test calling the k_means function directly

        # non centered, sparse centers to check the
        centers = np.array([
            [0.0, 5.0, 0.0, 0.0, 0.0],
            [1.0, 1.0, 4.0, 0.0, 0.0],
            [1.0, 0.0, 0.0, 5.0, 1.0],
        ])
        n_samples = 100
        n_clusters, n_features = centers.shape
        X, true_labels = make_blobs(n_samples=n_samples,
                                    centers=centers,
                                    cluster_std=1.,
                                    random_state=42)

        # catch output
        old_stdout = sys.stdout
        sys.stdout = StringIO()
        try:
            cluster_centers, labels, inertia = k_means(X,
                                                       n_clusters=n_clusters,
                                                       sample_weight=None,
                                                       verbose=True,
                                                       init='k-means++')
        finally:
            sys.stdout = old_stdout
        centers = cluster_centers
        assert centers.shape == (n_clusters, n_features)

        labels = labels.fetch()
        assert np.unique(labels).shape[0] == n_clusters

        # check that the labels assignment are perfect (up to a permutation)
        assert v_measure_score(true_labels, labels) == 1.0
        assert inertia > 0.0

        # check warning when centers are passed
        assert_warns(RuntimeWarning,
                     k_means,
                     X,
                     n_clusters=n_clusters,
                     sample_weight=None,
                     init=centers)

        # to many clusters desired
        with pytest.raises(ValueError):
            k_means(X,
                    n_clusters=X.shape[0] + 1,
                    sample_weight=None,
                    init='k-means++')
def test_exactly_zero_info_score():
    # Check numerical stability when information is exactly zero
    for i in np.logspace(1, 4, 4).astype(np.int):
        labels_a, labels_b = (np.ones(i, dtype=np.int),
                              np.arange(i, dtype=np.int))
        assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
        assert_equal(v_measure_score(labels_a, labels_b), 0.0)
        assert_equal(adjusted_mutual_info_score(labels_a, labels_b), 0.0)
        assert_equal(normalized_mutual_info_score(labels_a, labels_b), 0.0)
        for method in ["min", "geometric", "arithmetic", "max"]:
            assert adjusted_mutual_info_score(labels_a, labels_b,
                                              method) == 0.0
            assert normalized_mutual_info_score(labels_a, labels_b,
                                                method) == 0.0
Beispiel #32
0
def print_five_measures(target, predicted):
    print('homogeneity score:')
    print(homogeneity_score(target, predicted))

    print('completeness score:')
    print(completeness_score(target, predicted))

    print('V-measure:')
    print(v_measure_score(target, predicted))

    print('adjusted rand score:')
    print(adjusted_rand_score(target, predicted))

    print('adjuted mutual info score:')
    print(adjusted_mutual_info_score(target, predicted))
Beispiel #33
0
def evaluate(clusters, typedict):
    """Given the predicted clusters and type dictionary, this function calculates homogeneity, completeness, and V-measure assuming the gold tags are the most frequent tags for each type in the type dict
    input:
        clusters (dict of int:Cluster): Clusters by id
        typedict (dict of str:Word): Word by wordform
    return:
        (float): homogeneity score
        (float): completeness score
        (float): V measure"""
    # The instructor completed this function in 7 line including the return
    golds = []
    preds = []
    # Your code here
    return homogeneity_score(golds, preds), completeness_score(
        golds, preds), v_measure_score(golds, preds, beta=2.0)
Beispiel #34
0
 def test_copac(self):
     """ Minimal test that COPAC runs at all. """
     k = 40
     mu = 10
     eps = 2
     alpha = 0.85
     copac = COPAC(k=k, mu=mu, eps=eps, alpha=alpha)
     y_pred = copac.fit_predict(self.X)
     v = v_measure_score(self.y, y_pred)
     # Must score perfectly on very simple data
     assert_equal(self.v, v)
     # Check correct labels_ attribute
     copac = COPAC(k=k, mu=mu, eps=eps, alpha=alpha)
     copac.fit(self.X)
     assert_array_equal(copac.labels_, y_pred)
def _check_fitted_model(km):
    # check that the number of clusters centers and distinct labels match
    # the expectation
    centers = km.cluster_centers_
    assert_equal(centers.shape, (n_clusters, n_features))

    labels = km.labels_
    assert_equal(np.unique(labels).shape[0], n_clusters)

    # check that the labels assignment are perfect (up to a permutation)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
    assert_greater(km.inertia_, 0.0)

    # check error on dataset being too small
    assert_raises(ValueError, km.fit, [[0., 1.]])
Beispiel #36
0
def _check_fitted_model(km):
    # check that the number of clusters centers and distinct labels match
    # the expectation
    centers = km.cluster_centers_
    assert_equal(centers.shape, (n_clusters, n_features))

    labels = km.labels_
    assert_equal(np.unique(labels).shape[0], n_clusters)

    # check that the labels assignment are perfect (up to a permutation)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
    assert_greater(km.inertia_, 0.0)

    # check error on dataset being too small
    assert_raises(ValueError, km.fit, [[0., 1.]])
def test_clustering():
    matched = load_matched_data(MATCHED_DATA_FILE)
    for filename in glob.glob(os.path.join(CLUSTERS_PREDICTION_DIR, '*.fth')):
        num_clusters = int(filename.split('_clusters_')[1].split('_')[0])
        matched['cluster_uniform'] = random_unif_pred(num_clusters,
                                                      matched.shape[0])
        matched['cluster_exp'] = random_exp_pred(num_clusters,
                                                 matched.shape[0])

        print(filename)
        print(
            "(uniform) V-measure:",
            v_measure_score(matched.property_decoded, matched.cluster_uniform))
        print(
            "(uniform) AMI:",
            adjusted_mutual_info_score(matched.property_decoded,
                                       matched.cluster_uniform))
        print("(exp) V-measure:",
              v_measure_score(matched.property_decoded, matched.cluster_exp))
        print(
            "(exp) AMI:",
            adjusted_mutual_info_score(matched.property_decoded,
                                       matched.cluster_exp))
        print()
def sklearn_measures(U, V):
    #     http://scikit-learn.org/stable/modules/classes.html#clustering-metrics
    import sklearn.metrics.cluster as sym
    U_labels = np.nonzero(U)[1]
    V_labels = np.nonzero(V)[1]
    print U_labels, V_labels
#     V2_labels = np.nonzero(V2)[1]
    print 'entro(U)=',sym.entropy(U_labels),'entro(V)=',sym.entropy(V_labels), 'entro(U,V)=',sym.mutual_info_score(U_labels, V_labels)
    res = [ ['ari', 'nmi', 'ami', 'vm' ], \
            [ sym.adjusted_rand_score(U_labels, V_labels),\
              sym.normalized_mutual_info_score(U_labels, V_labels),\
              sym.adjusted_mutual_info_score(U_labels, V_labels),\
              sym.v_measure_score(U_labels, V_labels)]]
    print res
    return res
def sklearn_measures(U, V):
    #     http://scikit-learn.org/stable/modules/classes.html#clustering-metrics
    import sklearn.metrics.cluster as sym
    U_labels = np.nonzero(U)[1]
    V_labels = np.nonzero(V)[1]
    print U_labels, V_labels
    #     V2_labels = np.nonzero(V2)[1]
    print 'entro(U)=', sym.entropy(U_labels), 'entro(V)=', sym.entropy(
        V_labels), 'entro(U,V)=', sym.mutual_info_score(U_labels, V_labels)
    res = [ ['ari', 'nmi', 'ami', 'vm' ], \
            [ sym.adjusted_rand_score(U_labels, V_labels),\
              sym.normalized_mutual_info_score(U_labels, V_labels),\
              sym.adjusted_mutual_info_score(U_labels, V_labels),\
              sym.v_measure_score(U_labels, V_labels)]]
    print res
    return res
Beispiel #40
0
def _check_fitted_model(km):
    # check that the number of clusters centers and distinct labels match
    # the expectation
    centers = km.cluster_centers_
    assert centers.shape == (n_clusters, n_features)

    labels = km.labels_
    assert np.unique(labels).shape[0] == n_clusters

    # check that the labels assignment are perfect (up to a permutation)
    assert v_measure_score(true_labels, labels) == 1.0
    assert km.inertia_ > 0.0

    # check error on dataset being too small
    assert_raise_message(ValueError, "n_samples=1 should be >= n_clusters=%d"
                         % km.n_clusters, km.fit, [[0., 1.]])
Beispiel #41
0
def k_means_clustering(training_data,
                       target_labels,
                       title='Contingency Matrix',
                       n_clusters=20,
                       random_state=0,
                       max_iter=1000,
                       n_init=30):
    start = time.time()
    km = KMeans(n_clusters=n_clusters,
                random_state=random_state,
                max_iter=max_iter,
                n_init=n_init)
    km.fit(training_data)
    print("Finished clustering in %f seconds" % (time.time() - start))

    cm = contingency_matrix(target_labels, km.labels_)
    # reorder to maximize along diagonal
    rows, cols = linear_sum_assignment(cm, maximize=True)
    new_cm = cm[rows[:, np.newaxis], cols]

    print("Show Contingency Matrix:")
    plot_contingency_table_20(new_cm, title=title)

    print("Report 5 Measures for K-Means Clustering")

    homogeneity = homogeneity_score(target_labels, km.labels_)
    completeness = completeness_score(target_labels, km.labels_)
    v_measure = v_measure_score(target_labels, km.labels_)
    adjusted_rand_index = adjusted_rand_score(target_labels, km.labels_)
    adjusted_mutual_info = adjusted_mutual_info_score(target_labels,
                                                      km.labels_)

    print("Homogeneity Score: %f" % homogeneity)
    print("Completeness Score: %f" % completeness)
    print("V-Measure Score: %f" % v_measure)
    print("Adjusted Rand Index: %f" % adjusted_rand_index)
    print("Adjusted Mutual Information: %f" % adjusted_mutual_info)

    results = {
        "homogeneity": homogeneity,
        "completeness": completeness,
        "v_measure": v_measure,
        "adjusted_rand_index": adjusted_rand_index,
        "adjusted_mutual_info": adjusted_mutual_info
    }

    return results, km
Beispiel #42
0
    def compute_result(self, loss, preds, targets, stage):
        # Cluster embedded values using k-means.
        kmeans_input = preds.cpu().numpy()
        kmeans = KMeans(n_clusters=7, random_state=0).fit(kmeans_input)
        pred = kmeans.predict(kmeans_input)

        labels = targets.cpu().numpy()
        completeness = torch.Tensor([completeness_score(labels, pred)])
        hm = torch.Tensor([homogeneity_score(labels, pred)])
        nmi = torch.Tensor([v_measure_score(labels, pred)])

        # auc, ap = model.test(z, data.test_pos_edge_index, data.test_neg_edge_index)
        result = pl.EvalResult(loss)
        result.log(f"{stage}_completeness", completeness, prog_bar=True)
        result.log(f"{stage}_hm", hm, prog_bar=True)
        result.log(f"{stage}_nmi", nmi, prog_bar=True)
        return result
def test():
    model.eval()
    z = model.encode(data.x, data.train_pos_edge_index)

    # Cluster embedded values using k-means.
    kmeans_input = z.cpu().numpy()
    kmeans = KMeans(n_clusters=7, random_state=0).fit(kmeans_input)
    pred = kmeans.predict(kmeans_input)

    labels = data.y.cpu().numpy()
    completeness = completeness_score(labels, pred)
    hm = homogeneity_score(labels, pred)
    nmi = v_measure_score(labels, pred)

    auc, ap = model.test(z, data.test_pos_edge_index, data.test_neg_edge_index)

    return auc, ap, completeness, hm, nmi
 def evaluate(self):
     eval_result_dict = {}
     eval_result_dict['ami'] = adjusted_mutual_info_score(
         self.data['true_y'], self.data['pred_y'])
     eval_result_dict['rand'] = adjusted_rand_score(self.data['true_y'],
                                                    self.data['pred_y'])
     eval_result_dict['comp'] = completeness_score(self.data['true_y'],
                                                   self.data['pred_y'])
     eval_result_dict['fow'] = fowlkes_mallows_score(
         self.data['true_y'], self.data['pred_y'])
     eval_result_dict['hom'] = homogeneity_score(self.data['true_y'],
                                                 self.data['pred_y'])
     eval_result_dict['nmi'] = normalized_mutual_info_score(
         self.data['true_y'], self.data['pred_y'])
     eval_result_dict['v_score'] = v_measure_score(self.data['true_y'],
                                                   self.data['pred_y'])
     return eval_result_dict
Beispiel #45
0
def test_k_means_function():
    # test calling the k_means function directly
    # catch output
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        cluster_centers, labels, inertia = k_means(X,
                                                   n_clusters=n_clusters,
                                                   sample_weight=None,
                                                   verbose=True)
    finally:
        sys.stdout = old_stdout
    centers = cluster_centers
    assert_equal(centers.shape, (n_clusters, n_features))

    labels = labels
    assert_equal(np.unique(labels).shape[0], n_clusters)

    # check that the labels assignment are perfect (up to a permutation)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
    assert_greater(inertia, 0.0)

    # check warning when centers are passed
    assert_warns(RuntimeWarning,
                 k_means,
                 X,
                 n_clusters=n_clusters,
                 sample_weight=None,
                 init=centers)

    # to many clusters desired
    assert_raises(ValueError,
                  k_means,
                  X,
                  n_clusters=X.shape[0] + 1,
                  sample_weight=None)

    # kmeans for algorithm='elkan' raises TypeError on sparse matrix
    assert_raise_message(TypeError, "algorithm='elkan' not supported for "
                         "sparse input X",
                         k_means,
                         X=X_csr,
                         n_clusters=2,
                         sample_weight=None,
                         algorithm="elkan")
Beispiel #46
0
def test_beta_parameter():
    # test for when beta passed to
    # homogeneity_completeness_v_measure
    # and v_measure_score
    beta_test = 0.2
    h_test = 0.67
    c_test = 0.42
    v_test = (1 + beta_test) * h_test * c_test / (beta_test * h_test + c_test)

    h, c, v = homogeneity_completeness_v_measure(
        [0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test
    )
    assert_almost_equal(h, h_test, 2)
    assert_almost_equal(c, c_test, 2)
    assert_almost_equal(v, v_test, 2)

    v = v_measure_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2], beta=beta_test)
    assert_almost_equal(v, v_test, 2)
Beispiel #47
0
def test_k_means_function():
    # test calling the k_means function directly
    # catch output
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        cluster_centers, labels, score = k_means(X, n_clusters=n_clusters)
    finally:
        sys.stdout = old_stdout
    centers = cluster_centers
    assert_equal(centers.shape, (n_clusters, n_features))

    labels = labels
    assert_equal(np.unique(labels).shape[0], n_clusters)

    # check that the labels assignment are perfect (up to a permutation)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
    assert_greater(score, 0.0)
Beispiel #48
0
    def _check_fitted_model(self, km, n_clusters, n_features, true_labels):
        # check that the number of clusters centers and distinct labels match
        # the expectation
        centers = km.cluster_centers_
        self.assertEqual(centers.shape, (n_clusters, n_features))

        labels = km.labels_.fetch()
        self.assertEqual(np.unique(labels).shape[0], n_clusters)

        # check that the labels assignment are perfect (up to a permutation)
        self.assertEqual(v_measure_score(true_labels, labels), 1.0)
        self.assertGreater(km.inertia_, 0.0)

        # check error on dataset being too small
        assert_raise_message(
            ValueError,
            "n_samples=1 should be >= n_clusters=%d" % km.n_clusters, km.fit,
            [[0., 1.]])
Beispiel #49
0
    def _eval(self, ind, X, Y):

        if ind["fenotype"] == None:
            self.distance_creator.expand(ind)

        # evaluation using a pre-constructed distance matrix
        # with sklearn's agglomerative clustering algorithm
        # as was allowed in the Duvidas TP1 Moodle thread.
        d_matrix = cdist(X, X, metric=ind["fenotype"])
        d_matrix = numpy.nan_to_num(d_matrix)
        kmeans_instance = AgglomerativeClustering(n_clusters=self.classes,
                                                  affinity="precomputed",
                                                  linkage="single")

        # predicts and adapts the cluster numbers to be compatible with the
        # numbers given in the test CSV
        pred = kmeans_instance.fit_predict(d_matrix) + numpy.ones(len(X))
        # recupera os clusters gerados

        return v_measure_score(Y, pred)
Beispiel #50
0
def test_weighted_vs_repeated():
    # a sample weight of N should yield the same result as an N-fold
    # repetition of the sample
    sample_weight = np.random.randint(1, 5, size=n_samples)
    X_repeat = np.repeat(X, sample_weight, axis=0)
    estimators = [
        KMeans(init="k-means++", n_clusters=n_clusters, random_state=42),
        KMeans(init="random", n_clusters=n_clusters, random_state=42),
        KMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42),
        MiniBatchKMeans(n_clusters=n_clusters, batch_size=10, random_state=42)
    ]
    for estimator in estimators:
        est_weighted = clone(estimator).fit(X, sample_weight=sample_weight)
        est_repeated = clone(estimator).fit(X_repeat)
        repeated_labels = np.repeat(est_weighted.labels_, sample_weight)
        assert_almost_equal(
            v_measure_score(est_repeated.labels_, repeated_labels), 1.0)
        if not isinstance(estimator, MiniBatchKMeans):
            assert_almost_equal(_sort_centers(est_weighted.cluster_centers_),
                                _sort_centers(est_repeated.cluster_centers_))
def test_clustering():
    for filename in glob.glob(os.path.join(CLUSTERS_PREDICTION_DIR, '*.fth')):
        print()
        print('Looking at the', filename)

        clusterized = load_model_predictions(filename)
        if not clusterized.shape[0]:
            print('Empty predictions file.')
        else:
            matched = load_matched_data(MATCHED_DATA_FILE)
            matched = clusters4matched(matched, clusterized)
            print("Matched pairs are of shape",
                  matched[matched.cluster.notna()].shape)

            print("V-measure:",
                  v_measure_score(matched.property_decoded, matched.cluster))
            print(
                "AMI:",
                adjusted_mutual_info_score(matched.property_decoded,
                                           matched.cluster))
def test_weighted_vs_repeated():
    # a sample weight of N should yield the same result as an N-fold
    # repetition of the sample
    sample_weight = np.random.randint(1, 5, size=n_samples)
    X_repeat = np.repeat(X, sample_weight, axis=0)
    estimators = [KMeans(init="k-means++", n_clusters=n_clusters,
                         random_state=42),
                  KMeans(init="random", n_clusters=n_clusters,
                         random_state=42),
                  KMeans(init=centers.copy(), n_clusters=n_clusters,
                         random_state=42),
                  MiniBatchKMeans(n_clusters=n_clusters, batch_size=10,
                                  random_state=42)]
    for estimator in estimators:
        est_weighted = clone(estimator).fit(X, sample_weight=sample_weight)
        est_repeated = clone(estimator).fit(X_repeat)
        repeated_labels = np.repeat(est_weighted.labels_, sample_weight)
        assert_almost_equal(v_measure_score(est_repeated.labels_,
                                            repeated_labels), 1.0)
        if not isinstance(estimator, MiniBatchKMeans):
            assert_almost_equal(_sort_centers(est_weighted.cluster_centers_),
                                _sort_centers(est_repeated.cluster_centers_))
def test_beta_parameter():
    # test for when beta passed to
    # homogeneity_completeness_v_measure
    # and v_measure_score
    beta_test = 0.2
    h_test = 0.67
    c_test = 0.42
    v_test = ((1 + beta_test) * h_test * c_test
              / (beta_test * h_test + c_test))

    h, c, v = homogeneity_completeness_v_measure(
        [0, 0, 0, 1, 1, 1],
        [0, 1, 0, 1, 2, 2],
        beta=beta_test)
    assert_almost_equal(h, h_test, 2)
    assert_almost_equal(c, c_test, 2)
    assert_almost_equal(v, v_test, 2)

    v = v_measure_score(
        [0, 0, 0, 1, 1, 1],
        [0, 1, 0, 1, 2, 2],
        beta=beta_test)
    assert_almost_equal(v, v_test, 2)
                             tokenizer=number_aware_tokenizer)
cocluster = SpectralCoclustering(n_clusters=len(categories),
                                 svd_method='arpack', random_state=0)
kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,
                         random_state=0)

print("Vectorizing...")
X = vectorizer.fit_transform(newsgroups.data)

print("Coclustering...")
start_time = time()
cocluster.fit(X)
y_cocluster = cocluster.row_labels_
print("Done in {:.2f}s. V-measure: {:.4f}".format(
    time() - start_time,
    v_measure_score(y_cocluster, y_true)))

print("MiniBatchKMeans...")
start_time = time()
y_kmeans = kmeans.fit_predict(X)
print("Done in {:.2f}s. V-measure: {:.4f}".format(
    time() - start_time,
    v_measure_score(y_kmeans, y_true)))

feature_names = vectorizer.get_feature_names()
document_names = list(newsgroups.target_names[i] for i in newsgroups.target)


def bicluster_ncut(i):
    rows, cols = cocluster.get_indices(i)
    if not (np.any(rows) and np.any(cols)):