def test_integration_dense(params_in):
    n_clusters = 5
    n_examples = 20
    n_features = 100
    X = np.random.randn(n_examples, n_features)
    for ee in range(n_examples):
        X[ee, :] /= np.linalg.norm(X[ee, :])

    params_in.update({'n_clusters': n_clusters})
    movmf = VonMisesFisherMixture(**params_in)
    movmf.fit(X)

    assert movmf.cluster_centers_.shape == (n_clusters, n_features)
    assert len(movmf.concentrations_) == n_clusters
    assert len(movmf.weights_) == n_clusters
    assert len(movmf.labels_) == n_examples

    for center in movmf.cluster_centers_:
        assert_almost_equal(np.linalg.norm(center), 1.0)

    for concentration in movmf.concentrations_:
        assert concentration > 0

    for weight in movmf.weights_:
        assert not np.isnan(weight)

    plabels = movmf.predict(X)
    assert_array_equal(plabels, movmf.labels_)

    ll = movmf.log_likelihood(X)
    ll_labels = np.zeros(movmf.labels_.shape)
    for ee in range(n_examples):
        ll_labels[ee] = np.argmax(ll[:, ee])

    assert_array_equal(ll_labels, movmf.labels_)
Beispiel #2
0
def VonMisesFisherMixture_Model(vocab_embeddings, vocab, topics, rerank, rand):
    # vmf_soft = VonMisesFisherMixture(n_clusters=topics, posterior_type='hard', n_jobs=-1, random_state=rand).fit(vocab_embeddings)
    print("fitting vmf...")
    vmf_soft = VonMisesFisherMixture(n_clusters=topics,
                                     posterior_type='soft',
                                     n_jobs=-1,
                                     random_state=rand).fit(vocab_embeddings)

    llh = vmf_soft.log_likelihood(vocab_embeddings)
    indices = []
    for i in range(topics):

        topk_vals = llh[i, :].argsort()[::-1].astype(int)
        if rerank:
            indices.append(find_top_k_words(100, topk_vals, vocab))
        else:
            indices.append(find_top_k_words(10, topk_vals, vocab))

    return vmf_soft.predict(vocab_embeddings), indices
def test_integration_sparse(params_in):
    n_clusters = 5
    n_examples = 20
    n_features = 100
    n_nonzero = 10
    X = sp.sparse.csr_matrix((n_examples, n_features))
    for ee in range(n_examples):
        ridx = np.random.randint(n_features, size=(n_nonzero))
        random_values = np.random.randn(n_nonzero)
        random_values = random_values / np.linalg.norm(random_values)
        X[ee, ridx] = random_values

    params_in.update({"n_clusters": n_clusters})
    movmf = VonMisesFisherMixture(**params_in)
    movmf.fit(X)

    assert movmf.cluster_centers_.shape == (n_clusters, n_features)
    assert len(movmf.concentrations_) == n_clusters
    assert len(movmf.weights_) == n_clusters
    assert len(movmf.labels_) == n_examples
    assert len(movmf.posterior_) == n_clusters

    for center in movmf.cluster_centers_:
        assert_almost_equal(np.linalg.norm(center), 1.0)

    for concentration in movmf.concentrations_:
        assert concentration > 0

    for weight in movmf.weights_:
        assert not np.isnan(weight)

    plabels = movmf.predict(X)
    assert_array_equal(plabels, movmf.labels_)

    ll = movmf.log_likelihood(X)
    ll_labels = np.zeros(movmf.labels_.shape)
    for ee in range(n_examples):
        ll_labels[ee] = np.argmax(ll[:, ee])

    assert_array_equal(ll_labels, movmf.labels_)