Example #1
0
def test_whitening_tensor_e2_m1():
    rng = np.random.RandomState(12)

    n_features = 300
    n_components = 25
    min_count = 3
    alpha0 = 10.
    n_samples = rng.randint(100, 150)
    doc_word_mtx = rng.randint(0, 3, size=(n_samples, n_features)).astype('float')
    doc_word_mtx = sp.csr_matrix(doc_word_mtx)

    m1, _ = first_order_moments(doc_word_mtx, min_words=min_count)
    e2, _ = cooccurrence_expectation(doc_word_mtx, min_words=min_count)

    # create M2 directly
    m2 = (alpha0 + 1.) * e2.toarray()
    m2 -= (alpha0 * m1) * m1[:, np.newaxis]
    m2_vals, m2_vecs = sp.linalg.eigsh(m2, k=n_components)
    # create whitening matrix
    W = whitening(m2_vals, m2_vecs)

    # optimized method
    wt_m1 = np.dot(W.T, m1)
    u1_2_3 = whitening_tensor_e2_m1(wt_m1, alpha0)

    # compute directly
    u1_2_3_true = _compute_e2_m1_directly(doc_word_mtx, W, wt_m1)
    assert_array_almost_equal(u1_2_3_true, u1_2_3)
Example #2
0
def test_whitening():

    rng = np.random.RandomState(1)

    n_features = 500
    n_components = 50
    min_count = 3
    alpha0 = 10.
    n_samples = rng.randint(100, 150)
    doc_word_mtx = rng.randint(0, 3, size=(n_samples, n_features)).astype('float')
    doc_word_mtx = sp.csr_matrix(doc_word_mtx)

    m1, _ = first_order_moments(doc_word_mtx, min_words=min_count)
    e2, _ = cooccurrence_expectation(doc_word_mtx, min_words=min_count)

    # create M2 directly
    m2 = (alpha0 + 1.) * e2.toarray()
    m2 -= (alpha0 * m1) * m1[:, np.newaxis]
    m2_vals, m2_vecs = sp.linalg.eigsh(m2, k=n_components)
    # create whitening matrix
    W = whitening(m2_vals, m2_vecs)

    # check whitening matrix shape
    assert_equal(n_features, W.shape[0])
    assert_equal(n_components, W.shape[1])

    # M2(W, W) should be identity matrix
    identity = np.dot(np.dot(W.T, m2), W)
    assert_array_almost_equal(np.eye(n_components, n_components), identity)