Esempio n. 1
0
def test_whitening():

    rng = np.random.RandomState(1)

    n_features = 500
    n_components = 50
    min_count = 3
    alpha0 = 10.
    n_samples = rng.randint(100, 150)
    doc_word_mtx = rng.randint(0, 3, size=(n_samples, n_features)).astype('float')
    doc_word_mtx = sp.csr_matrix(doc_word_mtx)

    m1, _ = first_order_moments(doc_word_mtx, min_words=min_count)
    e2, _ = cooccurrence_expectation(doc_word_mtx, min_words=min_count)

    # create M2 directly
    m2 = (alpha0 + 1.) * e2.toarray()
    m2 -= (alpha0 * m1) * m1[:, np.newaxis]
    m2_vals, m2_vecs = sp.linalg.eigsh(m2, k=n_components)
    # create whitening matrix
    W = whitening(m2_vals, m2_vecs)

    # check whitening matrix shape
    assert_equal(n_features, W.shape[0])
    assert_equal(n_components, W.shape[1])

    # M2(W, W) should be identity matrix
    identity = np.dot(np.dot(W.T, m2), W)
    assert_array_almost_equal(np.eye(n_components, n_components), identity)
Esempio n. 2
0
def test_whitening_tensor_e2_m1():
    rng = np.random.RandomState(12)

    n_features = 300
    n_components = 25
    min_count = 3
    alpha0 = 10.
    n_samples = rng.randint(100, 150)
    doc_word_mtx = rng.randint(0, 3, size=(n_samples, n_features)).astype('float')
    doc_word_mtx = sp.csr_matrix(doc_word_mtx)

    m1, _ = first_order_moments(doc_word_mtx, min_words=min_count)
    e2, _ = cooccurrence_expectation(doc_word_mtx, min_words=min_count)

    # create M2 directly
    m2 = (alpha0 + 1.) * e2.toarray()
    m2 -= (alpha0 * m1) * m1[:, np.newaxis]
    m2_vals, m2_vecs = sp.linalg.eigsh(m2, k=n_components)
    # create whitening matrix
    W = whitening(m2_vals, m2_vecs)

    # optimized method
    wt_m1 = np.dot(W.T, m1)
    u1_2_3 = whitening_tensor_e2_m1(wt_m1, alpha0)

    # compute directly
    u1_2_3_true = _compute_e2_m1_directly(doc_word_mtx, W, wt_m1)
    assert_array_almost_equal(u1_2_3_true, u1_2_3)
Esempio n. 3
0
def test_first_order_moments():
    rng = np.random.RandomState(0)

    n_features = rng.randint(100, 500)
    n_samples = rng.randint(1000, 2000)
    doc_word_mtx = rng.randint(0, 10, size=(n_samples, n_features))

    word_cnts = doc_word_mtx.sum(axis=1).astype('float')
    result = (doc_word_mtx / word_cnts[:, np.newaxis]).sum(axis=0)
    result /= n_samples
    m1, ignored_cnt = first_order_moments(doc_word_mtx, min_words=0)
    assert_equal(0, ignored_cnt)
    assert_array_almost_equal(result, m1)
Esempio n. 4
0
def test_first_order_moments_with_ignored_count():
    rng = np.random.RandomState(0)

    n_features = 10
    n_samples = rng.randint(1000, 2000)
    doc_word_mtx = rng.randint(0, 3, size=(n_samples, n_features))

    word_cnts = doc_word_mtx.sum(axis=1).astype('float')
    min_count = int(word_cnts.min() + 1)
    mask = (word_cnts >= min_count)

    result = (doc_word_mtx / word_cnts[:, np.newaxis])[mask, :].sum(axis=0)
    result /= mask.sum()
    m1, ignored_cnt = first_order_moments(doc_word_mtx, min_words=min_count)
    assert_greater(ignored_cnt, 0)
    assert_equal(mask.sum(), n_samples - ignored_cnt)
    assert_array_almost_equal(result, m1)

    # sparse matrix should return same result
    m1_2, ignored_cnt_2 = first_order_moments(
        sp.csr_matrix(doc_word_mtx), min_words=min_count)
    assert_equal(ignored_cnt, ignored_cnt_2)
    assert_array_almost_equal(m1, m1_2)
Esempio n. 5
0
def test_second_order_moments():
    # compare create M2 directly vs create eigen value
    # and vectors with optimized method
    rng = np.random.RandomState(100)

    n_features = 500
    n_components = 50
    min_count = 3
    alpha0 = 10.
    n_samples = rng.randint(100, 150)
    doc_word_mtx = rng.randint(0, 3,
                               size=(n_samples, n_features)).astype('float')
    doc_word_mtx = sp.csr_matrix(doc_word_mtx)

    m1, _ = first_order_moments(doc_word_mtx, min_words=min_count)
    e2, _ = cooccurrence_expectation(doc_word_mtx, min_words=min_count)

    # create M2 directly
    m2 = (alpha0 + 1.) * e2.toarray()
    m2 -= (alpha0 * m1) * m1[:, np.newaxis]
    m2_vals_true, m2_vecs_true = sp.linalg.eigsh(m2, k=n_components)
    m2_vecs_true, m2_vals_true, m2_vec_t = sp.linalg.svds(
        e2, k=n_components, which='LM', return_singular_vectors=True)

    assert_array_almost_equal(m2_vecs_true, m2_vec_t.T)

    assert_true(np.all(m2_vals_true > 0.))
    # create M2 eigen values & vectors with optimized method
    m2_vals, m2_vecs = second_order_moments(n_components, e2, m1, alpha0)

    # make sure all eigen values are greater than 0.
    assert_true(np.all(m2_vals > 0.))
    assert_equal(m2_vals.shape[0], n_components)
    assert_equal(m2_vecs.shape[0], n_features)
    assert_equal(m2_vecs.shape[1], n_components)

    m2_reconstruct_true = np.dot(np.dot(m2_vecs_true, np.diag(m2_vals_true)),
                                 m2_vecs_true.T)
    m2_reconstruct = np.dot(np.dot(m2_vecs, np.diag(m2_vals)), m2_vecs.T)

    # compare reconstructed version
    assert_array_almost_equal(m2_reconstruct_true, m2_reconstruct, decimal=4)

    # compare original M2 with reconstructed version
    assert_array_almost_equal(m2, m2_reconstruct, decimal=4)