Beispiel #1
0
def test_whitening_tensor_e2_m1():
    rng = np.random.RandomState(12)

    n_features = 300
    n_components = 25
    min_count = 3
    alpha0 = 10.
    n_samples = rng.randint(100, 150)
    doc_word_mtx = rng.randint(0, 3, size=(n_samples, n_features)).astype('float')
    doc_word_mtx = sp.csr_matrix(doc_word_mtx)

    m1, _ = first_order_moments(doc_word_mtx, min_words=min_count)
    e2, _ = cooccurrence_expectation(doc_word_mtx, min_words=min_count)

    # create M2 directly
    m2 = (alpha0 + 1.) * e2.toarray()
    m2 -= (alpha0 * m1) * m1[:, np.newaxis]
    m2_vals, m2_vecs = sp.linalg.eigsh(m2, k=n_components)
    # create whitening matrix
    W = whitening(m2_vals, m2_vecs)

    # optimized method
    wt_m1 = np.dot(W.T, m1)
    u1_2_3 = whitening_tensor_e2_m1(wt_m1, alpha0)

    # compute directly
    u1_2_3_true = _compute_e2_m1_directly(doc_word_mtx, W, wt_m1)
    assert_array_almost_equal(u1_2_3_true, u1_2_3)
Beispiel #2
0
def test_whitening():

    rng = np.random.RandomState(1)

    n_features = 500
    n_components = 50
    min_count = 3
    alpha0 = 10.
    n_samples = rng.randint(100, 150)
    doc_word_mtx = rng.randint(0, 3, size=(n_samples, n_features)).astype('float')
    doc_word_mtx = sp.csr_matrix(doc_word_mtx)

    m1, _ = first_order_moments(doc_word_mtx, min_words=min_count)
    e2, _ = cooccurrence_expectation(doc_word_mtx, min_words=min_count)

    # create M2 directly
    m2 = (alpha0 + 1.) * e2.toarray()
    m2 -= (alpha0 * m1) * m1[:, np.newaxis]
    m2_vals, m2_vecs = sp.linalg.eigsh(m2, k=n_components)
    # create whitening matrix
    W = whitening(m2_vals, m2_vecs)

    # check whitening matrix shape
    assert_equal(n_features, W.shape[0])
    assert_equal(n_components, W.shape[1])

    # M2(W, W) should be identity matrix
    identity = np.dot(np.dot(W.T, m2), W)
    assert_array_almost_equal(np.eye(n_components, n_components), identity)
Beispiel #3
0
def test_cooccurrence_expectation_simple():

    doc_word_mtx = np.array([
        [1, 0, 3, 2],
        [4, 1, 5, 0]
    ])

    result_1 = np.array([
        [0, 0, 3, 2],
        [0, 0, 0, 0],
        [3, 0, 6, 6],
        [2, 0, 6, 2],
    ])

    result_2 = np.array([
        [12, 4, 20, 0],
        [4, 0, 5, 0],
        [20, 5, 20, 0],
        [0, 0, 0, 0],
    ])

    result = (result_1 / float(6 * 5)) + \
             (result_2 / float(10 * 9))
    result /= 2
    e2, ignored_cnt = cooccurrence_expectation(doc_word_mtx, min_words=3)

    assert_equal(ignored_cnt, 0)
    assert_array_almost_equal(result, e2.toarray())
Beispiel #4
0
def test_cooccurrence_expectation():
    rng = np.random.RandomState(0)

    n_features = 100
    n_samples = rng.randint(100, 200)
    doc_word_mtx = rng.randint(0, 3, size=(n_samples, n_features)).astype('float')

    word_cnts = doc_word_mtx.sum(axis=1).astype('float')
    min_count = int(word_cnts.min() + 1)
    mask = (word_cnts >= min_count)

    result = np.zeros((n_features, n_features))
    for i in xrange(n_samples):
        cnt = word_cnts[i]
        if cnt < min_count:
            continue
        doc_i = doc_word_mtx[i, :]
        result_i = (doc_i * doc_i[:, np.newaxis]) - np.diag(doc_i)
        result_i /= cnt * (cnt - 1)
        result += result_i
    result /= mask.sum()

    e2, ignored_cnt = cooccurrence_expectation(
        doc_word_mtx, min_words=min_count)

    e2_dense = e2.toarray()
    assert_greater(ignored_cnt, 0)
    assert_equal(mask.sum(), n_samples - ignored_cnt)
    assert_array_almost_equal(result, e2_dense)
    # cooccurrence should be symmertic
    assert_array_almost_equal(result, e2_dense.T)
    assert_true(np.all(e2_dense >= 0.))
Beispiel #5
0
def test_second_order_moments():
    # compare create M2 directly vs create eigen value
    # and vectors with optimized method
    rng = np.random.RandomState(100)

    n_features = 500
    n_components = 50
    min_count = 3
    alpha0 = 10.
    n_samples = rng.randint(100, 150)
    doc_word_mtx = rng.randint(0, 3,
                               size=(n_samples, n_features)).astype('float')
    doc_word_mtx = sp.csr_matrix(doc_word_mtx)

    m1, _ = first_order_moments(doc_word_mtx, min_words=min_count)
    e2, _ = cooccurrence_expectation(doc_word_mtx, min_words=min_count)

    # create M2 directly
    m2 = (alpha0 + 1.) * e2.toarray()
    m2 -= (alpha0 * m1) * m1[:, np.newaxis]
    m2_vals_true, m2_vecs_true = sp.linalg.eigsh(m2, k=n_components)
    m2_vecs_true, m2_vals_true, m2_vec_t = sp.linalg.svds(
        e2, k=n_components, which='LM', return_singular_vectors=True)

    assert_array_almost_equal(m2_vecs_true, m2_vec_t.T)

    assert_true(np.all(m2_vals_true > 0.))
    # create M2 eigen values & vectors with optimized method
    m2_vals, m2_vecs = second_order_moments(n_components, e2, m1, alpha0)

    # make sure all eigen values are greater than 0.
    assert_true(np.all(m2_vals > 0.))
    assert_equal(m2_vals.shape[0], n_components)
    assert_equal(m2_vecs.shape[0], n_features)
    assert_equal(m2_vecs.shape[1], n_components)

    m2_reconstruct_true = np.dot(np.dot(m2_vecs_true, np.diag(m2_vals_true)),
                                 m2_vecs_true.T)
    m2_reconstruct = np.dot(np.dot(m2_vecs, np.diag(m2_vals)), m2_vecs.T)

    # compare reconstructed version
    assert_array_almost_equal(m2_reconstruct_true, m2_reconstruct, decimal=4)

    # compare original M2 with reconstructed version
    assert_array_almost_equal(m2, m2_reconstruct, decimal=4)