def test_whitening_tensor_e2_m1(): rng = np.random.RandomState(12) n_features = 300 n_components = 25 min_count = 3 alpha0 = 10. n_samples = rng.randint(100, 150) doc_word_mtx = rng.randint(0, 3, size=(n_samples, n_features)).astype('float') doc_word_mtx = sp.csr_matrix(doc_word_mtx) m1, _ = first_order_moments(doc_word_mtx, min_words=min_count) e2, _ = cooccurrence_expectation(doc_word_mtx, min_words=min_count) # create M2 directly m2 = (alpha0 + 1.) * e2.toarray() m2 -= (alpha0 * m1) * m1[:, np.newaxis] m2_vals, m2_vecs = sp.linalg.eigsh(m2, k=n_components) # create whitening matrix W = whitening(m2_vals, m2_vecs) # optimized method wt_m1 = np.dot(W.T, m1) u1_2_3 = whitening_tensor_e2_m1(wt_m1, alpha0) # compute directly u1_2_3_true = _compute_e2_m1_directly(doc_word_mtx, W, wt_m1) assert_array_almost_equal(u1_2_3_true, u1_2_3)
def test_whitening(): rng = np.random.RandomState(1) n_features = 500 n_components = 50 min_count = 3 alpha0 = 10. n_samples = rng.randint(100, 150) doc_word_mtx = rng.randint(0, 3, size=(n_samples, n_features)).astype('float') doc_word_mtx = sp.csr_matrix(doc_word_mtx) m1, _ = first_order_moments(doc_word_mtx, min_words=min_count) e2, _ = cooccurrence_expectation(doc_word_mtx, min_words=min_count) # create M2 directly m2 = (alpha0 + 1.) * e2.toarray() m2 -= (alpha0 * m1) * m1[:, np.newaxis] m2_vals, m2_vecs = sp.linalg.eigsh(m2, k=n_components) # create whitening matrix W = whitening(m2_vals, m2_vecs) # check whitening matrix shape assert_equal(n_features, W.shape[0]) assert_equal(n_components, W.shape[1]) # M2(W, W) should be identity matrix identity = np.dot(np.dot(W.T, m2), W) assert_array_almost_equal(np.eye(n_components, n_components), identity)
def test_cooccurrence_expectation_simple(): doc_word_mtx = np.array([ [1, 0, 3, 2], [4, 1, 5, 0] ]) result_1 = np.array([ [0, 0, 3, 2], [0, 0, 0, 0], [3, 0, 6, 6], [2, 0, 6, 2], ]) result_2 = np.array([ [12, 4, 20, 0], [4, 0, 5, 0], [20, 5, 20, 0], [0, 0, 0, 0], ]) result = (result_1 / float(6 * 5)) + \ (result_2 / float(10 * 9)) result /= 2 e2, ignored_cnt = cooccurrence_expectation(doc_word_mtx, min_words=3) assert_equal(ignored_cnt, 0) assert_array_almost_equal(result, e2.toarray())
def test_cooccurrence_expectation(): rng = np.random.RandomState(0) n_features = 100 n_samples = rng.randint(100, 200) doc_word_mtx = rng.randint(0, 3, size=(n_samples, n_features)).astype('float') word_cnts = doc_word_mtx.sum(axis=1).astype('float') min_count = int(word_cnts.min() + 1) mask = (word_cnts >= min_count) result = np.zeros((n_features, n_features)) for i in xrange(n_samples): cnt = word_cnts[i] if cnt < min_count: continue doc_i = doc_word_mtx[i, :] result_i = (doc_i * doc_i[:, np.newaxis]) - np.diag(doc_i) result_i /= cnt * (cnt - 1) result += result_i result /= mask.sum() e2, ignored_cnt = cooccurrence_expectation( doc_word_mtx, min_words=min_count) e2_dense = e2.toarray() assert_greater(ignored_cnt, 0) assert_equal(mask.sum(), n_samples - ignored_cnt) assert_array_almost_equal(result, e2_dense) # cooccurrence should be symmertic assert_array_almost_equal(result, e2_dense.T) assert_true(np.all(e2_dense >= 0.))
def test_second_order_moments(): # compare create M2 directly vs create eigen value # and vectors with optimized method rng = np.random.RandomState(100) n_features = 500 n_components = 50 min_count = 3 alpha0 = 10. n_samples = rng.randint(100, 150) doc_word_mtx = rng.randint(0, 3, size=(n_samples, n_features)).astype('float') doc_word_mtx = sp.csr_matrix(doc_word_mtx) m1, _ = first_order_moments(doc_word_mtx, min_words=min_count) e2, _ = cooccurrence_expectation(doc_word_mtx, min_words=min_count) # create M2 directly m2 = (alpha0 + 1.) * e2.toarray() m2 -= (alpha0 * m1) * m1[:, np.newaxis] m2_vals_true, m2_vecs_true = sp.linalg.eigsh(m2, k=n_components) m2_vecs_true, m2_vals_true, m2_vec_t = sp.linalg.svds( e2, k=n_components, which='LM', return_singular_vectors=True) assert_array_almost_equal(m2_vecs_true, m2_vec_t.T) assert_true(np.all(m2_vals_true > 0.)) # create M2 eigen values & vectors with optimized method m2_vals, m2_vecs = second_order_moments(n_components, e2, m1, alpha0) # make sure all eigen values are greater than 0. assert_true(np.all(m2_vals > 0.)) assert_equal(m2_vals.shape[0], n_components) assert_equal(m2_vecs.shape[0], n_features) assert_equal(m2_vecs.shape[1], n_components) m2_reconstruct_true = np.dot(np.dot(m2_vecs_true, np.diag(m2_vals_true)), m2_vecs_true.T) m2_reconstruct = np.dot(np.dot(m2_vecs, np.diag(m2_vals)), m2_vecs.T) # compare reconstructed version assert_array_almost_equal(m2_reconstruct_true, m2_reconstruct, decimal=4) # compare original M2 with reconstructed version assert_array_almost_equal(m2, m2_reconstruct, decimal=4)