def test_l1_regularization(solver): n_components = 3 rng = np.random.mtrand.RandomState(42) X = np.abs(rng.randn(6, 5)) Y = np.abs(rng.randn(5, 4)) # L1 regularization should increase the number of zeros l1_reg = 2. reg = CMF(n_components=n_components, solver=solver, l1_reg=l1_reg, random_state=42) model = CMF(n_components=n_components, solver=solver, l1_reg=0., random_state=42) U_reg, V_reg, Z_reg = reg.fit_transform(X, Y) U_model, V_model, Z_model = model.fit_transform(X, Y) U_reg_n_zeros = U_reg[U_reg == 0].size V_reg_n_zeros = V_reg[V_reg == 0].size Z_reg_n_zeros = Z_reg[Z_reg == 0].size U_model_n_zeros = U_model[U_model == 0].size V_model_n_zeros = V_model[V_model == 0].size Z_model_n_zeros = Z_model[Z_model == 0].size msg = "solver: {}".format(solver) # If one matrix is full of zeros, # it might make sense for the other matrices to reduce the number of zeros # Therefore, we compare the total number of zeros assert_greater(U_reg_n_zeros + V_reg_n_zeros + Z_reg_n_zeros, U_model_n_zeros + V_model_n_zeros + Z_model_n_zeros, msg)
def test_analysis(): # smoke test to see that analysis works rng = np.random.mtrand.RandomState(36) model = CMF(n_components=2, solver="newton", max_iter=1) c = CountVectorizer() X_ = c.fit_transform(["hello world", "goodbye world", "hello goodbye"]) X_ = csr_matrix(X_) Y = np.abs(rng.randn(3, 1)) model.fit_transform(X_.T, Y) model.print_topic_terms(c, importances=False) model.print_topic_terms(c, importances=True)
def test_transform_custom_init(): # Smoke test that checks if CMF.fit_transform works with custom initialization random_state = np.random.RandomState(0) X = np.abs(random_state.randn(6, 5)) Y = np.abs(random_state.randn(5, 1)) n_components = 4 avg = np.sqrt(X.mean() / n_components) U_init = np.abs(avg * random_state.randn(6, n_components)) V_init = np.abs(avg * random_state.randn(5, n_components)) avg = np.sqrt(Y.mean() / n_components) Z_init = np.abs(avg * random_state.randn(1, n_components)) m = CMF(solver='newton', n_components=n_components, x_init='custom', y_init='custom', random_state=0) m.fit_transform(X, Y, U=U_init, V=V_init, Z=Z_init)
def test_input_method_compatibility(): # Smoke test for combinations between different init methods rng = np.random.mtrand.RandomState(0) X = np.abs(rng.randn(6, 5)) Y = np.abs(rng.randn(5, 6)) n_components = 4 avg = np.sqrt(X.mean() / n_components) U_init = np.abs(avg * rng.randn(6, n_components)) V_init = np.abs(avg * rng.randn(5, n_components)) avg = np.sqrt(Y.mean() / n_components) Z_init = np.abs(avg * rng.randn(6, n_components)) inits = [None, 'random', 'nndsvd', 'nndsvda', 'nndsvdar', 'custom'] for x_init, y_init in itertools.product(inits, inits): pnmf = CMF(n_components=n_components, solver='mu', x_init=x_init, y_init=y_init, random_state=0, max_iter=1) pnmf.fit_transform(X, Y, U=U_init, V=V_init, Z=Z_init)
def sparse_cmf_benchmark(solver): rng = np.random.mtrand.RandomState(42) X = np.abs(rng.randn(2000, 150)) X[:1000, 2 * np.arange(10) + 100] = 0 X[1000:, 2 * np.arange(10)] = 0 X_sparse = SP(X) Y = np.abs(rng.randn(150, 10)) model = CMF(n_components=10, solver=solver, random_state=42, max_iter=10) U, V, Z = model.fit_transform(X_sparse, Y)
def test_auto_compute_alpha(): rng = np.random.mtrand.RandomState(36) X = rng.randn(10, 10) Y = rng.randn(10, 5) x_emphasis_model = CMF(n_components=2, solver="newton", x_init='svd', y_init='svd', U_non_negative=False, V_non_negative=False, Z_non_negative=False, random_state=0, max_iter=100, alpha=0.5) # automatic = weight * number_of_elements is constant for both X and Y y_emphasis_model = CMF(n_components=2, solver="newton", x_init='svd', y_init='svd', U_non_negative=False, V_non_negative=False, Z_non_negative=False, random_state=0, max_iter=100, alpha="auto") U1, V1, Z1 = x_emphasis_model.fit_transform(X, Y) U2, V2, Z2 = y_emphasis_model.fit_transform(X, Y) assert_greater(np.linalg.norm(np.dot(U2, V2.T) - X), np.linalg.norm(np.dot(U1, V1.T) - X)) assert_greater(np.linalg.norm(np.dot(V1, Z1.T) - Y), np.linalg.norm(np.dot(V2, Z2.T) - Y))
def test_l2_regularization(solver): n_components = 3 rng = np.random.mtrand.RandomState(42) X = np.abs(rng.randn(6, 5)) Y = np.abs(rng.randn(5, 4)) # L2 regularization should decrease the mean of the coefficients l2_reg = 2. model = CMF(n_components=n_components, solver=solver, l2_reg=0., random_state=42) reg = CMF(n_components=n_components, solver=solver, l2_reg=l2_reg, random_state=42) U_reg, V_reg, Z_reg = reg.fit_transform(X, Y) U_model, V_model, Z_model = model.fit_transform(X, Y) msg = "solver: {}".format(solver) assert_greater(U_model.mean(), U_reg.mean(), msg) assert_greater(V_model.mean(), V_reg.mean(), msg) assert_greater(Z_model.mean(), Z_reg.mean(), msg)
def test_logit_link_optimization(): n_components = 5 rng = np.random.mtrand.RandomState(42) X = 1 / (1 + np.exp(-rng.randn(6, 5))) Y = 1 / (1 + np.exp(-rng.randn(5, 4))) model = CMF(n_components=n_components, solver="newton", l2_reg=0., random_state=42, x_link="logit", y_link="logit", U_non_negative=False, V_non_negative=False, Z_non_negative=False) U, V, Z = model.fit_transform(X, Y) assert_less(model.reconstruction_err_, 0.1)
def sparse_cmf_with_logits_benchmark(sample_ratio): rng = np.random.mtrand.RandomState(42) X = np.abs(rng.randn(2000, 150)) X[:1000, 2 * np.arange(10) + 100] = 0 X[1000:, 2 * np.arange(10)] = 0 X_sparse = SP(X) Y = expit(rng.randn(150, 10)) model = CMF(n_components=10, solver="newton", random_state=42, sg_sample_ratio=sample_ratio, max_iter=10) U, V, Z = model.fit_transform(X_sparse, Y)
def test_transform_after_fit_no_labels(solver): rng = np.random.mtrand.RandomState(36) X = rng.randn(7, 5) Y = rng.randn(5, 3) X_new = rng.randn(15, 5) model = CMF(n_components=2, solver=solver, x_init='svd', y_init='svd', U_non_negative=False, V_non_negative=False, Z_non_negative=False, random_state=0, max_iter=100) U_ft, V_ft, Z_ft = model.fit_transform(X, Y) U_t, V_t, Z_t = model.transform(X_new, None) assert_array_equal(V_t, V_ft)
def test_fit_nn_output(solver): # Test that the decomposition does not contain negative values X = np.c_[5 * np.ones(5) - np.arange(1, 6), 5 * np.ones(5) + np.arange(1, 6)] Y = np.c_[5 * np.ones(5) - np.arange(1, 6), 5 * np.ones(5) + np.arange(1, 6)].T for init in (None, 'nndsvd', 'nndsvda', 'nndsvdar', 'random'): model = CMF(n_components=2, solver=solver, x_init=init, y_init=init, random_state=0) U, V, Z = model.fit_transform(X, Y) assert_false((U < 0).any() or (V < 0).any() or (Z < 0).any())
def test_logit_link_non_negative_optimization(): # Test if the logit link function works with a non-negative counterpart n_components = 5 rng = np.random.mtrand.RandomState(42) X = rng.randn(6, 5) X[X < 0] = 0 Y = 1 / (1 + np.exp(-rng.randn(5, 4))) model = CMF(n_components=n_components, solver="newton", l2_reg=0., random_state=42, y_link="logit", U_non_negative=True, V_non_negative=True, Z_non_negative=False, hessian_pertubation=0.2, max_iter=1000) U, V, Z = model.fit_transform(X, Y) assert_less(model.reconstruction_err_, 0.1)
def test_nonnegative_condition_for_newton_solver(): n_components = 3 rng = np.random.mtrand.RandomState(42) X = np.abs(rng.randn(6, 5)) Y = np.abs(rng.randn(5, 4)) model = CMF(n_components=n_components, solver="newton", l2_reg=0., random_state=42, U_non_negative=False, V_non_negative=False, Z_non_negative=False) U, V, Z = model.fit_transform(X, Y) # if one value is negative in any matrix, since X and Y are non-negative, # all the other matrices will need to have negative values assert_less(np.min(U), 0) assert_less(np.min(V), 0) assert_less(np.min(Z), 0)
def test_stochastic_newton_solver_sparse_input(): rng = np.random.mtrand.RandomState(36) A = np.abs(rng.randn(10, 10)) A[:, 2 * np.arange(5)] = 0 A_sparse = csr_matrix(A) B = np.abs(rng.randn(10, 5)) B[2 * np.arange(5), :] = 0 B_sparse = csr_matrix(B) est1 = CMF(n_components=5, solver="newton", x_init='svd', y_init='svd', U_non_negative=False, V_non_negative=False, Z_non_negative=False, sg_sample_ratio=0.5, random_state=0, max_iter=1000) est2 = clone(est1) U1, V1, Z1 = est1.fit_transform(A, B) U2, V2, Z2 = est2.fit_transform(A_sparse, B_sparse) assert_array_almost_equal(U1, U2) assert_array_almost_equal(V1, V2) assert_array_almost_equal(Z1, Z2)
def test_sparse_input(solver): # Test that sparse matrices are accepted as input rng = np.random.mtrand.RandomState(42) A = np.abs(rng.randn(10, 10)) A[:, 2 * np.arange(5)] = 0 A_sparse = csr_matrix(A) B = np.abs(rng.randn(10, 5)) B[2 * np.arange(5), :] = 0 B_sparse = csr_matrix(B) est1 = CMF(solver=solver, n_components=5, x_init='random', y_init='random', random_state=0, tol=1e-2) est2 = clone(est1) U1, V1, Z1 = est1.fit_transform(A, B) U2, V2, Z2 = est2.fit_transform(A_sparse, B_sparse) assert_array_almost_equal(U1, U2) assert_array_almost_equal(V1, V2) assert_array_almost_equal(Z1, Z2)
def dense_cmf_with_logits_benchmark(): rng = np.random.mtrand.RandomState(42) X = np.abs(rng.randn(2000, 150)) Y = np.abs(rng.randn(150, 10)) model = CMF(n_components=10, solver="newton", random_state=42, max_iter=10) U, V, Z = model.fit_transform(X, Y)