def test_non_negative_factorization_consistency(): # Test that the function is called in the same way, either directly # or through the NMF class rng = np.random.mtrand.RandomState(42) A = np.abs(rng.randn(10, 10)) A[:, 2 * np.arange(5)] = 0 for init in ['random', 'nndsvd']: for solver in ('cd', 'mu'): W_nmf, H, _ = non_negative_factorization(A, init=init, solver=solver, random_state=1, tol=1e-2) W_nmf_2, _, _ = non_negative_factorization(A, H=H, update_H=False, init=init, solver=solver, random_state=1, tol=1e-2) model_class = NMF(init=init, solver=solver, random_state=1, tol=1e-2) W_cls = model_class.fit_transform(A) W_cls_2 = model_class.transform(A) assert_array_almost_equal(W_nmf, W_cls, decimal=10) assert_array_almost_equal(W_nmf_2, W_cls_2, decimal=10)
def run(self): from numpy import array, reshape from sklearn.decomposition import non_negative_factorization from sklearn.ensemble import GradientBoostingRegressor ##########################Learning################################# W, H, _ = non_negative_factorization(X=self.train.data, n_components=self.train.data.shape[1], regularization='transformation', alpha=2 * self.alpha + self.beta, l1_ratio=self.beta / (2 * self.alpha + self.beta)) Y = reshape(self.train.occupancy, (-1,)) gblsr = GradientBoostingRegressor(loss='ls', n_estimators=500).fit(W, Y) #################################################################### #############################Prediction############################# W, _, _ = non_negative_factorization(X=self.test.data, H=H, n_components=self.test.data.shape[1], regularization='transformation', alpha=2 * self.alpha + self.beta, l1_ratio=self.beta / (2 * self.alpha + self.beta)) Y = gblsr.predict(W) Y[Y < 0] = 0 predict_occupancy = array(Y) #################################################################### return reshape(predict_occupancy, (-1, 1))
def apply(self, k=-1, alpha=1.0, l1=0.75, max_iter=100, rel_err=1e-3): if k == -1: k = self.num_cluster X_t = self.pre_processing() X = X_t.T fixed_W = pd.get_dummies(self.labels) fixed_W_t = fixed_W.T # interpret W as H (transpose), you can only fix H, while optimizing W in the code. So we simply switch those matrices (invert their roles). learned_H_t, fixed_W_t_same, n_iter = decomp.non_negative_factorization(X_t.astype(np.float), n_components=k, init='custom', random_state=0, update_H=False, H=fixed_W_t.astype(np.float), alpha=alpha, l1_ratio=l1, max_iter=max_iter, shuffle=True, solver='cd',tol=rel_err, verbose=0) assert(np.all(fixed_W_t == fixed_W_t_same)) #self.cluster_labels = np.argmax(fixed_W_t_same.T, axis=1) # Now take the learned H, fix it and learn W to see how well it worked learned_W, learned_H_fix, n_iter = decomp.non_negative_factorization(X.astype(np.float), n_components=k, init='custom', random_state=0, update_H=False, H=learned_H_t.T, alpha=alpha, l1_ratio=l1, max_iter=max_iter, shuffle=True, solver='cd',tol=rel_err, verbose=0) assert(np.all(learned_H_t.T == learned_H_fix)) self.cluster_labels = np.argmax(learned_W, axis=1) if np.any(np.isnan(learned_H_t)): raise Exception('H contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'.format( alpha, k, l1, X.shape[0], X.shape[1])) if np.any(np.isnan(fixed_W_t)): raise Exception('W contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'.format( alpha, k, l1, X.shape[0], X.shape[1])) #self.print_reconstruction_error(X, fixed_W_t, learned_H_t) self.dictionary = learned_H_t self.data_matrix = fixed_W_t
def norm_nmf(data, k, init_weights=None, init_means=None, normalize_w=True, return_cost=True, write_progress_file=None, **kwargs): """ Args: data (array): dense or sparse array with shape (genes, cells) k (int): number of cell types normalize_w (bool): True if W should be normalized (so that each column sums to 1) init_weights (array, optional): Initial value for W. Default: None init_means (array, optional): Initial value for M. Default: None **kwargs: misc arguments to NMF Returns: Two matrices M of shape (genes, k) and W of shape (k, cells) """ data = cell_normalize(data) init = None if init_weights is not None or init_means is not None: init = 'custom' if init_weights is None: init_weights_, _, n_iter = non_negative_factorization( data.T, n_components=k, init='custom', update_W=False, W=init_means.T) init_weights = init_weights_.T elif init_means is None: init_means, _, n_iter = non_negative_factorization(data, n_components=k, init='custom', update_W=False, W=init_weights) init_means = init_means.copy(order='C') init_weights = init_weights.copy(order='C') nmf = NMF(k, init=init, **kwargs) if write_progress_file is not None: progress = open(write_progress_file, 'w') progress.write(str(0)) progress.close() M = nmf.fit_transform(data, W=init_means, H=init_weights) W = nmf.components_ if normalize_w: W = W / W.sum(0) if return_cost: cost = 0 if sparse.issparse(data): ws = sparse.csr_matrix(M) hs = sparse.csr_matrix(W) cost = 0.5 * ((data - ws.dot(hs)).power(2)).sum() else: cost = 0.5 * ((data - M.dot(W))**2).sum() return M, W, cost else: return M, W
def test_nmf_custom_init_dtype_error(): # Check that an error is raise if custom H and/or W don't have the same # dtype as X. rng = np.random.RandomState(0) X = rng.random_sample((20, 15)) H = rng.random_sample((15, 15)).astype(np.float32) W = rng.random_sample((20, 15)) with pytest.raises(TypeError, match="should have the same dtype as X"): NMF(init='custom').fit(X, H=H, W=W) with pytest.raises(TypeError, match="should have the same dtype as X"): non_negative_factorization(X, H=H, update_H=False)
def test_non_negative_factorization_consistency(): # Test that the function is called in the same way, either directly # or through the NMF class A = np.abs(random_state.randn(10, 10)) A[:, 2 * np.arange(5)] = 0 W_nmf, H, _ = non_negative_factorization(A, random_state=1, tol=1e-2) W_nmf_2, _, _ = non_negative_factorization(A, H=H, update_H=False, random_state=1, tol=1e-2) model_class = NMF(random_state=1, tol=1e-2) W_cls = model_class.fit_transform(A) W_cls_2 = model_class.transform(A) assert_array_almost_equal(W_nmf, W_cls, decimal=10) assert_array_almost_equal(W_nmf_2, W_cls_2, decimal=10)
def test_init_default_deprecation(): # Test FutureWarning on init default msg = (r"The 'init' value, when 'init=None' and " r"n_components is less than n_samples and " r"n_features, will be changed from 'nndsvd' to " r"'nndsvda' in 1.1 \(renaming of 0.26\).") rng = np.random.mtrand.RandomState(42) A = np.abs(rng.randn(6, 5)) with pytest.warns(FutureWarning, match=msg): nmf._initialize_nmf(A, 3) with pytest.warns(FutureWarning, match=msg): NMF().fit(A) with pytest.warns(FutureWarning, match=msg): non_negative_factorization(A)
def run_NMF(): true_labels = labels adjs = [ adjacency_matrix, adjacency_matrix_weights, adjacency_matrix_similarity ] names = [ 'Adjacency Matrix: no weights\n', 'Adjacency Matrix: likes-dislikes weights\n', 'Adjacency Matrix: similarity weights\n' ] for adj, name in zip(adjs, names): nmf_factorization = non_negative_factorization(adj, n_components=2, init='random') W = nmf_factorization[0] W = pd.DataFrame(W) H = nmf_factorization[1] H = pd.DataFrame(H) clusters = [ 1 if (H.iloc[0, i] < H.iloc[1, i]) else 0 for i in range(H.shape[1]) ] clusters = pd.Series(clusters) predicted_labels = list(clusters) print(name) print(classification_report(true_labels, predicted_labels)) print('--------------------------------------------\n')
def h_to_a(self, h_comp, w, adj): # TODO construct kernel from random walk theory # TODO random walk is fast but least accurate model among graph completion algos # TODO check the literature for online nmf (OMF) # transform graph to kernel to parameterize # fNRI factorization of edges using the softmax # make it variational, make 10 projections to generate 10 different permutations of adjacency # adj will all be ones, assuming fully connected graph at the init # use nmf to sparsify the adj, by making more plausible connections and less density graph. adj_mat_vec = tf.zeros(shape=(10, adj.shape[0], adj.shape[1]), dtype=tf.float32) for k in range(10): w, h, n_iter = sk_dec.non_negative_factorization( X=adj, H=h_comp[k], W=w, init='custom', n_components=adj.shape[0]) adj_mat = tf.matmul(w, h) adj_mat_vec[k] = adj_mat # edges = gumbel_softmax(logits, tau=args.temp, hard=args.hard) # prob = my_softmax(logits, -1) # loss_kl = kl_categorical_uniform(prob, args.num_atoms, edge_types) return adj_mat_vec
def apply(self, k=-1, alpha=1.0, l1=0.75, max_iter=100, rel_err=1e-3): if k == -1: k = self.num_cluster X = self.pre_processing() fixed_W = pd.get_dummies(self.labels) fixed_W_t = fixed_W.T # interpret W as H (transpose), you can only fix H, while optimizing W in the code. So we simply switch those matrices (invert their roles). learned_H_t, fixed_W_t_same, n_iter = decomp.non_negative_factorization(X.astype(np.float), n_components=k, init='custom', random_state=0, update_H=False, H=fixed_W_t.astype(np.float), alpha=alpha, l1_ratio=l1, max_iter=max_iter, shuffle=True, solver='cd',tol=rel_err, verbose=0) init_W = fixed_W_t_same.T init_H = learned_H_t.T nmf = decomp.NMF(alpha=alpha, init='custom',l1_ratio=l1, max_iter=max_iter, n_components=k, random_state=0, shuffle=True, solver='cd', tol=rel_err, verbose=0) W = nmf.fit_transform(X.T, W=init_W, H = init_H) H = nmf.components_ self.cluster_labels = np.argmax(W, axis=1) if np.any(np.isnan(H)): raise Exception('H contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'.format( alpha, k, l1, X.shape[0], X.shape[1])) if np.any(np.isnan(W)): raise Exception('W contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'.format( alpha, k, l1, X.shape[0], X.shape[1])) # self.print_reconstruction_error(X, W, H) self.dictionary = H.T self.data_matrix = W.T
def compute_exposure(self, X, P): ''' Compute exposures from given signatures ''' # Initialize NMF object with components equal to the signatures. # then run nmf.transform(X). K, M = P.shape # A hacky way to call sklearn's nmf function... nmf = self._new_NMF_model(K) E, P_, n_iter_ = non_negative_factorization( X=X, W=None, H=P, n_components=K, init=nmf.init, update_H=False, solver=nmf.solver, beta_loss=nmf.beta_loss, tol=nmf.tol, max_iter=nmf.max_iter, alpha=nmf.alpha, l1_ratio=nmf.l1_ratio, regularization='both', random_state=nmf.random_state, verbose=nmf.verbose, shuffle=nmf.shuffle) assert(np.allclose(P_, P)) # TODO: add new feature to change norm to be # KL or itakaru-saito divergence err = np.linalg.norm(X - E.dot(P), 'fro') return E, err
def thresholding(X, rank=0, W_ini=[], H_ini=[]): ''' Algorithm of thresholding from Binary Matrix Factorization with Applications by Zhang ''' if (rank == 0 and (W_ini == [] or H_ini == [])): print(" You have to put initializations or a rank") return if (W_ini == [] or H_ini == []): W_ini, H_ini, thash = non_negative_factorization(X, n_components=rank, solver='mu') W_ini, H_ini = utils.normalization(W_ini, H_ini) II = np.max(H_ini) testh = np.linspace(0, II, int((II - 0) / 0.01)) ll = np.max(W_ini) testw = np.linspace(0, ll, int((ll - 0) / 0.01)) temp = 10**10 for i in range(len(testh)): newH = signstar(H_ini, testh[i]) for j in range(len(testw)): newW = signstar(W_ini, testw[j]) X_res = np.dot(newW, newH.T) X_res[X_res > 1] = 1 newtemp = utils.frobenius(X, X_res) if newtemp < temp: temp = newtemp h = testh[i] w = testw[j] return (w, h)
def test_nmf_decreasing(): # test that the objective function is decreasing at each iteration n_samples = 20 n_features = 15 n_components = 10 alpha = 0.1 l1_ratio = 0.5 tol = 0. # initialization rng = np.random.mtrand.RandomState(42) X = rng.randn(n_samples, n_features) np.abs(X, X) W0, H0 = nmf._initialize_nmf(X, n_components, init='random', random_state=42) for beta_loss in (-1.2, 0, 0.2, 1., 2., 2.5): for solver in ('cd', 'mu'): if solver != 'mu' and beta_loss != 2: # not implemented continue W, H = W0.copy(), H0.copy() previous_loss = None for _ in range(30): # one more iteration starting from the previous results W, H, _ = non_negative_factorization( X, W, H, beta_loss=beta_loss, init='custom', n_components=n_components, max_iter=1, alpha=alpha, solver=solver, tol=tol, l1_ratio=l1_ratio, verbose=0, regularization='both', random_state=0, update_H=True) loss = nmf._beta_divergence(X, W, H, beta_loss) if previous_loss is not None: assert_greater(previous_loss, loss) previous_loss = loss
def delete_word_from_topic(self, topic_to_delete_from, word_to_delete, top_words_in_topic): self.W = np.copy(self.nmf_matrix) H = np.copy(self.nmf_components) index_of_word_to_remove = self.features.index( word_to_delete.replace(' ', '_')) H[topic_to_delete_from][index_of_word_to_remove] = 0 self.W, self.H, n_iter = non_negative_factorization( self.vectorized_out, n_components=self.nr_of_topics, init='custom', random_state=0, update_H=False, H=H) self.nmf_matrix = np.copy(self.W) self.nmf_components = np.copy(self.H) self.doc_topic_dists = self.nmf_matrix / self.nmf_matrix.sum( axis=1)[:, None] self.doc_topic_dists = np.nan_to_num(self.doc_topic_dists, nan=1 / self.nr_of_topics) self.top_words_map = self._top_words_map() self.doc_topic_matrix_df = self._doc_topic_matrix_df()
def runCustom(sig, mix): print("mix: ", mix.shape) print("sig: ", sig.shape) # the roles of W and H are reversed in this case # because sklearn nmf only lets us fix H whereas we want to fix W so we must reverse # the roles and transpose # H is now the signature matrix # W is now the mix matrix print("running NMF with %d components" % sig.shape[1]) W, H, n_iter = non_negative_factorization(mix.T, n_components=sig.shape[1], init='custom', solver='mu', beta_loss='kullback-leibler', max_iter=10000000, tol=1e-13, random_state=123456, update_H=False, H=sig.T) # sum to 1 for each row W = W / W.sum(axis=1, keepdims=1) return W.T, H.T
def learn_representation(audio: np.ndarray, win_length: int = 1024, n_components: int = 100, max_iter: int = 400, init: str = None, W: np.ndarray = None, H: np.ndarray = None): mags, phases = get_magphase(audio, win_length=win_length) components, weights, n_iters = non_negative_factorization( mags, init=init, W=W, H=H, n_components=n_components, beta_loss="kullback-leibler", solver="mu", l1_ratio=1.0, alpha=0.1, max_iter=max_iter) # model = DictionaryLearning(n_components=n_components, # tol=1e-1, # fit_algorithm="cd", # transform_algorithm="lasso_cd", # positive_code=True, # positive_dict=True, # max_iter=max_iter) # weights = model.fit_transform(mags.T).T # components = model.components_.T # n_iters = model.n_iter_ return components, weights, n_iters
def nmf_pooling(A, levels, binarize=False): S_list = [] A_list = [] S_prev = sp.eye(A.shape[0], dtype=np.float32) for i in range(max(levels) + 1): A = sp.csr_matrix(A, dtype=np.float32) if i in levels: A_list.append(A) n_nodes = A.shape[0] n_comp = np.maximum(n_nodes // 2, 2) _, H, _ = non_negative_factorization(A, n_components=n_comp, init='random', random_state=0, max_iter=10) H = sp.csr_matrix(H, dtype=np.float32) A = (H.dot(A)).dot(H.T) # binarize H (hard cluster assignment) if binarize: H = H.toarray() S_i = np.zeros_like(H) S_i[np.arange(len(H)), H.argmax(1)] = 1 S_i = sp.csr_matrix(S_i, dtype=np.float32) else: S_i = H # save the right pooling matrices S_prev = S_i.dot(S_prev) if i + 1 == max(levels) + 1: S_list.append(S_prev) elif i + 1 in levels: S_list.append(S_prev) S_prev = sp.eye(A.shape[0], dtype=np.float32) return S_list, A_list
def fit_transform_split(self, topics, fixed_H, column): self.nr_of_topics = topics repeats = np.ones(len(self.nmf_matrix.T), dtype=int) repeats[column] = int(2) W = np.repeat(self.nmf_matrix.T, repeats, axis=0) self.W = W.T self.W = np.ascontiguousarray(self.W, dtype=np.float64) # self.W, self.H, n_iter = non_negative_factorization(self.vectorized_out, n_components=topics, init='custom', random_state=0, update_H=True, H=fixed_H, W=self.W) self.W, self.H, n_iter = non_negative_factorization( self.vectorized_out, n_components=topics, init='custom', random_state=0, update_H=False, H=fixed_H) self.nmf_matrix = np.copy(self.W) self.nmf_components = np.copy(self.H) self.doc_topic_dists = self.nmf_matrix / self.nmf_matrix.sum( axis=1)[:, None] self.doc_topic_dists = np.nan_to_num(self.doc_topic_dists, nan=1 / self.nr_of_topics) self.top_words_map = self._top_words_map() self.doc_topic_matrix_df = self._doc_topic_matrix_df() return
def lsa_compute(word_doc_matrix, n_topics: int, method='SVD', max_nmf_iter=10): """ Computes lsa on word_doc_matrix, using factorization functions from `sklearn`. If `method` is "SVD" (default), it will use `randomized_svd`. If `method` is "NMF", it will use `non_negative_factorization`. Args: word_doc_matrix (matrix): matrix to factorize n_topics (int): number of "topics" to extract method (str): factorization method max_nmf_iter (int, optional): Sets the max number of iterations when calling `non_negative_factorization`. Default is 10. Returns: tuple of word_topic_matrix, topic_doc_matrix """ logging.info(f"Computing LSA using {method} method...") if method == "SVD": U, _, VT = randomized_svd(word_doc_matrix, n_topics) return U, VT elif method == "NMF": W, H, _ = non_negative_factorization( word_doc_matrix, n_components=n_topics, max_iter=max_nmf_iter, random_state=0, ) return W, H else: raise ValueError(f"ERROR: invalid value for method argument")
def decompose_with_dict(spec, dic, max_iter=6000, alpha=0.5): """ get H with V and W Example: >>> V = 10*np.random.rand(100, 200) >>> W, H = decompose(V, k=50) >>> H2 = decompose_with_dict(V, W) :param spec: :param dic: :param max_iter :param alpha :param l1_rate :return: """ k = dic.shape[1] _act, _, n_iter = non_negative_factorization(np.transpose(spec), H=np.transpose(dic), update_H=False, alpha=alpha, l1_ratio=1, n_components=k, solver='cd', max_iter=max_iter) return np.transpose(_act)
def test_nmf_multiplicative_update_sparse(): # Compare sparse and dense input in multiplicative update NMF # Also test continuity of the results with respect to beta_loss parameter n_samples = 20 n_features = 10 n_components = 5 alpha = 0.1 l1_ratio = 0.5 n_iter = 20 # initialization rng = np.random.mtrand.RandomState(1337) X = rng.randn(n_samples, n_features) X = np.abs(X) X_csr = sp.csr_matrix(X) W0, H0 = nmf._initialize_nmf(X, n_components, init='random', random_state=42) for beta_loss in (-1.2, 0, 0.2, 1., 2., 2.5): # Reference with dense array X W, H = W0.copy(), H0.copy() W1, H1, _ = non_negative_factorization( X, W, H, n_components, init='custom', update_H=True, solver='mu', beta_loss=beta_loss, max_iter=n_iter, alpha=alpha, l1_ratio=l1_ratio, regularization='both', random_state=42) # Compare with sparse X W, H = W0.copy(), H0.copy() W2, H2, _ = non_negative_factorization( X_csr, W, H, n_components, init='custom', update_H=True, solver='mu', beta_loss=beta_loss, max_iter=n_iter, alpha=alpha, l1_ratio=l1_ratio, regularization='both', random_state=42) assert_array_almost_equal(W1, W2, decimal=7) assert_array_almost_equal(H1, H2, decimal=7) # Compare with almost same beta_loss, since some values have a specific # behavior, but the results should be continuous w.r.t beta_loss beta_loss -= 1.e-5 W, H = W0.copy(), H0.copy() W3, H3, _ = non_negative_factorization( X_csr, W, H, n_components, init='custom', update_H=True, solver='mu', beta_loss=beta_loss, max_iter=n_iter, alpha=alpha, l1_ratio=l1_ratio, regularization='both', random_state=42) assert_array_almost_equal(W1, W3, decimal=4) assert_array_almost_equal(H1, H3, decimal=4)
def test_non_negative_factorization_consistency(init, solver, alpha_W, alpha_H): # Test that the function is called in the same way, either directly # or through the NMF class max_iter = 500 rng = np.random.mtrand.RandomState(42) A = np.abs(rng.randn(10, 10)) A[:, 2 * np.arange(5)] = 0 W_nmf, H, _ = non_negative_factorization( A, init=init, solver=solver, max_iter=max_iter, alpha_W=alpha_W, alpha_H=alpha_H, random_state=1, tol=1e-2, ) W_nmf_2, H, _ = non_negative_factorization( A, H=H, update_H=False, init=init, solver=solver, max_iter=max_iter, alpha_W=alpha_W, alpha_H=alpha_H, random_state=1, tol=1e-2, ) model_class = NMF( init=init, solver=solver, max_iter=max_iter, alpha_W=alpha_W, alpha_H=alpha_H, random_state=1, tol=1e-2, ) W_cls = model_class.fit_transform(A) W_cls_2 = model_class.transform(A) assert_allclose(W_nmf, W_cls) assert_allclose(W_nmf_2, W_cls_2)
def _assert_nmf_no_nan(X, beta_loss): W, H, _ = non_negative_factorization(X, n_components=n_components, solver='mu', beta_loss=beta_loss, random_state=0, max_iter=1000) assert_false(np.any(np.isnan(W))) assert_false(np.any(np.isnan(H)))
def _assert_nmf_no_nan(X, beta_loss): W, H, _ = non_negative_factorization( X, init="random", n_components=n_components, solver="mu", beta_loss=beta_loss, random_state=0, max_iter=1000, ) assert not np.any(np.isnan(W)) assert not np.any(np.isnan(H))
def test_nmf_decreasing(solver): # test that the objective function is decreasing at each iteration n_samples = 20 n_features = 15 n_components = 10 alpha = 0.1 l1_ratio = 0.5 tol = 0.0 # initialization rng = np.random.mtrand.RandomState(42) X = rng.randn(n_samples, n_features) np.abs(X, X) W0, H0 = nmf._initialize_nmf(X, n_components, init="random", random_state=42) for beta_loss in (-1.2, 0, 0.2, 1.0, 2.0, 2.5): if solver != "mu" and beta_loss != 2: # not implemented continue W, H = W0.copy(), H0.copy() previous_loss = None for _ in range(30): # one more iteration starting from the previous results W, H, _ = non_negative_factorization( X, W, H, beta_loss=beta_loss, init="custom", n_components=n_components, max_iter=1, alpha_W=alpha, solver=solver, tol=tol, l1_ratio=l1_ratio, verbose=0, random_state=0, update_H=True, ) loss = (nmf._beta_divergence(X, W, H, beta_loss) + alpha * l1_ratio * n_features * W.sum() + alpha * l1_ratio * n_samples * H.sum() + alpha * (1 - l1_ratio) * n_features * (W**2).sum() + alpha * (1 - l1_ratio) * n_samples * (H**2).sum()) if previous_loss is not None: assert previous_loss > loss previous_loss = loss
def _nmf(self, X, nmf_kwargs): """ Parameters ---------- X : pandas.DataFrame, Normalized counts dataFrame to be factorized. nmf_kwargs : dict, Arguments to be passed to ``non_negative_factorization`` """ (usages, spectra, niter) = non_negative_factorization(X, **nmf_kwargs) return(spectra, usages)
def nmf(): product_tensor = np.load('data/product_tensor.npy') X = product_tensor.sum(1) W, H, n_iter = non_negative_factorization(X, n_components=10, max_iter=500, regularization='both', init='random', solver='mu', beta_loss='kullback-leibler') print(n_iter, W.shape, H.shape) np.save('predictions/user_embed.npy', W) np.save('predictions/prod_embed.npy', H.T)
def nmf_sklearn(V, k, W=None, H=None, beta_loss="frobenius", verbose=False): """ NMF with sklearn. """ f = V.shape[0] t = V.shape[1] if W is None: W = np.random.uniform(size=(f, k)) if H is None: H = np.random.uniform(size=(k, t)) W, H, _ = non_negative_factorization(V, W, H, k, init="custom", solver="mu", beta_loss=beta_loss, verbose=verbose) return W, H
def split_once_sklearn( X, subset, W_parent, random_state: mtrand.RandomState, dtype: Union[np.float32, np.float64], tol, maxiter, init, ): m = X.shape[0] if len(subset) <= 3: cluster_subset = np.ones(len(subset), dtype=dtype) W_buffer_one = np.zeros((m, 2), dtype=dtype) H_buffer_one = np.zeros((2, len(subset)), dtype=dtype) priority_one = -1 else: term_subset = np.where(np.sum(X[:, subset], axis=1) != 0)[0] X_subset = X[term_subset, :][:, subset] W = random_state.rand(len(term_subset), 2) H = random_state.rand(2, len(subset)) W, H, n_iter_ = non_negative_factorization( X=X_subset, W=W, H=H, n_components=2, init=init, update_H=True, solver="cd", beta_loss=2, tol=tol, max_iter=maxiter, alpha=0, l1_ratio=0, regularization="both", random_state=random_state, verbose=0, shuffle=False, ) cluster_subset = np.argmax(H, axis=0) W_buffer_one = np.zeros((m, 2), dtype=dtype) W_buffer_one[term_subset, :] = W H_buffer_one = H if len(np.unique(cluster_subset)) > 1: priority_one = compute_priority(W_parent, W_buffer_one, dtype=dtype) else: priority_one = -1 return cluster_subset, W_buffer_one, H_buffer_one, priority_one
def nmf_init(data, clusters, k, init='enhanced'): """ runs enhanced NMF initialization from clusterings (Gong 2013) There are 3 options for init: enhanced - uses EIn-NMF from Gong 2013 basic - uses means for W, assigns H such that the chosen cluster for a given cell has value 0.75 and all others have 0.25/(k-1). nmf - uses means for W, and assigns H using the NMF objective while holding W constant. """ init_w = np.zeros((data.shape[0], k)) if sparse.issparse(data): for i in range(k): if data[:, clusters == i].shape[1] == 0: point = np.random.randint(0, data.shape[1]) init_w[:, i] = data[:, point].toarray().flatten() else: init_w[:, i] = np.array(data[:, clusters == i].mean(1)).flatten() else: for i in range(k): if data[:, clusters == i].shape[1] == 0: point = np.random.randint(0, data.shape[1]) init_w[:, i] = data[:, point].flatten() else: init_w[:, i] = data[:, clusters == i].mean(1) init_h = np.zeros((k, data.shape[1])) if init == 'enhanced': distances = np.zeros((k, data.shape[1])) for i in range(k): for j in range(data.shape[1]): distances[i, j] = np.sqrt( ((data[:, j] - init_w[:, i])**2).sum()) for i in range(k): for j in range(data.shape[1]): init_h[i, j] = 1 / ( (distances[:, j] / distances[i, j])**(-2)).sum() elif init == 'basic': init_h = initialize_from_assignments(clusters, k) elif init == 'nmf': init_h_, _, n_iter = non_negative_factorization(data.T, n_components=k, init='custom', update_H=False, H=init_w.T) init_h = init_h_.T return init_w, init_h
def decompose(spec, k, max_iter=6000, alpha=0.5): """ basic NMF tool, use it to get W and H Example: >>> V = 10 * np.random.rand(100, 3000) >>> W, H = decompose(V, 50) :param spec: :param k: :param max_iter: """ _dic, _act, n_iter = non_negative_factorization(spec, n_components=k, solver='cd', alpha=alpha, l1_ratio=1, max_iter=max_iter) return _dic, _act
def _assert_nmf_no_nan(X, beta_loss): W, H, _ = non_negative_factorization( X, init='random', n_components=n_components, solver='mu', beta_loss=beta_loss, random_state=0, max_iter=1000) assert not np.any(np.isnan(W)) assert not np.any(np.isnan(H))