def fit_analyze_nmf(self): ''' fit NMF print out stats print out summary.html print out model.html ''' #idx_df = list(self.df.index) t0 = time.time() # time it self.W, self.H, nmf_model = run_nmf(self.X2, self.kw_nmf) self.nmf_model = nmf_model t1 = time.time() # time it print "finished in %4.4fmin for %s " % ((t1 - t0) / 60, 'run_nmf') print 'w ', self.W.shape, 'h', self.H.shape W_t = self.W.T print self.W.shape, W_t.shape # print type(W_t) print len(W_t[0, ]), W_t[0, :10] print 'range for W: (%.2f - %.2f); range for H: (%.2f - %.2f)' % (np.min(self.W), np.max(self.W), np.min(self.H), np.max(self.H)) self.topic_terms = get_top_topics_terms( self.vectorizer, self.H, k_top_words=n_top_terms) self.print_topic_results_html() self.plot_hist_weight_best_topic_per_article()
def test_run_nmf_nokw(): print '\nfunction: %s ' % inspect.stack()[0][3] nmx_max_iter = 6000 model_name = 'run3_1' #func_stemmer = PorterStemmer() #func_tokenizer = word_tokenize # kw_tfidf = {'max_df': 0.90, 'stop_words': 'english', 'min_df': 10,\ # 'tokenizer': func_tokenizer, 'ngram_range':(1,3)} kw_nmf = {'n_components': 2, 'max_iter': nmx_max_iter} X = np.array([random.random() for i in xrange(20)]).reshape((4, 5)) print X W, H, nmf = run_nmf(X) # , kw_nmf) print W n.assert_true(len(W), 2)
def nmf( data: Union[MultimodalData, UnimodalData], n_components: int = 20, features: str = "highly_variable_features", space: str = "log", init: str = "nndsvdar", algo: str = "halsvar", mode: str = "batch", tol: float = 1e-4, use_gpu: bool = False, alpha_W: float = 0.0, l1_ratio_W: float = 0.0, alpha_H: float = 0.0, l1_ratio_H: float = 0.0, fp_precision: str = "float", n_jobs: int = -1, random_state: int = 0, ) -> None: """Perform Nonnegative Matrix Factorization (NMF) to the data using Frobenius norm. Steps include select features and L2 normalization and NMF and L2 normalization of resulting coordinates. The calculation uses `nmf-torch <https://github.com/lilab-bcb/nmf-torch>`_ package. Parameters ---------- data: ``pegasusio.MultimodalData`` Annotated data matrix with rows for cells and columns for genes. n_components: ``int``, optional, default: ``50``. Number of Principal Components to get. features: ``str``, optional, default: ``"highly_variable_features"``. Keyword in ``data.var`` to specify features used for nmf. max_value: ``float``, optional, default: ``None``. The threshold to truncate data symmetrically after scaling. If ``None``, do not truncate. space: ``str``, optional, default: ``log``. Choose from ``log`` and ``expression``. ``log`` works on log-transformed expression space; ``expression`` works on the original expression space (normalized by total UMIs). init: ``str``, optional, default: ``nndsvdar``. Method to initialize NMF. Options are 'random', 'nndsvd', 'nndsvda' and 'nndsvdar'. algo: ``str``, optional, default: ``halsvar`` Choose from ``mu`` (Multiplicative Update), ``hals`` (Hierarchical Alternative Least Square), ``halsvar`` (HALS variant, use HALS to mimic ``bpp`` and can get better convergence for sometimes) and ``bpp`` (alternative non-negative least squares with Block Principal Pivoting method). mode: ``str``, optional, default: ``batch`` Learning mode. Choose from ``batch`` and ``online``. Notice that ``online`` only works when ``beta=2.0``. For other beta loss, it switches back to ``batch`` method. tol: ``float``, optional, default: ``1e-4`` The toleration used for convergence check. use_gpu: ``bool``, optional, default: ``False`` If ``True``, use GPU if available. Otherwise, use CPU only. alpha_W: ``float``, optional, default: ``0.0`` A numeric scale factor which multiplies the regularization terms related to W. If zero or negative, no regularization regarding W is considered. l1_ratio_W: ``float``, optional, default: ``0.0`` The ratio of L1 penalty on W, must be between 0 and 1. And thus the ratio of L2 penalty on W is (1 - l1_ratio_W). alpha_H: ``float``, optional, default: ``0.0`` A numeric scale factor which multiplies the regularization terms related to H. If zero or negative, no regularization regarding H is considered. l1_ratio_H: ``float``, optional, default: ``0.0`` The ratio of L1 penalty on W, must be between 0 and 1. And thus the ratio of L2 penalty on H is (1 - l1_ratio_H). fp_precision: ``str``, optional, default: ``float`` The numeric precision on the results. Choose from ``float`` and ``double``. n_jobs : `int`, optional (default: -1) Number of threads to use. -1 refers to using all physical CPU cores. random_state: ``int``, optional, default: ``0``. Random seed to be set for reproducing result. Returns ------- ``None``. Update ``data.obsm``: * ``data.obsm["X_nmf"]``: Scaled NMF coordinates of shape ``(n_cells, n_components)``. Each column has a unit variance. * ``data.obsm["H"]``: The coordinate factor matrix of shape ``(n_cells, n_components)``. Update ``data.uns``: * ``data.uns["W"]``: The feature factor matrix of shape ``(n_HVFs, n_components)``. * ``data.uns["nmf_err"]``: The NMF loss. * ``data.uns["nmf_features"]``: Record the features used to perform NMF analysis. Examples -------- >>> pg.nmf(data) """ X = _select_and_scale_features(data, features=features, space=space) try: from nmf import run_nmf except ImportError as e: import sys logger.error(f"{e}\nNeed NMF-Torch! Try 'pip install nmf-torch'.") sys.exit(-1) H, W, err = run_nmf( X, n_components=n_components, init=init, algo=algo, mode=mode, tol=tol, n_jobs=eff_n_jobs(n_jobs), random_state=random_state, use_gpu=use_gpu, alpha_W=alpha_W, l1_ratio_W=l1_ratio_W, alpha_H=alpha_H, l1_ratio_H=l1_ratio_H, fp_precision=fp_precision, ) data.uns["nmf_features"] = features # record which feature to use data.uns["W"] = np.ascontiguousarray( W.T, dtype=np.float32 ) # cannot be varm because numbers of features are not the same data.uns["nmf_err"] = err data.obsm["H"] = np.ascontiguousarray(H, dtype=np.float32) H = data.obsm["H"] data.obsm["X_nmf"] = H / np.linalg.norm(H, axis=0)
def run_test(filename, algo, mode, k, n_jobs, fp=None, init='nndsvdar', loss='frobenius', tol=1e-4, max_iter=200, random_state=0, alpha=0.0, l1_ratio=0.0, chunk_size=5000): X = np.load(filename) print(X.shape) if loss == 'kullback-leibler': beta = 1 elif loss == 'frobenius': beta = 2 elif loss == 'itakura-saito': beta = 0 else: raise ValueError("Beta loss not supported!") #if method == 'sklearn mu': # model = sd.NMF(n_components=k, init=init, beta_loss=loss, tol=tol, max_iter=max_iter, random_state=random_state, solver='mu', # alpha=alpha, l1_ratio=l1_ratio) # with threadpool_limits(limits=n_jobs): # ts_start = time.time() # W1 = model.fit_transform(X) # ts_end = time.time() # H1 = model.components_ # err = beta_loss(torch.tensor(X), torch.tensor(W1 @ H1), torch.tensor(W1), torch.tensor(H1), # l1_reg_H=alpha*l1_ratio, l2_reg_H=alpha*(1-l1_ratio), # l1_reg_W=alpha*l1_ratio, l2_reg_W=alpha*(1-l1_ratio), # beta=beta, epsilon=EPSILON) # print(f"H has {np.sum(W1!=0)} non-zero elements, W has {np.sum(H1!=0)} non-zero elements. Iterations: {model.n_iter_}.") #elif method == 'sklearn cd': # model = sd.NMF(n_components=k, init=init, beta_loss=loss, tol=tol, max_iter=max_iter, random_state=random_state, solver='cd', # alpha=alpha, l1_ratio=l1_ratio) # with threadpool_limits(limits=n_jobs): # ts_start = time.time() # W1 = model.fit_transform(X) # ts_end = time.time() # H1 = model.components_ # err = beta_loss(torch.tensor(X), torch.tensor(W1 @ H1), torch.tensor(W1), torch.tensor(H1), # l1_reg_H=alpha*l1_ratio, l2_reg_H=alpha*(1-l1_ratio), # l1_reg_W=alpha*l1_ratio, l2_reg_W=alpha*(1-l1_ratio), # beta=beta, epsilon=EPSILON) # err_double = beta_loss(torch.tensor(X), torch.tensor(W1 @ H1), torch.tensor(W1), torch.tensor(H1), # l1_reg_H=alpha*l1_ratio, l2_reg_H=alpha*(1-l1_ratio), # l1_reg_W=alpha*l1_ratio, l2_reg_W=alpha*(1-l1_ratio), # beta=beta, epsilon=EPSILON, dtype='double') # print(f"H has {np.sum(W1!=0)} non-zero elements, W has {np.sum(H1!=0)} non-zero elements. Iterations: {model.n_iter_}.") ts_start = time.time() H, W, err = run_nmf(X, k, init=init, beta_loss=loss, algo=algo, mode=mode, tol=tol, n_jobs=n_jobs, random_state=random_state, alpha_W=alpha, l1_ratio_W=l1_ratio, alpha_H=alpha, l1_ratio_H=l1_ratio, fp_precision='float') ts_end = time.time() err_confirm = beta_loss(torch.tensor(X), torch.tensor(H @ W), torch.tensor(H), torch.tensor(W), beta=beta, epsilon=EPSILON, l1_reg_H=alpha * l1_ratio, l2_reg_H=alpha * (1 - l1_ratio), l1_reg_W=alpha * l1_ratio, l2_reg_W=alpha * (1 - l1_ratio)) print( f"{algo} {mode} takes {ts_end - ts_start} s, with error {err} ({err_confirm} confirmed)." ) if fp is not None: fp.write(f"{algo} {mode},{ts_end - ts_start} s,{err}\n")