def transform(self, X, y=None): """Transform the data X into the topic space of the fitted ensemble model. Parameters ---------- X: array or sparse matrix of shape (n_docs, n_words) Corpus to be embedded into topic space y: Ignored Returns ------- embedding: array of shape (n_docs, n_topics) An embedding of the documents X into the topic space. """ X = check_array(X, accept_sparse="csr") if not issparse(X): X = coo_matrix(X) else: X = X.tocoo() result = plsa_refit( X, self.components_, n_iter=50, n_iter_per_test=5, tolerance=0.001, random_state=self.random_state, ) return result
def ensemble_fit( X, estimated_n_topics=10, model="plsa", init="random", min_samples=3, min_cluster_size=4, n_starts=16, n_jobs=8, parallelism="dask", topic_combination="hellinger_umap", n_iter=100, n_iter_per_test=10, tolerance=0.001, e_step_thresh=1e-16, lift_factor=1, beta_loss=1, alpha=0.0, solver="mu", random_state=None, ): """Generate a set of stable topics by using an ensemble of topic models and then clustering the results and generating representative topics for each cluster. The generate a set of document vectors based on the selected stable topics. Parameters ---------- X: array or sparse matrix of shape (n_docs, n_words) The bag-of-words matrix for the corpus to train on. estimated_n_topics: int (optional, default=10) The estimated number of topics. Note that the final number of topics produced can differ from this value, and may be more or less than the provided value. Instead this value provides the algorithm with a suggestion of the approximate number of topics to use. model: string (optional, default="plsa") The topic modeling method to use (either "plsa" or "nmf") init: string or tuple (optional, default="random") The intialization method to use. This should be one of: * ``"random"`` * ``"nndsvd"`` * ``"nmf"`` or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words). int (optional, default=3) The min_samples parameter to use for HDBSCAN clustering. min_cluster_size: int (optional, default=4) The min_cluster_size parameter to use for HDBSCAN clustering n_starts: int (optional, default=16) The number of bootstrap sampled topic models to run -- the size of the ensemble. n_jobs: int (optional, default=8) The number of parallel jobs to run at a time. parallelism: string (optional, default="dask") The parallelism model to use. Should be one of "dask" or "joblib". topic_combination: string (optional, default="hellinger_umap") The method of comnining ensemble topics into a set of stable topics. Should be one of: * ``"hellinger_umap"`` * ``"hellinger"`` * ``"kl_divergence"`` n_iter: int The maximum number iterations of EM to perform n_iter_per_test: int The number of iterations between tests for relative improvement in log-likelihood. tolerance: float The threshold of relative improvement in log-likelihood required to continue iterations. e_step_thresh: float (optional, default=1e-32) Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls below threshold then write a zero for P(z|w,d). lift_factor: int (optional, default=1) Importance factor to apply to lift -- if high lift value are important to you then larger lift factors will be beneficial. beta_loss: float or string, (optional, default 'kullback-leibler') The beta loss to use if using NMF for topic modeling. alpha: float (optional, default=0.0) The alpha parameter defining regularization if using NMF for topic modeling. solver: string, (optional, default="mu") The choice of solver if using NMF for topic modeling. Should be either "cd" or "mu". random_state int, RandomState instance or None, (optional, default: None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used in in initialization. Returns ------- doc_vectors, stable_topics: arrays of shape (n_docs, M) and (M, n_words) The vectors giving the probability of topics for each document, and the stable topics produced by the ensemble. """ X = check_array(X, accept_sparse="csr") if issparse(X): X_coo = X.tocoo() else: X_coo = coo_matrix(X) all_topics = ensemble_of_topics( X_coo, estimated_n_topics, model, n_jobs, n_starts, parallelism, init=init, n_iter=n_iter, n_iter_per_test=n_iter_per_test, tolerance=tolerance, e_step_thresh=e_step_thresh, lift_factor=1, beta_loss=beta_loss, alpha=alpha, solver=solver, random_state=random_state, ) if topic_combination in _topic_combiner: cluster_topics = _topic_combiner[topic_combination] else: raise ValueError("topic_combination must be one of {}".format( tuple(_topic_combiner.keys()))) stable_topics = cluster_topics(all_topics, min_samples, min_cluster_size) if lift_factor != 1: stable_topics **= lift_factor normalize(stable_topics, axis=1) if model == "plsa": doc_vectors = plsa_refit( X, stable_topics, e_step_thresh=e_step_thresh, random_state=random_state, ) elif model == "nmf": doc_vectors, _, _ = non_negative_factorization( X, H=stable_topics, n_components=stable_topics.shape[0], update_H=False, beta_loss=beta_loss, alpha=alpha, solver=solver, ) else: raise ValueError('Model must be one of "plsa" or "nmf"') return doc_vectors, stable_topics