Esempio n. 1
0
    def transform(self, X, y=None):
        """Transform the data X into the topic space of the fitted ensemble model.

        Parameters
        ----------
        X: array or sparse matrix of shape (n_docs, n_words)
            Corpus to be embedded into topic space

        y: Ignored

        Returns
        -------
        embedding: array of shape (n_docs, n_topics)
            An embedding of the documents X into the topic space.
        """

        X = check_array(X, accept_sparse="csr")

        if not issparse(X):
            X = coo_matrix(X)
        else:
            X = X.tocoo()

        result = plsa_refit(
            X,
            self.components_,
            n_iter=50,
            n_iter_per_test=5,
            tolerance=0.001,
            random_state=self.random_state,
        )

        return result
Esempio n. 2
0
def ensemble_fit(
    X,
    estimated_n_topics=10,
    model="plsa",
    init="random",
    min_samples=3,
    min_cluster_size=4,
    n_starts=16,
    n_jobs=8,
    parallelism="dask",
    topic_combination="hellinger_umap",
    n_iter=100,
    n_iter_per_test=10,
    tolerance=0.001,
    e_step_thresh=1e-16,
    lift_factor=1,
    beta_loss=1,
    alpha=0.0,
    solver="mu",
    random_state=None,
):
    """Generate a set of stable topics by using an ensemble of topic models and then clustering
    the results and generating representative topics for each cluster. The generate a set of
    document vectors based on the selected stable topics.

    Parameters
    ----------
    X: array or sparse matrix of shape (n_docs, n_words)
        The bag-of-words matrix for the corpus to train on.

    estimated_n_topics: int (optional, default=10)
        The estimated number of topics. Note that the final number of topics produced can differ
        from this value, and may be more or less than the provided value. Instead this value
        provides the algorithm with a suggestion of the approximate number of topics to use.

    model: string (optional, default="plsa")
        The topic modeling method to use (either "plsa" or "nmf")

    init: string or tuple (optional, default="random")
        The intialization method to use. This should be one of:
            * ``"random"``
            * ``"nndsvd"``
            * ``"nmf"``
        or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words).

    int (optional, default=3)
        The min_samples parameter to use for HDBSCAN clustering.

    min_cluster_size: int (optional, default=4)
        The min_cluster_size parameter to use for HDBSCAN clustering

    n_starts: int (optional, default=16)
        The number of bootstrap sampled topic models to run -- the size of the ensemble.

    n_jobs: int (optional, default=8)
        The number of parallel jobs to run at a time.

    parallelism: string (optional, default="dask")
        The parallelism model to use. Should be one of "dask" or "joblib".

    topic_combination: string (optional, default="hellinger_umap")
        The method of comnining ensemble topics into a set of stable topics. Should be one of:
            * ``"hellinger_umap"``
            * ``"hellinger"``
            * ``"kl_divergence"``

    n_iter: int
        The maximum number iterations of EM to perform

    n_iter_per_test: int
        The number of iterations between tests for
        relative improvement in log-likelihood.

    tolerance: float
        The threshold of relative improvement in
        log-likelihood required to continue iterations.

    e_step_thresh: float (optional, default=1e-32)
        Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls
        below threshold then write a zero for P(z|w,d).

    lift_factor: int (optional, default=1)
        Importance factor to apply to lift -- if high lift value are important to
        you then larger lift factors will be beneficial.

    beta_loss: float or string, (optional, default 'kullback-leibler')
        The beta loss to use if using NMF for topic modeling.

    alpha: float (optional, default=0.0)
        The alpha parameter defining regularization if using NMF for topic modeling.

    solver: string, (optional, default="mu")
        The choice of solver if using NMF for topic modeling. Should be either "cd" or "mu".

    random_state int, RandomState instance or None, (optional, default: None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Used in in initialization.

    Returns
    -------
    doc_vectors, stable_topics: arrays of shape (n_docs, M) and (M, n_words)
        The vectors giving the probability of topics for each document, and the stable topics
        produced by the ensemble.
    """

    X = check_array(X, accept_sparse="csr")

    if issparse(X):
        X_coo = X.tocoo()
    else:
        X_coo = coo_matrix(X)

    all_topics = ensemble_of_topics(
        X_coo,
        estimated_n_topics,
        model,
        n_jobs,
        n_starts,
        parallelism,
        init=init,
        n_iter=n_iter,
        n_iter_per_test=n_iter_per_test,
        tolerance=tolerance,
        e_step_thresh=e_step_thresh,
        lift_factor=1,
        beta_loss=beta_loss,
        alpha=alpha,
        solver=solver,
        random_state=random_state,
    )

    if topic_combination in _topic_combiner:
        cluster_topics = _topic_combiner[topic_combination]
    else:
        raise ValueError("topic_combination must be one of {}".format(
            tuple(_topic_combiner.keys())))

    stable_topics = cluster_topics(all_topics, min_samples, min_cluster_size)

    if lift_factor != 1:
        stable_topics **= lift_factor
        normalize(stable_topics, axis=1)

    if model == "plsa":
        doc_vectors = plsa_refit(
            X,
            stable_topics,
            e_step_thresh=e_step_thresh,
            random_state=random_state,
        )
    elif model == "nmf":
        doc_vectors, _, _ = non_negative_factorization(
            X,
            H=stable_topics,
            n_components=stable_topics.shape[0],
            update_H=False,
            beta_loss=beta_loss,
            alpha=alpha,
            solver=solver,
        )
    else:
        raise ValueError('Model must be one of "plsa" or "nmf"')

    return doc_vectors, stable_topics