Beispiel #1
0
def nmf_topics(X, k, **kwargs):
    """Perform a boostrap sample from a corpus of documents and fit the sample using
    NMF to give a set of topic vectors, normalized such that the(z,w) entry of the
    returned array is the probability P(w|z) of word w occuring given the zth topic.

    Parameters
    ----------
    X: sparse matrix of shape (n_docs, n_words)
        The bag of words representation of the corpus of documents.

    k: int
        The number of topics to generate.

    kwargs:
        Further keyword arguments that can be passed on th the ``NMF`` class.
        Possibilities include:
            * ``init``
            * ``beta_loss``
            * ``alpha``
            * ``solver``

    Returns
    -------
    topics: array of shape (k, n_words)
        The topics generated from the bootstrap sample.
    """
    A = X.tocsr()
    if kwargs.get("bootstrap", True):
        rng = check_random_state(kwargs.get("random_state", None))
        bootstrap_sample_indices = rng.randint(0, A.shape[0], size=A.shape[0])
        B = A[bootstrap_sample_indices]
    else:
        B = A
    nmf = NMF(
        n_components=k,
        init=kwargs.get("init", "nndsvd"),
        beta_loss=kwargs.get("beta_loss", 1),
        alpha=kwargs.get("alpha", 0.0),
        solver=kwargs.get("solver", "mu"),
        random_state=kwargs.get("random_state", None),
    ).fit(B)
    topics = nmf.components_.copy()
    normalize(topics, axis=1)
    return topics
Beispiel #2
0
def plsa_refit(
    X,
    topics,
    sample_weight,
    n_iter=50,
    n_iter_per_test=10,
    tolerance=0.005,
    e_step_thresh=1e-32,
    random_state=None,
):
    """Routine for refitting values of P(z|d) given a fixed set of topics (
    i.e. P(w|z)). This allows fitting document vectors to a predefined set of topics
    (given, for example, by an ensemble result).

    Parameters
    ----------
    X: sparse matrix of shape (n_docs, n_words)
        The data matrix pLSA is attempting to fit to.

    topics: array of shape (n_topics, n_words)
        The fixed topics against which to fit the values of P(z|d).

    sample_weight: array of shape (n_docs,)
        Input document weights.

    n_iter: int
        The maximum number iterations of EM to perform

    n_iter_per_test: int
        The number of iterations between tests for relative improvement in
        log-likelihood.

    tolerance: float
        The threshold of relative improvement in log-likelihood required to continue
        iterations.

    e_step_thresh: float (optional, default=1e-32)
        Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls
        below threshold then write a zero for P(z|w,d).

    random_state: int, RandomState instance or None, (optional, default: None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Used in in initialization.

    Returns
    -------
    p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words)
        The resulting model values of P(z|d) and P(w|z)

    """
    A = X.tocoo().astype(np.float32)
    k = topics.shape[0]

    rng = check_random_state(random_state)
    p_z_given_d = rng.rand(A.shape[0], k)
    normalize(p_z_given_d, axis=1)
    p_z_given_d = p_z_given_d.astype(np.float32)
    topics = topics.astype(np.float32)

    p_z_given_d = plsa_refit_inner(
        A.row,
        A.col,
        A.data,
        topics,
        p_z_given_d,
        sample_weight,
        n_iter=n_iter,
        n_iter_per_test=n_iter_per_test,
        tolerance=tolerance,
        e_step_thresh=e_step_thresh,
    )

    return p_z_given_d
Beispiel #3
0
def plsa_init(X, k, init="random", rng=np.random):
    """Initialize matrices for pLSA. Specifically, given data X, a number of topics
    k, and an initialization method, compute matrices for P(z|d) and P(w|z) that can
    be used to begin an EM optimization of pLSA.

    Various initialization approaches are available. The most straightforward is
    "random", which randomly initializes values for P(z|d) and P(w|z) and normalizes
    to make them probabilities. A second approach, borrowing from sklearn's NMF
    implementation, is to use a non-negative SVD approach ("nndsvd"). A third option
    is the use the fast coordinate descent under Frobenius loss version of NMF and
    then normalize to make probabilities ("nmf"). Finally if the ``init`` parameter
    is a tuple of ndarrays then these will be used, allowing for custom user defined
    initializations.

    Parameters
    ----------
    X: sparse matrix of shape (n_docs, n_words)
        The data matrix pLSA is attempting to fit to.

    k: int
        The number of topics for pLSA to fit with.

    init: string or tuple (optional, default="random")
        The intialization method to use. This should be one of:
            * ``"random"``
            * ``"nndsvd"``
            * ``"nmf"``
        or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words).

    rng: RandomState instance (optional, default=np.random)
        Seeded randomness generator. Used for random intialization.

    Returns
    -------
    p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words)
        Initialized arrays suitable to passing to
        pLSA optimization methods.
    """

    n = X.shape[0]
    m = X.shape[1]

    if init == "random":
        p_w_given_z = rng.rand(k, m)
        p_z_given_d = rng.rand(n, k)

    elif init == "nndsvd":
        # Taken from sklearn NMF implementation
        U, S, V = randomized_svd(X, k)
        p_z_given_d, p_w_given_z = np.zeros(U.shape), np.zeros(V.shape)

        # The leading singular triplet is non-negative
        # so it can be used as is for initialization.
        p_z_given_d[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0])
        p_w_given_z[0, :] = np.sqrt(S[0]) * np.abs(V[0, :])

        for j in range(1, k):
            x, y = U[:, j], V[j, :]

            # extract positive and negative parts of column vectors
            x_p, y_p = np.maximum(x, 0), np.maximum(y, 0)
            x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0))

            # and their norms
            x_p_nrm, y_p_nrm = norm(x_p), norm(y_p)
            x_n_nrm, y_n_nrm = norm(x_n), norm(y_n)

            m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm

            # choose update
            if m_p > m_n:
                u = x_p / x_p_nrm
                v = y_p / y_p_nrm
                sigma = m_p
            else:
                u = x_n / x_n_nrm
                v = y_n / y_n_nrm
                sigma = m_n

            lbd = np.sqrt(S[j] * sigma)
            p_z_given_d[:, j] = lbd * u
            p_w_given_z[j, :] = lbd * v

    elif init == "nmf":
        p_z_given_d, p_w_given_z, _ = non_negative_factorization(
            X,
            n_components=k,
            init="nndsvd",
            solver="cd",
            beta_loss=2,
            tol=1e-2,
            max_iter=100,
        )
    elif isinstance(init, tuple) or isinstance(init, list):
        p_z_given_d, p_w_given_z = init
    else:
        raise ValueError("Unrecognized init {}".format(init))

    normalize(p_w_given_z, axis=1)
    normalize(p_z_given_d, axis=1)

    return p_z_given_d, p_w_given_z
Beispiel #4
0
def ensemble_fit(
    X,
    estimated_n_topics=10,
    model="plsa",
    init="random",
    min_samples=3,
    min_cluster_size=4,
    n_starts=16,
    n_jobs=8,
    parallelism="dask",
    topic_combination="hellinger_umap",
    n_iter=100,
    n_iter_per_test=10,
    tolerance=0.001,
    e_step_thresh=1e-16,
    lift_factor=1,
    beta_loss=1,
    alpha=0.0,
    solver="mu",
    random_state=None,
):
    """Generate a set of stable topics by using an ensemble of topic models and then clustering
    the results and generating representative topics for each cluster. The generate a set of
    document vectors based on the selected stable topics.

    Parameters
    ----------
    X: array or sparse matrix of shape (n_docs, n_words)
        The bag-of-words matrix for the corpus to train on.

    estimated_n_topics: int (optional, default=10)
        The estimated number of topics. Note that the final number of topics produced can differ
        from this value, and may be more or less than the provided value. Instead this value
        provides the algorithm with a suggestion of the approximate number of topics to use.

    model: string (optional, default="plsa")
        The topic modeling method to use (either "plsa" or "nmf")

    init: string or tuple (optional, default="random")
        The intialization method to use. This should be one of:
            * ``"random"``
            * ``"nndsvd"``
            * ``"nmf"``
        or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words).

    int (optional, default=3)
        The min_samples parameter to use for HDBSCAN clustering.

    min_cluster_size: int (optional, default=4)
        The min_cluster_size parameter to use for HDBSCAN clustering

    n_starts: int (optional, default=16)
        The number of bootstrap sampled topic models to run -- the size of the ensemble.

    n_jobs: int (optional, default=8)
        The number of parallel jobs to run at a time.

    parallelism: string (optional, default="dask")
        The parallelism model to use. Should be one of "dask" or "joblib".

    topic_combination: string (optional, default="hellinger_umap")
        The method of comnining ensemble topics into a set of stable topics. Should be one of:
            * ``"hellinger_umap"``
            * ``"hellinger"``
            * ``"kl_divergence"``

    n_iter: int
        The maximum number iterations of EM to perform

    n_iter_per_test: int
        The number of iterations between tests for
        relative improvement in log-likelihood.

    tolerance: float
        The threshold of relative improvement in
        log-likelihood required to continue iterations.

    e_step_thresh: float (optional, default=1e-32)
        Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls
        below threshold then write a zero for P(z|w,d).

    lift_factor: int (optional, default=1)
        Importance factor to apply to lift -- if high lift value are important to
        you then larger lift factors will be beneficial.

    beta_loss: float or string, (optional, default 'kullback-leibler')
        The beta loss to use if using NMF for topic modeling.

    alpha: float (optional, default=0.0)
        The alpha parameter defining regularization if using NMF for topic modeling.

    solver: string, (optional, default="mu")
        The choice of solver if using NMF for topic modeling. Should be either "cd" or "mu".

    random_state int, RandomState instance or None, (optional, default: None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`. Used in in initialization.

    Returns
    -------
    doc_vectors, stable_topics: arrays of shape (n_docs, M) and (M, n_words)
        The vectors giving the probability of topics for each document, and the stable topics
        produced by the ensemble.
    """

    X = check_array(X, accept_sparse="csr")

    if issparse(X):
        X_coo = X.tocoo()
    else:
        X_coo = coo_matrix(X)

    all_topics = ensemble_of_topics(
        X_coo,
        estimated_n_topics,
        model,
        n_jobs,
        n_starts,
        parallelism,
        init=init,
        n_iter=n_iter,
        n_iter_per_test=n_iter_per_test,
        tolerance=tolerance,
        e_step_thresh=e_step_thresh,
        lift_factor=1,
        beta_loss=beta_loss,
        alpha=alpha,
        solver=solver,
        random_state=random_state,
    )

    if topic_combination in _topic_combiner:
        cluster_topics = _topic_combiner[topic_combination]
    else:
        raise ValueError("topic_combination must be one of {}".format(
            tuple(_topic_combiner.keys())))

    stable_topics = cluster_topics(all_topics, min_samples, min_cluster_size)

    if lift_factor != 1:
        stable_topics **= lift_factor
        normalize(stable_topics, axis=1)

    if model == "plsa":
        doc_vectors = plsa_refit(
            X,
            stable_topics,
            e_step_thresh=e_step_thresh,
            random_state=random_state,
        )
    elif model == "nmf":
        doc_vectors, _, _ = non_negative_factorization(
            X,
            H=stable_topics,
            n_components=stable_topics.shape[0],
            update_H=False,
            beta_loss=beta_loss,
            alpha=alpha,
            solver=solver,
        )
    else:
        raise ValueError('Model must be one of "plsa" or "nmf"')

    return doc_vectors, stable_topics