def test_gaussian_mixture_aic_bic():
    # Test the aic and bic criteria
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 50, 3, 2
    X = rng.randn(n_samples, n_features)
    # standard gaussian entropy
    sgh = 0.5 * (fast_logdet(np.cov(X.T, bias=1)) + n_features *
                 (1 + np.log(2 * np.pi)))
    for cv_type in COVARIANCE_TYPE:
        g = GaussianMixture(n_components=n_components,
                            covariance_type=cv_type,
                            random_state=rng,
                            max_iter=200)
        g.fit(X)
        aic = 2 * n_samples * sgh + 2 * g._n_parameters()
        bic = (2 * n_samples * sgh + np.log(n_samples) * g._n_parameters())
        bound = n_features / np.sqrt(n_samples)
        assert (g.aic(X) - aic) / n_samples < bound
        assert (g.bic(X) - bic) / n_samples < bound
def test_gaussian_mixture_n_parameters():
    # Test that the right number of parameters is estimated
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 50, 5, 2
    X = rng.randn(n_samples, n_features)
    n_params = {'spherical': 13, 'diag': 21, 'tied': 26, 'full': 41}
    for cv_type in COVARIANCE_TYPE:
        g = GaussianMixture(n_components=n_components,
                            covariance_type=cv_type,
                            random_state=rng).fit(X)
        assert g._n_parameters() == n_params[cv_type]
Esempio n. 3
0
def cluster(data, aff, link, cov, k, c_true=None):
    """
    Cluster according to specified method
    input:
        data - nxk numpy matrix of data
        c_true - n array of true cluster membership
        aff - affinity, element of ['euclidean','manhattan','cosine'] or none for EM
              from scratch
        link - linkage, element of ['ward','complete','average','single'], or none for
                EM from scratch
        cov - covariance, element of ['full','tied','diag','spherical']
        k - # of clusters
    output:
        c_hat - n array of clustering results
        means - kxd array of means of mixture components
        bic - Bayes Information Criterion for this clustering
        ari - Adjusted Rand Index to comparing clustering result to true clustering
        reg - regularization parameter that was used in the clustering results
            (0 or 1e-6)
    """
    iter_num = 100
    if aff == "none" or link == "none":
        try:  # no regularization
            reg = 0
            gmm = GaussianMixture(
                n_components=k,
                covariance_type=cov,
                reg_covar=reg,
                max_iter=iter_num,
                verbose=0,
                verbose_interval=1,
            )
            c_hat = gmm.fit_predict(data)
            bic = processBIC(data, gmm.weights_, gmm.means_, gmm.covariances_, cov)
            if any([sum(c_hat == i) <= 1 for i in range(k)]) or bic == -np.inf:
                raise ValueError
        # if there was a numerical error during EM,or while calculating BIC,
        # or if the clustering found a class with only one element
        except:  # regularize
            reg = 1e-6
            gmm = GaussianMixture(
                n_components=k,
                covariance_type=cov,
                reg_covar=reg,
                max_iter=iter_num,
                verbose=0,
                verbose_interval=1,
            )
            c_hat = gmm.fit_predict(data)
            bic = processBIC(data, gmm.weights_, gmm.means_, gmm.covariances_, cov)
    else:
        one_hot = agglomerate(data, aff, link, k)
        weights, means, precisions = initialize_params(data, one_hot, cov)

        try:
            reg = 0
            gmm = GaussianMixture(
                n_components=k,
                covariance_type=cov,
                weights_init=weights,
                means_init=means,
                precisions_init=precisions,
                max_iter=iter_num,
                reg_covar=reg,
                verbose=0,
                verbose_interval=1,
            )
            c_hat = gmm.fit_predict(data)
            bic = processBIC(data, gmm.weights_, gmm.means_, gmm.covariances_, cov)
            if any([sum(c_hat == i) <= 1 for i in range(k)]) or bic == -np.inf:
                raise ValueError
        # if there was a numerical error, or if initial clustering produced a
        # mixture component with only one element
        except:
            reg = 1e-6
            gmm = GaussianMixture(
                n_components=k,
                covariance_type=cov,
                weights_init=weights,
                means_init=means,
                precisions_init=precisions,
                max_iter=iter_num,
                reg_covar=reg,
                verbose=0,
                verbose_interval=1,
            )
            c_hat = gmm.fit_predict(data)
            bic = processBIC(data, gmm.weights_, gmm.means_, gmm.covariances_, cov)

    if c_true is not None:
        ari = adjusted_rand_score(c_true, c_hat)
    else:
        ari = None

    means = gmm.means_
    return c_hat, means, bic, ari, reg, gmm._n_parameters()