def test_gaussian_mixture_aic_bic(): # Test the aic and bic criteria rng = np.random.RandomState(0) n_samples, n_features, n_components = 50, 3, 2 X = rng.randn(n_samples, n_features) # standard gaussian entropy sgh = 0.5 * (fast_logdet(np.cov(X.T, bias=1)) + n_features * (1 + np.log(2 * np.pi))) for cv_type in COVARIANCE_TYPE: g = GaussianMixture(n_components=n_components, covariance_type=cv_type, random_state=rng, max_iter=200) g.fit(X) aic = 2 * n_samples * sgh + 2 * g._n_parameters() bic = (2 * n_samples * sgh + np.log(n_samples) * g._n_parameters()) bound = n_features / np.sqrt(n_samples) assert (g.aic(X) - aic) / n_samples < bound assert (g.bic(X) - bic) / n_samples < bound
def test_gaussian_mixture_n_parameters(): # Test that the right number of parameters is estimated rng = np.random.RandomState(0) n_samples, n_features, n_components = 50, 5, 2 X = rng.randn(n_samples, n_features) n_params = {'spherical': 13, 'diag': 21, 'tied': 26, 'full': 41} for cv_type in COVARIANCE_TYPE: g = GaussianMixture(n_components=n_components, covariance_type=cv_type, random_state=rng).fit(X) assert g._n_parameters() == n_params[cv_type]
def cluster(data, aff, link, cov, k, c_true=None): """ Cluster according to specified method input: data - nxk numpy matrix of data c_true - n array of true cluster membership aff - affinity, element of ['euclidean','manhattan','cosine'] or none for EM from scratch link - linkage, element of ['ward','complete','average','single'], or none for EM from scratch cov - covariance, element of ['full','tied','diag','spherical'] k - # of clusters output: c_hat - n array of clustering results means - kxd array of means of mixture components bic - Bayes Information Criterion for this clustering ari - Adjusted Rand Index to comparing clustering result to true clustering reg - regularization parameter that was used in the clustering results (0 or 1e-6) """ iter_num = 100 if aff == "none" or link == "none": try: # no regularization reg = 0 gmm = GaussianMixture( n_components=k, covariance_type=cov, reg_covar=reg, max_iter=iter_num, verbose=0, verbose_interval=1, ) c_hat = gmm.fit_predict(data) bic = processBIC(data, gmm.weights_, gmm.means_, gmm.covariances_, cov) if any([sum(c_hat == i) <= 1 for i in range(k)]) or bic == -np.inf: raise ValueError # if there was a numerical error during EM,or while calculating BIC, # or if the clustering found a class with only one element except: # regularize reg = 1e-6 gmm = GaussianMixture( n_components=k, covariance_type=cov, reg_covar=reg, max_iter=iter_num, verbose=0, verbose_interval=1, ) c_hat = gmm.fit_predict(data) bic = processBIC(data, gmm.weights_, gmm.means_, gmm.covariances_, cov) else: one_hot = agglomerate(data, aff, link, k) weights, means, precisions = initialize_params(data, one_hot, cov) try: reg = 0 gmm = GaussianMixture( n_components=k, covariance_type=cov, weights_init=weights, means_init=means, precisions_init=precisions, max_iter=iter_num, reg_covar=reg, verbose=0, verbose_interval=1, ) c_hat = gmm.fit_predict(data) bic = processBIC(data, gmm.weights_, gmm.means_, gmm.covariances_, cov) if any([sum(c_hat == i) <= 1 for i in range(k)]) or bic == -np.inf: raise ValueError # if there was a numerical error, or if initial clustering produced a # mixture component with only one element except: reg = 1e-6 gmm = GaussianMixture( n_components=k, covariance_type=cov, weights_init=weights, means_init=means, precisions_init=precisions, max_iter=iter_num, reg_covar=reg, verbose=0, verbose_interval=1, ) c_hat = gmm.fit_predict(data) bic = processBIC(data, gmm.weights_, gmm.means_, gmm.covariances_, cov) if c_true is not None: ari = adjusted_rand_score(c_true, c_hat) else: ari = None means = gmm.means_ return c_hat, means, bic, ari, reg, gmm._n_parameters()