def model_dict(X, Z, K): """Run all the models and store their features into one dictionary for plotting. Args: - X : (N, d) data - Z : (N) labels - K : Number of clusters Returns: - models : Dictionary containing the means, covariance matrices (when existing) and labels of the models clusters. """ # Models dictionary models = dict() # Ground truth models["ground truth"] = { "mean": np.array([X[Z == k].mean(0) for k in range(len(np.unique(Z)))]), "cov": None, "labels": Z, } # Run diagonal EM em_diag = EM(K) em_diag.fit(X) models["diagonal EM"] = { "mean": em_diag.mus, "cov": np.array([np.diag(em_diag.Ds[k]) for k in range(K)]), "labels": em_diag.labels_, } # Run general EM em = GaussianMixture(K) em.fit(X) # Compute reponsabilities gaussians = np.array( [ multivariate_normal.pdf(X, em.means_[k], em.covariances_[k]) * em.weights_[k] for k in range(K) ] ) r = gaussians / gaussians.sum(0) models["general EM"] = { "mean": em.means_, "cov": em.covariances_, "labels": r.argmax(0), } # Run K-means km = KMeans(K) km.fit(X) models["K-means"] = {"mean": km.cluster_centers_, "cov": None, "labels": km.labels_} return models
def run_mnist(): # FIXME: running EM on MNIST has the problem that all data collapses to one class # This is because the likelihood for that class is slightly higher than all other. # Probably has to do with the variance being lower for one, form k-means, # and that being more important than closeness to mean for such high dimensional data? # Running it with 0 iterations (i.e. on k-means) work fine, then it finds different orientations of the digits. data_per_class = 20 training_data = list(mnist.read("training")) dim_x, dim_y = np.shape(training_data[0][1]) ones = [d[1] for d in training_data if d[0] == 1] fours = [d[1] for d in training_data if d[0] == 4] fives = [d[1] for d in training_data if d[0] == 5] ones = ones[:data_per_class] fours = fours[:data_per_class] fives = fives[:data_per_class] data = np.array(ones + fours + fives).reshape((-1, dim_x * dim_y)) solver = EM(data=data, num_classes=3, num_nuisances=3) split_data, thetas = solver.fit(max_iter=1) for c, class_thetas in enumerate(thetas): for n, theta in enumerate(class_thetas): print(f"Prior: {theta.prior}, Var: {theta.variance}") mnist.show(thetas[c][n].mean.reshape(28, 28))