def test(file, max_n_components, n_classes): print('GaussianMixture for set: ' + file) dataset = utils.dataset_reader(file) X, y = utils.data_process(dataset) list_sse = [] list_nmi = [] for n_components in range(1, max_n_components + 1): gmm = GaussianMixture(n_components=n_components) gmm.fit(X) y_hat = gmm.predict(X) sse = utils.sum_of_squared_errors(X, y_hat, gmm.means_) nmi = utils.normalized_mutual_information(y, n_classes, y_hat, n_components) print('{0:2d} components, SSE: {1:.2f}, NMI: {2:.4f}'.format( n_components, sse, nmi)) # print('iterations: ', gmm.n_iter_) # print(gmm.means_, gmm.covariances_, gmm.weights_) # print(gmm.lower_bound_) list_sse.append(sse) list_nmi.append(nmi) utils.plot_measure_vs_k('SSE', list_sse, range(1, max_n_components + 1)) utils.plot_measure_vs_k('NMI', list_nmi, range(1, max_n_components + 1))
def update_server_model(self): # The model must be regenerated with the new average parameters. It cannot simply be updated (it might be initialized again with wrong parameters) self.model = GaussianMixture( X=self.init_dataset, n_components=self.args.components, random_state=self.random_state, is_quiet=True, init_params=self.args.init, weights_init=self.avg_clients_weights, means_init=self.avg_clients_means, precisions_init=self.avg_clients_precisions) return
def testPredictClasses(self): """ Assert that torch.FloatTensor is handled correctly. """ x = torch.randn(4, 2) n_components = np.random.randint(1, 100) model = GaussianMixture(n_components, x.size(1)) model.fit(x) y = model.predict(x) # check that dimensionality of class memberships is (n) self.assertEqual(torch.Tensor(x.size(0)).size(), y.size())
def testPredictProbabilities(self): """ Assert that torch.FloatTensor is handled correctly when returning class probabilities. """ x = torch.randn(4, 2) n_components = np.random.randint(1, 100) model = GaussianMixture(n_components, x.size(1)) model.fit(x) # check that y_p has dimensions (n, k) y_p = model.predict(x, probs=True) self.assertEqual( torch.Tensor(x.size(0), n_components).size(), y_p.size())
def __init__(self, args, init_dataset, clients, output_dir): self.random_state = None if args.seed: self.random_state = (int(args.seed)) self.model = GaussianMixture(X=init_dataset, n_components=args.components, random_state=self.random_state, is_quiet=True, init_params=args.init) self.init_dataset = init_dataset self.args = args self.rounds = args.rounds self.clients = clients self.fraction_clients = float(args.C) self.n_clients = int(args.K) self.n_clients_round = int(self.fraction_clients * self.n_clients) self.selected_clients = {} self.output_dir = output_dir self.metrics_history = {'aic': [], 'bic': [], 'll': []}
def main(): n, d = 300, 2 # generate some data points .. data = torch.Tensor(n, d).normal_() # .. and shift them around to non-standard Gaussians data[:n//2] -= 1 data[:n//2] *= sqrt(3) data[n//2:] += 1 data[n//2:] *= sqrt(2) # Next, the Gaussian mixture is instantiated and .. n_components = 2 model = GaussianMixture(n_components, d) model.fit(data) # .. used to predict the data points as they where shifted y = model.predict(data) plot(data, y)
class Server(): def __init__(self, args, init_dataset, clients, output_dir): self.random_state = None if args.seed: self.random_state = (int(args.seed)) self.model = GaussianMixture(X=init_dataset, n_components=args.components, random_state=self.random_state, is_quiet=True, init_params=args.init) self.init_dataset = init_dataset self.args = args self.rounds = args.rounds self.clients = clients self.fraction_clients = float(args.C) self.n_clients = int(args.K) self.n_clients_round = int(self.fraction_clients * self.n_clients) self.selected_clients = {} self.output_dir = output_dir self.metrics_history = {'aic': [], 'bic': [], 'll': []} def _select_round_clients(self, round): idxs_round_clients = np.random.choice(range(self.n_clients), self.n_clients_round, replace=False) selected_clients = [] for idx in idxs_round_clients: selected_clients.append(self.clients[idx]) self.selected_clients[round] = selected_clients return selected_clients def _set_parameters_from_clients_models(self, round_history): self.clients_means = [] self.clients_covariances = [] self.clients_weights = [] for client_id in round_history: parameters = round_history[client_id]['parameters'] self.clients_means.append(parameters['means'][-1]) self.clients_covariances.append(parameters['covariances'][-1]) self.clients_weights.append(parameters['weights'][-1]) self.clients_means = np.array(self.clients_means) self.clients_covariances = np.array(self.clients_covariances) self.clients_weights = np.array(self.clients_weights) return def _set_metrics_from_clients_models(self, round_history): self.clients_aic = [] self.clients_bic = [] self.clients_ll = [] for client_id in round_history: metrics = round_history[client_id]['metrics'] self.clients_aic.append(metrics['aic'][-1]) self.clients_bic.append(metrics['bic'][-1]) self.clients_ll.append(metrics['ll'][-1]) self.clients_aic = np.array(self.clients_aic) self.clients_bic = np.array(self.clients_bic) self.clients_ll = np.array(self.clients_ll) return def start_round(self, round): selected_clients = self._select_round_clients(round) round_history = {} pbar = tqdm(selected_clients) for client in pbar: pbar.set_description('Round: {}/{} | Client: {}'.format( round + 1, self.rounds, client.id)) round_history[client.id] = client.fit(self.model, self.args.local_epochs) if pbar.iterable[-1] == client: pbar.set_description('Round: {}/{} | Completed'.format( round + 1, self.rounds)) self._set_parameters_from_clients_models(round_history) self._set_metrics_from_clients_models(round_history) return def _sort_clients_distributions(self, update_reference: bool = False): reference_distributions = [] for component_idx in range(self.args.components): client_idx = 0 # First client distribution = NormalDistribution( self.clients_means[client_idx][component_idx], self.clients_covariances[client_idx][component_idx]) reference_distributions.append(distribution) for client_idx in range(1, self.n_clients_round): selected_components_idxs = [] for component_idx in range(self.args.components): distances = [] distances_idxs = [] for target_component_idx in range(self.args.components): if target_component_idx in selected_components_idxs: pass else: means = self.clients_means[client_idx][ target_component_idx] covariances = self.clients_covariances[client_idx][ target_component_idx] target_distribution = NormalDistribution( means, covariances) distance = get_hellinger_multivariate( reference_distributions[component_idx], target_distribution) distances.append(distance) distances_idxs.append(target_component_idx) min_idx = np.argmin(distances) selected_components_idxs.append(distances_idxs[min_idx]) selected_components_idxs = np.array(selected_components_idxs) self.clients_means[client_idx] = self.clients_means[client_idx][ selected_components_idxs] self.clients_covariances[client_idx] = self.clients_covariances[ client_idx][selected_components_idxs] self.clients_weights[client_idx] = self.clients_weights[ client_idx][selected_components_idxs] if update_reference is True: avg_means = [] reference_means = [ reference_distributions[component_idx].means for component_idx in range(self.args.components) ] avg_means.append(np.array(reference_means)) avg_means.append(np.array(self.clients_means[client_idx])) avg_means = np.array(avg_means) avg_covariances = [] reference_covariances = [ reference_distributions[component_idx].covariances for component_idx in range(self.args.components) ] avg_covariances.append(np.array(reference_covariances)) avg_covariances.append( np.array(self.clients_covariances[client_idx])) avg_covariances = np.array(avg_covariances) gamma = 1 / avg_means.shape[0] avg_means = np.sum(avg_means * pow(gamma, 1), axis=0) avg_covariances = np.sum(avg_covariances * pow(gamma, 2), axis=0) reference_distributions = [] for component_idx in range(self.args.components): distribution = NormalDistribution( avg_means[component_idx], avg_covariances[component_idx]) reference_distributions.append(distribution) return def average_clients_models(self, use_hellinger_distance: bool = True, update_reference: bool = False): if use_hellinger_distance is True: self._sort_clients_distributions(update_reference) gamma = 1 / self.n_clients_round # weight for each client (the same) self.avg_clients_means = np.sum(self.clients_means * pow(gamma, 1), axis=0) self.avg_clients_covariances = np.sum(self.clients_covariances * pow(gamma, 2), axis=0) self.avg_clients_weights = np.sum(self.clients_weights * pow(gamma, 1), axis=0) self.avg_clients_precisions_cholesky = self.model.compute_precision_cholesky( self.avg_clients_covariances, self.model.covariance_type) params = (self.avg_clients_weights, self.avg_clients_means, self.avg_clients_covariances, self.avg_clients_precisions_cholesky) self.model.set_parameters(params) self.avg_clients_precisions = self.model.precisions_ return def update_server_model(self): # The model must be regenerated with the new average parameters. It cannot simply be updated (it might be initialized again with wrong parameters) self.model = GaussianMixture( X=self.init_dataset, n_components=self.args.components, random_state=self.random_state, is_quiet=True, init_params=self.args.init, weights_init=self.avg_clients_weights, means_init=self.avg_clients_means, precisions_init=self.avg_clients_precisions) return def average_clients_metrics(self): self.metrics_history['aic'].append(np.mean(self.clients_aic)) self.metrics_history['bic'].append(np.mean(self.clients_bic)) self.metrics_history['ll'].append(np.mean(self.clients_ll)) return def plot(self, X, labels, round=None): self.model.plot(X, labels, self.args, self.output_dir, 'round', round) return def compute_init_metrics(self, X): self.metrics_history['aic'].append(self.model.aic(X)) self.metrics_history['bic'].append(self.model.bic(X)) self.metrics_history['ll'].append(self.model.score(X)) return
def testEmMatchesSkLearn(self): """ Assert that log-probabilities (E-step) and parameter updates (M-step) approximately match those of sklearn. """ d = 20 n_components = np.random.randint(1, 100) # (n, k, d) x = torch.randn(40, 1, d) # (n, d) x_np = np.squeeze(x.data.numpy()) var_init = torch.ones(1, n_components, d) - .4 model = GaussianMixture(n_components, d, var_init=var_init) model_sk = sklearn.mixture.GaussianMixture( n_components, covariance_type="diag", init_params="random", means_init=np.squeeze(model.mu.data.numpy()), precisions_init=np.squeeze(1. / np.sqrt(var_init.data.numpy()))) model_sk._initialize_parameters(x_np, np.random.RandomState()) log_prob_sk = model_sk._estimate_log_prob(x_np) log_prob = model._estimate_log_prob(x) # Test whether log-probabilities are approximately equal np.testing.assert_almost_equal(np.squeeze(log_prob.data.numpy()), log_prob_sk, decimal=2, verbose=True) _, log_resp_sk = model_sk._e_step(x_np) _, log_resp = model._e_step(x) # Test whether E-steps are approximately equal np.testing.assert_almost_equal(np.squeeze(log_resp.data.numpy()), log_resp_sk, decimal=0, verbose=True) model_sk._m_step(x_np, log_prob_sk) pi_sk = model_sk.weights_ mu_sk = model_sk.means_ var_sk = model_sk.means_ pi, mu, var = model._m_step(x, log_prob) # Test whether pi .. np.testing.assert_almost_equal(np.squeeze(pi.data.numpy()), pi_sk, decimal=1, verbose=True) # .. mu .. np.testing.assert_almost_equal(np.squeeze(mu.data.numpy()), mu_sk, decimal=1, verbose=True) # .. and var are approximately equal np.testing.assert_almost_equal(np.squeeze(var.data.numpy()), var_sk, decimal=1, verbose=True)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Apr 14 19:53:43 2018 @author: Garrett """ from kmeans import KMeans #from sklearn.cluster import KMeans from gmm import GaussianMixture import numpy as np X = np.array([[2, 2], [3, 4], [1, 0], [101, 2], [102, 4], [100, 0]]) kmeans = KMeans(n_clusters=2).fit(X) #print(kmeans.labels_) #print(kmeans.predict(np.array([[0, 0], [4, 4]]))) #print(kmeans.cluster_centers_) gmm = GaussianMixture(n_components=2).fit(X) print('gmm predict ', gmm.predict(X)) #print(gmm.predict(np.array([[0, 0], [4, 4]]))) print('gmm.means_ ', gmm.means_) print('gmm.covariances_ ', gmm.covariances_) print('gmm.n_iter', gmm.n_iter_)
train_dataset, train_dataset_labels, _ = get_dataset(args) print_configuration(args, train_dataset, False) save_configuration(args, train_dataset, output_dir, False) # Init the Gaussian Mixture Model seed = None if args.seed: seed = (int(args.seed)) # Prepare server --> init_dataset is given by 0.5% of the train_dataset randomly sampled # init_dataset_size = int(train_dataset.shape[0] * 0.005) # init_dataset = train_dataset[np.random.choice(train_dataset.shape[0], init_dataset_size, replace=False)] init_dataset = train_dataset model = GaussianMixture(X=init_dataset, n_components=args.components, random_state=seed, init_params=args.init) init_metrics = { 'aic': model.aic(train_dataset), 'bic': model.bic(train_dataset), 'll': model.score(train_dataset) } model.fit(train_dataset, args.epochs, train_dataset_labels, args, output_dir) predicted_labels = model.predict_proba(train_dataset).tolist() predicted_labels = np.array(predicted_labels) print('\nSaving images...')
def gmm(opt): return GaussianMixture(opt.GMM_NUM_COMPONENTS)