def _train_GMM(self, data, n_components, train_inds=None, val_inds=None, loglikelihood=0): """ Perform one training of GMM. :param data: :param n_components: :return: """ if train_inds is not None and val_inds is not None: training_data, validation_data = CV.get_train_validation_set(data, train_inds, val_inds) else: training_data = np.copy(data) validation_data = np.copy(data) if self.data_weights_ is None and self.bias_factors_ is None: gmm = GaussianMixture(n_components=n_components, tol=self.convergence_tol_) # Train model on the current training data gmm.fit(training_data) # Check log-likelihood of validation data loglikelihood += gmm.score(validation_data) else: gmm = GMM.GaussianMixture(n_components=n_components, convergence_tol=self.convergence_tol_,verbose=self.verbose_) training_data_weights = self.data_weights_ validation_data_weights = self.data_weights_ training_bias_factors = self.bias_factors_ if train_inds is not None and val_inds is not None: if self.data_weights_ is not None: training_data_weights, validation_data_weights = CV.get_train_validation_set(self.data_weights_, train_inds, val_inds) if self.bias_factors_ is not None: training_bias_factors, validation_bias_factors = CV.get_train_validation_set(self.bias_factors_, train_inds, val_inds) # Train model on the current training data gmm.fit(training_data, data_weights=training_data_weights, bias_factors=training_bias_factors) if training_bias_factors is not None and train_inds is not None and val_inds is not None: # Compute the weights of validation data using the validation data bias factors validation_data_weights = gmm.compute_data_weights(validation_data, validation_bias_factors) # Check log-likelihood of validation data loglikelihood += gmm.loglikelihood(validation_data, data_weights=validation_data_weights) return gmm, loglikelihood
def fit(self): do_EM = True print('Training density model weights.') if do_EM: loglikelihood = -np.inf prev_loglikelihood = 0 while (np.abs(prev_loglikelihood - loglikelihood) > self.convergence_tol_): beta = self._expectation() self._maximization(beta) prev_loglikelihood = loglikelihood loglikelihood = self.loglikelihood( self.val_data_list_, list_of_validation_data=True) else: self.model_weights_ = opt.fmin_cg(self.objective_function, self.model_weights_) # Keep only models with nonzero weight self._sparisify_model() self._set_n_component_list() # Train each density model on the full dataset. print('Training each model on the full dataset.') for i_model in range(self.n_models_): n_components = self.GMM_list_[i_model].n_components_ print(' - Training model with ' + str(n_components) + ' components') best_loglikelihood = -np.inf for i_iter in range(self.n_iterations_): density_model = GMM.GaussianMixture( n_components=n_components, convergence_tol=self.convergence_tol_) density_model.fit(self.data_) loglikelihood = density_model.loglikelihood(self.data_) if loglikelihood > best_loglikelihood: best_loglikelihood = loglikelihood self.GMM_list_[i_model] = density_model self.n_components_list_ = np.asarray(self.n_components_list_) return
def _fit_FE(self, data, set_density_model=True): """ Fit density to data points. :param data: [n_samples x n_dims] :return: free energy of points """ best_n_components = self.min_n_components # Extract test set from the dataset n_points_test = int(self.test_set_perc_*data.shape[0]) data_orig = np.copy(data) data_weights_orig = np.copy(self.data_weights_) if n_points_test > 0: test_data = data[-n_points_test::,:] data = np.copy(data[0:-n_points_test, :]) if self.data_weights_ is not None: self.data_weights_ = np.copy(self.data_weights_[0:-n_points_test,:]) else: test_data = np.zeros((0,self.n_dims_)) if self.stack_landscapes_: print('Estimating density with stacked GMMs.') else: print('Estimating density with GMM.') if self.data_weights_ is not None: print('Using weighted data to estimate GMM.') best_loglikelihood = -np.inf list_of_GMMs = [] list_of_validation_data = [] ICs = [] # Get indices of training and validation datasets if self.n_splits_ > 1: train_inds, val_inds = CV.split_train_validation(data, self.n_splits_, self.shuffle_data) # Determine number of components with k-fold cross-validation, # or store all estimated densities and then weight together. if self.max_n_components is not None: for n_components in range(self.min_n_components,self.max_n_components+1,self.n_components_step): if self.verbose_: print('# Components = '+str(n_components)) if self.n_splits_ > 1 and not(self.stack_landscapes_): loglikelihood = 0 for i_split in range(self.n_splits_): gmm, loglikelihood = self._train_GMM(data, n_components, train_inds[i_split], val_inds[i_split], loglikelihood) # Keep best model if loglikelihood > best_loglikelihood: best_loglikelihood = loglikelihood best_n_components = n_components else: best_loglikelihood = -np.inf for i_iter in range(self.n_iterations_): # Train GMM gmm, loglikelihood = self._train_GMM(data, n_components) # Compute average AIC/BIC over iterations if i_iter == 0: if self.stack_landscapes_: if self.data_weights_ is None: ICs.append(gmm.aic(data)) else: ICs.append(gmm.aic(data, self.data_weights_)) else: if self.data_weights_ is None: ICs.append(gmm.bic(data)) else: ICs.append(gmm.bic(data, self.data_weights_)) # Keep best model if loglikelihood > best_loglikelihood: best_loglikelihood = loglikelihood if i_iter == 0: list_of_GMMs.append(GMM.GaussianMixture(n_components=n_components)) if self.stack_landscapes_: ICs[-1] = gmm.aic(data) else: ICs[-1] = gmm.bic(data) list_of_GMMs[-1].weights_ = gmm.weights_ list_of_GMMs[-1].means_ = gmm.means_ list_of_GMMs[-1].covariances_ = gmm.covariances_ if self.stack_landscapes_: if self.max_n_components is None: gmm, _ = self._train_GMM(data, self.min_n_components) list_of_GMMs.append(gmm) ICs = np.asarray(ICs) model_weights = np.exp(-0.5 *(ICs-ICs.min())) model_weights /= model_weights.sum() # Fit mixture of density estimators using the validation data density_est = FEC.LandscapeStacker(data, list_of_validation_data, list_of_GMMs, n_splits=1, convergence_tol=self.convergence_tol_, n_iterations=self.n_iterations_, model_weights=model_weights) density = density_est.density(data_orig) if set_density_model: self.density_est_ = density_est else: # Estimate FE with best number of components (deduced from cross-validation) if self.n_splits_ > 1: print('Training final model with ' + str(best_n_components) + ' components.') best_loglikelihood = -np.inf density_est = GMM.GaussianMixture(n_components=best_n_components) # Fit multiple times to for i_iter in range(self.n_iterations_): gmm, loglikelihood = self._train_GMM(data, best_n_components) if loglikelihood > best_loglikelihood: best_loglikelihood = loglikelihood density_est.weights_ = gmm.weights_ density_est.means_ = gmm.means_ density_est.covariances_ = gmm.covariances_ else: ICs = np.asarray(ICs) self.BICs_ = np.copy(ICs) model_ind = ICs.argmin() gmm = list_of_GMMs[model_ind] best_n_components = gmm.weights_.shape[0] density_est = GMM.GaussianMixture(n_components=best_n_components) print('Identifying final model with ' + str(density_est.n_components_) + ' components.') density_est.weights_ = gmm.weights_ density_est.means_ = gmm.means_ density_est.covariances_ = gmm.covariances_ density = density_est.density(data_orig) if set_density_model: self.density_est_ = density_est if set_density_model: # Compute test set loglikelihood on the test set if test set exists if n_points_test > 0: self.test_set_loglikelihood = self.density_est_.loglikelihood(test_data) return self._free_energy(density) else: return self._free_energy(density), density_est