Exemple #1
0
	def _train_GMM(self, data, n_components, train_inds=None, val_inds=None, loglikelihood=0):
		"""
		Perform one training of GMM.
		:param data:
		:param n_components:
		:return:
		"""

		if train_inds is not None and val_inds is not None:
			training_data, validation_data = CV.get_train_validation_set(data, train_inds, val_inds)
		else:
			training_data = np.copy(data)
			validation_data = np.copy(data)

		if self.data_weights_ is None and self.bias_factors_ is None:
			gmm = GaussianMixture(n_components=n_components, tol=self.convergence_tol_)

			# Train model on the current training data
			gmm.fit(training_data)

			# Check log-likelihood of validation data
			loglikelihood += gmm.score(validation_data)
		else:
			gmm = GMM.GaussianMixture(n_components=n_components, convergence_tol=self.convergence_tol_,verbose=self.verbose_)

			training_data_weights = self.data_weights_
			validation_data_weights = self.data_weights_
			training_bias_factors = self.bias_factors_

			if train_inds is not None and val_inds is not None:
				if self.data_weights_ is not None:
					training_data_weights, validation_data_weights = CV.get_train_validation_set(self.data_weights_,
																								 train_inds, val_inds)

				if self.bias_factors_ is not None:
					training_bias_factors, validation_bias_factors = CV.get_train_validation_set(self.bias_factors_,
																								 train_inds, val_inds)

			# Train model on the current training data
			gmm.fit(training_data, data_weights=training_data_weights, bias_factors=training_bias_factors)


			if training_bias_factors is not None and train_inds is not None and val_inds is not None:
				# Compute the weights of validation data using the validation data bias factors
				validation_data_weights = gmm.compute_data_weights(validation_data, validation_bias_factors)

			# Check log-likelihood of validation data
			loglikelihood += gmm.loglikelihood(validation_data, data_weights=validation_data_weights)

		return gmm, loglikelihood
Exemple #2
0
    def fit(self):
        do_EM = True

        print('Training density model weights.')

        if do_EM:
            loglikelihood = -np.inf
            prev_loglikelihood = 0
            while (np.abs(prev_loglikelihood - loglikelihood) >
                   self.convergence_tol_):
                beta = self._expectation()
                self._maximization(beta)
                prev_loglikelihood = loglikelihood
                loglikelihood = self.loglikelihood(
                    self.val_data_list_, list_of_validation_data=True)
        else:
            self.model_weights_ = opt.fmin_cg(self.objective_function,
                                              self.model_weights_)

        # Keep only models with nonzero weight
        self._sparisify_model()
        self._set_n_component_list()

        # Train each density model on the full dataset.
        print('Training each model on the full dataset.')
        for i_model in range(self.n_models_):
            n_components = self.GMM_list_[i_model].n_components_
            print(' - Training model with ' + str(n_components) +
                  ' components')
            best_loglikelihood = -np.inf
            for i_iter in range(self.n_iterations_):
                density_model = GMM.GaussianMixture(
                    n_components=n_components,
                    convergence_tol=self.convergence_tol_)
                density_model.fit(self.data_)
                loglikelihood = density_model.loglikelihood(self.data_)
                if loglikelihood > best_loglikelihood:
                    best_loglikelihood = loglikelihood
                    self.GMM_list_[i_model] = density_model

        self.n_components_list_ = np.asarray(self.n_components_list_)
        return
Exemple #3
0
    def _fit_FE(self, data, set_density_model=True):
        """
        Fit density to data points.
        :param data: [n_samples x n_dims]
        :return: free energy of points
        """

        best_n_components = self.min_n_components

        # Extract test set from the dataset
        n_points_test = int(self.test_set_perc_*data.shape[0])
        data_orig = np.copy(data)
        data_weights_orig = np.copy(self.data_weights_)

        if n_points_test > 0:
            test_data = data[-n_points_test::,:]
            data = np.copy(data[0:-n_points_test, :])
            if self.data_weights_ is not None:
                self.data_weights_ = np.copy(self.data_weights_[0:-n_points_test,:])
        else:
            test_data = np.zeros((0,self.n_dims_))

        if self.stack_landscapes_:
            print('Estimating density with stacked GMMs.')
        else:
            print('Estimating density with GMM.')

        if self.data_weights_ is not None:
            print('Using weighted data to estimate GMM.')

        best_loglikelihood = -np.inf
        list_of_GMMs = []
        list_of_validation_data = []
        ICs = []

        # Get indices of training and validation datasets
        if self.n_splits_ > 1:
            train_inds, val_inds = CV.split_train_validation(data, self.n_splits_, self.shuffle_data)

        # Determine number of components with k-fold cross-validation,
        # or store all estimated densities and then weight together.
        if self.max_n_components is not None:
            for n_components in range(self.min_n_components,self.max_n_components+1,self.n_components_step):
                if self.verbose_:
                    print('# Components = '+str(n_components))

                if self.n_splits_ > 1 and not(self.stack_landscapes_):
                    loglikelihood = 0
                    for i_split in range(self.n_splits_):
                        gmm, loglikelihood = self._train_GMM(data, n_components, train_inds[i_split], val_inds[i_split], loglikelihood)

                    # Keep best model
                    if loglikelihood > best_loglikelihood:
                        best_loglikelihood = loglikelihood
                        best_n_components = n_components
                else:
                    best_loglikelihood = -np.inf
                    for i_iter in range(self.n_iterations_):
                        # Train GMM
                        gmm, loglikelihood = self._train_GMM(data, n_components)

                        # Compute average AIC/BIC over iterations
                        if i_iter == 0:
                            if self.stack_landscapes_:
                                if self.data_weights_ is None:
                                    ICs.append(gmm.aic(data))
                                else:
                                    ICs.append(gmm.aic(data, self.data_weights_))
                            else:
                                if self.data_weights_ is None:
                                    ICs.append(gmm.bic(data))
                                else:
                                    ICs.append(gmm.bic(data, self.data_weights_))

                        # Keep best model
                        if loglikelihood > best_loglikelihood:
                            best_loglikelihood = loglikelihood
                            if i_iter == 0:
                                list_of_GMMs.append(GMM.GaussianMixture(n_components=n_components))

                            if self.stack_landscapes_:
                                ICs[-1] = gmm.aic(data)
                            else:
                                ICs[-1] = gmm.bic(data)

                            list_of_GMMs[-1].weights_ = gmm.weights_
                            list_of_GMMs[-1].means_ = gmm.means_
                            list_of_GMMs[-1].covariances_ = gmm.covariances_

        if self.stack_landscapes_:
            if  self.max_n_components is None:
                gmm, _ = self._train_GMM(data, self.min_n_components)
                list_of_GMMs.append(gmm)

            ICs = np.asarray(ICs)
            model_weights = np.exp(-0.5 *(ICs-ICs.min()))
            model_weights /= model_weights.sum()

            # Fit mixture of density estimators using the validation data
            density_est = FEC.LandscapeStacker(data, list_of_validation_data, list_of_GMMs, n_splits=1,
                                                        convergence_tol=self.convergence_tol_, n_iterations=self.n_iterations_,
                                                        model_weights=model_weights)

            density = density_est.density(data_orig)
            if set_density_model:
                    self.density_est_ = density_est
        else:
            # Estimate FE with best number of components (deduced from cross-validation)
            if self.n_splits_ > 1:
                print('Training final model with ' + str(best_n_components) + ' components.')
                best_loglikelihood = -np.inf
                density_est = GMM.GaussianMixture(n_components=best_n_components)
                # Fit multiple times to
                for i_iter in range(self.n_iterations_):
                    gmm, loglikelihood = self._train_GMM(data, best_n_components)

                    if  loglikelihood > best_loglikelihood:
                        best_loglikelihood = loglikelihood
                        density_est.weights_ = gmm.weights_
                        density_est.means_ = gmm.means_
                        density_est.covariances_ = gmm.covariances_
            else:
                ICs = np.asarray(ICs)
                self.BICs_ = np.copy(ICs)
                model_ind = ICs.argmin()
                gmm = list_of_GMMs[model_ind]
                best_n_components = gmm.weights_.shape[0]
                density_est = GMM.GaussianMixture(n_components=best_n_components)

                print('Identifying final model with ' + str(density_est.n_components_) + ' components.')

                density_est.weights_ = gmm.weights_
                density_est.means_ = gmm.means_
                density_est.covariances_ = gmm.covariances_

            density = density_est.density(data_orig)

            if set_density_model:
                self.density_est_ = density_est

        if set_density_model:
            # Compute test set loglikelihood on the test set if test set exists
            if n_points_test > 0:
                self.test_set_loglikelihood = self.density_est_.loglikelihood(test_data)
            return self._free_energy(density)
        else:
            return self._free_energy(density), density_est