def test_KMeansMachine(): # Test a KMeansMachine means = np.array([[3, 70, 0], [4, 72, 0]], "float64") test_val = np.array([3, 70, 1], "float64") test_arr = np.array([[3, 70, 1], [5, 72, 0]], "float64") for transform in (to_numpy, to_dask_array): means, test_val, test_arr = transform(means, test_val, test_arr) # Initializes a KMeansMachine km = KMeansMachine(2) km.centroids_ = means # Distance and closest mean np.testing.assert_equal(km.transform(test_val)[0], np.array([1])) np.testing.assert_equal(km.transform(test_val)[1], np.array([6])) index = km.predict(test_val) assert index == 0 indices = km.predict(test_arr) np.testing.assert_equal(indices, np.array([0, 1])) # Check __eq__ and is_similar_to km2 = KMeansMachine(2) assert km != km2 assert not km.is_similar_to(km2) km2 = copy.deepcopy(km) assert km == km2 assert km.is_similar_to(km2) km2.centroids_[0, 0] += 1 assert km != km2 assert not km.is_similar_to(km2)
def test_KMeansMachine(): # Test a KMeansMachine means = numpy.array([[3, 70, 0], [4, 72, 0]], 'float64') mean = numpy.array([3, 70, 1], 'float64') # Initializes a KMeansMachine km = KMeansMachine(2, 3) km.means = means assert km.shape == (2, 3) # Sets and gets assert (km.means == means).all() assert (km.get_mean(0) == means[0, :]).all() assert (km.get_mean(1) == means[1, :]).all() km.set_mean(0, mean) assert (km.get_mean(0) == mean).all() # Distance and closest mean eps = 1e-10 assert equals(km.get_distance_from_mean(mean, 0), 0, eps) assert equals(km.get_distance_from_mean(mean, 1), 6, eps) (index, dist) = km.get_closest_mean(mean) assert index == 0 assert equals(dist, 0, eps) assert equals(km.get_min_distance(mean), 0, eps) # Loads and saves filename = str(tempfile.mkstemp(".hdf5")[1]) km.save(bob.io.base.HDF5File(filename, 'w')) km_loaded = KMeansMachine(bob.io.base.HDF5File(filename)) assert km == km_loaded # Resize km.resize(4, 5) assert km.shape == (4, 5) # Copy constructor and comparison operators km.resize(2, 3) km2 = KMeansMachine(km) assert km2 == km assert (km2 != km) is False assert km2.is_similar_to(km) means2 = numpy.array([[3, 70, 0], [4, 72, 2]], 'float64') km2.means = means2 assert (km2 == km) is False assert km2 != km assert (km2.is_similar_to(km)) is False # Clean-up os.unlink(filename)
def test_trainer_execption(): from nose.tools import assert_raises # Testing Inf machine = KMeansMachine(2, 2) data = numpy.array([[1.0, 2.0], [2, 3.], [1, 1.], [2, 5.], [numpy.inf, 1.0]]) trainer = KMeansTrainer() assert_raises(ValueError, bob.learn.em.train, trainer, machine, data, 10) # Testing Nan machine = KMeansMachine(2, 2) data = numpy.array([[1.0, 2.0], [2, 3.], [1, numpy.nan], [2, 5.], [2.0, 1.0]]) trainer = KMeansTrainer() assert_raises(ValueError, bob.learn.em.train, trainer, machine, data, 10)
def test_kmeans_a(): # Trains a KMeansMachine # This files contains draws from two 1D Gaussian distributions: # * 100 samples from N(-10,1) # * 100 samples from N(10,1) data = bob.io.base.load(datafile("samplesFrom2G_f64.hdf5", __name__, path="../data/")) machine = KMeansMachine(2, 1) trainer = KMeansTrainer() # trainer.train(machine, data) bob.learn.em.train(trainer, machine, data) [variances, weights] = machine.get_variances_and_weights_for_each_cluster(data) variances_b = numpy.ndarray(shape=(2, 1), dtype=numpy.float64) weights_b = numpy.ndarray(shape=(2,), dtype=numpy.float64) machine.__get_variances_and_weights_for_each_cluster_init__(variances_b, weights_b) machine.__get_variances_and_weights_for_each_cluster_acc__(data, variances_b, weights_b) machine.__get_variances_and_weights_for_each_cluster_fin__(variances_b, weights_b) m1 = machine.get_mean(0) m2 = machine.get_mean(1) ## Check means [-10,10] / variances [1,1] / weights [0.5,0.5] if (m1 < m2): means = numpy.array(([m1[0], m2[0]]), 'float64') else: means = numpy.array(([m2[0], m1[0]]), 'float64') assert equals(means, numpy.array([-10., 10.]), 2e-1) assert equals(variances, numpy.array([1., 1.]), 2e-1) assert equals(weights, numpy.array([0.5, 0.5]), 1e-3) assert equals(variances, variances_b, 1e-8) assert equals(weights, weights_b, 1e-8)
def test_kmeans_plus_plus(): # Tests the K-Means++ initialization dim_c = 5 dim_d = 7 n_samples = 150 data = numpy.random.randn(n_samples, dim_d) seed = 0 # C++ implementation machine = KMeansMachine(dim_c, dim_d) trainer = KMeansTrainer() trainer.rng = bob.core.random.mt19937(seed) trainer.initialization_method = 'KMEANS_PLUS_PLUS' trainer.initialize(machine, data) # Python implementation py_machine = KMeansMachine(dim_c, dim_d) kmeans_plus_plus(py_machine, data, seed) assert equals(machine.means, py_machine.means, 1e-8)
def test_kmeans_machine(): # Test a KMeansMachine means = numpy.array([[3, 70, 0], [4, 72, 0]], "float64") # Initializes a KMeansMachine kmeans_machine = KMeansMachine(2, 3) kmeans_machine.means = means kmeans_machine_after_pickle = pickle.loads(pickle.dumps(kmeans_machine)) assert numpy.allclose(kmeans_machine_after_pickle.means, kmeans_machine.means, 10e-3)
def test_kmeans_fit(): np.random.seed(0) data1 = np.random.normal(loc=1, size=(2000, 3)) data2 = np.random.normal(loc=-1, size=(2000, 3)) print(data1.min(), data1.max()) print(data2.min(), data2.max()) data = np.concatenate([data1, data2], axis=0) for transform in (to_numpy, to_dask_array): data = transform(data) machine = KMeansMachine(2, random_state=0).fit(data) centroids = machine.centroids_[np.argsort(machine.centroids_[:, 0])] expected = [ [-1.07173464, -1.06200356, -1.00724920], [0.99479125, 0.99665564, 0.97689017], ] np.testing.assert_almost_equal(centroids, expected, decimal=7) # Early stop machine = KMeansMachine(2, max_iter=2) machine.fit(data)
def _voice_activity_detection(self, energy_array: np.ndarray) -> np.ndarray: """Fits a 2 Gaussian GMM on the energy that splits between voice and silence.""" n_samples = len(energy_array) # if energy does not change a lot, it may not be audio? if np.std(energy_array) < 10e-5: return np.zeros(shape=n_samples) # Add an epsilon small Gaussian noise to avoid numerical issues (mainly due to artificial silence). energy_array = (1e-6 * np.random.randn(n_samples)) + energy_array # Normalize the energy array, make it an array of 1D samples normalized_energy = utils.normalize_std_array(energy_array).reshape( (-1, 1)) # Note: self.max_iterations and self.convergence_threshold are used for both # k-means and GMM training. kmeans_trainer = KMeansMachine( n_clusters=2, convergence_threshold=self.convergence_threshold, max_iter=self.max_iterations, init_max_iter=self.max_iterations, ) ubm_gmm = GMMMachine( n_gaussians=2, trainer="ml", update_means=True, update_variances=True, update_weights=True, convergence_threshold=self.convergence_threshold, max_fitting_steps=self.max_iterations, k_means_trainer=kmeans_trainer, ) ubm_gmm.variance_thresholds = self.variance_threshold ubm_gmm.fit(normalized_energy) if np.isnan(ubm_gmm.means).any(): logger.warn("Annotation aborted: File contains NaN's") return np.zeros(shape=n_samples, dtype=int) # Classify # Different behavior dep on which mean represents high energy (higher value) labels = ubm_gmm.log_weighted_likelihood(normalized_energy) if ubm_gmm.means.argmax() == 0: # High energy in means[0] labels = labels.argmin(axis=0) else: # High energy in means[1] labels = labels.argmax(axis=0) return labels
def test_kmeans_b(): # Trains a KMeansMachine (arStd, std) = NormalizeStdArray(datafile("faithful.torch3.hdf5", __name__, path="../data/")) machine = KMeansMachine(2, 2) trainer = KMeansTrainer() # trainer.seed = 1337 bob.learn.em.train(trainer, machine, arStd, convergence_threshold=0.001) [variances, weights] = machine.get_variances_and_weights_for_each_cluster(arStd) means = numpy.array(machine.means) variances = numpy.array(variances) multiplyVectorsByFactors(means, std) multiplyVectorsByFactors(variances, std ** 2) gmmWeights = bob.io.base.load(datafile('gmm.init_weights.hdf5', __name__, path="../data/")) gmmMeans = bob.io.base.load(datafile('gmm.init_means.hdf5', __name__, path="../data/")) gmmVariances = bob.io.base.load(datafile('gmm.init_variances.hdf5', __name__, path="../data/")) if (means[0, 0] < means[1, 0]): means = flipRows(means) variances = flipRows(variances) weights = flipRows(weights) assert equals(means, gmmMeans, 1e-3) assert equals(weights, gmmWeights, 1e-3) assert equals(variances, gmmVariances, 1e-3) # Check that there is no duplicate means during initialization machine = KMeansMachine(2, 1) trainer = KMeansTrainer() trainer.initialization_method = 'RANDOM_NO_DUPLICATE' data = numpy.array([[1.], [1.], [1.], [1.], [1.], [1.], [2.], [3.]]) bob.learn.em.train(trainer, machine, data) assert (numpy.isnan(machine.means).any()) == False
def test_custom_trainer(): # Custom python trainer ar = bob.io.base.load( datafile("faithful.torch3_f64.hdf5", __name__, path="../data/")) mytrainer = MyTrainer1() machine = KMeansMachine(2, 2) mytrainer.train(machine, ar) for i in range(0, 2): assert (ar[i + 1] == machine.means[i, :]).all()
def test_kmeans_noduplicate(): # Data/dimensions dim_c = 2 dim_d = 3 seed = 0 data = numpy.array([[1, 2, 3], [1, 2, 3], [1, 2, 3], [4, 5, 6.]]) # Defines machine and trainer machine = KMeansMachine(dim_c, dim_d) trainer = KMeansTrainer() rng = bob.core.random.mt19937(seed) trainer.initialization_method = 'RANDOM_NO_DUPLICATE' trainer.initialize(machine, data, rng) # Makes sure that the two initial mean vectors selected are different assert equals(machine.get_mean(0), machine.get_mean(1), 1e-8) == False
def test_kmeans_fit_init_random(): np.random.seed(0) data1 = np.random.normal(loc=1, size=(2000, 3)) data2 = np.random.normal(loc=-1, size=(2000, 3)) data = np.concatenate([data1, data2], axis=0) for transform in (to_numpy, to_dask_array): data = transform(data) machine = KMeansMachine(2, init_method="random", random_state=0).fit(data) centroids = machine.centroids_[np.argsort(machine.centroids_[:, 0])] expected = [ [-1.07329460, -1.06207104, -1.00714365], [0.99529015, 0.99570570, 0.97580858], ] np.testing.assert_almost_equal(centroids, expected, decimal=7)
def test_KMeansMachine_var_and_weight(): for transform in (to_numpy, to_dask_array): kmeans = KMeansMachine(2) kmeans.centroids_ = transform(np.array([[1.2, 1.3], [0.2, -0.3]])) data = np.array([[1.0, 1], [1.2, 3], [0, 0], [0.3, 0.2], [0.2, 0]]) data = transform(data) variances, weights = kmeans.get_variances_and_weights_for_each_cluster( data) variances_result = np.array([[0.01, 1.0], [0.01555556, 0.00888889]]) weights_result = np.array([0.4, 0.6]) np.testing.assert_almost_equal(variances, variances_result, decimal=7) np.testing.assert_equal(weights, weights_result)
def test_gmm_kmeans_plusplus_init(): n_gaussians = 3 machine = GMMMachine( n_gaussians, k_means_trainer=KMeansMachine(n_clusters=n_gaussians, init_method="k-means++"), ) data = np.array([[1.5, 1], [1, 1.5], [-1, 0.5], [-1.5, 0], [2, 2], [2.5, 2.5]]) for transform in (to_numpy, to_dask_array): data = transform(data) machine = machine.fit(data) expected_means = np.array([[2.25, 2.25], [-1.25, 0.25], [1.25, 1.25]]) expected_variances = np.array([[1 / 16, 1 / 16], [1 / 16, 1 / 16], [1 / 16, 1 / 16]]) np.testing.assert_almost_equal(machine.means, expected_means, decimal=3) np.testing.assert_almost_equal(machine.variances, expected_variances)
def test_kmeans_parameters(): np.random.seed(0) data1 = np.random.normal(loc=1, size=(2000, 3)) data2 = np.random.normal(loc=-1, size=(2000, 3)) data = np.concatenate([data1, data2], axis=0) for transform in (to_numpy, to_dask_array): data = transform(data) machine = KMeansMachine( n_clusters=2, init_method="k-means||", convergence_threshold=1e-5, max_iter=5, random_state=0, init_max_iter=5, ).fit(data) centroids = machine.centroids_[np.argsort(machine.centroids_[:, 0])] expected = [ [-1.07173464, -1.06200356, -1.00724920], [0.99479125, 0.99665564, 0.97689017], ] np.testing.assert_almost_equal(centroids, expected, decimal=7)
def fit(self, array, y=None, **kwargs): """Trains the UBM.""" # Stack all the samples in a 2D array of features if isinstance(array, da.Array): array = array.persist() # if input is a list (or SampleBatch) of 2 dimensional arrays, stack them if array[0].ndim == 2: array = np.vstack(array) logger.debug( f"Creating UBM machine with {self.number_of_gaussians} gaussians and {len(array)} samples" ) self.ubm = GMMMachine( n_gaussians=self.number_of_gaussians, trainer="ml", max_fitting_steps=self.ubm_training_iterations, convergence_threshold=self.training_threshold, update_means=self.update_means, update_variances=self.update_variances, update_weights=self.update_weights, mean_var_update_threshold=self.variance_threshold, k_means_trainer=KMeansMachine( self.number_of_gaussians, convergence_threshold=self.training_threshold, max_iter=self.kmeans_training_iterations, init_method="k-means||", init_max_iter=self.kmeans_init_iterations, random_state=self.init_seed, oversampling_factor=self.kmeans_oversampling_factor, ), ) # Train the GMM logger.info("Training UBM GMM") self.ubm.fit(array) return self
from bob.bio.spear.extractor import Cepstral from bob.bio.spear.transformer import ReferenceIdEncoder from bob.learn.em import KMeansMachine from bob.pipelines import wrap SEED = 0 ubm = GMM( n_gaussians=256, max_fitting_steps=2, convergence_threshold= 1e-3, # Maximum number of iterations as stopping criterion k_means_trainer=KMeansMachine( n_clusters=256, max_iter=2, random_state=SEED, init_max_iter=5, oversampling_factor=64, ), return_stats_in_transform=True, ) bioalgorithm = ISV( # ISV parameters r_U=50, random_state=SEED, em_iterations=2, enroll_iterations=1, # GMM parameters ubm=ubm, )
import matplotlib.pyplot as plt from sklearn.datasets import load_iris from bob.learn.em import KMeansMachine iris_data = load_iris() data = iris_data.data setosa = data[iris_data.target == 0] versicolor = data[iris_data.target == 1] virginica = data[iris_data.target == 2] # Training KMeans # 3 clusters with a feature dimensionality of 2 machine = KMeansMachine(n_clusters=3, init_method="k-means++").fit(data) predictions = machine.predict(data) # Plotting figure, ax = plt.subplots() plt.scatter(setosa[:, 0], setosa[:, 1], c="darkcyan", label="setosa") plt.scatter(versicolor[:, 0], versicolor[:, 1], c="goldenrod", label="versicolor") plt.scatter(virginica[:, 0], virginica[:, 1], c="dimgrey", label="virginica") plt.scatter( machine.centroids_[:, 0], machine.centroids_[:, 1], c="blue", marker="x",
This pipeline is composed of the following steps: - annotator: Energy_2Gauss (VAD on 2 Gaussians) - extractor: Cepstral (MFCC, 60 features) - algorithm: GMM (trained in the pipeline as a Transformer, and used as BioAlgorithm for enrollment and scoring) """ # Number of Gaussians for the UBM (used by kmeans and GMM) n_gaussians = 256 # Kmeans machine used for GMM initialization kmeans_trainer = KMeansMachine( n_clusters=n_gaussians, max_iter=25, convergence_threshold=0.0, init_max_iter=5, oversampling_factor=64, ) # Algorithm used for enrollment and scoring, trained first as a Transformer. bioalgorithm = GMM( n_gaussians=n_gaussians, max_fitting_steps=25, enroll_iterations=1, convergence_threshold= 0.0, # Maximum number of iterations as stopping criterion k_means_trainer=kmeans_trainer, random_state=2, )