예제 #1
0
def test_KMeansMachine():
    # Test a KMeansMachine

    means = np.array([[3, 70, 0], [4, 72, 0]], "float64")
    test_val = np.array([3, 70, 1], "float64")
    test_arr = np.array([[3, 70, 1], [5, 72, 0]], "float64")

    for transform in (to_numpy, to_dask_array):
        means, test_val, test_arr = transform(means, test_val, test_arr)

        # Initializes a KMeansMachine
        km = KMeansMachine(2)
        km.centroids_ = means

        # Distance and closest mean
        np.testing.assert_equal(km.transform(test_val)[0], np.array([1]))
        np.testing.assert_equal(km.transform(test_val)[1], np.array([6]))

        index = km.predict(test_val)
        assert index == 0

        indices = km.predict(test_arr)
        np.testing.assert_equal(indices, np.array([0, 1]))

        # Check __eq__ and is_similar_to
        km2 = KMeansMachine(2)
        assert km != km2
        assert not km.is_similar_to(km2)
        km2 = copy.deepcopy(km)
        assert km == km2
        assert km.is_similar_to(km2)
        km2.centroids_[0, 0] += 1
        assert km != km2
        assert not km.is_similar_to(km2)
예제 #2
0
def test_KMeansMachine():
    # Test a KMeansMachine

    means = numpy.array([[3, 70, 0], [4, 72, 0]], 'float64')
    mean = numpy.array([3, 70, 1], 'float64')

    # Initializes a KMeansMachine
    km = KMeansMachine(2, 3)
    km.means = means
    assert km.shape == (2, 3)

    # Sets and gets
    assert (km.means == means).all()
    assert (km.get_mean(0) == means[0, :]).all()
    assert (km.get_mean(1) == means[1, :]).all()
    km.set_mean(0, mean)
    assert (km.get_mean(0) == mean).all()

    # Distance and closest mean
    eps = 1e-10

    assert equals(km.get_distance_from_mean(mean, 0), 0, eps)
    assert equals(km.get_distance_from_mean(mean, 1), 6, eps)

    (index, dist) = km.get_closest_mean(mean)

    assert index == 0
    assert equals(dist, 0, eps)
    assert equals(km.get_min_distance(mean), 0, eps)

    # Loads and saves
    filename = str(tempfile.mkstemp(".hdf5")[1])
    km.save(bob.io.base.HDF5File(filename, 'w'))
    km_loaded = KMeansMachine(bob.io.base.HDF5File(filename))
    assert km == km_loaded

    # Resize
    km.resize(4, 5)
    assert km.shape == (4, 5)

    # Copy constructor and comparison operators
    km.resize(2, 3)
    km2 = KMeansMachine(km)
    assert km2 == km
    assert (km2 != km) is False
    assert km2.is_similar_to(km)
    means2 = numpy.array([[3, 70, 0], [4, 72, 2]], 'float64')
    km2.means = means2
    assert (km2 == km) is False
    assert km2 != km
    assert (km2.is_similar_to(km)) is False

    # Clean-up
    os.unlink(filename)
예제 #3
0
def test_trainer_execption():
    from nose.tools import assert_raises

    # Testing Inf
    machine = KMeansMachine(2, 2)
    data = numpy.array([[1.0, 2.0], [2, 3.], [1, 1.], [2, 5.], [numpy.inf, 1.0]])
    trainer = KMeansTrainer()
    assert_raises(ValueError, bob.learn.em.train, trainer, machine, data, 10)

    # Testing Nan
    machine = KMeansMachine(2, 2)
    data = numpy.array([[1.0, 2.0], [2, 3.], [1, numpy.nan], [2, 5.], [2.0, 1.0]])
    trainer = KMeansTrainer()
    assert_raises(ValueError, bob.learn.em.train, trainer, machine, data, 10)
예제 #4
0
def test_kmeans_a():
    # Trains a KMeansMachine
    # This files contains draws from two 1D Gaussian distributions:
    #   * 100 samples from N(-10,1)
    #   * 100 samples from N(10,1)
    data = bob.io.base.load(datafile("samplesFrom2G_f64.hdf5", __name__, path="../data/"))

    machine = KMeansMachine(2, 1)

    trainer = KMeansTrainer()
    # trainer.train(machine, data)
    bob.learn.em.train(trainer, machine, data)

    [variances, weights] = machine.get_variances_and_weights_for_each_cluster(data)
    variances_b = numpy.ndarray(shape=(2, 1), dtype=numpy.float64)
    weights_b = numpy.ndarray(shape=(2,), dtype=numpy.float64)
    machine.__get_variances_and_weights_for_each_cluster_init__(variances_b, weights_b)
    machine.__get_variances_and_weights_for_each_cluster_acc__(data, variances_b, weights_b)
    machine.__get_variances_and_weights_for_each_cluster_fin__(variances_b, weights_b)
    m1 = machine.get_mean(0)
    m2 = machine.get_mean(1)

    ## Check means [-10,10] / variances [1,1] / weights [0.5,0.5]
    if (m1 < m2):
        means = numpy.array(([m1[0], m2[0]]), 'float64')
    else:
        means = numpy.array(([m2[0], m1[0]]), 'float64')
    assert equals(means, numpy.array([-10., 10.]), 2e-1)
    assert equals(variances, numpy.array([1., 1.]), 2e-1)
    assert equals(weights, numpy.array([0.5, 0.5]), 1e-3)

    assert equals(variances, variances_b, 1e-8)
    assert equals(weights, weights_b, 1e-8)
예제 #5
0
    def test_kmeans_plus_plus():
        # Tests the K-Means++ initialization
        dim_c = 5
        dim_d = 7
        n_samples = 150
        data = numpy.random.randn(n_samples, dim_d)
        seed = 0

        # C++ implementation
        machine = KMeansMachine(dim_c, dim_d)
        trainer = KMeansTrainer()
        trainer.rng = bob.core.random.mt19937(seed)
        trainer.initialization_method = 'KMEANS_PLUS_PLUS'
        trainer.initialize(machine, data)

        # Python implementation
        py_machine = KMeansMachine(dim_c, dim_d)
        kmeans_plus_plus(py_machine, data, seed)
        assert equals(machine.means, py_machine.means, 1e-8)
예제 #6
0
def test_kmeans_machine():
    # Test a KMeansMachine

    means = numpy.array([[3, 70, 0], [4, 72, 0]], "float64")

    # Initializes a KMeansMachine
    kmeans_machine = KMeansMachine(2, 3)
    kmeans_machine.means = means

    kmeans_machine_after_pickle = pickle.loads(pickle.dumps(kmeans_machine))
    assert numpy.allclose(kmeans_machine_after_pickle.means,
                          kmeans_machine.means, 10e-3)
예제 #7
0
def test_kmeans_fit():
    np.random.seed(0)
    data1 = np.random.normal(loc=1, size=(2000, 3))
    data2 = np.random.normal(loc=-1, size=(2000, 3))
    print(data1.min(), data1.max())
    print(data2.min(), data2.max())
    data = np.concatenate([data1, data2], axis=0)

    for transform in (to_numpy, to_dask_array):
        data = transform(data)
        machine = KMeansMachine(2, random_state=0).fit(data)
        centroids = machine.centroids_[np.argsort(machine.centroids_[:, 0])]
        expected = [
            [-1.07173464, -1.06200356, -1.00724920],
            [0.99479125, 0.99665564, 0.97689017],
        ]
        np.testing.assert_almost_equal(centroids, expected, decimal=7)

        # Early stop
        machine = KMeansMachine(2, max_iter=2)
        machine.fit(data)
예제 #8
0
    def _voice_activity_detection(self,
                                  energy_array: np.ndarray) -> np.ndarray:
        """Fits a 2 Gaussian GMM on the energy that splits between voice and silence."""
        n_samples = len(energy_array)
        # if energy does not change a lot, it may not be audio?
        if np.std(energy_array) < 10e-5:
            return np.zeros(shape=n_samples)

        # Add an epsilon small Gaussian noise to avoid numerical issues (mainly due to artificial silence).
        energy_array = (1e-6 * np.random.randn(n_samples)) + energy_array

        # Normalize the energy array, make it an array of 1D samples
        normalized_energy = utils.normalize_std_array(energy_array).reshape(
            (-1, 1))

        # Note: self.max_iterations and self.convergence_threshold are used for both
        # k-means and GMM training.
        kmeans_trainer = KMeansMachine(
            n_clusters=2,
            convergence_threshold=self.convergence_threshold,
            max_iter=self.max_iterations,
            init_max_iter=self.max_iterations,
        )
        ubm_gmm = GMMMachine(
            n_gaussians=2,
            trainer="ml",
            update_means=True,
            update_variances=True,
            update_weights=True,
            convergence_threshold=self.convergence_threshold,
            max_fitting_steps=self.max_iterations,
            k_means_trainer=kmeans_trainer,
        )
        ubm_gmm.variance_thresholds = self.variance_threshold

        ubm_gmm.fit(normalized_energy)

        if np.isnan(ubm_gmm.means).any():
            logger.warn("Annotation aborted: File contains NaN's")
            return np.zeros(shape=n_samples, dtype=int)

        # Classify

        # Different behavior dep on which mean represents high energy (higher value)
        labels = ubm_gmm.log_weighted_likelihood(normalized_energy)
        if ubm_gmm.means.argmax() == 0:  # High energy in means[0]
            labels = labels.argmin(axis=0)
        else:  # High energy in means[1]
            labels = labels.argmax(axis=0)

        return labels
예제 #9
0
def test_kmeans_b():
    # Trains a KMeansMachine
    (arStd, std) = NormalizeStdArray(datafile("faithful.torch3.hdf5", __name__, path="../data/"))

    machine = KMeansMachine(2, 2)

    trainer = KMeansTrainer()
    # trainer.seed = 1337
    bob.learn.em.train(trainer, machine, arStd, convergence_threshold=0.001)

    [variances, weights] = machine.get_variances_and_weights_for_each_cluster(arStd)

    means = numpy.array(machine.means)
    variances = numpy.array(variances)

    multiplyVectorsByFactors(means, std)
    multiplyVectorsByFactors(variances, std ** 2)

    gmmWeights = bob.io.base.load(datafile('gmm.init_weights.hdf5', __name__, path="../data/"))
    gmmMeans = bob.io.base.load(datafile('gmm.init_means.hdf5', __name__, path="../data/"))
    gmmVariances = bob.io.base.load(datafile('gmm.init_variances.hdf5', __name__, path="../data/"))

    if (means[0, 0] < means[1, 0]):
        means = flipRows(means)
        variances = flipRows(variances)
        weights = flipRows(weights)

    assert equals(means, gmmMeans, 1e-3)
    assert equals(weights, gmmWeights, 1e-3)
    assert equals(variances, gmmVariances, 1e-3)

    # Check that there is no duplicate means during initialization
    machine = KMeansMachine(2, 1)
    trainer = KMeansTrainer()
    trainer.initialization_method = 'RANDOM_NO_DUPLICATE'
    data = numpy.array([[1.], [1.], [1.], [1.], [1.], [1.], [2.], [3.]])
    bob.learn.em.train(trainer, machine, data)
    assert (numpy.isnan(machine.means).any()) == False
예제 #10
0
def test_custom_trainer():

    # Custom python trainer

    ar = bob.io.base.load(
        datafile("faithful.torch3_f64.hdf5", __name__, path="../data/"))

    mytrainer = MyTrainer1()

    machine = KMeansMachine(2, 2)
    mytrainer.train(machine, ar)

    for i in range(0, 2):
        assert (ar[i + 1] == machine.means[i, :]).all()
예제 #11
0
def test_kmeans_noduplicate():
    # Data/dimensions
    dim_c = 2
    dim_d = 3
    seed = 0
    data = numpy.array([[1, 2, 3], [1, 2, 3], [1, 2, 3], [4, 5, 6.]])
    # Defines machine and trainer
    machine = KMeansMachine(dim_c, dim_d)
    trainer = KMeansTrainer()
    rng = bob.core.random.mt19937(seed)
    trainer.initialization_method = 'RANDOM_NO_DUPLICATE'
    trainer.initialize(machine, data, rng)
    # Makes sure that the two initial mean vectors selected are different
    assert equals(machine.get_mean(0), machine.get_mean(1), 1e-8) == False
예제 #12
0
def test_kmeans_fit_init_random():
    np.random.seed(0)
    data1 = np.random.normal(loc=1, size=(2000, 3))
    data2 = np.random.normal(loc=-1, size=(2000, 3))
    data = np.concatenate([data1, data2], axis=0)
    for transform in (to_numpy, to_dask_array):
        data = transform(data)
        machine = KMeansMachine(2, init_method="random",
                                random_state=0).fit(data)
        centroids = machine.centroids_[np.argsort(machine.centroids_[:, 0])]
        expected = [
            [-1.07329460, -1.06207104, -1.00714365],
            [0.99529015, 0.99570570, 0.97580858],
        ]
        np.testing.assert_almost_equal(centroids, expected, decimal=7)
예제 #13
0
def test_KMeansMachine_var_and_weight():
    for transform in (to_numpy, to_dask_array):
        kmeans = KMeansMachine(2)
        kmeans.centroids_ = transform(np.array([[1.2, 1.3], [0.2, -0.3]]))

        data = np.array([[1.0, 1], [1.2, 3], [0, 0], [0.3, 0.2], [0.2, 0]])
        data = transform(data)
        variances, weights = kmeans.get_variances_and_weights_for_each_cluster(
            data)

        variances_result = np.array([[0.01, 1.0], [0.01555556, 0.00888889]])
        weights_result = np.array([0.4, 0.6])

        np.testing.assert_almost_equal(variances, variances_result, decimal=7)
        np.testing.assert_equal(weights, weights_result)
예제 #14
0
def test_gmm_kmeans_plusplus_init():
    n_gaussians = 3
    machine = GMMMachine(
        n_gaussians,
        k_means_trainer=KMeansMachine(n_clusters=n_gaussians,
                                      init_method="k-means++"),
    )
    data = np.array([[1.5, 1], [1, 1.5], [-1, 0.5], [-1.5, 0], [2, 2],
                     [2.5, 2.5]])
    for transform in (to_numpy, to_dask_array):
        data = transform(data)
        machine = machine.fit(data)
        expected_means = np.array([[2.25, 2.25], [-1.25, 0.25], [1.25, 1.25]])
        expected_variances = np.array([[1 / 16, 1 / 16], [1 / 16, 1 / 16],
                                       [1 / 16, 1 / 16]])
        np.testing.assert_almost_equal(machine.means,
                                       expected_means,
                                       decimal=3)
        np.testing.assert_almost_equal(machine.variances, expected_variances)
예제 #15
0
def test_kmeans_parameters():
    np.random.seed(0)
    data1 = np.random.normal(loc=1, size=(2000, 3))
    data2 = np.random.normal(loc=-1, size=(2000, 3))
    data = np.concatenate([data1, data2], axis=0)
    for transform in (to_numpy, to_dask_array):
        data = transform(data)
        machine = KMeansMachine(
            n_clusters=2,
            init_method="k-means||",
            convergence_threshold=1e-5,
            max_iter=5,
            random_state=0,
            init_max_iter=5,
        ).fit(data)
        centroids = machine.centroids_[np.argsort(machine.centroids_[:, 0])]
        expected = [
            [-1.07173464, -1.06200356, -1.00724920],
            [0.99479125, 0.99665564, 0.97689017],
        ]
        np.testing.assert_almost_equal(centroids, expected, decimal=7)
예제 #16
0
    def fit(self, array, y=None, **kwargs):
        """Trains the UBM."""
        # Stack all the samples in a 2D array of features
        if isinstance(array, da.Array):
            array = array.persist()

        # if input is a list (or SampleBatch) of 2 dimensional arrays, stack them
        if array[0].ndim == 2:
            array = np.vstack(array)

        logger.debug(
            f"Creating UBM machine with {self.number_of_gaussians} gaussians and {len(array)} samples"
        )

        self.ubm = GMMMachine(
            n_gaussians=self.number_of_gaussians,
            trainer="ml",
            max_fitting_steps=self.ubm_training_iterations,
            convergence_threshold=self.training_threshold,
            update_means=self.update_means,
            update_variances=self.update_variances,
            update_weights=self.update_weights,
            mean_var_update_threshold=self.variance_threshold,
            k_means_trainer=KMeansMachine(
                self.number_of_gaussians,
                convergence_threshold=self.training_threshold,
                max_iter=self.kmeans_training_iterations,
                init_method="k-means||",
                init_max_iter=self.kmeans_init_iterations,
                random_state=self.init_seed,
                oversampling_factor=self.kmeans_oversampling_factor,
            ),
        )

        # Train the GMM
        logger.info("Training UBM GMM")

        self.ubm.fit(array)

        return self
예제 #17
0
from bob.bio.spear.extractor import Cepstral
from bob.bio.spear.transformer import ReferenceIdEncoder
from bob.learn.em import KMeansMachine
from bob.pipelines import wrap

SEED = 0

ubm = GMM(
    n_gaussians=256,
    max_fitting_steps=2,
    convergence_threshold=
    1e-3,  # Maximum number of iterations as stopping criterion
    k_means_trainer=KMeansMachine(
        n_clusters=256,
        max_iter=2,
        random_state=SEED,
        init_max_iter=5,
        oversampling_factor=64,
    ),
    return_stats_in_transform=True,
)

bioalgorithm = ISV(
    # ISV parameters
    r_U=50,
    random_state=SEED,
    em_iterations=2,
    enroll_iterations=1,
    # GMM parameters
    ubm=ubm,
)
예제 #18
0
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris

from bob.learn.em import KMeansMachine

iris_data = load_iris()
data = iris_data.data
setosa = data[iris_data.target == 0]
versicolor = data[iris_data.target == 1]
virginica = data[iris_data.target == 2]

# Training KMeans
# 3 clusters with a feature dimensionality of 2
machine = KMeansMachine(n_clusters=3, init_method="k-means++").fit(data)

predictions = machine.predict(data)

# Plotting
figure, ax = plt.subplots()
plt.scatter(setosa[:, 0], setosa[:, 1], c="darkcyan", label="setosa")
plt.scatter(versicolor[:, 0],
            versicolor[:, 1],
            c="goldenrod",
            label="versicolor")
plt.scatter(virginica[:, 0], virginica[:, 1], c="dimgrey", label="virginica")
plt.scatter(
    machine.centroids_[:, 0],
    machine.centroids_[:, 1],
    c="blue",
    marker="x",
예제 #19
0
This pipeline is composed of the following steps:
    - annotator: Energy_2Gauss (VAD on 2 Gaussians)
    - extractor: Cepstral (MFCC, 60 features)
    - algorithm: GMM (trained in the pipeline as a Transformer, and used as BioAlgorithm
        for enrollment and scoring)
"""

# Number of Gaussians for the UBM (used by kmeans and GMM)
n_gaussians = 256

# Kmeans machine used for GMM initialization
kmeans_trainer = KMeansMachine(
    n_clusters=n_gaussians,
    max_iter=25,
    convergence_threshold=0.0,
    init_max_iter=5,
    oversampling_factor=64,
)

# Algorithm used for enrollment and scoring, trained first as a Transformer.
bioalgorithm = GMM(
    n_gaussians=n_gaussians,
    max_fitting_steps=25,
    enroll_iterations=1,
    convergence_threshold=
    0.0,  # Maximum number of iterations as stopping criterion
    k_means_trainer=kmeans_trainer,
    random_state=2,
)