def gmm_estep(algorithm, extractor, iteration, indices, force=False): """Performs a single E-step of the GMM training (parallel).""" if indices[0] >= indices[1]: return fs = FileSelector.instance() stats_file = fs.gmm_stats_file(iteration, indices[0], indices[1]) new_machine_file = fs.gmm_intermediate_file(iteration + 1) if utils.check_file(stats_file, force, 1000) or utils.check_file(new_machine_file, force, 1000): logger.info("UBM training: Skipping GMM E-Step since the file '%s' or '%s' already exists", stats_file, new_machine_file) else: training_list = fs.training_list('extracted', 'train_projector') last_machine_file = fs.gmm_intermediate_file(iteration) gmm_machine = bob.learn.em.GMMMachine(bob.io.base.HDF5File(last_machine_file)) logger.info("UBM training: GMM E-Step from range(%d, %d)", *indices) # read data data = numpy.vstack([read_feature(extractor, training_list[index]) for index in range(indices[0], indices[1])]) trainer = algorithm.ubm_trainer trainer.initialize(gmm_machine, None) # Calls the E-step and extracts the GMM statistics algorithm.ubm_trainer.e_step(gmm_machine, data) gmm_stats = algorithm.ubm_trainer.gmm_statistics # Saves the GMM statistics to the file bob.io.base.create_directories_safe(os.path.dirname(stats_file)) gmm_stats.save(bob.io.base.HDF5File(stats_file, 'w')) logger.info("UBM training: Wrote GMM stats '%s'", stats_file)
def kmeans_initialize(algorithm, extractor, limit_data = None, force = False, allow_missing_files = False): """Initializes the K-Means training (non-parallel).""" fs = FileSelector.instance() output_file = fs.kmeans_intermediate_file(0) if utils.check_file(output_file, force, 1000): logger.info("UBM training: Skipping KMeans initialization since the file '%s' already exists", output_file) else: # read data logger.info("UBM training: initializing kmeans") training_list = utils.selected_elements(fs.training_list('extracted', 'train_projector'), limit_data) # read the features reader = functools.partial(read_feature, extractor) data = utils.vstack_features(reader, training_list, allow_missing_files=allow_missing_files) # Perform KMeans initialization kmeans_machine = bob.learn.em.KMeansMachine(algorithm.gaussians, data.shape[1]) # Creates the KMeansTrainer and call the initialization procedure # Reseting the pseudo random number generator so we can have the same initialization for serial and parallel execution. algorithm.rng = bob.core.random.mt19937(algorithm.init_seed) algorithm.kmeans_trainer.initialize(kmeans_machine, data, algorithm.rng) bob.io.base.create_directories_safe(os.path.dirname(output_file)) kmeans_machine.save(bob.io.base.HDF5File(output_file, 'w')) logger.info("UBM training: saved initial KMeans machine to '%s'", output_file)
def lda_project(algorithm, indices, force=False, allow_missing_files=False): """Performs IVector projection""" fs = FileSelector.instance() algorithm.load_lda(fs.lda_file) whitened_files = fs.training_list('whitened', 'train_projector') lda_projected_files = fs.training_list('lda_projected', 'train_projector') logger.info( "IVector training: LDA projection range (%d, %d) from '%s' to '%s'", indices[0], indices[1], fs.directories['whitened'], fs.directories['lda_projected']) # extract the features for i in range(indices[0], indices[1]): ivector_file = whitened_files[i] lda_projected_file = lda_projected_files[i] if not utils.check_file(lda_projected_file, force): if len( utils.filter_missing_files( [ivector_file], split_by_client=False, allow_missing_files=allow_missing_files)) > 0: # load feature ivector = algorithm.read_feature(ivector_file) # project feature lda_projected = algorithm.project_lda(ivector) # write it bob.io.base.create_directories_safe( os.path.dirname(lda_projected_file)) bob.bio.base.save(lda_projected, lda_projected_file)
def train_isv(algorithm, force=False, allow_missing_files=False): """Finally, the UBM is used to train the ISV projector/enroller.""" fs = FileSelector.instance() if utils.check_file(fs.projector_file, force, 800): logger.info( "ISV training: Skipping ISV training since '%s' already exists", fs.projector_file) else: # read UBM into the ISV class algorithm.load_ubm(fs.ubm_file) # read training data training_list = fs.training_list('projected_gmm', 'train_projector', arrange_by_client=True) training_list = utils.filter_missing_files( training_list, split_by_client=True, allow_missing_files=allow_missing_files) train_gmm_stats = [[ algorithm.read_gmm_stats(filename) for filename in client_files ] for client_files in training_list] # perform ISV training logger.info("ISV training: training ISV with %d clients", len(train_gmm_stats)) algorithm.train_isv(train_gmm_stats) # save result bob.io.base.create_directories_safe(os.path.dirname(fs.projector_file)) algorithm.save_projector(fs.projector_file)
def gmm_initialize(algorithm, extractor, limit_data = None, force = False, allow_missing_files = False): """Initializes the GMM calculation with the result of the K-Means algorithm (non-parallel). This might require a lot of memory.""" fs = FileSelector.instance() output_file = fs.gmm_intermediate_file(0) if utils.check_file(output_file, force, 800): logger.info("UBM Training: Skipping GMM initialization since '%s' already exists", output_file) else: logger.info("UBM Training: Initializing GMM") training_list = utils.selected_elements(fs.training_list('extracted', 'train_projector'), limit_data) # read the features reader = functools.partial(read_feature, extractor) data = utils.vstack_features(reader, training_list, allow_missing_files=allow_missing_files) # get means and variances of kmeans result kmeans_machine = bob.learn.em.KMeansMachine(bob.io.base.HDF5File(fs.kmeans_file)) [variances, weights] = kmeans_machine.get_variances_and_weights_for_each_cluster(data) # Create initial GMM Machine gmm_machine = bob.learn.em.GMMMachine(algorithm.gaussians, data.shape[1]) # Initializes the GMM gmm_machine.means = kmeans_machine.means gmm_machine.variances = variances gmm_machine.weights = weights gmm_machine.set_variance_thresholds(algorithm.variance_threshold) # write gmm machine to file bob.io.base.create_directories_safe(os.path.dirname(output_file)) gmm_machine.save(bob.io.base.HDF5File(output_file, 'w')) logger.info("UBM Training: Wrote GMM file '%s'", output_file)
def wccn_project(algorithm, indices, force=False): """Performs IVector projection""" fs = FileSelector.instance() algorithm.load_wccn(fs.wccn_file) if algorithm.use_lda: input_label = 'lda_projected' else: input_label = 'whitened' input_files = fs.training_list(input_label, 'train_projector') wccn_projected_files = fs.training_list('wccn_projected', 'train_projector') logger.info("IVector training: WCCN projection range (%d, %d) from '%s' to '%s'", indices[0], indices[1], fs.directories[input_label], fs.directories['wccn_projected']) # extract the features for i in range(indices[0], indices[1]): ivector_file = input_files[i] wccn_projected_file = wccn_projected_files[i] if not utils.check_file(wccn_projected_file, force): # load feature ivector = algorithm.read_feature(ivector_file) # project feature wccn_projected = algorithm.project_wccn(ivector) # write it bob.io.base.create_directories_safe(os.path.dirname(wccn_projected_file)) bob.bio.base.save(wccn_projected, wccn_projected_file)
def kmeans_initialize(algorithm, extractor, limit_data=None, force=False): """Initializes the K-Means training (non-parallel).""" fs = FileSelector.instance() output_file = fs.kmeans_intermediate_file(0) if utils.check_file(output_file, force, 1000): logger.info( "UBM training: Skipping KMeans initialization since the file '%s' already exists", output_file) else: # read data logger.info("UBM training: initializing kmeans") training_list = utils.selected_elements( fs.training_list('extracted', 'train_projector'), limit_data) data = numpy.vstack([ read_feature(extractor, feature_file) for feature_file in training_list ]) # Perform KMeans initialization kmeans_machine = bob.learn.em.KMeansMachine(algorithm.gaussians, data.shape[1]) # Creates the KMeansTrainer and call the initialization procedure algorithm.kmeans_trainer.initialize(kmeans_machine, data, algorithm.rng) bob.io.base.create_directories_safe(os.path.dirname(output_file)) kmeans_machine.save(bob.io.base.HDF5File(output_file, 'w')) logger.info("UBM training: saved initial KMeans machine to '%s'", output_file)
def train_plda(algorithm, force=False, allow_missing_files=False): """Train the feature projector with the extracted features of the world group.""" fs = FileSelector.instance() if utils.check_file(fs.plda_file, force, 1000): logger.info("- PLDA projector '%s' already exists.", fs.plda_file) else: if algorithm.use_wccn: input_label = 'wccn_projected' elif algorithm.use_lda: input_label = 'lda_projected' else: input_label = 'whitened' train_files = fs.training_list(input_label, 'train_projector', arrange_by_client=True) train_files = utils.filter_missing_files( train_files, split_by_client=True, allow_missing_files=allow_missing_files) train_features = [[ bob.bio.base.load(filename) for filename in client_files ] for client_files in train_files] # perform training algorithm.train_plda(train_features) bob.io.base.create_directories_safe(os.path.dirname(fs.plda_file)) bob.bio.base.save(algorithm.plda_base, fs.plda_file)
def gmm_initialize(algorithm, extractor, limit_data = None, force = False): """Initializes the GMM calculation with the result of the K-Means algorithm (non-parallel). This might require a lot of memory.""" fs = FileSelector.instance() output_file = fs.gmm_intermediate_file(0) if utils.check_file(output_file, force, 800): logger.info("UBM Training: Skipping GMM initialization since '%s' already exists", output_file) else: logger.info("UBM Training: Initializing GMM") # read features training_list = utils.selected_elements(fs.training_list('extracted', 'train_projector'), limit_data) data = numpy.vstack([read_feature(extractor, feature_file) for feature_file in training_list]) # get means and variances of kmeans result kmeans_machine = bob.learn.em.KMeansMachine(bob.io.base.HDF5File(fs.kmeans_file)) [variances, weights] = kmeans_machine.get_variances_and_weights_for_each_cluster(data) # Create initial GMM Machine gmm_machine = bob.learn.em.GMMMachine(algorithm.gaussians, data.shape[1]) # Initializes the GMM gmm_machine.means = kmeans_machine.means gmm_machine.variances = variances gmm_machine.weights = weights gmm_machine.set_variance_thresholds(algorithm.variance_threshold) # write gmm machine to file bob.io.base.create_directories_safe(os.path.dirname(output_file)) gmm_machine.save(bob.io.base.HDF5File(output_file, 'w')) logger.info("UBM Training: Wrote GMM file '%s'", output_file)
def gmm_project(algorithm, extractor, indices, force=False): """Performs GMM projection""" fs = FileSelector.instance() algorithm.load_ubm(fs.ubm_file) feature_files = fs.training_list('extracted', 'train_projector') projected_files = fs.training_list('projected_gmm', 'train_projector') logger.info( "ISV training: Project features range (%d, %d) from '%s' to '%s'", indices[0], indices[1], fs.directories['extracted'], fs.directories['projected_gmm']) # extract the features for i in range(indices[0], indices[1]): feature_file = feature_files[i] projected_file = projected_files[i] if not utils.check_file(projected_file, force): # load feature feature = read_feature(extractor, feature_file) # project feature projected = algorithm.project_ubm(feature) # write it bob.io.base.create_directories_safe( os.path.dirname(projected_file)) bob.bio.base.save(projected, projected_file)
def ivector_estep(algorithm, iteration, indices, force=False, allow_missing_files=False): """Performs a single E-step of the IVector algorithm (parallel)""" fs = FileSelector.instance() stats_file = fs.ivector_stats_file(iteration, indices[0], indices[1]) if utils.check_file(stats_file, force, 1000): logger.info( "IVector training: Skipping IVector E-Step since the file '%s' already exists", stats_file) else: logger.info("IVector training: E-Step from range(%d, %d)", *indices) # Temporary machine used for initialization algorithm.load_ubm(fs.ubm_file) # get the IVectorTrainer and call the initialization procedure trainer = algorithm.ivector_trainer # Load machine if iteration: # load last TV file tv = bob.learn.em.IVectorMachine( bob.io.base.HDF5File(fs.ivector_intermediate_file(iteration))) tv.ubm = algorithm.ubm else: # create new TV machine tv = bob.learn.em.IVectorMachine(algorithm.ubm, algorithm.subspace_dimension_of_t, algorithm.variance_threshold) trainer.initialize(tv) # Load data training_list = fs.training_list('projected_gmm', 'train_projector') training_list = [ training_list[i] for i in range(indices[0], indices[1]) ] training_list = utils.filter_missing_files( training_list, split_by_client=False, allow_missing_files=allow_missing_files) data = [algorithm.read_gmm_stats(f) for f in training_list] # Perform the E-step trainer.e_step(tv, data) # write results to file bob.io.base.create_directories_safe(os.path.dirname(stats_file)) hdf5 = bob.io.base.HDF5File(stats_file, 'w') hdf5.set('acc_nij_wij2', trainer.acc_nij_wij2) hdf5.set('acc_fnormij_wij', trainer.acc_fnormij_wij) hdf5.set('acc_nij', trainer.acc_nij) hdf5.set('acc_snormij', trainer.acc_snormij) hdf5.set('nsamples', indices[1] - indices[0]) logger.info("IVector training: Wrote Stats file '%s'", stats_file)
def ivector_mstep(algorithm, iteration, number_of_parallel_jobs, force=False, clean=False): """Performs a single M-step of the IVector algorithm (non-parallel)""" fs = FileSelector.instance() old_machine_file = fs.ivector_intermediate_file(iteration) new_machine_file = fs.ivector_intermediate_file(iteration + 1) if utils.check_file(new_machine_file, force, 1000): logger.info("IVector training: Skipping IVector M-Step since the file '%s' already exists", new_machine_file) else: # get the files from e-step training_list = fs.training_list('projected_gmm', 'train_projector') # try if there is one file containing all data if os.path.exists(fs.ivector_stats_file(iteration, 0, len(training_list))): # load stats file statistics = self._read_stats(fs.ivector_stats_file(iteration, 0, len(training_list))) else: # load several files stats_files = [] for job in range(number_of_parallel_jobs): job_indices = tools.indices(training_list, number_of_parallel_jobs, job+1) if job_indices[-1] >= job_indices[0]: stats_files.append(fs.ivector_stats_file(iteration, job_indices[0], job_indices[-1])) # read all stats files statistics = _accumulate(stats_files) # Load machine algorithm.load_ubm(fs.ubm_file) if iteration: tv = bob.learn.em.IVectorMachine(bob.io.base.HDF5File(old_machine_file)) tv.ubm = algorithm.ubm else: tv = bob.learn.em.IVectorMachine(algorithm.ubm, algorithm.subspace_dimension_of_t, algorithm.variance_threshold) # Creates the IVectorTrainer and initialize values trainer = algorithm.ivector_trainer trainer.reset_accumulators(tv) trainer.acc_nij_wij2 = statistics[0] trainer.acc_fnormij_wij = statistics[1] trainer.acc_nij = statistics[2] trainer.acc_snormij = statistics[3] trainer.m_step(tv) # data is not used in M-step logger.info("IVector training: Performed M step %d", iteration) # Save the IVector model bob.io.base.create_directories_safe(os.path.dirname(new_machine_file)) tv.save(bob.io.base.HDF5File(new_machine_file, 'w')) logger.info("IVector training: Wrote new IVector machine '%s'", new_machine_file) if iteration == algorithm.tv_training_iterations-1: shutil.copy(new_machine_file, fs.tv_file) logger.info("IVector training: Wrote new TV matrix '%s'", fs.tv_file) if clean and iteration > 0: old_dir = os.path.dirname(fs.ivector_intermediate_file(iteration-1)) logger.info("Removing old intermediate directory '%s'", old_dir) shutil.rmtree(old_dir)
def gmm_mstep(algorithm, iteration, number_of_parallel_jobs, force=False, clean=False): """Performs a single M-step of the GMM training (non-parallel)""" fs = FileSelector.instance() old_machine_file = fs.gmm_intermediate_file(iteration) new_machine_file = fs.gmm_intermediate_file(iteration + 1) if utils.check_file(new_machine_file, force, 1000): logger.info("UBM training: Skipping GMM M-Step since the file '%s' already exists", new_machine_file) else: # get the files from e-step training_list = fs.training_list('extracted', 'train_projector') # try if there is one file containing all data if os.path.exists(fs.gmm_stats_file(iteration, 0, len(training_list))): stats_file = fs.gmm_stats_file(iteration, 0, len(training_list)) # load stats file gmm_stats = bob.learn.em.GMMStats(bob.io.base.HDF5File(stats_file)) else: # load several files stats_files = [] for job in range(number_of_parallel_jobs): job_indices = tools.indices(training_list, number_of_parallel_jobs, job+1) if job_indices[-1] > job_indices[0]: stats_files.append(fs.gmm_stats_file(iteration, job_indices[0], job_indices[-1])) # read all stats files gmm_stats = bob.learn.em.GMMStats(bob.io.base.HDF5File(stats_files[0])) for stats_file in stats_files[1:]: gmm_stats += bob.learn.em.GMMStats(bob.io.base.HDF5File(stats_file)) # load the old gmm machine gmm_machine = bob.learn.em.GMMMachine(bob.io.base.HDF5File(old_machine_file)) # initialize the trainer trainer = algorithm.ubm_trainer trainer.initialize(gmm_machine) trainer.gmm_statistics = gmm_stats # Calls M-step (no data required) trainer.m_step(gmm_machine) # Saves the GMM statistics to the file bob.io.base.create_directories_safe(os.path.dirname(new_machine_file)) gmm_machine.save(bob.io.base.HDF5File(new_machine_file, 'w')) # Write the final UBM file after the last iteration # TODO: implement other stopping criteria if iteration == algorithm.gmm_training_iterations-1: shutil.copy(new_machine_file, fs.ubm_file) logger.info("UBM training: Wrote new UBM '%s'", fs.ubm_file) if clean and iteration > 0: old_dir = os.path.dirname(fs.gmm_intermediate_file(iteration-1)) logger.info("Removing old intermediate directory '%s'", old_dir) shutil.rmtree(old_dir)
def isv_mstep(algorithm, iteration, number_of_parallel_jobs, force=False, clean=False): """Performs a single M-step of the ISV algorithm (non-parallel)""" fs = FileSelector.instance() old_machine_file = fs.isv_intermediate_file(iteration) new_machine_file = fs.isv_intermediate_file(iteration + 1) if utils.check_file(new_machine_file, force, 1000): logger.info("ISV training: Skipping ISV M-Step since the file '%s' already exists", new_machine_file) else: # get the files from e-step training_list = fs.training_list('projected_gmm', 'train_projector', arrange_by_client=True) # try if there is one file containing all data if os.path.exists(fs.isv_stats_file(iteration, 0, len(training_list))): # load stats file statistics = _read_stats(fs.isv_stats_file(iteration, 0, len(training_list))) else: # load several files stats_files = [] for job in range(number_of_parallel_jobs): job_indices = tools.indices(training_list, number_of_parallel_jobs, job+1) if job_indices[-1] >= job_indices[0]: stats_files.append(fs.isv_stats_file(iteration, job_indices[0], job_indices[-1])) # read all stats files statistics = _accumulate(stats_files) # Load machine algorithm.load_ubm(fs.ubm_file) if iteration: isv_base = bob.learn.em.ISVBase(bob.io.base.HDF5File(old_machine_file)) isv_base.ubm = algorithm.ubm else: isv_base = bob.learn.em.ISVBase(algorithm.ubm, algorithm.subspace_dimension_of_u) # Creates the IVectorTrainer and initialize values trainer = algorithm.isv_trainer data = [algorithm.read_gmm_stats(training_list[0])]#Loading data just to allocate memory trainer.initialize(isv_base, data) #Just to allocate memory trainer.acc_u_a1 = statistics[0] trainer.acc_u_a2 = statistics[1] trainer.m_step(isv_base) # data is not used in M-step logger.info("ISV training: Performed M step %d", iteration) # Save the ISV model bob.io.base.create_directories_safe(os.path.dirname(new_machine_file)) isv_base.save(bob.io.base.HDF5File(new_machine_file, 'w')) logger.info("ISV training: Wrote new ISV Base '%s'", new_machine_file) if iteration == algorithm.isv_training_iterations-1: shutil.copy(new_machine_file, fs.isv_file) logger.info("ISV training: Wrote new TV matrix '%s'", fs.isv_file) if clean and iteration > 0: old_dir = os.path.dirname(fs.isv_intermediate_file(iteration-1)) logger.info("Removing old intermediate directory '%s'", old_dir) shutil.rmtree(old_dir)
def save_isv_projector(algorithm, force=False): fs = FileSelector.instance() if utils.check_file(fs.projector_file, force, 1000): logger.info("- Projector '%s' already exists.", fs.projector_file) else: # save the projector into one file algorithm.load_ubm(fs.ubm_file) algorithm.load_isv(fs.isv_file) logger.info("Writing projector into file %s", fs.projector_file) algorithm.save_projector(fs.projector_file)
def kmeans_mstep(algorithm, iteration, number_of_parallel_jobs, force=False, clean=False): """Performs a single M-step of the K-Means algorithm (non-parallel)""" fs = FileSelector.instance() old_machine_file = fs.kmeans_intermediate_file(iteration) new_machine_file = fs.kmeans_intermediate_file(iteration+1) if utils.check_file(new_machine_file, force, 1000): logger.info("UBM training: Skipping KMeans M-Step since the file '%s' already exists", new_machine_file) else: # get the files from e-step training_list = fs.training_list('extracted', 'train_projector') # try if there is one file containing all data if os.path.exists(fs.kmeans_stats_file(iteration, 0, len(training_list))): stats_file = fs.kmeans_stats_file(iteration, 0, len(training_list)) # load stats file statistics = _read_stats(stats_file) else: # load several files filenames = [] for job in range(number_of_parallel_jobs): job_indices = tools.indices(training_list, number_of_parallel_jobs, job+1) if job_indices[-1] > job_indices[0]: filenames.append(fs.kmeans_stats_file(iteration, job_indices[0], job_indices[-1])) statistics = _accumulate(filenames) # Creates the KMeansMachine kmeans_machine = bob.learn.em.KMeansMachine(bob.io.base.HDF5File(old_machine_file)) trainer = algorithm.kmeans_trainer trainer.reset_accumulators(kmeans_machine) trainer.zeroeth_order_statistics = statistics[0] trainer.first_order_statistics = statistics[1] trainer.average_min_distance = statistics[3] error = statistics[3] / statistics[2] # Performs the M-step trainer.m_step(kmeans_machine, None) # data is not used in M-step logger.info("UBM training: Performed M step %d with result %f" % (iteration, error)) # Save the K-Means model bob.io.base.create_directories_safe(os.path.dirname(new_machine_file)) kmeans_machine.save(bob.io.base.HDF5File(new_machine_file, 'w')) # copy the k_means file, when last iteration # TODO: implement other stopping criteria if iteration == algorithm.kmeans_training_iterations-1: shutil.copy(new_machine_file, fs.kmeans_file) logger.info("UBM training: Wrote new KMeans machine '%s'", fs.kmeans_file) if clean and iteration > 0: old_dir = os.path.dirname(fs.kmeans_intermediate_file(iteration-1)) logger.info("Removing old intermediate directory '%s'", old_dir) shutil.rmtree(old_dir)
def kmeans_estep(algorithm, extractor, iteration, indices, force=False, allow_missing_files=False): """Performs a single E-step of the K-Means algorithm (parallel)""" if indices[0] >= indices[1]: return fs = FileSelector.instance() # check if we need to compute this step stats_file = fs.kmeans_stats_file(iteration, indices[0], indices[1]) new_machine_file = fs.kmeans_intermediate_file(iteration + 1) if utils.check_file(stats_file, force, 1000) or utils.check_file( new_machine_file, force, 1000): logger.info( "UBM training: Skipping KMeans E-Step since the file '%s' or '%s' already exists", stats_file, new_machine_file) else: training_list = fs.training_list('extracted', 'train_projector') last_machine_file = fs.kmeans_intermediate_file(iteration) kmeans_machine = bob.learn.em.KMeansMachine( bob.io.base.HDF5File(last_machine_file)) logger.info("UBM training: KMeans E-Step round %d from range(%d, %d)", iteration, *indices) # read the features reader = functools.partial(read_feature, extractor) data = utils.vstack_features( reader, (training_list[index] for index in range(indices[0], indices[1])), allow_missing_files=allow_missing_files) # Performs the E-step trainer = algorithm.kmeans_trainer trainer.e_step(kmeans_machine, data) # write results to file dist = numpy.array(trainer.average_min_distance) nsamples = numpy.array([indices[1] - indices[0]], dtype=numpy.float64) # write statistics bob.io.base.create_directories_safe(os.path.dirname(stats_file)) hdf5 = bob.io.base.HDF5File(stats_file, 'w') hdf5.set('zeros', trainer.zeroeth_order_statistics) hdf5.set('first', trainer.first_order_statistics) hdf5.set('dist', dist * nsamples) hdf5.set('nsamples', nsamples) logger.info("UBM training: Wrote Stats file '%s'", stats_file)
def train_lda(algorithm, force=False): """Train the feature projector with the extracted features of the world group.""" fs = FileSelector.instance() if utils.check_file(fs.lda_file, force, 1000): logger.info("- LDA projector '%s' already exists.", fs.lda_file) else: train_files = fs.training_list('whitened', 'train_projector', arrange_by_client = True) train_features = [[bob.bio.base.load(filename) for filename in client_files] for client_files in train_files] # perform training algorithm.train_lda(train_features) bob.io.base.create_directories_safe(os.path.dirname(fs.lda_file)) bob.bio.base.save(algorithm.lda, fs.lda_file)
def train_whitener(algorithm, force=False): """Train the feature projector with the extracted features of the world group.""" fs = FileSelector.instance() if utils.check_file(fs.whitener_file, force, 1000): logger.info("- Whitening projector '%s' already exists.", fs.whitener_file) else: train_files = fs.training_list('projected_ivector', 'train_projector') train_features = [bob.bio.base.load(f) for f in train_files] # perform training algorithm.train_whitener(train_features) bob.io.base.create_directories_safe(os.path.dirname(fs.whitener_file)) bob.bio.base.save(algorithm.whitener, fs.whitener_file)
def isv_estep(algorithm, iteration, indices, force=False): """Performs a single E-step of the ISV U matric training algorithm (parallel)""" fs = FileSelector.instance() stats_file = fs.isv_stats_file(iteration, indices[0], indices[1]) if utils.check_file(stats_file, force, 1000): logger.info( "ISV training: Skipping ISV E-Step since the file '%s' already exists", stats_file) else: logger.info("ISV training: E-Step from range(%d, %d)", *indices) # Temporary machine used for initialization algorithm.load_ubm(fs.ubm_file) # get the IVectorTrainer and call the initialization procedure trainer = algorithm.isv_trainer # Load data training_list = fs.training_list('projected_gmm', 'train_projector', arrange_by_client=True) data = [ algorithm.read_gmm_stats(training_list[i]) for i in range(indices[0], indices[1]) ] # Load machine if iteration: # load last ISV file isv_base = bob.learn.em.ISVBase( bob.io.base.HDF5File(fs.isv_intermediate_file(iteration))) isv_base.ubm = algorithm.ubm else: # create new ISV Base isv_base = bob.learn.em.ISVBase(algorithm.ubm, algorithm.subspace_dimension_of_u) # Perform the E-step trainer.initialize(isv_base, data, rng=algorithm.rng) #Just to reset the accumulators trainer.e_step(isv_base, data) # write results to file bob.io.base.create_directories_safe(os.path.dirname(stats_file)) hdf5 = bob.io.base.HDF5File(stats_file, 'w') hdf5.set('acc_u_a1', trainer.acc_u_a1) hdf5.set('acc_u_a2', trainer.acc_u_a2) logger.info("ISV training: Wrote Stats file '%s'", stats_file)
def save_projector(algorithm, force=False): fs = FileSelector.instance() if utils.check_file(fs.projector_file, force, 1000): logger.info("- Projector '%s' already exists.", fs.projector_file) else: # save the projector into one file algorithm.load_ubm(fs.ubm_file) algorithm.load_tv(fs.tv_file) algorithm.load_whitener(fs.whitener_file) if algorithm.use_lda: algorithm.load_lda(fs.lda_file) if algorithm.use_wccn: algorithm.load_wccn(fs.wccn_file) if algorithm.use_plda: algorithm.load_plda(fs.plda_file) logger.info("Writing projector into file %s", fs.projector_file) algorithm.save_projector(fs.projector_file)
def gmm_estep(algorithm, extractor, iteration, indices, force=False, allow_missing_files=False): """Performs a single E-step of the GMM training (parallel).""" if indices[0] >= indices[1]: return fs = FileSelector.instance() stats_file = fs.gmm_stats_file(iteration, indices[0], indices[1]) new_machine_file = fs.gmm_intermediate_file(iteration + 1) if utils.check_file(stats_file, force, 1000) or utils.check_file( new_machine_file, force, 1000): logger.info( "UBM training: Skipping GMM E-Step since the file '%s' or '%s' already exists", stats_file, new_machine_file) else: training_list = fs.training_list('extracted', 'train_projector') last_machine_file = fs.gmm_intermediate_file(iteration) gmm_machine = bob.learn.em.GMMMachine( bob.io.base.HDF5File(last_machine_file)) logger.info("UBM training: GMM E-Step from range(%d, %d)", *indices) # read the features reader = functools.partial(read_feature, extractor) data = utils.vstack_features( reader, (training_list[index] for index in range(indices[0], indices[1])), allow_missing_files=allow_missing_files) trainer = algorithm.ubm_trainer trainer.initialize(gmm_machine, None) # Calls the E-step and extracts the GMM statistics algorithm.ubm_trainer.e_step(gmm_machine, data) gmm_stats = algorithm.ubm_trainer.gmm_statistics # Saves the GMM statistics to the file bob.io.base.create_directories_safe(os.path.dirname(stats_file)) gmm_stats.save(bob.io.base.HDF5File(stats_file, 'w')) logger.info("UBM training: Wrote GMM stats '%s'", stats_file)
def ivector_estep(algorithm, iteration, indices, force=False): """Performs a single E-step of the IVector algorithm (parallel)""" fs = FileSelector.instance() stats_file = fs.ivector_stats_file(iteration, indices[0], indices[1]) if utils.check_file(stats_file, force, 1000): logger.info("IVector training: Skipping IVector E-Step since the file '%s' already exists", stats_file) else: logger.info("IVector training: E-Step from range(%d, %d)", *indices) # Temporary machine used for initialization algorithm.load_ubm(fs.ubm_file) # get the IVectorTrainer and call the initialization procedure trainer = algorithm.ivector_trainer # Load machine if iteration: # load last TV file tv = bob.learn.em.IVectorMachine(bob.io.base.HDF5File(fs.ivector_intermediate_file(iteration))) tv.ubm = algorithm.ubm else: # create new TV machine tv = bob.learn.em.IVectorMachine(algorithm.ubm, algorithm.subspace_dimension_of_t, algorithm.variance_threshold) trainer.initialize(tv) # Load data training_list = fs.training_list('projected_gmm', 'train_projector') data = [algorithm.read_gmm_stats(training_list[i]) for i in range(indices[0], indices[1])] # Perform the E-step trainer.e_step(tv, data) # write results to file bob.io.base.create_directories_safe(os.path.dirname(stats_file)) hdf5 = bob.io.base.HDF5File(stats_file, 'w') hdf5.set('acc_nij_wij2', trainer.acc_nij_wij2) hdf5.set('acc_fnormij_wij', trainer.acc_fnormij_wij) hdf5.set('acc_nij', trainer.acc_nij) hdf5.set('acc_snormij', trainer.acc_snormij) hdf5.set('nsamples', indices[1] - indices[0]) logger.info("IVector training: Wrote Stats file '%s'", stats_file)
def train_isv(algorithm, force=False): """Finally, the UBM is used to train the ISV projector/enroller.""" fs = FileSelector.instance() if utils.check_file(fs.projector_file, force, 800): logger.info("ISV training: Skipping ISV training since '%s' already exists", fs.projector_file) else: # read UBM into the ISV class algorithm.load_ubm(fs.ubm_file) # read training data training_list = fs.training_list('projected_gmm', 'train_projector', arrange_by_client = True) train_gmm_stats = [[algorithm.read_gmm_stats(filename) for filename in client_files] for client_files in training_list] # perform ISV training logger.info("ISV training: training ISV with %d clients", len(train_gmm_stats)) algorithm.train_isv(train_gmm_stats) # save result bob.io.base.create_directories_safe(os.path.dirname(fs.projector_file)) algorithm.save_projector(fs.projector_file)
def whitening_project(algorithm, indices, force=False): """Performs IVector projection""" fs = FileSelector.instance() algorithm.load_whitener(fs.whitener_file) ivector_files = fs.training_list('projected_ivector', 'train_projector') whitened_files = fs.training_list('whitened', 'train_projector') logger.info("IVector training: whitening ivectors range (%d, %d) from '%s' to '%s'", indices[0], indices[1], fs.directories['projected_ivector'], fs.directories['whitened']) # extract the features for i in range(indices[0], indices[1]): ivector_file = ivector_files[i] whitened_file = whitened_files[i] if not utils.check_file(whitened_file, force): # load feature ivector = algorithm.read_feature(ivector_file) # project feature whitened = algorithm.project_whitening(ivector) # write it bob.io.base.create_directories_safe(os.path.dirname(whitened_file)) bob.bio.base.save(whitened, whitened_file)
def kmeans_initialize(algorithm, extractor, limit_data = None, force = False): """Initializes the K-Means training (non-parallel).""" fs = FileSelector.instance() output_file = fs.kmeans_intermediate_file(0) if utils.check_file(output_file, force, 1000): logger.info("UBM training: Skipping KMeans initialization since the file '%s' already exists", output_file) else: # read data logger.info("UBM training: initializing kmeans") training_list = utils.selected_elements(fs.training_list('extracted', 'train_projector'), limit_data) data = numpy.vstack([read_feature(extractor, feature_file) for feature_file in training_list]) # Perform KMeans initialization kmeans_machine = bob.learn.em.KMeansMachine(algorithm.gaussians, data.shape[1]) # Creates the KMeansTrainer and call the initialization procedure algorithm.kmeans_trainer.initialize(kmeans_machine, data, algorithm.rng) bob.io.base.create_directories_safe(os.path.dirname(output_file)) kmeans_machine.save(bob.io.base.HDF5File(output_file, 'w')) logger.info("UBM training: saved initial KMeans machine to '%s'", output_file)
def lda_project(algorithm, indices, force=False): """Performs IVector projection""" fs = FileSelector.instance() algorithm.load_lda(fs.lda_file) whitened_files = fs.training_list('whitened', 'train_projector') lda_projected_files = fs.training_list('lda_projected', 'train_projector') logger.info("IVector training: LDA projection range (%d, %d) from '%s' to '%s'", indices[0], indices[1], fs.directories['whitened'], fs.directories['lda_projected']) # extract the features for i in range(indices[0], indices[1]): ivector_file = whitened_files[i] lda_projected_file = lda_projected_files[i] if not utils.check_file(lda_projected_file, force): # load feature ivector = algorithm.read_feature(ivector_file) # project feature lda_projected = algorithm.project_lda(ivector) # write it bob.io.base.create_directories_safe(os.path.dirname(lda_projected_file)) bob.bio.base.save(lda_projected, lda_projected_file)
def kmeans_estep(algorithm, extractor, iteration, indices, force=False): """Performs a single E-step of the K-Means algorithm (parallel)""" if indices[0] >= indices[1]: return fs = FileSelector.instance() # check if we need to compute this step stats_file = fs.kmeans_stats_file(iteration, indices[0], indices[1]) new_machine_file = fs.kmeans_intermediate_file(iteration + 1) if utils.check_file(stats_file, force, 1000) or utils.check_file(new_machine_file, force, 1000): logger.info("UBM training: Skipping KMeans E-Step since the file '%s' or '%s' already exists", stats_file, new_machine_file) else: training_list = fs.training_list('extracted', 'train_projector') last_machine_file = fs.kmeans_intermediate_file(iteration) kmeans_machine = bob.learn.em.KMeansMachine(bob.io.base.HDF5File(last_machine_file)) logger.info("UBM training: KMeans E-Step round %d from range(%d, %d)", iteration, *indices) # read data data = numpy.vstack([read_feature(extractor, training_list[index]) for index in range(indices[0], indices[1])]) # Performs the E-step trainer = algorithm.kmeans_trainer trainer.e_step(kmeans_machine, data) # write results to file dist = numpy.array(trainer.average_min_distance) nsamples = numpy.array([indices[1] - indices[0]], dtype=numpy.float64) # write statistics bob.io.base.create_directories_safe(os.path.dirname(stats_file)) hdf5 = bob.io.base.HDF5File(stats_file, 'w') hdf5.set('zeros', trainer.zeroeth_order_statistics) hdf5.set('first', trainer.first_order_statistics) hdf5.set('dist', dist * nsamples) hdf5.set('nsamples', nsamples) logger.info("UBM training: Wrote Stats file '%s'", stats_file)
def isv_estep(algorithm, iteration, indices, force=False): """Performs a single E-step of the ISV U matric training algorithm (parallel)""" fs = FileSelector.instance() stats_file = fs.isv_stats_file(iteration, indices[0], indices[1]) if utils.check_file(stats_file, force, 1000): logger.info("ISV training: Skipping ISV E-Step since the file '%s' already exists", stats_file) else: logger.info("ISV training: E-Step from range(%d, %d)", *indices) # Temporary machine used for initialization algorithm.load_ubm(fs.ubm_file) # get the IVectorTrainer and call the initialization procedure trainer = algorithm.isv_trainer # Load data training_list = fs.training_list('projected_gmm', 'train_projector', arrange_by_client=True) data = [algorithm.read_gmm_stats(training_list[i]) for i in range(indices[0], indices[1])] # Load machine if iteration: # load last ISV file isv_base = bob.learn.em.ISVBase(bob.io.base.HDF5File(fs.isv_intermediate_file(iteration))) isv_base.ubm = algorithm.ubm else: # create new ISV Base isv_base = bob.learn.em.ISVBase(algorithm.ubm, algorithm.subspace_dimension_of_u) # Perform the E-step trainer.initialize(isv_base, data, rng = algorithm.rng) #Just to reset the accumulators trainer.e_step(isv_base, data) # write results to file bob.io.base.create_directories_safe(os.path.dirname(stats_file)) hdf5 = bob.io.base.HDF5File(stats_file, 'w') hdf5.set('acc_u_a1', trainer.acc_u_a1) hdf5.set('acc_u_a2', trainer.acc_u_a2) logger.info("ISV training: Wrote Stats file '%s'", stats_file)
def ivector_project(algorithm, indices, force=False): """Performs IVector projection""" # read UBM and TV into the IVector class fs = FileSelector.instance() algorithm.load_ubm(fs.ubm_file) algorithm.load_tv(fs.tv_file) gmm_stats_files = fs.training_list('projected_gmm', 'train_projector') ivector_files = fs.training_list('projected_ivector', 'train_projector') logger.info("IVector training: Project features range (%d, %d) from '%s' to '%s'", indices[0], indices[1], fs.directories['projected_gmm'], fs.directories['projected_ivector']) # extract the features for i in range(indices[0], indices[1]): gmm_stats_file = gmm_stats_files[i] ivector_file = ivector_files[i] if not utils.check_file(ivector_file, force): # load feature feature = algorithm.read_gmm_stats(gmm_stats_file) # project feature projected = algorithm.project_ivector(feature) # write it bob.io.base.create_directories_safe(os.path.dirname(ivector_file)) bob.bio.base.save(projected, ivector_file)
def gmm_project(algorithm, extractor, indices, force=False): """Performs GMM projection""" fs = FileSelector.instance() algorithm.load_ubm(fs.ubm_file) feature_files = fs.training_list('extracted', 'train_projector') projected_files = fs.training_list('projected_gmm', 'train_projector') logger.info("ISV training: Project features range (%d, %d) from '%s' to '%s'", indices[0], indices[1], fs.directories['extracted'], fs.directories['projected_gmm']) # extract the features for i in range(indices[0], indices[1]): feature_file = feature_files[i] projected_file = projected_files[i] if not utils.check_file(projected_file, force): # load feature feature = read_feature(extractor, feature_file) # project feature projected = algorithm.project_ubm(feature) # write it bob.io.base.create_directories_safe(os.path.dirname(projected_file)) bob.bio.base.save(projected, projected_file)
def isv_mstep(algorithm, iteration, number_of_parallel_jobs, force=False, clean=False): """Performs a single M-step of the ISV algorithm (non-parallel)""" fs = FileSelector.instance() old_machine_file = fs.isv_intermediate_file(iteration) new_machine_file = fs.isv_intermediate_file(iteration + 1) if utils.check_file(new_machine_file, force, 1000): logger.info( "ISV training: Skipping ISV M-Step since the file '%s' already exists", new_machine_file) else: # get the files from e-step training_list = fs.training_list('projected_gmm', 'train_projector', arrange_by_client=True) # try if there is one file containing all data if os.path.exists(fs.isv_stats_file(iteration, 0, len(training_list))): # load stats file statistics = _read_stats( fs.isv_stats_file(iteration, 0, len(training_list))) else: # load several files stats_files = [] for job in range(number_of_parallel_jobs): job_indices = tools.indices(training_list, number_of_parallel_jobs, job + 1) if job_indices[-1] >= job_indices[0]: stats_files.append( fs.isv_stats_file(iteration, job_indices[0], job_indices[-1])) # read all stats files statistics = _accumulate(stats_files) # Load machine algorithm.load_ubm(fs.ubm_file) if iteration: isv_base = bob.learn.em.ISVBase( bob.io.base.HDF5File(old_machine_file)) isv_base.ubm = algorithm.ubm else: isv_base = bob.learn.em.ISVBase(algorithm.ubm, algorithm.subspace_dimension_of_u) # Creates the IVectorTrainer and initialize values trainer = algorithm.isv_trainer data = [algorithm.read_gmm_stats(training_list[0]) ] #Loading data just to allocate memory trainer.initialize(isv_base, data) #Just to allocate memory trainer.acc_u_a1 = statistics[0] trainer.acc_u_a2 = statistics[1] trainer.m_step(isv_base) # data is not used in M-step logger.info("ISV training: Performed M step %d", iteration) # Save the ISV model bob.io.base.create_directories_safe(os.path.dirname(new_machine_file)) isv_base.save(bob.io.base.HDF5File(new_machine_file, 'w')) logger.info("ISV training: Wrote new ISV Base '%s'", new_machine_file) if iteration == algorithm.isv_training_iterations - 1: shutil.copy(new_machine_file, fs.isv_file) logger.info("ISV training: Wrote new TV matrix '%s'", fs.isv_file) if clean and iteration > 0: old_dir = os.path.dirname(fs.isv_intermediate_file(iteration - 1)) logger.info("Removing old intermediate directory '%s'", old_dir) shutil.rmtree(old_dir)
def train_enroller(self, train_features, enroller_file, metadata=None): """Computes the Universal Background Model from the training ("world") data""" ###################################### # TODO: This is a critical moment. # With the next two lines of code we are breaking completely the isolation concept implemented # in bob.bio.base by introducing database knowledge inside of the algorithm. # This is a total HACK. # In short, we just opened the gates from hell. # Some demons may come out and the might terrorize innocent people. # Do your prayers, you will need them. # Only faith can save your soul. # God forgive us fs = FileSelector.instance() #train_files = fs.training_objects('extracted', 'train_projector', arrange_by_client = True) train_files = fs.database.training_files('train_projector', True) ##### # stacking all the features. TODO: This is super sub-optimal train_features_flatten = numpy.vstack( [feature for client in train_features for feature in client]) # training UBM (it's on self.ubm) self.train_ubm(train_features_flatten) # Now it comes the hack. # We would need to stack the features from all classes # Setting the MAP Trainer self.enroll_trainer = bob.learn.em.MAP_GMMTrainer( self.ubm, relevance_factor=self.relevance_factor, update_means=True, update_variances=False) # Efficiency tip, let's pre-allocate the supervector arrays mean_supervectors = [] for client in train_features: shape = (len(client), self.ubm.mean_supervector.shape[0]) mean_supervectors.append(numpy.zeros(shape)) # Now let's compute the supervectors for client, i in zip(train_features, range(len(train_features))): for feature, j in zip(client, range(len(client))): # Running MAP map_feature = self.enroll_gmm(feature) mean_supervectors[i][j] = map_feature.mean_supervector # The enroller is composed by the UBM and all the training supervector samples # saving ubm hdf5 = bob.io.base.HDF5File(enroller_file, "w") hdf5.create_group("/UBM") hdf5.cd("/UBM") self.ubm.save(hdf5) # saving supervectors hdf5.create_group("/train_supervectors") hdf5.cd("/train_supervectors") for i in range(len(mean_supervectors)): # Fetching and memorizing the client id, so we can use it during the enroll class_id = train_files[i][0].client_id hdf5.set("{0}".format(class_id), mean_supervectors[i])