def initial_models(self, nb_threads=1): # sort by show to minimize the reading of mfcc by the statServer self.diar.sort(['show']) # Compute statistics by segments self.stat_seg = StatServer(self.diar.id_map()) self.stat_seg.accumulate_stat(self.ubm, self.features_server) self.stat_speaker = self.stat_seg.adapt_mean_MAP_multisession(self.ubm)
def extract_ivectors_single(self, ubm, stat_server, uncertainty=False): """ Estimate i-vectors for a given StatServer using single process on a single node. :param stat_server: sufficient statistics stored in a StatServer :param ubm: Mixture object (the UBM) :param uncertainty: boolean, if True, return an additional matrix with uncertainty matrices (diagonal of the matrices) :return: a StatServer with i-vectors in the stat1 attribute and a matrix of uncertainty matrices (optional) """ assert(isinstance(stat_server, StatServer) and stat_server.validate()), \ "First argument must be a proper StatServer" assert (isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture" gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full" # Set useful variables tv_rank = self.F.shape[1] feature_size = ubm.mu.shape[1] nb_distrib = ubm.w.shape[0] # Whiten the statistics for diagonal or full models if gmm_covariance == "diag": stat_server.whiten_stat1(ubm.get_mean_super_vector(), 1. / ubm.get_invcov_super_vector()) elif gmm_covariance == "full": stat_server.whiten_stat1(ubm.get_mean_super_vector(), ubm.invchol) # Extract i-vectors iv_stat_server = StatServer() iv_stat_server.modelset = copy.deepcopy(stat_server.modelset) iv_stat_server.segset = copy.deepcopy(stat_server.segset) iv_stat_server.start = copy.deepcopy(stat_server.start) iv_stat_server.stop = copy.deepcopy(stat_server.stop) iv_stat_server.stat0 = numpy.ones((stat_server.modelset.shape[0], 1)) iv_stat_server.stat1 = numpy.ones( (stat_server.modelset.shape[0], tv_rank)) iv_sigma = numpy.ones((stat_server.modelset.shape[0], tv_rank)) # Replicate self.stat0 index_map = numpy.repeat(numpy.arange(nb_distrib), feature_size) for sess in tqdm(range(stat_server.segset.shape[0]), desc="Processing"): inv_lambda = scipy.linalg.inv( numpy.eye(tv_rank) + (self.F.T * stat_server.stat0[sess, index_map]).dot(self.F)) Aux = self.F.T.dot(stat_server.stat1[sess, :]) iv_stat_server.stat1[sess, :] = Aux.dot(inv_lambda) iv_sigma[sess, :] = numpy.diag(inv_lambda + numpy.outer( iv_stat_server.stat1[sess, :], iv_stat_server.stat1[sess, :])) if uncertainty: return iv_stat_server, iv_sigma else: return iv_stat_server
def _gaussian_backend_train(data, label): """ Take a StatServer of training examples as input output a StatServer mean for each class and a tied co-variance matrix """ train_ss = StatServer() train_ss.segset = label train_ss.modelset = label train_ss.stat1 = data train_ss.stat0 = numpy.ones((data.shape[0], 1)) train_ss.start = numpy.empty(data.shape[0], dtype="object") train_ss.stop = numpy.empty(data.shape[0], dtype="object") return gaussian_backend_train(train_ss)
def extract_ivectors(self, ubm, stat_server_filename, prefix='', batch_size=300, uncertainty=False, num_thread=1): """ Parallel extraction of i-vectors using multiprocessing module :param ubm: Mixture object (the UBM) :param stat_server_filename: name of the file from which the input StatServer is read :param prefix: prefix used to store the StatServer in its file :param batch_size: number of sessions to process in a batch :param uncertainty: a boolean, if True, return the diagonal of the uncertainty matrices :param num_thread: number of process to run in parallel :return: a StatServer with i-vectors in the stat1 attribute and a matrix of uncertainty matrices (optional) """ assert (isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture" tv_rank = self.F.shape[1] # Set useful variables with h5py.File(stat_server_filename, 'r') as fh: # open the first statserver to get size _, sv_size = fh[prefix + 'stat1'].shape nb_sessions = fh[prefix + "modelset"].shape[0] iv_server = StatServer() iv_server.modelset = fh.get(prefix + 'modelset').value iv_server.segset = fh.get(prefix + 'segset').value tmpstart = fh.get(prefix + "start").value tmpstop = fh.get(prefix + "stop").value iv_server.start = numpy.empty(fh[prefix + "start"].shape, '|O') iv_server.stop = numpy.empty(fh[prefix + "stop"].shape, '|O') iv_server.start[tmpstart != -1] = tmpstart[tmpstart != -1] iv_server.stop[tmpstop != -1] = tmpstop[tmpstop != -1] iv_server.stat0 = numpy.ones((nb_sessions, 1), dtype=STAT_TYPE) with warnings.catch_warnings(): iv_server.stat1 = serialize(numpy.zeros( (nb_sessions, tv_rank))) iv_sigma = serialize(numpy.zeros((nb_sessions, tv_rank))) nb_sessions = iv_server.modelset.shape[0] batch_nb = int(numpy.floor(nb_sessions / float(batch_size) + 0.999)) batch_indices = numpy.array_split(numpy.arange(nb_sessions), batch_nb) manager = multiprocessing.Manager() q = manager.Queue() pool = multiprocessing.Pool(num_thread + 2) # put listener to work first watcher = pool.apply_async(iv_collect, ((iv_server.stat1, iv_sigma), q)) # fire off workers jobs = [] # Load data per batch to reduce the memory footprint for batch_idx in batch_indices: # Create list of argument for a process arg = batch_idx, fh["stat0"][batch_idx, :], fh["stat1"][ batch_idx, :], ubm, self.F job = pool.apply_async(iv_extract_on_batch, (arg, q)) jobs.append(job) # collect results from the workers through the pool result queue for job in jobs: job.get() # now we are done, kill the listener q.put((None, None, None)) pool.close() iv_server.stat1, iv_sigma = watcher.get() if uncertainty: return iv_server, iv_sigma else: return iv_server
class HAC_CLR: """ CLR Hierarchical Agglomerative Clustering (HAC) with GMM trained by MAP """ def __init__(self, features_server, diar, ubm, ce=False, ntop=5): assert isinstance( features_server, FeaturesServer), 'First parameter has to be a FeatureServer' assert isinstance( diar, Diar), '2sd parameter has to be a Diar (segmentationContener)' assert isinstance(ubm, Mixture), '3rd parameter has to be a Mixture' self.features_server = features_server self.diar = copy.deepcopy(diar) self.merge = [] self.nb_merge = 0 self.ubm = ubm self.ce = ce self.stat_speaker = None self.stat_seg = None self.llr = None self.ntop = ntop #self.init_train() #self._init_distance() def _get_cep(self, map, cluster): cep_list = list() for show in map[cluster]: idx = self.diar.features_by_cluster(show)[cluster] if len(idx) > 0: tmp, vad = self.features_server.load(show) cep_list.append(tmp[0][idx]) cep = np.concatenate(cep_list, axis=0) return cep def _ll(self, ubm, cep, mu=None, name='ubm', argtop=None): # ajouter le top gaussien lp = ubm.compute_log_posterior_probabilities(cep, mu=mu) if argtop is None: #logging.info('compute argtop '+speaker) argtop = argpartition(lp * -1.0, self.ntop, axis=1)[:, :self.ntop] #logging.info(argtop.shape) if self.ntop is not None: #logging.info('use ntop '+speaker) #logging.info(argtop.shape) #logging.info(lp.shape) lp = lp[np.arange(argtop.shape[0])[:, np.newaxis], argtop] # ppMax = numpy.max(lp, axis=1) ll = np.log(np.sum(np.exp(lp), axis=1)) # ll = ppMax + numpy.log(numpy.sum(numpy.exp((lp.transpose() - ppMax).transpose()), # axis=1)) not_finite = np.logical_not(np.isfinite(ll)) cpt = np.count_nonzero(not_finite) # ll[finite] = numpy.finfo('d').min ll[not_finite] = 1.0e-200 m = np.mean(ll) if cpt > 0: logging.info( 'model ' + name + '), nb trame with llk problem: %d/%d \t %f', cpt, cep.shape[0], m) return m, argtop def initial_models(self, nb_threads=1): # sort by show to minimize the reading of mfcc by the statServer self.diar.sort(['show']) # Compute statistics by segments self.stat_seg = StatServer(self.diar.id_map()) self.stat_seg.accumulate_stat(self.ubm, self.features_server) self.stat_speaker = self.stat_seg.adapt_mean_MAP_multisession(self.ubm) def initial_distances(self, nb_threads=1): map = self.diar.make_index(['cluster', 'show']) nb = self.stat_speaker.modelset.shape[0] self.llr = np.full((nb, nb), np.nan) self.dist = np.full((nb, nb), np.nan) for i, name_i in enumerate(self.stat_speaker.modelset): cep_i = self._get_cep(map, name_i) argtop = None ll_ubm = None if self.ntop is not None or self.ce == False: ll_ubm, argtop = self._ll(self.ubm, cep_i, argtop=argtop) # self.merge.append([]) for j, name_j in enumerate(self.stat_speaker.modelset): mu = self.stat_speaker.get_model_stat1_by_index(j) # if i == 0: # logging.debug(mu) self.llr[i, j], _ = self._ll(self.ubm, cep_i, mu=mu, name=name_j, argtop=argtop) if self.ce: self.llr[i, :] -= self.llr[i, i] else: self.llr[i, :] -= ll_ubm # logging.debug(self.llr) self.dist = (self.llr + self.llr.T) * -1.0 np.fill_diagonal(self.dist, np.finfo('d').max) def update(self, i, j, nb_threads=1): name_i = self.stat_speaker.modelset[i] name_j = self.stat_speaker.modelset[j] # logging.debug('%d %d / %s %s', i, j, name_i, name_j) for k in range(len(self.stat_seg.modelset)): if self.stat_seg.modelset[k] == name_j: self.stat_seg.modelset[k] = name_i self.stat_speaker = self.stat_seg.adapt_mean_MAP_multisession(self.ubm) self.llr = roll(self.llr, j) self.diar.rename('cluster', [name_j], name_i) map = self.diar.make_index(['cluster', 'show']) cep_i = self._get_cep(map, name_i) argtop = None ll_ubm = None if self.ntop > 0 or self.ce == False: ll_ubm, argtop = self._ll(self.ubm, cep_i, argtop=argtop) for k, name_k in enumerate(self.stat_speaker.modelset): mu = self.stat_speaker.get_model_stat1_by_index(k) self.llr[i, k], _ = self._ll(self.ubm, cep_i, mu=mu, name=name_k) if self.ce: self.llr[i, :] -= self.llr[i, i] else: self.llr[i, :] -= ll_ubm self.dist = (self.llr + self.llr.T) * -1.0 np.fill_diagonal(self.dist, np.finfo('d').max) def information(self, i, j, value): models = self.stat_speaker.modelset self.merge.append([self.nb_merge, models[i], models[j], value]) def perform(self, thr=0.0, to_the_end=False): models = self.stat_speaker.modelset nb = len(models) self.nb_merge = -1 for i in range(nb): self.information(i, i, 0) i, j, v = argmin(self.dist, nb) self.nb_merge = 0 while v < thr and nb > 1: self.information(i, j, v) self.nb_merge += 1 logging.debug('merge: %d c1: %s (%d) c2: %s (%d) dist: %f', self.nb_merge, models[i], i, models[j], j, v) # update merge # update model and distance self.update(i, j) nb -= 1 i, j, v = argmin(self.dist, nb) end_diar = copy.deepcopy(self.diar) if to_the_end: while nb > 1: self.information(i, j, v) self.nb_merge += 1 logging.debug('merge: %d c1: %s (%d) c2: %s (%d) dist: %f', self.nb_merge, models[i], i, models[j], j, v) # update merge # update model and distance self.update(i, j) nb -= 1 i, j, v = argmin(self.dist, nb) return end_diar
def extract_ivector(tv, stat_server_file_name, ubm, output_file_name, uncertainty=False, prefix=''): """ Estimate i-vectors for a given StatServer using multiple process on multiple nodes. :param comm: MPI.comm object defining the group of nodes to use :param stat_server_file_name: file name of the sufficient statistics StatServer HDF5 file :param ubm: Mixture object (the UBM) :param output_file_name: name of the file to save the i-vectors StatServer in HDF5 format :param uncertainty: boolean, if True, saves a matrix with uncertainty matrices (diagonal of the matrices) :param prefix: prefixe of the dataset to read from in HDF5 file """ assert (isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture" comm = MPI.COMM_WORLD comm.Barrier() gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full" # Set useful variables tv_rank = tv.F.shape[1] feature_size = ubm.mu.shape[1] nb_distrib = ubm.w.shape[0] # Get the number of sessions to process with h5py.File(stat_server_file_name, 'r') as fh: nb_sessions = fh["segset"].shape[0] # Work on each node with different data indices = numpy.array_split(numpy.arange(nb_sessions), comm.size, axis=0) sendcounts = numpy.array([idx.shape[0] * tv.F.shape[1] for idx in indices]) displacements = numpy.hstack((0, numpy.cumsum(sendcounts)[:-1])) stat_server = StatServer.read_subset(stat_server_file_name, indices[comm.rank]) # Whiten the statistics for diagonal or full models if gmm_covariance == "diag": stat_server.whiten_stat1(ubm.get_mean_super_vector(), 1. / ubm.get_invcov_super_vector()) elif gmm_covariance == "full": stat_server.whiten_stat1(ubm.get_mean_super_vector(), ubm.invchol) # Estimate i-vectors if comm.rank == 0: iv = numpy.zeros((nb_sessions, tv_rank)) iv_sigma = numpy.zeros((nb_sessions, tv_rank)) else: iv = None iv_sigma = None local_iv = numpy.zeros((stat_server.modelset.shape[0], tv_rank)) local_iv_sigma = numpy.ones((stat_server.modelset.shape[0], tv_rank)) # Replicate stat0 index_map = numpy.repeat(numpy.arange(nb_distrib), feature_size) for sess in range(stat_server.segset.shape[0]): inv_lambda = scipy.linalg.inv( numpy.eye(tv_rank) + (tv.F.T * stat_server.stat0[sess, index_map]).dot(tv.F)) Aux = tv.F.T.dot(stat_server.stat1[sess, :]) local_iv[sess, :] = Aux.dot(inv_lambda) local_iv_sigma[sess, :] = numpy.diag( inv_lambda + numpy.outer(local_iv[sess, :], local_iv[sess, :])) comm.Barrier() comm.Gatherv(local_iv, [iv, sendcounts, displacements, MPI.DOUBLE], root=0) comm.Gatherv(local_iv_sigma, [iv_sigma, sendcounts, displacements, MPI.DOUBLE], root=0) if comm.rank == 0: with h5py.File(stat_server_file_name, 'r') as fh: iv_stat_server = StatServer() iv_stat_server.modelset = fh.get(prefix + "modelset").value iv_stat_server.segset = fh.get(prefix + "segset").value # if running python 3, need a conversion to unicode if sys.version_info[0] == 3: iv_stat_server.modelset = iv_stat_server.modelset.astype( 'U', copy=False) iv_stat_server.segset = iv_stat_server.segset.astype( 'U', copy=False) tmpstart = fh.get(prefix + "start").value tmpstop = fh.get(prefix + "stop").value iv_stat_server.start = numpy.empty(fh[prefix + "start"].shape, '|O') iv_stat_server.stop = numpy.empty(fh[prefix + "stop"].shape, '|O') iv_stat_server.start[tmpstart != -1] = tmpstart[tmpstart != -1] iv_stat_server.stop[tmpstop != -1] = tmpstop[tmpstop != -1] iv_stat_server.stat0 = numpy.ones((nb_sessions, 1)) iv_stat_server.stat1 = iv iv_stat_server.write(output_file_name) if uncertainty: path = os.path.splitext(output_file_name) write_matrix_hdf5(iv_sigma, path[0] + "_uncertainty" + path[1])
def extract_parallel(args, fs_params): emb_a_size = 512 emb_b_size = 512 idmap = IdMap(args.idmap) x_server_1 = StatServer(idmap, 1, emb_a_size) x_server_2 = StatServer(idmap, 1, emb_b_size) x_server_3 = StatServer(idmap, 1, emb_b_size) x_server_4 = StatServer(idmap, 1, emb_b_size) x_server_5 = StatServer(idmap, 1, emb_b_size) x_server_6 = StatServer(idmap, 1, emb_b_size) x_server_1.stat0 = numpy.ones(x_server_1.stat0.shape) x_server_2.stat0 = numpy.ones(x_server_2.stat0.shape) x_server_3.stat0 = numpy.ones(x_server_3.stat0.shape) x_server_4.stat0 = numpy.ones(x_server_4.stat0.shape) x_server_5.stat0 = numpy.ones(x_server_5.stat0.shape) x_server_6.stat0 = numpy.ones(x_server_6.stat0.shape) # Split the indices mega_batch_size = idmap.leftids.shape[0] // args.num_processes logging.critical("Number of sessions to process: {}".format(idmap.leftids.shape[0])) segment_idx = [] for ii in range(args.num_processes): segment_idx.append( numpy.arange(ii * mega_batch_size, numpy.min([(ii + 1) * mega_batch_size, idmap.leftids.shape[0]]))) for idx, si in enumerate(segment_idx): logging.critical("Number of session on process {}: {}".format(idx, len(si))) # Extract x-vectors in parallel output_queue = mp.Queue() processes = [] for rank in range(args.num_processes): p = mp.Process(target=extract_idmap, args=(args, rank, segment_idx[rank], fs_params, args.idmap, output_queue) ) # We first train the model across `num_processes` processes p.start() processes.append(p) # Get the x-vectors and fill the StatServer for ii in range(args.num_processes): indices, seg_1, seg_2, seg_3, seg_4, seg_5, seg_6 = output_queue.get() x_server_1.stat1[indices, :] = seg_1 x_server_2.stat1[indices, :] = seg_2 x_server_3.stat1[indices, :] = seg_3 x_server_4.stat1[indices, :] = seg_4 x_server_5.stat1[indices, :] = seg_5 x_server_6.stat1[indices, :] = seg_6 for p in processes: p.join() print("Process parallel fini") return x_server_1, x_server_2, x_server_3, x_server_4, x_server_5, x_server_6