def extract_ivectors_single(self, ubm, stat_server, uncertainty=False): """ Estimate i-vectors for a given StatServer using single process on a single node. :param stat_server: sufficient statistics stored in a StatServer :param ubm: Mixture object (the UBM) :param uncertainty: boolean, if True, return an additional matrix with uncertainty matrices (diagonal of the matrices) :return: a StatServer with i-vectors in the stat1 attribute and a matrix of uncertainty matrices (optional) """ assert(isinstance(stat_server, StatServer) and stat_server.validate()), \ "First argument must be a proper StatServer" assert (isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture" gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full" # Set useful variables tv_rank = self.F.shape[1] feature_size = ubm.mu.shape[1] nb_distrib = ubm.w.shape[0] # Whiten the statistics for diagonal or full models if gmm_covariance == "diag": stat_server.whiten_stat1(ubm.get_mean_super_vector(), 1. / ubm.get_invcov_super_vector()) elif gmm_covariance == "full": stat_server.whiten_stat1(ubm.get_mean_super_vector(), ubm.invchol) # Extract i-vectors iv_stat_server = StatServer() iv_stat_server.modelset = copy.deepcopy(stat_server.modelset) iv_stat_server.segset = copy.deepcopy(stat_server.segset) iv_stat_server.start = copy.deepcopy(stat_server.start) iv_stat_server.stop = copy.deepcopy(stat_server.stop) iv_stat_server.stat0 = numpy.ones((stat_server.modelset.shape[0], 1)) iv_stat_server.stat1 = numpy.ones( (stat_server.modelset.shape[0], tv_rank)) iv_sigma = numpy.ones((stat_server.modelset.shape[0], tv_rank)) # Replicate self.stat0 index_map = numpy.repeat(numpy.arange(nb_distrib), feature_size) for sess in tqdm(range(stat_server.segset.shape[0]), desc="Processing"): inv_lambda = scipy.linalg.inv( numpy.eye(tv_rank) + (self.F.T * stat_server.stat0[sess, index_map]).dot(self.F)) Aux = self.F.T.dot(stat_server.stat1[sess, :]) iv_stat_server.stat1[sess, :] = Aux.dot(inv_lambda) iv_sigma[sess, :] = numpy.diag(inv_lambda + numpy.outer( iv_stat_server.stat1[sess, :], iv_stat_server.stat1[sess, :])) if uncertainty: return iv_stat_server, iv_sigma else: return iv_stat_server
def _gaussian_backend_train(data, label): """ Take a StatServer of training examples as input output a StatServer mean for each class and a tied co-variance matrix """ train_ss = StatServer() train_ss.segset = label train_ss.modelset = label train_ss.stat1 = data train_ss.stat0 = numpy.ones((data.shape[0], 1)) train_ss.start = numpy.empty(data.shape[0], dtype="object") train_ss.stop = numpy.empty(data.shape[0], dtype="object") return gaussian_backend_train(train_ss)
def extract_ivectors(self, ubm, stat_server_filename, prefix='', batch_size=300, uncertainty=False, num_thread=1): """ Parallel extraction of i-vectors using multiprocessing module :param ubm: Mixture object (the UBM) :param stat_server_filename: name of the file from which the input StatServer is read :param prefix: prefix used to store the StatServer in its file :param batch_size: number of sessions to process in a batch :param uncertainty: a boolean, if True, return the diagonal of the uncertainty matrices :param num_thread: number of process to run in parallel :return: a StatServer with i-vectors in the stat1 attribute and a matrix of uncertainty matrices (optional) """ assert (isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture" tv_rank = self.F.shape[1] # Set useful variables with h5py.File(stat_server_filename, 'r') as fh: # open the first statserver to get size _, sv_size = fh[prefix + 'stat1'].shape nb_sessions = fh[prefix + "modelset"].shape[0] iv_server = StatServer() iv_server.modelset = fh.get(prefix + 'modelset').value iv_server.segset = fh.get(prefix + 'segset').value tmpstart = fh.get(prefix + "start").value tmpstop = fh.get(prefix + "stop").value iv_server.start = numpy.empty(fh[prefix + "start"].shape, '|O') iv_server.stop = numpy.empty(fh[prefix + "stop"].shape, '|O') iv_server.start[tmpstart != -1] = tmpstart[tmpstart != -1] iv_server.stop[tmpstop != -1] = tmpstop[tmpstop != -1] iv_server.stat0 = numpy.ones((nb_sessions, 1), dtype=STAT_TYPE) with warnings.catch_warnings(): iv_server.stat1 = serialize(numpy.zeros( (nb_sessions, tv_rank))) iv_sigma = serialize(numpy.zeros((nb_sessions, tv_rank))) nb_sessions = iv_server.modelset.shape[0] batch_nb = int(numpy.floor(nb_sessions / float(batch_size) + 0.999)) batch_indices = numpy.array_split(numpy.arange(nb_sessions), batch_nb) manager = multiprocessing.Manager() q = manager.Queue() pool = multiprocessing.Pool(num_thread + 2) # put listener to work first watcher = pool.apply_async(iv_collect, ((iv_server.stat1, iv_sigma), q)) # fire off workers jobs = [] # Load data per batch to reduce the memory footprint for batch_idx in batch_indices: # Create list of argument for a process arg = batch_idx, fh["stat0"][batch_idx, :], fh["stat1"][ batch_idx, :], ubm, self.F job = pool.apply_async(iv_extract_on_batch, (arg, q)) jobs.append(job) # collect results from the workers through the pool result queue for job in jobs: job.get() # now we are done, kill the listener q.put((None, None, None)) pool.close() iv_server.stat1, iv_sigma = watcher.get() if uncertainty: return iv_server, iv_sigma else: return iv_server
def extract_ivector(tv, stat_server_file_name, ubm, output_file_name, uncertainty=False, prefix=''): """ Estimate i-vectors for a given StatServer using multiple process on multiple nodes. :param comm: MPI.comm object defining the group of nodes to use :param stat_server_file_name: file name of the sufficient statistics StatServer HDF5 file :param ubm: Mixture object (the UBM) :param output_file_name: name of the file to save the i-vectors StatServer in HDF5 format :param uncertainty: boolean, if True, saves a matrix with uncertainty matrices (diagonal of the matrices) :param prefix: prefixe of the dataset to read from in HDF5 file """ assert (isinstance(ubm, Mixture) and ubm.validate()), "Second argument must be a proper Mixture" comm = MPI.COMM_WORLD comm.Barrier() gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full" # Set useful variables tv_rank = tv.F.shape[1] feature_size = ubm.mu.shape[1] nb_distrib = ubm.w.shape[0] # Get the number of sessions to process with h5py.File(stat_server_file_name, 'r') as fh: nb_sessions = fh["segset"].shape[0] # Work on each node with different data indices = numpy.array_split(numpy.arange(nb_sessions), comm.size, axis=0) sendcounts = numpy.array([idx.shape[0] * tv.F.shape[1] for idx in indices]) displacements = numpy.hstack((0, numpy.cumsum(sendcounts)[:-1])) stat_server = StatServer.read_subset(stat_server_file_name, indices[comm.rank]) # Whiten the statistics for diagonal or full models if gmm_covariance == "diag": stat_server.whiten_stat1(ubm.get_mean_super_vector(), 1. / ubm.get_invcov_super_vector()) elif gmm_covariance == "full": stat_server.whiten_stat1(ubm.get_mean_super_vector(), ubm.invchol) # Estimate i-vectors if comm.rank == 0: iv = numpy.zeros((nb_sessions, tv_rank)) iv_sigma = numpy.zeros((nb_sessions, tv_rank)) else: iv = None iv_sigma = None local_iv = numpy.zeros((stat_server.modelset.shape[0], tv_rank)) local_iv_sigma = numpy.ones((stat_server.modelset.shape[0], tv_rank)) # Replicate stat0 index_map = numpy.repeat(numpy.arange(nb_distrib), feature_size) for sess in range(stat_server.segset.shape[0]): inv_lambda = scipy.linalg.inv( numpy.eye(tv_rank) + (tv.F.T * stat_server.stat0[sess, index_map]).dot(tv.F)) Aux = tv.F.T.dot(stat_server.stat1[sess, :]) local_iv[sess, :] = Aux.dot(inv_lambda) local_iv_sigma[sess, :] = numpy.diag( inv_lambda + numpy.outer(local_iv[sess, :], local_iv[sess, :])) comm.Barrier() comm.Gatherv(local_iv, [iv, sendcounts, displacements, MPI.DOUBLE], root=0) comm.Gatherv(local_iv_sigma, [iv_sigma, sendcounts, displacements, MPI.DOUBLE], root=0) if comm.rank == 0: with h5py.File(stat_server_file_name, 'r') as fh: iv_stat_server = StatServer() iv_stat_server.modelset = fh.get(prefix + "modelset").value iv_stat_server.segset = fh.get(prefix + "segset").value # if running python 3, need a conversion to unicode if sys.version_info[0] == 3: iv_stat_server.modelset = iv_stat_server.modelset.astype( 'U', copy=False) iv_stat_server.segset = iv_stat_server.segset.astype( 'U', copy=False) tmpstart = fh.get(prefix + "start").value tmpstop = fh.get(prefix + "stop").value iv_stat_server.start = numpy.empty(fh[prefix + "start"].shape, '|O') iv_stat_server.stop = numpy.empty(fh[prefix + "stop"].shape, '|O') iv_stat_server.start[tmpstart != -1] = tmpstart[tmpstart != -1] iv_stat_server.stop[tmpstop != -1] = tmpstop[tmpstop != -1] iv_stat_server.stat0 = numpy.ones((nb_sessions, 1)) iv_stat_server.stat1 = iv iv_stat_server.write(output_file_name) if uncertainty: path = os.path.splitext(output_file_name) write_matrix_hdf5(iv_sigma, path[0] + "_uncertainty" + path[1])
def extract_parallel(args, fs_params): emb_a_size = 512 emb_b_size = 512 idmap = IdMap(args.idmap) x_server_1 = StatServer(idmap, 1, emb_a_size) x_server_2 = StatServer(idmap, 1, emb_b_size) x_server_3 = StatServer(idmap, 1, emb_b_size) x_server_4 = StatServer(idmap, 1, emb_b_size) x_server_5 = StatServer(idmap, 1, emb_b_size) x_server_6 = StatServer(idmap, 1, emb_b_size) x_server_1.stat0 = numpy.ones(x_server_1.stat0.shape) x_server_2.stat0 = numpy.ones(x_server_2.stat0.shape) x_server_3.stat0 = numpy.ones(x_server_3.stat0.shape) x_server_4.stat0 = numpy.ones(x_server_4.stat0.shape) x_server_5.stat0 = numpy.ones(x_server_5.stat0.shape) x_server_6.stat0 = numpy.ones(x_server_6.stat0.shape) # Split the indices mega_batch_size = idmap.leftids.shape[0] // args.num_processes logging.critical("Number of sessions to process: {}".format(idmap.leftids.shape[0])) segment_idx = [] for ii in range(args.num_processes): segment_idx.append( numpy.arange(ii * mega_batch_size, numpy.min([(ii + 1) * mega_batch_size, idmap.leftids.shape[0]]))) for idx, si in enumerate(segment_idx): logging.critical("Number of session on process {}: {}".format(idx, len(si))) # Extract x-vectors in parallel output_queue = mp.Queue() processes = [] for rank in range(args.num_processes): p = mp.Process(target=extract_idmap, args=(args, rank, segment_idx[rank], fs_params, args.idmap, output_queue) ) # We first train the model across `num_processes` processes p.start() processes.append(p) # Get the x-vectors and fill the StatServer for ii in range(args.num_processes): indices, seg_1, seg_2, seg_3, seg_4, seg_5, seg_6 = output_queue.get() x_server_1.stat1[indices, :] = seg_1 x_server_2.stat1[indices, :] = seg_2 x_server_3.stat1[indices, :] = seg_3 x_server_4.stat1[indices, :] = seg_4 x_server_5.stat1[indices, :] = seg_5 x_server_6.stat1[indices, :] = seg_6 for p in processes: p.join() print("Process parallel fini") return x_server_1, x_server_2, x_server_3, x_server_4, x_server_5, x_server_6