def initial_models(self, nb_threads=1):
     # sort by show to minimize the reading of mfcc by the statServer
     self.diar.sort(['show'])
     # Compute statistics by segments
     self.stat_seg = StatServer(self.diar.id_map())
     self.stat_seg.accumulate_stat(self.ubm, self.features_server)
     self.stat_speaker = self.stat_seg.adapt_mean_MAP_multisession(self.ubm)
    def extract_ivectors_single(self, ubm, stat_server, uncertainty=False):
        """
        Estimate i-vectors for a given StatServer using single process on a single node.

        :param stat_server: sufficient statistics stored in a StatServer
        :param ubm: Mixture object (the UBM)
        :param uncertainty: boolean, if True, return an additional matrix with uncertainty matrices (diagonal of the matrices)

        :return: a StatServer with i-vectors in the stat1 attribute and a matrix of uncertainty matrices (optional)
        """
        assert(isinstance(stat_server, StatServer) and stat_server.validate()), \
            "First argument must be a proper StatServer"
        assert (isinstance(ubm, Mixture)
                and ubm.validate()), "Second argument must be a proper Mixture"

        gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full"

        # Set useful variables
        tv_rank = self.F.shape[1]
        feature_size = ubm.mu.shape[1]
        nb_distrib = ubm.w.shape[0]

        # Whiten the statistics for diagonal or full models
        if gmm_covariance == "diag":
            stat_server.whiten_stat1(ubm.get_mean_super_vector(),
                                     1. / ubm.get_invcov_super_vector())
        elif gmm_covariance == "full":
            stat_server.whiten_stat1(ubm.get_mean_super_vector(), ubm.invchol)

        # Extract i-vectors
        iv_stat_server = StatServer()
        iv_stat_server.modelset = copy.deepcopy(stat_server.modelset)
        iv_stat_server.segset = copy.deepcopy(stat_server.segset)
        iv_stat_server.start = copy.deepcopy(stat_server.start)
        iv_stat_server.stop = copy.deepcopy(stat_server.stop)
        iv_stat_server.stat0 = numpy.ones((stat_server.modelset.shape[0], 1))
        iv_stat_server.stat1 = numpy.ones(
            (stat_server.modelset.shape[0], tv_rank))

        iv_sigma = numpy.ones((stat_server.modelset.shape[0], tv_rank))

        # Replicate self.stat0
        index_map = numpy.repeat(numpy.arange(nb_distrib), feature_size)

        for sess in tqdm(range(stat_server.segset.shape[0]),
                         desc="Processing"):

            inv_lambda = scipy.linalg.inv(
                numpy.eye(tv_rank) +
                (self.F.T * stat_server.stat0[sess, index_map]).dot(self.F))
            Aux = self.F.T.dot(stat_server.stat1[sess, :])
            iv_stat_server.stat1[sess, :] = Aux.dot(inv_lambda)
            iv_sigma[sess, :] = numpy.diag(inv_lambda + numpy.outer(
                iv_stat_server.stat1[sess, :], iv_stat_server.stat1[sess, :]))

        if uncertainty:
            return iv_stat_server, iv_sigma
        else:
            return iv_stat_server
Beispiel #3
0
def _gaussian_backend_train(data, label):
    """
    Take a StatServer of training examples as input
    output a StatServer mean for each class and a tied co-variance matrix
    """
    train_ss = StatServer()
    train_ss.segset = label
    train_ss.modelset = label
    train_ss.stat1 = data
    train_ss.stat0 = numpy.ones((data.shape[0], 1))
    train_ss.start = numpy.empty(data.shape[0], dtype="object")
    train_ss.stop = numpy.empty(data.shape[0], dtype="object")

    return gaussian_backend_train(train_ss)
    def extract_ivectors(self,
                         ubm,
                         stat_server_filename,
                         prefix='',
                         batch_size=300,
                         uncertainty=False,
                         num_thread=1):
        """
        Parallel extraction of i-vectors using multiprocessing module

        :param ubm: Mixture object (the UBM)
        :param stat_server_filename: name of the file from which the input StatServer is read
        :param prefix: prefix used to store the StatServer in its file
        :param batch_size: number of sessions to process in a batch
        :param uncertainty: a boolean, if True, return the diagonal of the uncertainty matrices
        :param num_thread: number of process to run in parallel
        :return: a StatServer with i-vectors in the stat1 attribute and a matrix of uncertainty matrices (optional)
        """
        assert (isinstance(ubm, Mixture)
                and ubm.validate()), "Second argument must be a proper Mixture"

        tv_rank = self.F.shape[1]

        # Set useful variables
        with h5py.File(stat_server_filename,
                       'r') as fh:  # open the first statserver to get size
            _, sv_size = fh[prefix + 'stat1'].shape
            nb_sessions = fh[prefix + "modelset"].shape[0]

            iv_server = StatServer()
            iv_server.modelset = fh.get(prefix + 'modelset').value
            iv_server.segset = fh.get(prefix + 'segset').value

            tmpstart = fh.get(prefix + "start").value
            tmpstop = fh.get(prefix + "stop").value
            iv_server.start = numpy.empty(fh[prefix + "start"].shape, '|O')
            iv_server.stop = numpy.empty(fh[prefix + "stop"].shape, '|O')
            iv_server.start[tmpstart != -1] = tmpstart[tmpstart != -1]
            iv_server.stop[tmpstop != -1] = tmpstop[tmpstop != -1]

            iv_server.stat0 = numpy.ones((nb_sessions, 1), dtype=STAT_TYPE)
            with warnings.catch_warnings():
                iv_server.stat1 = serialize(numpy.zeros(
                    (nb_sessions, tv_rank)))
                iv_sigma = serialize(numpy.zeros((nb_sessions, tv_rank)))

            nb_sessions = iv_server.modelset.shape[0]
            batch_nb = int(numpy.floor(nb_sessions / float(batch_size) +
                                       0.999))
            batch_indices = numpy.array_split(numpy.arange(nb_sessions),
                                              batch_nb)

            manager = multiprocessing.Manager()
            q = manager.Queue()
            pool = multiprocessing.Pool(num_thread + 2)

            # put listener to work first
            watcher = pool.apply_async(iv_collect,
                                       ((iv_server.stat1, iv_sigma), q))
            # fire off workers
            jobs = []

            # Load data per batch to reduce the memory footprint
            for batch_idx in batch_indices:

                # Create list of argument for a process
                arg = batch_idx, fh["stat0"][batch_idx, :], fh["stat1"][
                    batch_idx, :], ubm, self.F
                job = pool.apply_async(iv_extract_on_batch, (arg, q))
                jobs.append(job)

            # collect results from the workers through the pool result queue
            for job in jobs:
                job.get()

            # now we are done, kill the listener
            q.put((None, None, None))
            pool.close()

            iv_server.stat1, iv_sigma = watcher.get()
        if uncertainty:
            return iv_server, iv_sigma
        else:
            return iv_server
class HAC_CLR:
    """
    CLR Hierarchical Agglomerative Clustering (HAC) with GMM trained by MAP
    """
    def __init__(self, features_server, diar, ubm, ce=False, ntop=5):
        assert isinstance(
            features_server,
            FeaturesServer), 'First parameter has to be a FeatureServer'
        assert isinstance(
            diar,
            Diar), '2sd parameter has to be a Diar (segmentationContener)'
        assert isinstance(ubm, Mixture), '3rd parameter has to be a Mixture'

        self.features_server = features_server
        self.diar = copy.deepcopy(diar)
        self.merge = []
        self.nb_merge = 0
        self.ubm = ubm
        self.ce = ce
        self.stat_speaker = None
        self.stat_seg = None
        self.llr = None
        self.ntop = ntop
        #self.init_train()
        #self._init_distance()

    def _get_cep(self, map, cluster):
        cep_list = list()
        for show in map[cluster]:
            idx = self.diar.features_by_cluster(show)[cluster]
            if len(idx) > 0:
                tmp, vad = self.features_server.load(show)
                cep_list.append(tmp[0][idx])
        cep = np.concatenate(cep_list, axis=0)
        return cep

    def _ll(self, ubm, cep, mu=None, name='ubm', argtop=None):
        # ajouter le top gaussien
        lp = ubm.compute_log_posterior_probabilities(cep, mu=mu)

        if argtop is None:
            #logging.info('compute argtop '+speaker)
            argtop = argpartition(lp * -1.0, self.ntop, axis=1)[:, :self.ntop]
            #logging.info(argtop.shape)
        if self.ntop is not None:
            #logging.info('use ntop '+speaker)
            #logging.info(argtop.shape)
            #logging.info(lp.shape)
            lp = lp[np.arange(argtop.shape[0])[:, np.newaxis], argtop]

        # ppMax = numpy.max(lp, axis=1)

        ll = np.log(np.sum(np.exp(lp), axis=1))
        # ll = ppMax + numpy.log(numpy.sum(numpy.exp((lp.transpose() - ppMax).transpose()),
        #                    axis=1))
        not_finite = np.logical_not(np.isfinite(ll))
        cpt = np.count_nonzero(not_finite)
        # ll[finite] = numpy.finfo('d').min
        ll[not_finite] = 1.0e-200
        m = np.mean(ll)
        if cpt > 0:
            logging.info(
                'model ' + name + '), nb trame with llk problem: %d/%d \t %f',
                cpt, cep.shape[0], m)
        return m, argtop

    def initial_models(self, nb_threads=1):
        # sort by show to minimize the reading of mfcc by the statServer
        self.diar.sort(['show'])
        # Compute statistics by segments
        self.stat_seg = StatServer(self.diar.id_map())
        self.stat_seg.accumulate_stat(self.ubm, self.features_server)
        self.stat_speaker = self.stat_seg.adapt_mean_MAP_multisession(self.ubm)

    def initial_distances(self, nb_threads=1):
        map = self.diar.make_index(['cluster', 'show'])
        nb = self.stat_speaker.modelset.shape[0]

        self.llr = np.full((nb, nb), np.nan)
        self.dist = np.full((nb, nb), np.nan)
        for i, name_i in enumerate(self.stat_speaker.modelset):
            cep_i = self._get_cep(map, name_i)
            argtop = None
            ll_ubm = None
            if self.ntop is not None or self.ce == False:
                ll_ubm, argtop = self._ll(self.ubm, cep_i, argtop=argtop)

            # self.merge.append([])
            for j, name_j in enumerate(self.stat_speaker.modelset):
                mu = self.stat_speaker.get_model_stat1_by_index(j)
                # if i == 0:
                #    logging.debug(mu)
                self.llr[i, j], _ = self._ll(self.ubm,
                                             cep_i,
                                             mu=mu,
                                             name=name_j,
                                             argtop=argtop)
            if self.ce:
                self.llr[i, :] -= self.llr[i, i]
            else:
                self.llr[i, :] -= ll_ubm

        # logging.debug(self.llr)
        self.dist = (self.llr + self.llr.T) * -1.0
        np.fill_diagonal(self.dist, np.finfo('d').max)

    def update(self, i, j, nb_threads=1):
        name_i = self.stat_speaker.modelset[i]
        name_j = self.stat_speaker.modelset[j]
        # logging.debug('%d %d / %s %s', i, j, name_i, name_j)

        for k in range(len(self.stat_seg.modelset)):
            if self.stat_seg.modelset[k] == name_j:
                self.stat_seg.modelset[k] = name_i

        self.stat_speaker = self.stat_seg.adapt_mean_MAP_multisession(self.ubm)

        self.llr = roll(self.llr, j)

        self.diar.rename('cluster', [name_j], name_i)
        map = self.diar.make_index(['cluster', 'show'])
        cep_i = self._get_cep(map, name_i)
        argtop = None
        ll_ubm = None
        if self.ntop > 0 or self.ce == False:
            ll_ubm, argtop = self._ll(self.ubm, cep_i, argtop=argtop)
        for k, name_k in enumerate(self.stat_speaker.modelset):
            mu = self.stat_speaker.get_model_stat1_by_index(k)
            self.llr[i, k], _ = self._ll(self.ubm, cep_i, mu=mu, name=name_k)
        if self.ce:
            self.llr[i, :] -= self.llr[i, i]
        else:
            self.llr[i, :] -= ll_ubm

        self.dist = (self.llr + self.llr.T) * -1.0
        np.fill_diagonal(self.dist, np.finfo('d').max)

    def information(self, i, j, value):
        models = self.stat_speaker.modelset
        self.merge.append([self.nb_merge, models[i], models[j], value])

    def perform(self, thr=0.0, to_the_end=False):
        models = self.stat_speaker.modelset
        nb = len(models)
        self.nb_merge = -1
        for i in range(nb):
            self.information(i, i, 0)

        i, j, v = argmin(self.dist, nb)
        self.nb_merge = 0
        while v < thr and nb > 1:
            self.information(i, j, v)
            self.nb_merge += 1
            logging.debug('merge: %d c1: %s (%d) c2: %s (%d) dist: %f',
                          self.nb_merge, models[i], i, models[j], j, v)
            # update merge
            # update model and distance
            self.update(i, j)
            nb -= 1
            i, j, v = argmin(self.dist, nb)

        end_diar = copy.deepcopy(self.diar)
        if to_the_end:
            while nb > 1:
                self.information(i, j, v)
                self.nb_merge += 1
                logging.debug('merge: %d c1: %s (%d) c2: %s (%d) dist: %f',
                              self.nb_merge, models[i], i, models[j], j, v)
                # update merge
                # update model and distance
                self.update(i, j)
                nb -= 1
                i, j, v = argmin(self.dist, nb)

        return end_diar
Beispiel #6
0
def extract_ivector(tv,
                    stat_server_file_name,
                    ubm,
                    output_file_name,
                    uncertainty=False,
                    prefix=''):
    """
    Estimate i-vectors for a given StatServer using multiple process on multiple nodes.

    :param comm: MPI.comm object defining the group of nodes to use
    :param stat_server_file_name: file name of the sufficient statistics StatServer HDF5 file
    :param ubm: Mixture object (the UBM)
    :param output_file_name: name of the file to save the i-vectors StatServer in HDF5 format
    :param uncertainty: boolean, if True, saves a matrix with uncertainty matrices (diagonal of the matrices)
    :param prefix: prefixe of the dataset to read from in HDF5 file
    """
    assert (isinstance(ubm, Mixture)
            and ubm.validate()), "Second argument must be a proper Mixture"

    comm = MPI.COMM_WORLD

    comm.Barrier()

    gmm_covariance = "diag" if ubm.invcov.ndim == 2 else "full"

    # Set useful variables
    tv_rank = tv.F.shape[1]
    feature_size = ubm.mu.shape[1]
    nb_distrib = ubm.w.shape[0]

    # Get the number of sessions to process
    with h5py.File(stat_server_file_name, 'r') as fh:
        nb_sessions = fh["segset"].shape[0]

    # Work on each node with different data
    indices = numpy.array_split(numpy.arange(nb_sessions), comm.size, axis=0)
    sendcounts = numpy.array([idx.shape[0] * tv.F.shape[1] for idx in indices])
    displacements = numpy.hstack((0, numpy.cumsum(sendcounts)[:-1]))

    stat_server = StatServer.read_subset(stat_server_file_name,
                                         indices[comm.rank])

    # Whiten the statistics for diagonal or full models
    if gmm_covariance == "diag":
        stat_server.whiten_stat1(ubm.get_mean_super_vector(),
                                 1. / ubm.get_invcov_super_vector())
    elif gmm_covariance == "full":
        stat_server.whiten_stat1(ubm.get_mean_super_vector(), ubm.invchol)

    # Estimate i-vectors
    if comm.rank == 0:
        iv = numpy.zeros((nb_sessions, tv_rank))
        iv_sigma = numpy.zeros((nb_sessions, tv_rank))
    else:
        iv = None
        iv_sigma = None

    local_iv = numpy.zeros((stat_server.modelset.shape[0], tv_rank))
    local_iv_sigma = numpy.ones((stat_server.modelset.shape[0], tv_rank))

    # Replicate stat0
    index_map = numpy.repeat(numpy.arange(nb_distrib), feature_size)
    for sess in range(stat_server.segset.shape[0]):

        inv_lambda = scipy.linalg.inv(
            numpy.eye(tv_rank) +
            (tv.F.T * stat_server.stat0[sess, index_map]).dot(tv.F))

        Aux = tv.F.T.dot(stat_server.stat1[sess, :])
        local_iv[sess, :] = Aux.dot(inv_lambda)
        local_iv_sigma[sess, :] = numpy.diag(
            inv_lambda + numpy.outer(local_iv[sess, :], local_iv[sess, :]))
    comm.Barrier()

    comm.Gatherv(local_iv, [iv, sendcounts, displacements, MPI.DOUBLE], root=0)
    comm.Gatherv(local_iv_sigma,
                 [iv_sigma, sendcounts, displacements, MPI.DOUBLE],
                 root=0)

    if comm.rank == 0:

        with h5py.File(stat_server_file_name, 'r') as fh:
            iv_stat_server = StatServer()
            iv_stat_server.modelset = fh.get(prefix + "modelset").value
            iv_stat_server.segset = fh.get(prefix + "segset").value

            # if running python 3, need a conversion to unicode
            if sys.version_info[0] == 3:
                iv_stat_server.modelset = iv_stat_server.modelset.astype(
                    'U', copy=False)
                iv_stat_server.segset = iv_stat_server.segset.astype(
                    'U', copy=False)

            tmpstart = fh.get(prefix + "start").value
            tmpstop = fh.get(prefix + "stop").value
            iv_stat_server.start = numpy.empty(fh[prefix + "start"].shape,
                                               '|O')
            iv_stat_server.stop = numpy.empty(fh[prefix + "stop"].shape, '|O')
            iv_stat_server.start[tmpstart != -1] = tmpstart[tmpstart != -1]
            iv_stat_server.stop[tmpstop != -1] = tmpstop[tmpstop != -1]
            iv_stat_server.stat0 = numpy.ones((nb_sessions, 1))
            iv_stat_server.stat1 = iv

        iv_stat_server.write(output_file_name)
        if uncertainty:
            path = os.path.splitext(output_file_name)
            write_matrix_hdf5(iv_sigma, path[0] + "_uncertainty" + path[1])
Beispiel #7
0
def extract_parallel(args, fs_params):
    emb_a_size = 512
    emb_b_size = 512

    idmap = IdMap(args.idmap)

    x_server_1 = StatServer(idmap, 1, emb_a_size)
    x_server_2 = StatServer(idmap, 1, emb_b_size)
    x_server_3 = StatServer(idmap, 1, emb_b_size)
    x_server_4 = StatServer(idmap, 1, emb_b_size)
    x_server_5 = StatServer(idmap, 1, emb_b_size)
    x_server_6 = StatServer(idmap, 1, emb_b_size)

    x_server_1.stat0 = numpy.ones(x_server_1.stat0.shape)
    x_server_2.stat0 = numpy.ones(x_server_2.stat0.shape)
    x_server_3.stat0 = numpy.ones(x_server_3.stat0.shape)
    x_server_4.stat0 = numpy.ones(x_server_4.stat0.shape)
    x_server_5.stat0 = numpy.ones(x_server_5.stat0.shape)
    x_server_6.stat0 = numpy.ones(x_server_6.stat0.shape)

    # Split the indices
    mega_batch_size = idmap.leftids.shape[0] // args.num_processes

    logging.critical("Number of sessions to process: {}".format(idmap.leftids.shape[0]))

    segment_idx = []
    for ii in range(args.num_processes):
        segment_idx.append(
            numpy.arange(ii * mega_batch_size, numpy.min([(ii + 1) * mega_batch_size, idmap.leftids.shape[0]])))

    for idx, si in enumerate(segment_idx):
        logging.critical("Number of session on process {}: {}".format(idx, len(si)))

    # Extract x-vectors in parallel
    output_queue = mp.Queue()

    processes = []
    for rank in range(args.num_processes):
        p = mp.Process(target=extract_idmap,
                       args=(args, rank, segment_idx[rank], fs_params, args.idmap, output_queue)
                       )
        # We first train the model across `num_processes` processes
        p.start()
        processes.append(p)

    # Get the x-vectors and fill the StatServer
    for ii in range(args.num_processes):
        indices, seg_1, seg_2, seg_3, seg_4, seg_5, seg_6 = output_queue.get()
        x_server_1.stat1[indices, :] = seg_1
        x_server_2.stat1[indices, :] = seg_2
        x_server_3.stat1[indices, :] = seg_3
        x_server_4.stat1[indices, :] = seg_4
        x_server_5.stat1[indices, :] = seg_5
        x_server_6.stat1[indices, :] = seg_6

    for p in processes:
        p.join()

    print("Process parallel fini")

    return x_server_1, x_server_2, x_server_3, x_server_4, x_server_5, x_server_6