コード例 #1
0
    def extract_ivecs(self):
        """ Extract normalization i-vectors using averaging.

            Returns:

        """
        speakers_dict = {}

        with open(self.norm_list, 'r') as f:
            for line in f:
                if len(line.split()) > 1:  # number of speakers is defined
                    line = line.split()[0]
                speakers_dict = self.process_file(line, speakers_dict)

        for dict_key in speakers_dict:
            speakers_dict[dict_key] = np.mean(speakers_dict[dict_key], axis=0)

        if self.out_ivec_dir:
            for speaker in speakers_dict:
                Utils.mkdir_p(
                    os.path.join(self.out_ivec_dir, os.path.dirname(speaker)))
                h5file = h5py.File(
                    '{}.{}'.format(os.path.join(self.out_ivec_dir, speaker),
                                   'h5'), 'w')
                h5file.create_dataset(speaker, data=speakers_dict[speaker])
                h5file.close()
        return np.array(speakers_dict.values())
コード例 #2
0
    def dump_rttm(self, scores, out_dir):
        """

        Args:
            scores:
            out_dir:

        Returns:

        """
        for ivecset in self.ivecs:
            if ivecset.size() > 0:
                name = ivecset.name
                reg_name = re.sub('/.*', '', ivecset.name)
                Utils.mkdir_p(os.path.join(out_dir, os.path.dirname(name)))
                with open(os.path.join(out_dir, name + '.rttm'), 'w') as f:
                    for i, ivec in enumerate(ivecset.ivecs):
                        start, end = ivec.window_start, ivec.window_end
                        idx = np.argmax(scores[name].T[i])
                        f.write(
                            'SPEAKER {} 1 {} {} <NA> <NA> {}_spkr_{} <NA>\n'.
                            format(reg_name, float(start / 1000.0),
                                   float((end - start) / 1000.0), reg_name,
                                   idx))
            else:
                logwarning(
                    '[Diarization.dump_rttm] No i-vectors to dump in {}.'.
                    format(ivecset.name))
コード例 #3
0
ファイル: plda.py プロジェクト: nayanhalder/VBDiarization
    def score(self, test, enroll):
        """
            Score ivectors based on the PLDA model
            Input:
                PLDA object. PLDA.V and PLDA.U gives the subspaces
                stats objects: enroll and test
            Output:
                2D array of scores of all possibilities
        """
        test = test - self.mu
        enroll = enroll - self.mu
        test = Utils.l2_norm(test)
        enroll = Utils.l2_norm(enroll)
        Tstats = self.prepare_stats(test)
        tstats = self.prepare_stats(enroll)
        # Create scores
        scores = np.zeros((len(Tstats.N), len(tstats.N)), 'f')
        (a, b) = scores.shape

        # Score for each uniq combination of N enroll and M test sessions (only enroll for now)
        for n_enroll_sessions in np.unique(Tstats.N):
            idxs_T = np.where(Tstats.N == n_enroll_sessions)[0]
            for n_test_sessions in np.unique(tstats.N):
                idxs_t = np.where(tstats.N == n_test_sessions)[0]
                scores[np.ix_(idxs_T, idxs_t)] = self.score_with_constant_n(
                    n_enroll_sessions, Tstats.F[idxs_T, :].T, n_test_sessions,
                    tstats.F[idxs_t, :].T)
        return scores.T
コード例 #4
0
def process_file(wav_dir,
                 vad_dir,
                 out_dir,
                 file_name,
                 fea2ivec_obj,
                 max_size,
                 tolerance,
                 wav_suffix='wav',
                 vad_suffix='lab.gz'):
    """ Process single audio file.

    Args:
        wav_dir (str): directory with wav files
        vad_dir (str): directory with vad files
        out_dir (str): output directory
        file_name (str): name of the file
        fea2ivec_obj (Fea2Ivec): input models for i-vector extraction
        max_size (int): maximal size of window in ms
        tolerance (int): accept given number of frames as speech even when it is marked as silence
        wav_suffix (str): suffix of wav files
        vad_suffix (str): suffix of vad files

    """
    loginfo('Processing file {} ...'.format(file_name.split()[0]))
    num_speakers = None
    if len(file_name.split()) > 1:  # number of speakers is defined
        file_name, num_speakers = file_name.split()[0], int(
            file_name.split()[1])
    wav = '{}.{}'.format(os.path.join(wav_dir, file_name), wav_suffix)
    rate, sig = read(wav)
    if len(sig.shape) != 1:
        raise ValueError('Expected mono as input audio.')
    if rate != RATE:
        logwarning(
            'The input file is expected to be in 8000 Hz, got {} Hz instead, resampling.'
            .format(rate))
        sig = signal.resample(sig, RATE)

    fea_extractor = Features()
    fea = fea_extractor(sig)
    vad, n_regions, n_frames = get_vad(
        '{}.{}'.format(os.path.join(vad_dir, file_name), vad_suffix), len(fea))

    ivec_set = IvecSet()
    ivec_set.name = file_name
    ivec_set.num_speakers = num_speakers
    for seg in get_segments(vad, max_size, tolerance):
        start, end = get_num_segments(seg[0]), get_num_segments(seg[1])
        if seg[0] > fea.shape[0] - 1 or seg[1] > fea.shape[0] - 1:
            raise ValueError(
                'Unexpected features dimensionality - check VAD input or audio.'
            )
        w = fea2ivec_obj.get_ivec(fea[seg[0]:seg[1]])
        ivec_set.add(w, start, end, mfccs=fea)
    if out_dir is not None:
        Utils.mkdir_p(os.path.join(out_dir, os.path.dirname(file_name)))
        ivec_set.save(os.path.join(out_dir, '{}.pkl'.format(file_name)))
    else:
        return ivec_set
コード例 #5
0
    def score_ivec(self, min_length, max_num_speakers, num_threads):
        """ Score i-vectors.

        Args:
            min_length (int): minimal length of segment used for clustering in miliseconds
            max_num_speakers (int): maximal number of speakers
            num_threads (int): number of threads to use

        Returns:
            dict: dictionary with scores for each file
        """
        scores_dict = {}
        for ivecset in self.ivecs:
            name = os.path.normpath(ivecset.name)
            ivecs_all = ivecset.get_all()
            ivecs_long = ivecset.get_longer(min_length)
            loginfo('Scoring {} ...'.format(name))
            size = ivecset.size()
            if size > 0:
                if ivecset.num_speakers is not None:
                    num_speakers = ivecset.num_speakers
                    sklearnkmeans = sklearnKMeans(
                        n_clusters=num_speakers,
                        n_init=100,
                        n_jobs=num_threads).fit(ivecs_long)
                    if self.plda is None:
                        centroids = sklearnkmeans.cluster_centers_
                    else:
                        centroids = PLDAKMeans(sklearnkmeans.cluster_centers_,
                                               num_speakers,
                                               self.plda).fit(ivecs_long)
                else:
                    xm = xmeans(ivecs_long, kmax=max_num_speakers)
                    xm.process()
                    num_speakers = len(xm.get_clusters())
                    sklearnkmeans = sklearnKMeans(
                        n_clusters=num_speakers,
                        n_init=100,
                        n_jobs=num_threads).fit(ivecs_long)
                    centroids = sklearnkmeans.cluster_centers_
                if self.norm is None:
                    if self.plda is None:
                        ivecs_all = Utils.l2_norm(ivecs_all)
                        centroids = Utils.l2_norm(centroids)
                        scores_dict[name] = cosine_similarity(
                            ivecs_all, centroids).T
                    else:
                        scores_dict[name] = self.plda.score(
                            ivecs_all, centroids)
                else:
                    ivecs_all = Utils.l2_norm(ivecs_all)
                    centroids = Utils.l2_norm(centroids)
                    scores_dict[name] = self.norm.s_norm(ivecs_all, centroids)
            else:
                logwarning('No i-vectors to score in {}.'.format(ivecset.name))
        return scores_dict
コード例 #6
0
    def __init__(self,
                 input_list,
                 embeddings,
                 embeddings_mean=None,
                 lda=None,
                 use_l2_norm=True,
                 norm=None,
                 plda=None):
        """ Initialize diarization class.

        Args:
            input_list (string_types): path to list of input files
            embeddings (string_types|List[EmbeddingSet]): path to directory containing embeddings or list
                of EmbeddingSet instances
            embeddings_mean (np.ndarray):
            lda (np.ndarray): linear discriminant analysis - dimensionality reduction
            use_l2_norm (bool): do l2 normalization
            norm (Normalization): instance of class Normalization
            plda (GPLDA): instance of class GPLDA
        """
        self.input_list = input_list
        if isinstance(embeddings, str):
            self.embeddings_dir = embeddings
            self.embeddings = list(self.load_embeddings())
        else:
            self.embeddings = embeddings
        self.lda = lda
        self.use_l2_norm = use_l2_norm
        self.norm = norm
        self.plda = plda

        for embedding_set in self.embeddings:
            for embedding in embedding_set:
                if embeddings_mean is not None:
                    embedding.data = embedding.data - embeddings_mean
                if lda is not None:
                    embedding.data = embedding.data.dot(lda)
                if use_l2_norm:
                    embedding.data = Utils.l2_norm(
                        embedding.data[np.newaxis, :]).flatten()
        if self.norm:
            assert embeddings_mean is not None, 'Expecting usage of mean from normalization set.'
            self.norm.embeddings = self.norm.embeddings - embeddings_mean
            if lda is not None:
                self.norm.embeddings = self.norm.embeddings.dot(lda)
            if use_l2_norm:
                self.norm.embeddings = Utils.l2_norm(self.norm.embeddings)
コード例 #7
0
def process_files(fns,
                  wav_dir,
                  vad_dir,
                  out_dir,
                  features_extractor,
                  embedding_extractor,
                  min_size,
                  max_size,
                  overlap,
                  tolerance,
                  wav_suffix='.wav',
                  vad_suffix='.lab.gz',
                  n_jobs=1):
    """ Process all files from list.

    Args:
        fns (list): name of files to process
        wav_dir (str): directory with wav files
        vad_dir (str): directory with vad files
        out_dir (str|None): output directory
        features_extractor (Any): intialized object for feature extraction
        embedding_extractor (Any): initialized object for embedding extraction
        max_size (int): maximal size of window in ms
        min_size (int): minimal size of window in ms
        overlap (int): size of window overlap in ms
        tolerance (int): accept given number of frames as speech even when it is marked as silence
        wav_suffix (str): suffix of wav files
        vad_suffix (str): suffix of vad files
        n_jobs (int): number of jobs to run in parallel

    Returns:
        List[EmbeddingSet]
    """
    kwargs = dict(wav_dir=wav_dir,
                  vad_dir=vad_dir,
                  out_dir=out_dir,
                  features_extractor=features_extractor,
                  embedding_extractor=embedding_extractor,
                  tolerance=tolerance,
                  min_size=min_size,
                  max_size=max_size,
                  overlap=overlap,
                  wav_suffix=wav_suffix,
                  vad_suffix=vad_suffix)
    if n_jobs == 1:
        ret = _process_files((fns, kwargs))
    else:
        pool = multiprocessing.Pool(n_jobs)
        ret = pool.map(_process_files,
                       ((part, kwargs)
                        for part in Utils.partition(fns, n_jobs)))
    return [item for sublist in ret for item in sublist]
コード例 #8
0
    def load_ivecs(self):
        """ Load normalization i-vectors.

            :returns: i-vectors
            :rtype: numpy.array
        """
        ivecs_list = []
        for f in Utils.list_directory_by_suffix(self.in_ivec_dir, 'h5'):
            loginfo('Loading h5 normalization file {} ...'.format(f))
            h5file = h5py.File(os.path.join(self.in_ivec_dir, f), 'r')
            for h5_key in h5file.keys():
                ivecs_list.append(h5file[h5_key][:].flatten())
        return np.array(ivecs_list)
コード例 #9
0
def process_files(fns,
                  speakers_dict,
                  features_extractor,
                  embedding_extractor,
                  audio_dir,
                  wav_suffix,
                  in_rttm_dir,
                  rttm_suffix,
                  min_length,
                  n_jobs=1):
    """

    Args:
        fns:
        speakers_dict:
        features_extractor:
        embedding_extractor:
        audio_dir:
        wav_suffix:
        in_rttm_dir:
        rttm_suffix:
        min_length:
        n_jobs:

    Returns:

    """
    kwargs = dict(speakers_dict=speakers_dict,
                  features_extractor=features_extractor,
                  embedding_extractor=embedding_extractor,
                  audio_dir=audio_dir,
                  wav_suffix=wav_suffix,
                  in_rttm_dir=in_rttm_dir,
                  rttm_suffix=rttm_suffix,
                  min_length=min_length)
    if n_jobs == 1:
        ret = _process_files((fns, kwargs))
    else:
        pool = multiprocessing.Pool(n_jobs)
        ret = pool.map(_process_files,
                       ((part, kwargs)
                        for part in Utils.partition(fns, n_jobs)))
    return ret
コード例 #10
0
def process_files(fns,
                  wav_dir,
                  vad_dir,
                  out_dir,
                  fea2ivec_obj,
                  max_size,
                  tolerance,
                  wav_suffix='.wav',
                  vad_suffix='.lab.gz',
                  n_jobs=1):
    """ Process all files from list.

    Args:
        fns (list): name of files to process
        wav_dir (str): directory with wav files
        vad_dir (str): directory with vad files
        out_dir (str): output directory
        fea2ivec_obj (Fea2Ivec): input models for i-vector extraction
        max_size (int): maximal size of window in ms
        tolerance (int): accept given number of frames as speech even when it is marked as silence
        wav_suffix (str): suffix of wav files
        vad_suffix (str): suffix of vad files
        n_jobs (int): number of jobs to run in parallel

    Returns:

    """
    kwargs = dict(wav_dir=wav_dir,
                  vad_dir=vad_dir,
                  out_dir=out_dir,
                  fea2ivec_obj=fea2ivec_obj,
                  max_size=max_size,
                  wav_suffix=wav_suffix,
                  vad_suffix=vad_suffix,
                  tolerance=tolerance)
    if n_jobs == 1:
        ret = _process_files((fns, kwargs))
    else:
        pool = multiprocessing.Pool(n_jobs)
        ret = pool.map(_process_files,
                       ((part, kwargs)
                        for part in Utils.partition(fns, n_jobs)))
    return ret
コード例 #11
0
 def save(self, path):
     Utils.mkdir_p(os.path.dirname(path))
     with open(path, 'wb') as f:
         pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
コード例 #12
0
                        type=int,
                        required=False)
    parser.set_defaults(num_cores=1)
    parser.set_defaults(max_num_speakers=10)
    parser.set_defaults(wav_suffix='wav')
    parser.set_defaults(vad_suffix='lab.gz')
    parser.set_defaults(rttm_suffix='rttm')
    parser.set_defaults(min_window_size=2500)
    parser.set_defaults(max_window_size=3000)
    parser.set_defaults(vad_tolerance=0)
    args = parser.parse_args()

    set_mkl(1)

    # initialize extractor
    config = Utils.read_config(args.configuration)
    fea2ivec = Fea2Ivec(config['GMM']['model_path'],
                        config['Extractor']['model_path'])
    files = [line.rstrip('\n') for line in open(args.input_list)]

    # extract i-vectors
    if args.in_ivec_dir is None:
        ivec = process_files(files, args.audio_dir, args.vad_dir,
                             args.in_ivec_dir, fea2ivec, args.max_window_size,
                             args.vad_tolerance, args.wav_suffix,
                             args.vad_suffix, args.num_threads)
        if args.out_ivec_dir:
            for ivecset in ivec:
                ivecset.save('{}.{}'.format(
                    os.path.join(args.out_ivec_dir, ivecset.name), 'pkl'))
    else: