Esempio n. 1
0
    def save(self, path):
        """ Save embedding set as pickled file.

        Args:
            path (string_types): output path
        """
        mkdir_p(os.path.dirname(path))
        with open(path, 'wb') as f:
            pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
Esempio n. 2
0
def install_scripts(directory):
    """ Call cmd commands to install extra software/repositories.

    Args:
        directory (str): path

    """
    if KALDI_ROOT_PATH is None or not os.path.isdir(KALDI_ROOT_PATH):
        raise ValueError('Please, set path to correct kaldi installation.')
    nnet_copy_binary = os.path.join(KALDI_ROOT_PATH, 'src', 'nnet3bin',
                                    'nnet3-copy')
    if not os.path.isfile(nnet_copy_binary):
        raise ValueError('nnet3-copy binary not found in `{}`.'.format(
            os.path.dirname(nnet_copy_binary)))
    copy_matrix_binary = os.path.join(KALDI_ROOT_PATH, 'src', 'bin',
                                      'copy-matrix')
    if not os.path.isfile(copy_matrix_binary):
        raise ValueError('copy-matrix binary not found in `{}`.'.format(
            os.path.dirname(copy_matrix_binary)))
    mkdir_p(XVEC_MODELS_DIR)
    with tempfile.NamedTemporaryFile() as f:
        urllib.urlretrieve(
            'http://kaldi-asr.org/models/0003_sre16_v2_1a.tar.gz', f.name)
        tar = tarfile.open(os.path.join(f.name), 'r:gz')
        tar.extractall(XVEC_MODELS_DIR)
        tar.close()

    # replace input of the last layer, so we can easily extract xvectors
    nnet_raw_path = os.path.join(XVEC_MODELS_DIR, '0003_sre16_v2_1a', 'exp',
                                 'xvector_nnet_1a', 'final.raw')
    old_line = 'output-node name=output input=output.log-softmax objective=linear'
    new_line = 'output-node name=output input=tdnn6.affine objective=linear'
    check_call([
        'sed', '-i', '-e', 's@{}@{}@g'.format(old_line, new_line),
        nnet_raw_path
    ])

    # convert LDA matrix to text format
    lda_path = os.path.join(os.path.dirname(nnet_raw_path), '..',
                            'xvectors_sre_combined', 'transform.mat')
    check_call([
        copy_matrix_binary, '--binary=false', lda_path,
        lda_path.replace('.mat', '.txt')
    ])
Esempio n. 3
0
    def extract_embeddings(self):
        """ Extract normalization embeddings using averaging.

        Returns:
            Tuple[np.array, np.array]: vectors for individual speakers, global mean over all speakers
        """
        speakers_dict, fns = {}, []
        with open(self.norm_list) as f:
            for line in f:
                if len(line.split()) > 1:  # number of speakers is defined
                    line = line.split()[0]
                else:
                    line = line.replace(os.linesep, '')
                fns.append(line)

        speakers_dict = process_files(
            fns,
            speakers_dict=speakers_dict,
            features_extractor=self.features_extractor,
            embedding_extractor=self.embedding_extractor,
            audio_dir=self.audio_dir,
            wav_suffix=self.wav_suffix,
            in_rttm_dir=self.in_rttm_dir,
            rttm_suffix=self.rttm_suffix,
            min_length=self.min_length,
            n_jobs=self.n_jobs)
        assert len(speakers_dict) == len(fns)
        # all are the same
        merged_speakers_dict = speakers_dict[0]

        if self.out_emb_dir:
            for speaker in merged_speakers_dict:
                out_path = os.path.join(self.out_emb_dir, f'{speaker}.pkl')
                mkdir_p(os.path.dirname(out_path))
                with open(out_path, 'wb') as f:
                    pickle.dump(merged_speakers_dict[speaker], f,
                                pickle.HIGHEST_PROTOCOL)

        for speaker in merged_speakers_dict:
            merged_speakers_dict[speaker] = np.mean(
                merged_speakers_dict[speaker], axis=0)

        return np.array(list(merged_speakers_dict.values()))
Esempio n. 4
0
    def dump_rttm(self, scores, out_dir):
        """ Dump rttm files to output directory. This function requires initialized embeddings.

        Args:
            scores (Dict): dictionary containing scores
            out_dir (string_types): path to output directory
        """
        for embedding_set in self.embeddings:
            if len(embedding_set) > 0:
                name = embedding_set.name
                reg_name = re.sub('/.*', '', embedding_set.name)
                mkdir_p(os.path.join(out_dir, os.path.dirname(name)))
                with open(os.path.join(out_dir, name + '.rttm'), 'w') as f:
                    for i, ivec in enumerate(embedding_set.embeddings):
                        start, end = ivec.window_start, ivec.window_end
                        idx = np.argmax(scores[name].T[i])
                        f.write('SPEAKER {} 1 {} {} <NA> <NA> {}_spkr_{} <NA>\n'.format(
                            reg_name, float(start / 1000.0), float((end - start) / 1000.0), reg_name, idx))
            else:
                logger.warning('No embedding to dump in {}.'.format(embedding_set.name))
def process_file(wav_dir,
                 vad_dir,
                 out_dir,
                 file_name,
                 features_extractor,
                 embedding_extractor,
                 min_size,
                 max_size,
                 overlap,
                 tolerance,
                 wav_suffix='.wav',
                 vad_suffix='.lab.gz'):
    """ Process single audio file.

    Args:
        wav_dir (str): directory with wav files
        vad_dir (str): directory with vad files
        out_dir (str): output directory
        file_name (str): name of the file
        features_extractor (Any): intialized object for feature extraction
        embedding_extractor (Any): initialized object for embedding extraction
        max_size (int): maximal size of window in ms
        max_size (int): maximal size of window in ms
        overlap (int): size of window overlap in ms
        tolerance (int): accept given number of frames as speech even when it is marked as silence
        wav_suffix (str): suffix of wav files
        vad_suffix (str): suffix of vad files

    Returns:
        EmbeddingSet
    """
    logger.info('Processing file {}.'.format(file_name.split()[0]))
    num_speakers = None
    if len(file_name.split()) > 1:  # number of speakers is defined
        file_name, num_speakers = file_name.split()[0], int(
            file_name.split()[1])

    wav_dir, vad_dir = os.path.abspath(wav_dir), os.path.abspath(vad_dir)
    if out_dir:
        out_dir = os.path.abspath(out_dir)

    # extract features
    features = features_extractor.audio2features(
        os.path.join(wav_dir, f'{file_name}{wav_suffix}'))

    # load voice activity detection from file
    vad, _, _ = get_vad(f'{os.path.join(vad_dir, file_name)}{vad_suffix}',
                        features.shape[0])

    # parse segments and split features
    features_dict = {}
    for seg in get_segments(vad, max_size, tolerance):
        seg_start, seg_end = seg
        start, end = get_time_from_frames(seg_start), get_time_from_frames(
            seg_end)
        if start >= overlap:
            seg_start = get_frames_from_time(start - overlap)
        if seg_start > features.shape[0] - 1 or seg_end > features.shape[0] - 1:
            logger.warning(
                f'Frames not aligned, number of frames {features.shape[0]} and got ending segment {seg_end}'
            )
            seg_end = features.shape[0]
        features_dict[(start, end)] = features[seg_start:seg_end]

    # extract embedding for each segment
    embedding_set = extract_embeddings(features_dict, embedding_extractor)
    embedding_set.name = file_name
    embedding_set.num_speakers = num_speakers

    # save embeddings if required
    if out_dir is not None:
        mkdir_p(os.path.join(out_dir, os.path.dirname(file_name)))
        embedding_set.save(os.path.join(out_dir, '{}.pkl'.format(file_name)))

    return embedding_set
    else:
        mean = norm.mean

    # run diarization
    diar = Diarization(args.input_list,
                       embeddings,
                       embeddings_mean=mean,
                       lda=lda,
                       use_l2_norm=use_l2_norm,
                       plda=plda,
                       norm=norm)
    result = diar.score_embeddings(args.min_window_size, args.max_num_speakers,
                                   args.mode)

    if args.mode == 'diarization':
        if args.in_rttm_dir:
            diar.evaluate(scores=result,
                          in_rttm_dir=args.in_rttm_dir,
                          collar_size=0.25,
                          evaluate_overlaps=False)

        if args.out_rttm_dir is not None:
            diar.dump_rttm(result, args.out_rttm_dir)
    else:
        if args.out_clusters_dir:
            for name in result:
                mkdir_p(
                    os.path.join(args.out_clusters_dir, os.path.dirname(name)))
                np.save(os.path.join(args.out_clusters_dir, name),
                        result[name])
Esempio n. 7
0
def process_file(wav_dir,
                 vad_dir,
                 out_dir,
                 file_name,
                 features_extractor,
                 embedding_extractor,
                 max_size,
                 tolerance,
                 wav_suffix='.wav',
                 vad_suffix='.lab.gz'):
    """ Process single audio file.

    Args:
        wav_dir (str): directory with wav files
        vad_dir (str): directory with vad files
        out_dir (str): output directory
        file_name (str): name of the file
        features_extractor (Any): intialized object for feature extraction
        embedding_extractor (Any): initialized object for embedding extraction
        max_size (int): maximal size of window in ms
        tolerance (int): accept given number of frames as speech even when it is marked as silence
        wav_suffix (str): suffix of wav files
        vad_suffix (str): suffix of vad files

    Returns:
        EmbeddingSet
    """
    logger.info('Processing file {}.'.format(file_name.split()[0]))
    num_speakers = None
    if len(file_name.split()) > 1:  # number of speakers is defined
        file_name, num_speakers = file_name.split()[0], int(
            file_name.split()[1])

    wav_dir, vad_dir = os.path.abspath(wav_dir), os.path.abspath(vad_dir)
    if out_dir:
        out_dir = os.path.abspath(out_dir)

    # extract features
    _, features = features_extractor.audio2features(
        os.path.join(wav_dir, '{}{}'.format(file_name, wav_suffix)))

    # load voice activity detection from file
    vad, _, _ = get_vad(
        '{}{}'.format(os.path.join(vad_dir, file_name), vad_suffix),
        features.shape[0])

    # parse segments and split features
    features_dict = {}
    for seg in get_segments(vad, max_size, tolerance):
        start, end = get_num_segments(seg[0]), get_num_segments(seg[1])
        if seg[0] > features.shape[0] - 1 or seg[1] > features.shape[0] - 1:
            raise ValueError(
                'Unexpected features dimensionality - check VAD input or audio.'
            )
        features_dict['{}_{}'.format(start, end)] = features[seg[0]:seg[1]]

    # extract embedding for each segment
    embedding_set = extract_embeddings(features_dict, embedding_extractor)
    embedding_set.name = file_name
    embedding_set.num_speakers = num_speakers

    # save embeddings if required
    if out_dir is not None:
        mkdir_p(os.path.join(out_dir, os.path.dirname(file_name)))
        embedding_set.save(os.path.join(out_dir, '{}.pkl'.format(file_name)))

    return embedding_set