Esempio n. 1
0
def process(wav_dir, id_list, out_dir, calculate_normalisation,
            normalisation_of_deltas):
    """Processes wav files in id_list, saves the log-F0 and MVN parameters to files.

    Args:
        wav_dir (str): Directory containing the wav files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        calculate_normalisation (bool): Whether to automatically calculate MVN parameters after extracting F0.
        normalisation_of_deltas (bool): Also calculate the MVN parameters for the delta and delta delta features.
    """
    file_ids = get_file_ids(wav_dir, id_list)

    make_dirs(os.path.join(out_dir, 'lf0'), file_ids)
    make_dirs(os.path.join(out_dir, 'vuv'), file_ids)

    for file_id in file_ids:
        wav_path = os.path.join(wav_dir, f'{file_id}.wav')
        wav, sample_rate = file_io.load_wav(wav_path)

        f0, vuv = analysis(wav, sample_rate)
        lf0 = np.log(f0)

        file_io.save_bin(lf0, os.path.join(out_dir, 'lf0', f'{file_id}.npy'))
        file_io.save_bin(vuv, os.path.join(out_dir, 'vuv', f'{file_id}.npy'))

    if calculate_normalisation:
        process_mvn(out_dir,
                    'lf0',
                    id_list=id_list,
                    deltas=normalisation_of_deltas)
Esempio n. 2
0
def process_dir(festival_dir, txt_dir, id_list, out_dir, custom_voice=None):
    """Create Utterance structures for all sentences in `id_list` and save them to `out_dir`.

    Args:
        festival_dir (str): Directory containing festival installation.
        txt_dir (str): Directory containing text transcriptions.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
    """
    file_ids = utils.get_file_ids(id_list=id_list)

    sentences = []

    # For all file_ids load the sentence and add a command to create and save the Utterance structure.
    for file_id in sorted(file_ids):
        sentence = file_io.load_lines(os.path.join(txt_dir,
                                                   f'{file_id}.txt'))[0]
        sentence = sentence.replace('"', '\\"')
        sentences.append(sentence)

    # If the file_ids are paths (e.g. for multi-speaker data), make sure the directory structure is already in place.
    utils.make_dirs(os.path.join(out_dir, 'utts'), file_ids)

    # Create and save the Utterance structures.
    create_utterances(festival_dir,
                      file_ids,
                      sentences,
                      out_dir,
                      custom_voice=custom_voice)
def process(data_dir, feat_name, id_list=None, is_npy=True, out_dir=None):
    """Calculates the min-max normalisation statistics from a directory of features.

    Args:
        data_dir (str): Root directory containing folders of features.
        feat_name (str): Name of the feature to be normalised.
        id_list (str): List of file names to process.
        is_npy (bool): If True uses `file_io.load_bin`, otherwise uses `file_io.load_txt` to load each file.
        ext (str): File extension of the saved features.
        out_dir (str): Location to save the normalisation parameters to.
    """
    feat_dir = os.path.join(data_dir, feat_name)
    file_ids = utils.get_file_ids(id_list=id_list)

    if is_npy:
        feature_list = file_io.load_dir(file_io.load_bin,
                                        feat_dir,
                                        file_ids,
                                        feat_ext='npy')
    else:
        feature_list = file_io.load_dir(file_io.load_txt,
                                        feat_dir,
                                        file_ids,
                                        feat_ext='txt')

    minmax_params = calculate_minmax_parameters(feature_list)

    if out_dir is not None:
        minmax_file_path = os.path.join(out_dir, f'{feat_name}_minmax.json')
        file_io.save_json(minmax_params, minmax_file_path)
Esempio n. 4
0
def process(lab_dir, id_list, out_dir, state_level):
    """Processes label files in id_list, saves the phone identities (as a string) to text files.

    Args:
        lab_dir (str): Directory containing the label files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level.
    """
    file_ids = utils.get_file_ids(id_list=id_list)

    utils.make_dirs(os.path.join(out_dir, 'phones'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids)

    for file_id in file_ids:
        # Label processing.
        lab_path = os.path.join(lab_dir, f'{file_id}.lab')
        label = lab_to_feat.Label(lab_path, state_level)

        phones = label.phones
        n_phones = len(label.phones)

        file_io.save_lines(phones,
                           os.path.join(out_dir, 'phones', f'{file_id}.txt'))
        file_io.save_txt(n_phones,
                         os.path.join(out_dir, 'n_phones', f'{file_id}.txt'))
Esempio n. 5
0
def process(festival_dir,
            utt_dir,
            id_list,
            out_dir,
            feature_level='Segment',
            extra_feats_scm='extra_feats.scm',
            label_feats='label.feats',
            label_full_awk='label-full.awk',
            label_mono_awk='label-mono.awk',
            custom_voice=None):
    """Create flat HTS-style full-context labels.

    Args:
        festival_dir (str): Directory containing festival installation.
        utt_dir (str): Directory containing Utterance structures.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        extra_feats_scm (str): .
        label_feats (str): .
        label_full_awk (str): .
        label_mono_awk (str): .
    """
    file_ids = utils.get_file_ids(id_list=id_list)

    dumpfeats_exe = os.path.join(festival_dir, 'examples', 'dumpfeats')

    label_dump_dir = os.path.join(out_dir, 'label_phone_align', 'dump')
    label_full_dir = os.path.join(out_dir, 'label_phone_align', 'full')
    label_mono_dir = os.path.join(out_dir, 'label_phone_align', 'mono')
    label_no_align_dir = os.path.join(out_dir, 'label_no_align')
    mono_no_align_dir = os.path.join(out_dir, 'mono_no_align')

    # Create the flattened features and format them according to `label_full_awk` and `label_mono_awk`.
    utts_to_dumps(dumpfeats_exe, utt_dir, file_ids, label_dump_dir,
                  feature_level, extra_feats_scm, label_feats, custom_voice)
    dumps_to_labs(label_dump_dir, file_ids, label_full_dir, label_full_awk)
    dumps_to_labs(label_dump_dir, file_ids, label_mono_dir, label_mono_awk)

    # Clean up the full-context label features: replace initial pauses with 'sil' and remove timestamps.
    sanitise_labs(label_full_dir,
                  file_ids,
                  label_no_align_dir,
                  include_times=False,
                  state_level=False)
    sanitise_labs(label_mono_dir,
                  file_ids,
                  mono_no_align_dir,
                  include_times=False,
                  state_level=False,
                  is_mono=True)
Esempio n. 6
0
    def __init__(self, htk_dir, lab_dir, wav_dir, id_list, out_dir):
        self.HCompV = os.path.join(htk_dir, 'bin', 'HCompV')
        self.HCopy = os.path.join(htk_dir, 'bin', 'HCopy')
        self.HERest = os.path.join(htk_dir, 'bin', 'HERest')
        self.HHEd = os.path.join(htk_dir, 'bin', 'HHEd')
        self.HVite = os.path.join(htk_dir, 'bin', 'HVite')

        self.wav_dir = wav_dir
        self.lab_dir = lab_dir

        self.file_ids = get_file_ids(id_list=id_list)
        self.file_ids = self.check_file_ids(self.file_ids)

        print('---preparing environment')

        # Directories
        # -----------

        self.cfg_dir = os.path.join(out_dir, 'config')
        self.model_dir = os.path.join(out_dir, 'model')
        self.cur_dir = os.path.join(self.model_dir, 'hmm0')
        self.mfc_dir = os.path.join(out_dir, 'mfc')
        self.mono_lab_dir = os.path.join(out_dir, 'mono_no_align')

        os.makedirs(self.cfg_dir, exist_ok=True)
        os.makedirs(self.cur_dir, exist_ok=True)
        os.makedirs(self.mfc_dir, exist_ok=True)
        os.makedirs(self.mono_lab_dir, exist_ok=True)

        # Paths
        # -----

        self.phonemes = os.path.join(out_dir, 'mono_phone.list')
        self.phoneme_map = os.path.join(out_dir, 'phoneme_map.dict')
        self.align_mlf = os.path.join(out_dir, 'mono_align.mlf')

        # HMMs
        self.proto = os.path.join(self.cfg_dir, 'proto')

        # SCP files
        self.copy_scp = os.path.join(self.cfg_dir, 'copy.scp')
        self.train_scp = os.path.join(self.cfg_dir, 'train.scp')
        self.phoneme_mlf = os.path.join(self.cfg_dir, 'mono_phone.mlf')

        # CFG
        self.cfg = os.path.join(self.cfg_dir, 'cfg')
Esempio n. 7
0
def process(embeddings_dir, n_clusters, id_list, out_dir):
    """Processes wav files in id_list, saves the log-F0 and MVN parameters to files.

    Args:
        embeddings_dir (str): Directory containing the embedding files.
        n_clusters (int): Number of clusters for k-means.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
    """
    file_ids = get_file_ids(id_list=id_list)

    # Load the embeddings.
    embeddings = file_io.load_dir(file_io.load_bin,
                                  embeddings_dir,
                                  file_ids,
                                  feat_ext='npy')
    embeddings = np.array(list(embeddings))

    cluster(embeddings, n_clusters, names=file_ids, out_dir=out_dir)
Esempio n. 8
0
def process(data_dir,
            feat_name,
            id_list=None,
            is_npy=True,
            deltas=False,
            out_dir=None):
    """Calculates the mean-variance normalisation statistics from a directory of features.

    Args:
        data_dir (str): Root directory containing folders of features.
        feat_name (str): Name of the feature to be normalised.
        id_list (str): List of file names to process.
        deltas (bool): Also calculate the MVN parameters for the delta and delta-delta features.
        is_npy (bool): If True uses `file_io.load_bin`, otherwise uses `file_io.load_txt` to load each file.
        out_dir (str): Location to save the normalisation parameters to.
    """
    feat_dir = os.path.join(data_dir, feat_name)
    file_ids = utils.get_file_ids(id_list=id_list)

    if is_npy:
        feature_list = file_io.load_dir(file_io.load_bin,
                                        feat_dir,
                                        file_ids,
                                        feat_ext='npy')
    else:
        feature_list = file_io.load_dir(file_io.load_txt,
                                        feat_dir,
                                        file_ids,
                                        feat_ext='txt')

    mvn_params, delta_mvn_params = calculate_mvn_parameters(
        feature_list, deltas)

    # Possibly save the parameters to json files.
    if out_dir is not None:
        mvn_file_path = os.path.join(out_dir, f'{feat_name}_mvn.json')
        file_io.save_json(mvn_params, mvn_file_path)

        if deltas:
            delta_mvn_file_path = os.path.join(out_dir,
                                               f'{feat_name}_deltas_mvn.json')
            file_io.save_json(delta_mvn_params, delta_mvn_file_path)
Esempio n. 9
0
    def load_params(self, data_dir, data_root='.', device='cpu'):
        r"""Loads the parameters for all speakers from file and stacks them in NumPy arrays and PyTorch tensors.

        Parameters
        ----------
        data_dir : str
            Directory containing all data for this dataset split.
        data_root : str
            Directory root for this dataset.
        device : str or torch.device
            Name of the device to place the parameters on.
        """
        if self.speaker_ids is None:
            self.speaker_ids = get_file_ids(
                id_list=os.path.join(data_root, self.speaker_id_list))

        for speaker_id in self.speaker_ids:
            params_file = os.path.join(
                data_root, data_dir,
                self.file_pattern.format(name=self.name,
                                         speaker_id=speaker_id))

            self.params[speaker_id] = self._from_json(params_file)
            self.params_torch[speaker_id] = self._to_torch(
                self.params[speaker_id], device=device)

            if self.use_deltas:
                delta_params_file = os.path.join(
                    data_root, data_dir,
                    self.file_pattern.format(speaker_id=speaker_id,
                                             name=self.name + '_deltas'))

                self.delta_params[speaker_id] = self._from_json(
                    delta_params_file)
                self.delta_params_torch[speaker_id] = self._to_torch(
                    self.delta_params[speaker_id], device=device)
Esempio n. 10
0
def process(lab_dir, wav_dir, id_list, out_dir, state_level, question_file,
            upsample, subphone_feat_type, trim_silences,
            calculate_normalisation, normalisation_of_deltas):
    """Processes wav files in id_list, saves the log-F0 and MVN parameters to files.

    Args:
        lab_dir (str): Directory containing the label files.
        wav_dir (str): Directory containing the wav files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level.
        question_file (str): Question set to be loaded. Can be one of the four provided question sets;
                questions-unilex_dnn_600.hed
                questions-unilex_phones_69.hed
                questions-radio_dnn_416.hed
                questions-radio_phones_48.hed
                questions-mandarin.hed
                questions-japanese.hed
        upsample (bool): Whether to upsample phone-level numerical labels to frame-level.
        subphone_feat_type (str): Subphone features to be extracted from the durations.
        trim_silences (bool): Whether to trim start and end silences from all features.
        calculate_normalisation (bool): Whether to automatically calculate MVN parameters after extracting F0.
        normalisation_of_deltas (bool): Also calculate the MVN parameters for the delta and delta delta features.
    """
    file_ids = utils.get_file_ids(id_list=id_list)

    question_set = lab_to_feat.QuestionSet(question_file)
    subphone_feature_set = lab_to_feat.SubphoneFeatureSet(subphone_feat_type)

    utils.make_dirs(os.path.join(out_dir, 'lab'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'counters'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'dur'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'phones'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'n_frames'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'lf0'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'vuv'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'mcep'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'bap'), file_ids)

    for file_id in tqdm(file_ids):
        # Label processing.
        lab_path = os.path.join(lab_dir, f'{file_id}.lab')
        label = lab_to_feat.Label(lab_path, state_level)

        numerical_labels = label.extract_numerical_labels(
            question_set, upsample_to_frame_level=upsample)
        counter_features = label.extract_counter_features(subphone_feature_set)
        durations = label.phone_durations.reshape((-1, 1))
        phones = label.phones

        n_frames = np.sum(durations).item()
        n_phones = len(label.phones)

        # Acoustic processing.
        wav_path = os.path.join(wav_dir, f'{file_id}.wav')
        wav, sample_rate = file_io.load_wav(wav_path)

        f0, vuv, mcep, bap = world_with_reaper_f0.analysis(wav, sample_rate)
        lf0 = np.log(f0)

        # Match the number of frames between label forced-alignment and vocoder analysis.
        # Often the durations from forced alignment are a few frames longer than the vocoder features.
        diff = n_frames - f0.shape[0]
        if diff > n_phones:
            raise ValueError(
                f'Number of label frames and vocoder frames is too different for {file_id}\n'
                f'\tlabel frames {n_frames}\n'
                f'\tvocoder frames {f0.shape[0]}\n'
                f'\tnumber of phones {n_phones}')

        # Remove excess durations if there is a shape mismatch.
        if diff > 0:
            # Remove 1 frame from each phone's duration starting at the end of the sequence.
            durations[-diff:] -= 1
            n_frames = f0.shape[0]
            print(
                f'Cropped {diff} frames from durations for utterance {file_id}'
            )

        assert n_frames == np.sum(durations).item()

        trim_frame_slice = slice(0, n_frames)
        if trim_silences:

            start_phone_idx, end_phone_idx = 0, n_phones
            start_frame_idx, end_frame_idx = 0, n_frames
            if phones[0] in ['sil', '#']:
                start_phone_idx += 1
                start_frame_idx += durations[0]
            if phones[-1] in ['sil', '#']:
                end_phone_idx -= 1
                end_frame_idx -= durations[-1]

            trim_phone_slice = slice(int(start_phone_idx), int(end_phone_idx))
            trim_frame_slice = slice(int(start_frame_idx), int(end_frame_idx))

            numerical_labels = numerical_labels[
                trim_frame_slice if upsample else trim_phone_slice]
            durations = durations[trim_phone_slice]
            phones = phones[trim_phone_slice]

            n_frames = trim_frame_slice.stop - trim_frame_slice.start
            n_phones = trim_phone_slice.stop - trim_phone_slice.start

        counter_features = counter_features[trim_frame_slice]
        lf0 = lf0[trim_frame_slice]
        vuv = vuv[trim_frame_slice]
        mcep = mcep[trim_frame_slice]
        bap = bap[trim_frame_slice]

        file_io.save_bin(numerical_labels.astype(np.float32),
                         os.path.join(out_dir, 'lab', f'{file_id}.npy'))
        file_io.save_bin(counter_features.astype(np.float32),
                         os.path.join(out_dir, 'counters', f'{file_id}.npy'))
        file_io.save_txt(durations,
                         os.path.join(out_dir, 'dur', f'{file_id}.txt'))
        file_io.save_lines(phones,
                           os.path.join(out_dir, 'phones', f'{file_id}.txt'))

        file_io.save_txt(n_frames,
                         os.path.join(out_dir, 'n_frames', f'{file_id}.txt'))
        file_io.save_txt(n_phones,
                         os.path.join(out_dir, 'n_phones', f'{file_id}.txt'))

        file_io.save_bin(lf0.astype(np.float32),
                         os.path.join(out_dir, 'lf0', f'{file_id}.npy'))
        file_io.save_bin(vuv, os.path.join(out_dir, 'vuv', f'{file_id}.npy'))
        file_io.save_bin(mcep.astype(np.float32),
                         os.path.join(out_dir, 'mcep', f'{file_id}.npy'))
        file_io.save_bin(bap.astype(np.float32),
                         os.path.join(out_dir, 'bap', f'{file_id}.npy'))

    if calculate_normalisation:
        process_minmax(out_dir, 'lab', id_list, out_dir=out_dir)
        process_minmax(out_dir, 'counters', id_list, out_dir=out_dir)
        process_mvn(out_dir,
                    'dur',
                    is_npy=False,
                    id_list=id_list,
                    deltas=False,
                    out_dir=out_dir)

        process_mvn(out_dir,
                    'lf0',
                    id_list=id_list,
                    deltas=normalisation_of_deltas,
                    out_dir=out_dir)
        process_mvn(out_dir,
                    'mcep',
                    id_list=id_list,
                    deltas=normalisation_of_deltas,
                    out_dir=out_dir)
        process_mvn(out_dir,
                    'bap',
                    id_list=id_list,
                    deltas=normalisation_of_deltas,
                    out_dir=out_dir)
Esempio n. 11
0
def process(lab_dir, id_list, out_dir, state_level, question_file, upsample,
            subphone_feat_type, calculate_normalisation):
    """Processes label files in id_list, saves the numerical labels and durations to file.

    Args:
        lab_dir (str): Directory containing the label files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level.
        question_file (str): Question set to be loaded. Can be one of the four provided question sets;
                questions-unilex_dnn_600.hed
                questions-unilex_phones_69.hed
                questions-radio_dnn_416.hed
                questions-radio_phones_48.hed
                questions-mandarin.hed
                questions-japanese.hed
        upsample (bool): Whether to upsample phone-level numerical labels to frame-level.
        subphone_feat_type (str): Subphone features to be extracted from the durations.
        calculate_normalisation (bool): Calculate mean-variance and min-max normalisation for duration and labels.
    """
    file_ids = get_file_ids(id_list=id_list)
    question_set = QuestionSet(question_file)
    subphone_feature_set = SubphoneFeatureSet(subphone_feat_type)

    make_dirs(os.path.join(out_dir, 'lab'), file_ids)
    make_dirs(os.path.join(out_dir, 'counters'), file_ids)
    make_dirs(os.path.join(out_dir, 'dur'), file_ids)
    make_dirs(os.path.join(out_dir, 'phones'), file_ids)
    make_dirs(os.path.join(out_dir, 'n_frames'), file_ids)
    make_dirs(os.path.join(out_dir, 'n_phones'), file_ids)

    for file_id in file_ids:
        lab_path = os.path.join(lab_dir, f'{file_id}.lab')
        label = Label(lab_path, state_level)

        numerical_labels = label.extract_numerical_labels(
            question_set, upsample_to_frame_level=upsample)
        counter_features = label.extract_counter_features(subphone_feature_set)
        durations = label.phone_durations.reshape((-1, 1))
        phones = label.phones

        n_frames = np.sum(durations).item()
        n_phones = len(label.phones)

        file_io.save_bin(numerical_labels,
                         os.path.join(out_dir, 'lab', f'{file_id}.npy'))
        file_io.save_bin(counter_features,
                         os.path.join(out_dir, 'counters', f'{file_id}.npy'))
        file_io.save_txt(durations,
                         os.path.join(out_dir, 'dur', f'{file_id}.dur'))
        file_io.save_lines(phones,
                           os.path.join(out_dir, 'phones', f'{file_id}.txt'))

        file_io.save_txt(n_frames,
                         os.path.join(out_dir, 'n_frames', f'{file_id}.txt'))
        file_io.save_txt(n_phones,
                         os.path.join(out_dir, 'n_phones', f'{file_id}.txt'))

    if calculate_normalisation:
        process_minmax(out_dir, 'lab', id_list)
        process_minmax(out_dir, 'counters', id_list)
        process_mvn(out_dir,
                    'dur',
                    is_npy=False,
                    id_list=id_list,
                    deltas=False)
def process(lab_dir, id_list, out_dir, state_level, lab_dir_with_pos, wav_dir):
    """Processes label files in id_list, saves the phone identities (as a string) to text files.

    Args:
        lab_dir (str): Directory containing the label files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level.
    """
    file_ids = utils.get_file_ids(id_list=id_list)

    utils.make_dirs(os.path.join(out_dir, 'segment_n_phones'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'segment_n_frames'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'n_segments'), file_ids)

    for file_id in file_ids:
        lab_path_with_pos = os.path.join(lab_dir_with_pos, f'{file_id}.lab')
        label_with_pos = file_io.load_lines(lab_path_with_pos)

        word_start_idxs, _ = get_word_idxs(
            label_with_pos, word_idx_sep=(r'@', r'\+'), phrase_idx_sep=(r'@', r'='))
        pos_tags = get_pos_tags(label_with_pos, word_start_idxs)

        lab_path = os.path.join(lab_dir, f'{file_id}.lab')
        label = lab_to_feat.Label(lab_path, state_level)

        durations = label.phone_durations
        n_frames = np.sum(durations).item()
        n_phones = len(label.phones)

        word_start_idxs, word_end_idxs = get_word_idxs(
            label.labels, word_idx_sep=(r':', r'\+'), phrase_idx_sep=(r':', r'='))
        try:
            segment_start_idxs, segment_end_idxs = segment_words(word_start_idxs, word_end_idxs, pos_tags)
        except (ValueError, IndexError) as e:
            print(f'{e}\n{file_id}')
        else:
            wav_path = os.path.join(wav_dir, f'{file_id}.wav')
            wav, sample_rate = file_io.load_wav(wav_path)
            f0, _, _, _ = world_with_reaper_f0.analysis(wav, sample_rate)

            # Match the number of frames between label forced-alignment and vocoder analysis.
            # Often the durations from forced alignment are a few frames longer than the vocoder features.
            diff = n_frames - f0.shape[0]
            if diff > n_phones:
                raise ValueError(f'Number of label frames and vocoder frames is too different for {file_id}\n'
                                 f'\tlabel frames {n_frames}\n'
                                 f'\tvocoder frames {f0.shape[0]}\n'
                                 f'\tnumber of phones {n_phones}')

            # Remove excess durations if there is a shape mismatch.
            if diff > 0:
                # Remove 1 frame from each phone's duration starting at the end of the sequence.
                durations[-diff:] -= 1
                n_frames = f0.shape[0]
                print(f'Cropped {diff} frames from durations for utterance {file_id}')

            assert n_frames == np.sum(durations).item()

            segment_phone_lens = []
            segment_frame_lens = []
            for segment_start_idx, segment_end_idx in zip(segment_start_idxs, segment_end_idxs):
                segment_phone_lens.append(segment_end_idx - segment_start_idx)
                segment_frame_lens.append(sum(durations[segment_start_idx:segment_end_idx]))

            file_io.save_txt(segment_phone_lens, os.path.join(out_dir, 'segment_n_phones', f'{file_id}.txt'))
            file_io.save_txt(segment_frame_lens, os.path.join(out_dir, 'segment_n_frames', f'{file_id}.txt'))
            file_io.save_txt(len(segment_phone_lens), os.path.join(out_dir, 'n_segments', f'{file_id}.txt'))