Ejemplo n.º 1
0
def process_dir(festival_dir, txt_dir, id_list, out_dir, custom_voice=None):
    """Create Utterance structures for all sentences in `id_list` and save them to `out_dir`.

    Args:
        festival_dir (str): Directory containing festival installation.
        txt_dir (str): Directory containing text transcriptions.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
    """
    file_ids = utils.get_file_ids(id_list=id_list)

    sentences = []

    # For all file_ids load the sentence and add a command to create and save the Utterance structure.
    for file_id in sorted(file_ids):
        sentence = file_io.load_lines(os.path.join(txt_dir,
                                                   f'{file_id}.txt'))[0]
        sentence = sentence.replace('"', '\\"')
        sentences.append(sentence)

    # If the file_ids are paths (e.g. for multi-speaker data), make sure the directory structure is already in place.
    utils.make_dirs(os.path.join(out_dir, 'utts'), file_ids)

    # Create and save the Utterance structures.
    create_utterances(festival_dir,
                      file_ids,
                      sentences,
                      out_dir,
                      custom_voice=custom_voice)
Ejemplo n.º 2
0
def plot_repeated_batch_f0(features, *repeated_predictions, out_dir=None):
    if out_dir is not None:
        plots_dir = os.path.join(out_dir, 'plots', 'repeated_f0')
        make_dirs(plots_dir, features['name'])

    n_frames = features['n_frames'].cpu().detach().numpy()

    target_f0, target_vuv = utils.detach_batched_seqs(torch.exp(
        features['lf0']),
                                                      features['vuv'],
                                                      seq_len=n_frames)

    repeated_pred_f0 = utils.detach_batched_seqs(
        *(torch.exp(predicted['lf0']) for predicted in repeated_predictions),
        seq_len=n_frames)

    for i, name in enumerate(features['name']):
        if out_dir is None:
            out_file = None
        else:
            out_file = os.path.join(plots_dir, f'{name}.pdf')

        plot_repeated_f0(target_f0[i],
                         target_vuv[i],
                         *(pred_f0[i] for pred_f0 in repeated_pred_f0),
                         name=name,
                         out_file=out_file)
Ejemplo n.º 3
0
def process(wav_dir, id_list, out_dir, calculate_normalisation,
            normalisation_of_deltas):
    """Processes wav files in id_list, saves the log-F0 and MVN parameters to files.

    Args:
        wav_dir (str): Directory containing the wav files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        calculate_normalisation (bool): Whether to automatically calculate MVN parameters after extracting F0.
        normalisation_of_deltas (bool): Also calculate the MVN parameters for the delta and delta delta features.
    """
    file_ids = get_file_ids(wav_dir, id_list)

    make_dirs(os.path.join(out_dir, 'lf0'), file_ids)
    make_dirs(os.path.join(out_dir, 'vuv'), file_ids)

    for file_id in file_ids:
        wav_path = os.path.join(wav_dir, f'{file_id}.wav')
        wav, sample_rate = file_io.load_wav(wav_path)

        f0, vuv = analysis(wav, sample_rate)
        lf0 = np.log(f0)

        file_io.save_bin(lf0, os.path.join(out_dir, 'lf0', f'{file_id}.npy'))
        file_io.save_bin(vuv, os.path.join(out_dir, 'vuv', f'{file_id}.npy'))

    if calculate_normalisation:
        process_mvn(out_dir,
                    'lf0',
                    id_list=id_list,
                    deltas=normalisation_of_deltas)
Ejemplo n.º 4
0
def create_utterances(festival_dir,
                      file_ids,
                      sentences,
                      out_dir,
                      custom_voice=None):
    festival_exe = os.path.join(festival_dir, 'bin', 'festival')
    scm_commands = [f'#!{festival_exe}']

    if custom_voice is not None:
        # Run Festival with a particular voice.
        scm_commands.append(f'(voice_{custom_voice})')

    scm_command_str = '(utt.save (utt.synth (Utterance Text "{sentence}" )) "{utt_file}")'

    for file_id, sentence in zip(file_ids, sentences):
        utt_file = os.path.join(out_dir, 'utts', f'{file_id}.utt')

        scm_commands.append(
            scm_command_str.format(sentence=sentence, utt_file=utt_file))

    # Save the commands.
    gen_utts_scm_file = os.path.join(out_dir, 'gen_utts.scm')
    file_io.save_lines(scm_commands, gen_utts_scm_file)

    # If the file_ids are paths (e.g. for multi-speaker data), make sure the directory structure is already in place.
    utils.make_dirs(os.path.join(out_dir, 'utts'), file_ids)

    # Run the commands.
    scm_file = os.path.join(out_dir, 'gen_utts.scm')
    # Argument `check=True` ensures that an exception is raised if the process' return code is non-zero.
    subprocess.run([festival_exe, '-b', scm_file], check=True)
Ejemplo n.º 5
0
def process(lab_dir, id_list, out_dir, state_level):
    """Processes label files in id_list, saves the phone identities (as a string) to text files.

    Args:
        lab_dir (str): Directory containing the label files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level.
    """
    file_ids = utils.get_file_ids(id_list=id_list)

    utils.make_dirs(os.path.join(out_dir, 'phones'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids)

    for file_id in file_ids:
        # Label processing.
        lab_path = os.path.join(lab_dir, f'{file_id}.lab')
        label = lab_to_feat.Label(lab_path, state_level)

        phones = label.phones
        n_phones = len(label.phones)

        file_io.save_lines(phones,
                           os.path.join(out_dir, 'phones', f'{file_id}.txt'))
        file_io.save_txt(n_phones,
                         os.path.join(out_dir, 'n_phones', f'{file_id}.txt'))
Ejemplo n.º 6
0
def plot_batch_f0(features, predicted, use_vuv=None, out_dir=None):
    if use_vuv is None:
        use_vuv = 'vuv' in predicted

    if out_dir is not None:
        plots_dir = os.path.join(out_dir, 'plots', 'f0')
        make_dirs(plots_dir, features['name'])

    n_frames = features['n_frames'].cpu().detach().numpy()
    target_f0, target_vuv = utils.detach_batched_seqs(torch.exp(
        features['lf0']),
                                                      features['vuv'],
                                                      seq_len=n_frames)

    pred_f0 = utils.detach_batched_seqs(torch.exp(predicted['lf0']),
                                        seq_len=n_frames)

    if use_vuv:
        pred_vuv = utils.detach_batched_seqs(predicted['vuv'] > 0.5,
                                             seq_len=n_frames)
    else:
        pred_vuv = [None] * len(pred_f0)

    for i, name in enumerate(features['name']):
        if out_dir is None:
            out_file = None
        else:
            out_file = os.path.join(plots_dir, f'{name}.pdf')

        plot_f0(target_f0[i],
                target_vuv[i],
                pred_f0[i],
                pred_vuv[i],
                name=name,
                out_file=out_file)
Ejemplo n.º 7
0
def save_dir(save_fn, path, data, file_ids, feat_ext=None):
    utils.make_dirs(path, file_ids)

    for datum, file_id in zip(data, file_ids):

        if feat_ext is not None:
            file_id = f'{file_id}.{feat_ext}'
        file_path = os.path.join(path, file_id)

        save_fn(datum, file_path)
Ejemplo n.º 8
0
def cluster(embeddings, n_clusters, names=None, out_dir=None):
    """Processes wav files in id_list, saves the log-F0 and MVN parameters to files.

    Args:
        embeddings_dir (str): Directory containing the embedding files.
        n_clusters (int): Number of clusters for k-means.
        names (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
    """
    if out_dir is not None:
        if names is None:
            raise ValueError(
                'If `out_dir` is given, then `names` of individual sentences must also be given'
            )

        centres_path = os.path.join(out_dir, 'k_means', 'clusters')
        make_dirs(centres_path, names)

        assignments_path = os.path.join(out_dir, 'k_means',
                                        'cluster_assignments')
        make_dirs(assignments_path, names)

    # Cluster with k-means.
    kmeans = KMeans(n_clusters=n_clusters).fit(embeddings)
    cluster_centres = kmeans.cluster_centers_
    cluster_assignments = kmeans.labels_

    # Save the cluster assignments and clusters to files.
    if out_dir is not None:
        cluster_names = [f'cluster_{i}' for i in range(n_clusters)]
        file_io.save_dir(file_io.save_bin,
                         centres_path,
                         cluster_centres,
                         cluster_names,
                         feat_ext='npy')
        file_io.save_dir(file_io.save_txt,
                         assignments_path,
                         cluster_assignments,
                         names,
                         feat_ext='txt')

        counts = np.array([(i,
                            cluster_assignments.reshape(-1).tolist().count(i))
                           for i in range(n_clusters)])
        file_io.save_txt(counts, f'{assignments_path}_counts.txt')

    return cluster_centres, cluster_assignments
Ejemplo n.º 9
0
    def _add_alignments_to_lab(self, mlf, lab_align_dir, lab_dir, file_ids):
        make_dirs(lab_align_dir, file_ids)

        with open(mlf, 'r') as f:
            # Consume the MLF #!header!# line.
            _ = f.readline()

            for file_id in file_ids:
                # Consume the file name line.
                line = f.readline()

                mlf_base_name = os.path.splitext(os.path.basename(line))[0]
                id_base_name = os.path.basename(file_id)

                if mlf_base_name != id_base_name:
                    raise ValueError(
                        f'The file order in the mlf ({mlf}) does not match file_ids)\n'
                        f'{mlf_base_name} {id_base_name}')

                label_no_align = file_io.load_lines(
                    os.path.join(lab_dir, f'{file_id}.lab'))

                label_state_align = []
                for label_tag in label_no_align:
                    label_tag = label_tag.strip()

                    for i in range(STATES_PER_PHONE):
                        # Consume a state alignment line.
                        line = f.readline().strip()

                        # Get the alignments for this state.
                        start_time, end_time, *_ = line.split()
                        label_state_align.append(
                            f'{start_time} {end_time} {label_tag}[{i + 2}]')

                # label_state_align
                file_io.save_lines(
                    label_state_align,
                    os.path.join(lab_align_dir, f'{file_id}.lab'))

                # Consume the end of file line marker ('.' character).
                line = f.readline().strip()

                if line != '.':
                    raise ValueError('The two files are not matched!')
Ejemplo n.º 10
0
def sanitise_labs(lab_dir,
                  file_ids,
                  label_out_dir,
                  include_times=False,
                  state_level=False,
                  is_mono=False):

    utils.make_dirs(label_out_dir, file_ids)

    for file_id in file_ids:
        label = file_io.load_lines(os.path.join(lab_dir, f'{file_id}.lab'))
        n_phones = len(label)

        start_times, end_times, label = map(list, zip(*map(str.split, label)))
        start_times, end_times, label = sanitise_silences(start_times,
                                                          end_times,
                                                          label,
                                                          is_mono=is_mono)

        if state_level:
            if include_times:
                n_states = n_phones * STATES_PER_PHONE

                times = np.interp(range(0, n_states + 1, 1),
                                  range(0, n_states + 1, STATES_PER_PHONE),
                                  start_times + end_times[-1:])

                start_times = times[:-1]
                end_times = times[1:]

            label = np.repeat(label, STATES_PER_PHONE).tolist()
            for i in range(len(label)):
                state_idx = i % STATES_PER_PHONE
                label[i] += f'[{state_idx+2}]'

        if include_times:
            start_times = list(map(_round_dur, start_times))
            end_times = list(map(_round_dur, end_times))

            label = list(map(' '.join, zip(*[start_times, end_times, label])))

        file_io.save_lines(label, os.path.join(label_out_dir,
                                               f'{file_id}.lab'))
Ejemplo n.º 11
0
def batch_synth(lf0,
                vuv,
                mcep,
                bap,
                seq_len=None,
                names=None,
                out_dir=None,
                sample_rate=16000):
    if out_dir is not None:
        if names is None:
            raise ValueError(
                'If `out_dir` is given, then `names` of individual sentences must also be given'
            )

        synth_dir = os.path.join(out_dir, 'synth')
        make_dirs(synth_dir, names)

    lf0, vuv, mcep, bap = utils.detach_batched_seqs(lf0,
                                                    vuv,
                                                    mcep,
                                                    bap,
                                                    seq_len=seq_len)

    wavs = []
    for i, name in enumerate(names):
        f0_i = np.exp(lf0[i])
        f0_i = savgol_filter(f0_i, 7, 1) if len(f0_i) >= 7 else f0_i

        wav = world.synthesis(f0_i,
                              vuv[i],
                              mcep[i],
                              bap[i],
                              sample_rate=sample_rate)
        wavs.append(wav)

        if out_dir is not None:
            wav_path = os.path.join(synth_dir, f'{names[i]}.wav')
            file_io.save_wav(wav, wav_path, sample_rate=sample_rate)

    return wavs
Ejemplo n.º 12
0
def dumps_to_labs(dump_dir, file_ids, label_out_dir, awk='label-full.awk'):

    if awk in pkg_resources.resource_listdir(
            'tts_data_tools', os.path.join('resources', 'festival')):
        print(
            f'Using tts_data_tools resource from resources/festival for {awk}')
        awk = pkg_resources.resource_filename(
            'tts_data_tools', os.path.join('resources', 'festival', awk))

    utils.make_dirs(label_out_dir, file_ids)

    for file_id in file_ids:
        # Argument `check=True` ensures that an exception is raised if the process' return code is non-zero.
        rtn = subprocess.run(
            ['gawk', '-f', awk,
             os.path.join(dump_dir, f'{file_id}.txt')],
            check=True,
            stdout=subprocess.PIPE)

        # `stdout` was redirected with a pipe and stored in the return object `rtn` as a binary string.
        with open(os.path.join(label_out_dir, f'{file_id}.lab'), 'wb') as f:
            f.write(rtn.stdout)
Ejemplo n.º 13
0
    def save_files(self, data, base_names, data_dir):
        utils.make_dirs(data_dir, base_names)

        for datum, base_name in zip(data, base_names):
            self.save_file(datum, base_name, data_dir)
def process(lab_dir, id_list, out_dir, state_level, lab_dir_with_pos, wav_dir):
    """Processes label files in id_list, saves the phone identities (as a string) to text files.

    Args:
        lab_dir (str): Directory containing the label files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level.
    """
    file_ids = utils.get_file_ids(id_list=id_list)

    utils.make_dirs(os.path.join(out_dir, 'segment_n_phones'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'segment_n_frames'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'n_segments'), file_ids)

    for file_id in file_ids:
        lab_path_with_pos = os.path.join(lab_dir_with_pos, f'{file_id}.lab')
        label_with_pos = file_io.load_lines(lab_path_with_pos)

        word_start_idxs, _ = get_word_idxs(
            label_with_pos, word_idx_sep=(r'@', r'\+'), phrase_idx_sep=(r'@', r'='))
        pos_tags = get_pos_tags(label_with_pos, word_start_idxs)

        lab_path = os.path.join(lab_dir, f'{file_id}.lab')
        label = lab_to_feat.Label(lab_path, state_level)

        durations = label.phone_durations
        n_frames = np.sum(durations).item()
        n_phones = len(label.phones)

        word_start_idxs, word_end_idxs = get_word_idxs(
            label.labels, word_idx_sep=(r':', r'\+'), phrase_idx_sep=(r':', r'='))
        try:
            segment_start_idxs, segment_end_idxs = segment_words(word_start_idxs, word_end_idxs, pos_tags)
        except (ValueError, IndexError) as e:
            print(f'{e}\n{file_id}')
        else:
            wav_path = os.path.join(wav_dir, f'{file_id}.wav')
            wav, sample_rate = file_io.load_wav(wav_path)
            f0, _, _, _ = world_with_reaper_f0.analysis(wav, sample_rate)

            # Match the number of frames between label forced-alignment and vocoder analysis.
            # Often the durations from forced alignment are a few frames longer than the vocoder features.
            diff = n_frames - f0.shape[0]
            if diff > n_phones:
                raise ValueError(f'Number of label frames and vocoder frames is too different for {file_id}\n'
                                 f'\tlabel frames {n_frames}\n'
                                 f'\tvocoder frames {f0.shape[0]}\n'
                                 f'\tnumber of phones {n_phones}')

            # Remove excess durations if there is a shape mismatch.
            if diff > 0:
                # Remove 1 frame from each phone's duration starting at the end of the sequence.
                durations[-diff:] -= 1
                n_frames = f0.shape[0]
                print(f'Cropped {diff} frames from durations for utterance {file_id}')

            assert n_frames == np.sum(durations).item()

            segment_phone_lens = []
            segment_frame_lens = []
            for segment_start_idx, segment_end_idx in zip(segment_start_idxs, segment_end_idxs):
                segment_phone_lens.append(segment_end_idx - segment_start_idx)
                segment_frame_lens.append(sum(durations[segment_start_idx:segment_end_idx]))

            file_io.save_txt(segment_phone_lens, os.path.join(out_dir, 'segment_n_phones', f'{file_id}.txt'))
            file_io.save_txt(segment_frame_lens, os.path.join(out_dir, 'segment_n_frames', f'{file_id}.txt'))
            file_io.save_txt(len(segment_phone_lens), os.path.join(out_dir, 'n_segments', f'{file_id}.txt'))
Ejemplo n.º 15
0
def utts_to_dumps(dumpfeats_exe,
                  utt_dir,
                  file_ids,
                  dump_dir,
                  feature_level='Segment',
                  extra_feats_scm='extra_feats.scm',
                  label_feats='label.feats',
                  custom_voice=None):

    if extra_feats_scm in pkg_resources.resource_listdir(
            'tts_data_tools', os.path.join('resources', 'festival')):
        print(
            f'Using tts_data_tools resource from resources/festival for {extra_feats_scm}'
        )
        extra_feats_scm = pkg_resources.resource_filename(
            'tts_data_tools',
            os.path.join('resources', 'festival', extra_feats_scm))

    if custom_voice is not None:
        # Create a temporary file, to which we can add a command to load the custom voice.
        extra_feats_scm_with_custom_voice = tempfile.NamedTemporaryFile(
            suffix='.scm')

        # Write an initial line to load the custom voice.
        extra_feats_scm_with_custom_voice.write(f'(voice_{custom_voice})\n')

        # Write the code from the original Scheme file.
        with open(extra_feats_scm, 'r') as f:
            scm_code = f.read()
        extra_feats_scm_with_custom_voice.write(scm_code)

        # Replace the file name with the name of the temporary file.
        extra_feats_scm = extra_feats_scm_with_custom_voice.name

    if label_feats in pkg_resources.resource_listdir(
            'tts_data_tools', os.path.join('resources', 'festival')):
        print(
            f'Using tts_data_tools resource from resources/festival for {label_feats}'
        )
        label_feats = pkg_resources.resource_filename(
            'tts_data_tools', os.path.join('resources', 'festival',
                                           label_feats))

    utils.make_dirs(dump_dir, file_ids)

    for file_id in file_ids:
        # Argument `check=True` ensures that an exception is raised if the process' return code is non-zero.
        subprocess.run([
            dumpfeats_exe, '-eval', extra_feats_scm, '-relation',
            feature_level, '-feats', label_feats, '-output',
            os.path.join(dump_dir, f'{file_id}.txt'),
            os.path.join(utt_dir, f'{file_id}.utt')
        ],
                       check=True)

    # Replace any '#' characters used for pauses with 'pau'.
    subprocess.run([
        'sed', '-i', '-e', 's/#/pau/g',
        *glob.glob('label_POS/label_phone_align/dump/*')
    ],
                   check=True)

    if custom_voice is not None:
        # Make sure to close the temporary file, this ensures it gets deleted.
        extra_feats_scm_with_custom_voice.close()
Ejemplo n.º 16
0
def process(lab_dir, wav_dir, id_list, out_dir, state_level, question_file,
            upsample, subphone_feat_type, trim_silences,
            calculate_normalisation, normalisation_of_deltas):
    """Processes wav files in id_list, saves the log-F0 and MVN parameters to files.

    Args:
        lab_dir (str): Directory containing the label files.
        wav_dir (str): Directory containing the wav files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level.
        question_file (str): Question set to be loaded. Can be one of the four provided question sets;
                questions-unilex_dnn_600.hed
                questions-unilex_phones_69.hed
                questions-radio_dnn_416.hed
                questions-radio_phones_48.hed
                questions-mandarin.hed
                questions-japanese.hed
        upsample (bool): Whether to upsample phone-level numerical labels to frame-level.
        subphone_feat_type (str): Subphone features to be extracted from the durations.
        trim_silences (bool): Whether to trim start and end silences from all features.
        calculate_normalisation (bool): Whether to automatically calculate MVN parameters after extracting F0.
        normalisation_of_deltas (bool): Also calculate the MVN parameters for the delta and delta delta features.
    """
    file_ids = utils.get_file_ids(id_list=id_list)

    question_set = lab_to_feat.QuestionSet(question_file)
    subphone_feature_set = lab_to_feat.SubphoneFeatureSet(subphone_feat_type)

    utils.make_dirs(os.path.join(out_dir, 'lab'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'counters'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'dur'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'phones'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'n_frames'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'n_phones'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'lf0'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'vuv'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'mcep'), file_ids)
    utils.make_dirs(os.path.join(out_dir, 'bap'), file_ids)

    for file_id in tqdm(file_ids):
        # Label processing.
        lab_path = os.path.join(lab_dir, f'{file_id}.lab')
        label = lab_to_feat.Label(lab_path, state_level)

        numerical_labels = label.extract_numerical_labels(
            question_set, upsample_to_frame_level=upsample)
        counter_features = label.extract_counter_features(subphone_feature_set)
        durations = label.phone_durations.reshape((-1, 1))
        phones = label.phones

        n_frames = np.sum(durations).item()
        n_phones = len(label.phones)

        # Acoustic processing.
        wav_path = os.path.join(wav_dir, f'{file_id}.wav')
        wav, sample_rate = file_io.load_wav(wav_path)

        f0, vuv, mcep, bap = world_with_reaper_f0.analysis(wav, sample_rate)
        lf0 = np.log(f0)

        # Match the number of frames between label forced-alignment and vocoder analysis.
        # Often the durations from forced alignment are a few frames longer than the vocoder features.
        diff = n_frames - f0.shape[0]
        if diff > n_phones:
            raise ValueError(
                f'Number of label frames and vocoder frames is too different for {file_id}\n'
                f'\tlabel frames {n_frames}\n'
                f'\tvocoder frames {f0.shape[0]}\n'
                f'\tnumber of phones {n_phones}')

        # Remove excess durations if there is a shape mismatch.
        if diff > 0:
            # Remove 1 frame from each phone's duration starting at the end of the sequence.
            durations[-diff:] -= 1
            n_frames = f0.shape[0]
            print(
                f'Cropped {diff} frames from durations for utterance {file_id}'
            )

        assert n_frames == np.sum(durations).item()

        trim_frame_slice = slice(0, n_frames)
        if trim_silences:

            start_phone_idx, end_phone_idx = 0, n_phones
            start_frame_idx, end_frame_idx = 0, n_frames
            if phones[0] in ['sil', '#']:
                start_phone_idx += 1
                start_frame_idx += durations[0]
            if phones[-1] in ['sil', '#']:
                end_phone_idx -= 1
                end_frame_idx -= durations[-1]

            trim_phone_slice = slice(int(start_phone_idx), int(end_phone_idx))
            trim_frame_slice = slice(int(start_frame_idx), int(end_frame_idx))

            numerical_labels = numerical_labels[
                trim_frame_slice if upsample else trim_phone_slice]
            durations = durations[trim_phone_slice]
            phones = phones[trim_phone_slice]

            n_frames = trim_frame_slice.stop - trim_frame_slice.start
            n_phones = trim_phone_slice.stop - trim_phone_slice.start

        counter_features = counter_features[trim_frame_slice]
        lf0 = lf0[trim_frame_slice]
        vuv = vuv[trim_frame_slice]
        mcep = mcep[trim_frame_slice]
        bap = bap[trim_frame_slice]

        file_io.save_bin(numerical_labels.astype(np.float32),
                         os.path.join(out_dir, 'lab', f'{file_id}.npy'))
        file_io.save_bin(counter_features.astype(np.float32),
                         os.path.join(out_dir, 'counters', f'{file_id}.npy'))
        file_io.save_txt(durations,
                         os.path.join(out_dir, 'dur', f'{file_id}.txt'))
        file_io.save_lines(phones,
                           os.path.join(out_dir, 'phones', f'{file_id}.txt'))

        file_io.save_txt(n_frames,
                         os.path.join(out_dir, 'n_frames', f'{file_id}.txt'))
        file_io.save_txt(n_phones,
                         os.path.join(out_dir, 'n_phones', f'{file_id}.txt'))

        file_io.save_bin(lf0.astype(np.float32),
                         os.path.join(out_dir, 'lf0', f'{file_id}.npy'))
        file_io.save_bin(vuv, os.path.join(out_dir, 'vuv', f'{file_id}.npy'))
        file_io.save_bin(mcep.astype(np.float32),
                         os.path.join(out_dir, 'mcep', f'{file_id}.npy'))
        file_io.save_bin(bap.astype(np.float32),
                         os.path.join(out_dir, 'bap', f'{file_id}.npy'))

    if calculate_normalisation:
        process_minmax(out_dir, 'lab', id_list, out_dir=out_dir)
        process_minmax(out_dir, 'counters', id_list, out_dir=out_dir)
        process_mvn(out_dir,
                    'dur',
                    is_npy=False,
                    id_list=id_list,
                    deltas=False,
                    out_dir=out_dir)

        process_mvn(out_dir,
                    'lf0',
                    id_list=id_list,
                    deltas=normalisation_of_deltas,
                    out_dir=out_dir)
        process_mvn(out_dir,
                    'mcep',
                    id_list=id_list,
                    deltas=normalisation_of_deltas,
                    out_dir=out_dir)
        process_mvn(out_dir,
                    'bap',
                    id_list=id_list,
                    deltas=normalisation_of_deltas,
                    out_dir=out_dir)
Ejemplo n.º 17
0
def process(lab_dir, id_list, out_dir, state_level, question_file, upsample,
            subphone_feat_type, calculate_normalisation):
    """Processes label files in id_list, saves the numerical labels and durations to file.

    Args:
        lab_dir (str): Directory containing the label files.
        id_list (str): List of file basenames to process.
        out_dir (str): Directory to save the output to.
        state_level (bool): Indicates that the label files are state level if True, otherwise they are frame level.
        question_file (str): Question set to be loaded. Can be one of the four provided question sets;
                questions-unilex_dnn_600.hed
                questions-unilex_phones_69.hed
                questions-radio_dnn_416.hed
                questions-radio_phones_48.hed
                questions-mandarin.hed
                questions-japanese.hed
        upsample (bool): Whether to upsample phone-level numerical labels to frame-level.
        subphone_feat_type (str): Subphone features to be extracted from the durations.
        calculate_normalisation (bool): Calculate mean-variance and min-max normalisation for duration and labels.
    """
    file_ids = get_file_ids(id_list=id_list)
    question_set = QuestionSet(question_file)
    subphone_feature_set = SubphoneFeatureSet(subphone_feat_type)

    make_dirs(os.path.join(out_dir, 'lab'), file_ids)
    make_dirs(os.path.join(out_dir, 'counters'), file_ids)
    make_dirs(os.path.join(out_dir, 'dur'), file_ids)
    make_dirs(os.path.join(out_dir, 'phones'), file_ids)
    make_dirs(os.path.join(out_dir, 'n_frames'), file_ids)
    make_dirs(os.path.join(out_dir, 'n_phones'), file_ids)

    for file_id in file_ids:
        lab_path = os.path.join(lab_dir, f'{file_id}.lab')
        label = Label(lab_path, state_level)

        numerical_labels = label.extract_numerical_labels(
            question_set, upsample_to_frame_level=upsample)
        counter_features = label.extract_counter_features(subphone_feature_set)
        durations = label.phone_durations.reshape((-1, 1))
        phones = label.phones

        n_frames = np.sum(durations).item()
        n_phones = len(label.phones)

        file_io.save_bin(numerical_labels,
                         os.path.join(out_dir, 'lab', f'{file_id}.npy'))
        file_io.save_bin(counter_features,
                         os.path.join(out_dir, 'counters', f'{file_id}.npy'))
        file_io.save_txt(durations,
                         os.path.join(out_dir, 'dur', f'{file_id}.dur'))
        file_io.save_lines(phones,
                           os.path.join(out_dir, 'phones', f'{file_id}.txt'))

        file_io.save_txt(n_frames,
                         os.path.join(out_dir, 'n_frames', f'{file_id}.txt'))
        file_io.save_txt(n_phones,
                         os.path.join(out_dir, 'n_phones', f'{file_id}.txt'))

    if calculate_normalisation:
        process_minmax(out_dir, 'lab', id_list)
        process_minmax(out_dir, 'counters', id_list)
        process_mvn(out_dir,
                    'dur',
                    is_npy=False,
                    id_list=id_list,
                    deltas=False)