Python get_spec_dict Exemples, preprocess_data.get_spec_dict Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : demo_stream.py Projet : Pitrified/hda-speech-recog

    def get_spec_aug_info(self) -> None:
        """MAKEDOC: what is get_spec_aug_info doing?"""
        logg = logging.getLogger(f"c.{__name__}.get_spec_aug_info")
        # logg.setLevel("INFO")
        logg.debug("Start get_spec_aug_info")

        self.p2d_kwargs = {"ref": np.max}

        if self.train_dataset.startswith("me"):
            spec_dict = get_spec_dict()
            self.mel_kwargs: ty.Dict[str,
                                     ty.Any] = spec_dict[self.train_dataset]
            spec_shape_dict = get_spec_shape_dict()
            self.spec_shape: ty.Tuple[int, int] = spec_shape_dict[
                self.train_dataset]

        elif self.train_dataset.startswith("au"):
            aug_dict = get_aug_dict()
            self.mel_kwargs = aug_dict[self.train_dataset]["mel_kwargs"]
            self.spec_shape = aug_dict[self.train_dataset]["aug_shape"]

Exemple #2

0

Afficher le fichier

def evaluate_stream(
    model: models.Model,
    datasets_type: str,
    train_words_type: str,
    architecture_type: str,
    model_name: str,
    orig_wav_path: Path,
    norm_tra: str,
    wav_ID: str,
) -> np.ndarray:
    """MAKEDOC: what is evaluate_stream doing?

    CNN_nf32_ks02_ps01_dw32_dr01_lr04_opa1_dsmeL04_bs32_en15_wLTnumLS
    if not 4 < len(norm_tra.split()) < 15:
    sentence_index 10
    sentence_wav_paths[6241_61943_000011_000003]: 6241/61943/6241_61943_000011_000003.wav
    sentence_norm_tra[6241_61943_000011_000003]: As usual, the crew was small, five Danes doing the whole of the work.

    sentence_index 16
    sentence_wav_paths[2412_153947_000023_000000]: 2412/153947/2412_153947_000023_000000.wav
    sentence_norm_tra[2412_153947_000023_000000]: june ninth eighteen seventy two

    sentence_index 19
    sentence_wav_paths[174_168635_000014_000000]: 174/168635/174_168635_000014_000000.wav
    sentence_norm_tra[174_168635_000014_000000]: CHAPTER three-TWO MISFORTUNES MAKE ONE PIECE OF GOOD FORTUNE

    sentence_index 22
    sentence_wav_paths[3000_15664_000017_000003]: 3000/15664/3000_15664_000017_000003.wav
    sentence_norm_tra[3000_15664_000017_000003]: The full grown bucks weigh nearly three hundred and fifty pounds.

    sentence_index 26
    sentence_wav_paths[2277_149897_000035_000002]: 2277/149897/2277_149897_000035_000002.wav
    sentence_norm_tra[2277_149897_000035_000002]: Three o'clock came, four, five, six, and no letter.

    sentence_index 33
    sentence_wav_paths[8297_275154_000019_000000]: 8297/275154/8297_275154_000019_000000.wav
    sentence_norm_tra[8297_275154_000019_000000]: "I will do neither the one nor the other.

    sentence_index 36
    sentence_wav_paths[8297_275156_000017_000007]: 8297/275156/8297_275156_000017_000007.wav
    sentence_norm_tra[8297_275156_000017_000007]: In any event, her second marriage would lead to one disastrous result.

    sentence_index 42
    sentence_wav_paths[7976_110124_000012_000000]: 7976/110124/7976_110124_000012_000000.wav
    sentence_norm_tra[7976_110124_000012_000000]: "Be sure that you admit no one," commanded the merchant.

    sentence_index 46
    sentence_wav_paths[7976_105575_000008_000004]: 7976/105575/7976_105575_000008_000004.wav
    sentence_norm_tra[7976_105575_000008_000004]: Five of my eight messmates of the day before were shot.

    sentence_index 66
    sentence_wav_paths[251_118436_000005_000000]: 251/118436/251_118436_000005_000000.wav
    sentence_norm_tra[251_118436_000005_000000]: one Death Strikes a King

    interesting because the two after the silence is identified, the four said quickly is missed
    sentence_index 67
    sentence_wav_paths[251_137823_000033_000000]: 251/137823/251_137823_000033_000000.wav
    sentence_norm_tra[251_137823_000033_000000]: Of the four other company engineers, two were now stirring and partly conscious.

    sentence_index 100
    sentence_wav_paths[1993_147966_000015_000000]: 1993/147966/1993_147966_000015_000000.wav
    sentence_norm_tra[1993_147966_000015_000000]: We had three weeks of this mild, open weather.

    ATT_ct02_dr01_ks01_lu01_qt04_dw01_opa1_lr04_bs02_en01_dsmeLa3_wLTnumLS
    sentence_index 10
    sentence_wav_paths[6241_61943_000011_000003]: /home/pmn/audiodatasets/LibriTTS/dev-clean/6241/61943/6241_61943_000011_000003.wav
    sentence_norm_tra[6241_61943_000011_000003]: As usual, the crew was small, five Danes doing the whole of the work.

    sentence_index 40
    sentence_wav_paths[7976_110124_000015_000000]: /home/pmn/audiodatasets/LibriTTS/dev-clean/7976/110124/7976_110124_000015_000000.wav
    sentence_norm_tra[7976_110124_000015_000000]: "Have pity upon a poor unfortunate one!" he called out.
    """
    logg = logging.getLogger(f"c.{__name__}.evaluate_stream")
    # logg.setLevel("INFO")
    logg.debug("Start evaluate_stream")

    # a random number generator to use
    rng = np.random.default_rng(12345)

    # the sample rate to use
    new_sr = 16000

    # load the sentence and resample it
    sentence_sig, sentence_sr = librosa.load(orig_wav_path, sr=None)
    sentence_sig = librosa.resample(sentence_sig, sentence_sr, new_sr)

    # split the sentence in chunks every sentence_hop_length
    sentence_hop_length = new_sr // 16

    # the words the model was trained on
    words = sorted(words_types[train_words_type])
    logg.debug(f"words: {words}")

    # the length of the split is chosen to match the training type
    if train_words_type.endswith("LS"):
        split_length = new_sr // 2
    else:
        split_length = new_sr

    splits = split_sentence(sentence_sig, datasets_type, sentence_hop_length,
                            split_length)
    logg.debug(f"len(splits): {len(splits)}")

    # if there are no splits available we pretend to have predicted only background
    # or only random if there are only numbers
    if len(splits) == 0:
        num_words = len(words)
        if num_words == 10:
            fake_pred = np.ones((1, num_words), dtype=np.float32)
            fake_pred /= num_words
        else:
            fake_pred = np.zeros((1, num_words), dtype=np.float32)
            fake_pred[0][0] = 1
        return fake_pred

    # compute spectrograms / augment / compose
    if datasets_type.startswith("au"):
        specs = augment_signals(splits,
                                datasets_type,
                                rng,
                                which_fold="testing")
        logg.debug(f"specs.shape: {specs.shape}")
        specs_img = np.expand_dims(specs, axis=-1)
        logg.debug(f"specs_img.shape: {specs_img.shape}")

    elif datasets_type.startswith("me"):

        spec_dict = get_spec_dict()
        mel_kwargs = spec_dict[datasets_type]
        logg.debug(f"mel_kwargs: {mel_kwargs}")

        spec_shape_dict = get_spec_shape_dict()
        spec_shape = spec_shape_dict[datasets_type]
        requested_length = spec_shape[1]
        logg.debug(f"requested_length: {requested_length}")

        p2d_kwargs = {"ref": np.max}
        specs_img = compute_spectrograms(splits, mel_kwargs, p2d_kwargs,
                                         requested_length)
        logg.debug(f"specs_img.shape: {specs_img.shape}")

    y_pred = model.predict(specs_img)
    # logg.debug(f"y_pred: {y_pred}")
    y_index = np.argmax(y_pred, axis=1)
    # logg.debug(f"y_index: {y_index}")
    y_pred_labels = [words[i] for i in y_index]
    # logg.debug(f"y_pred_labels: {y_pred_labels}")

    save_plots = False
    # save_plots = True
    if save_plots:
        clean_labels = []
        for yl in y_pred_labels:
            if yl.startswith("_other"):
                clean_labels.append(".")
            else:
                clean_labels.append(yl)
        logg.debug(f"Predictions {clean_labels}")

        # fig_name = f"{architecture_type}_{evaluation_type}_{datasets_type}_{train_words_type}_{norm_tra}.{{}}"
        # fig_name = f"{architecture_type}"
        fig_name = f"{model_name}"
        # fig_name += f"_{evaluation_type}"
        fig_name += f"_{datasets_type}"
        fig_name += f"_{train_words_type}"
        fig_name += f"_{wav_ID}"
        # fig_name += f"_{norm_tra}.{{}}"
        fig_name += ".{}"

        plot_sentence_pred(
            sentence_sig,
            y_pred,
            norm_tra,
            words,
            sentence_hop_length,
            split_length,
            fig_name,
        )
        # plt.show()

    return y_pred

Exemple #3

0

Afficher le fichier

Fichier : evaluate_cnn.py Projet : Pitrified/hda-speech-recog

def evaluate_audio_cnn(args):
    """MAKEDOC: what is evaluate_audio_cnn doing?"""
    logg = logging.getLogger(f"c.{__name__}.evaluate_audio_cnn")
    logg.debug("Start evaluate_audio_cnn")

    # magic to fix the GPUs
    setup_gpus()

    # need to know on which dataset the model was trained to compute specs
    dataset_name = "mel01"

    # words that the dataset was trained on
    train_words_type = args.train_words_type
    train_words = words_types[train_words_type]

    # permutation from sorted to your wor(l)d order
    perm_pred = compute_permutation(train_words)

    rec_words_type = args.rec_words_type
    if rec_words_type == "train":
        rec_words = train_words
    else:
        rec_words = words_types[rec_words_type]
    num_rec_words = len(rec_words)

    # where to save the audios
    audio_folder = Path("recorded_audio")
    if not audio_folder.exists():
        audio_folder.mkdir(parents=True, exist_ok=True)

    # record the audios and save them in audio_folder
    audio_path_fmt = "{}_02.wav"
    audios = record_audios(rec_words, audio_folder, audio_path_fmt, timeout=0)

    # compute the spectrograms and build the dataset of correct shape
    img_specs = []
    spec_dict = get_spec_dict()
    spec_kwargs = spec_dict[dataset_name]
    p2d_kwargs = {"ref": np.max}
    for word in rec_words:
        # get the name
        audio_path = audio_folder / audio_path_fmt.format(word)

        # convert it to mel
        log_spec = wav2mel(audio_path, spec_kwargs, p2d_kwargs)
        img_spec = log_spec.reshape((*log_spec.shape, 1))
        logg.debug(f"img_spec.shape: {img_spec.shape}"
                   )  # img_spec.shape: (128, 32, 1)

        img_specs.append(img_spec)

    # the data needs to look like this data['testing'].shape: (735, 128, 32, 1)
    # data = log_spec.reshape((1, *log_spec.shape, 1))
    data = np.stack(img_specs)
    logg.debug(f"data.shape: {data.shape}")

    hypa: ty.Dict[str, ty.Union[str, int]] = {}
    hypa["base_dense_width"] = 32
    hypa["base_filters"] = 20
    hypa["batch_size"] = 32
    hypa["dropout_type"] = "01"
    hypa["epoch_num"] = 16
    hypa["kernel_size_type"] = "02"
    hypa["pool_size_type"] = "02"
    hypa["learning_rate_type"] = "02"
    hypa["optimizer_type"] = "a1"
    hypa["dataset"] = dataset_name
    hypa["words"] = train_words_type

    # get the words
    train_words = words_types[train_words_type]

    model_name = build_cnn_name(hypa)
    logg.debug(f"model_name: {model_name}")

    # model_folder = Path("trained_models") / "cnn"
    model_folder = Path("saved_models")
    model_path = model_folder / f"{model_name}.h5"
    if not model_path.exists():
        logg.error(f"Model not found at: {model_path}")
        raise FileNotFoundError

    model = tf.keras.models.load_model(model_path)
    model.summary()

    pred = model.predict(data)
    # logg.debug(f"pred: {pred}")

    # plot the thing
    plot_size = 5
    fw = plot_size * 3
    fh = plot_size * num_rec_words
    fig, axes = plt.subplots(nrows=num_rec_words, ncols=3, figsize=(fw, fh))
    fig.suptitle("Recorded audios", fontsize=18)

    for i, word in enumerate(rec_words):
        plot_waveform(audios[i], axes[i][0])
        spec = img_specs[i][:, :, 0]
        plot_spec(spec, axes[i][1])
        plot_pred(
            pred[i][perm_pred],
            train_words,
            axes[i][2],
            f"Prediction for {rec_words[i]}",
            train_words.index(word),
        )

    # https://stackoverflow.com/q/8248467
    # https://stackoverflow.com/q/2418125
    fig.tight_layout(h_pad=3, rect=[0, 0.03, 1, 0.97])

    fig_name = f"{model_name}_{train_words_type}_{rec_words_type}.png"
    results_path = audio_folder / fig_name
    fig.savefig(results_path)

    if num_rec_words <= 6:
        plt.show()

Exemple #4

0

Afficher le fichier

def evaluate_attention_weights(train_words_type: str,
                               rec_words_type: str,
                               do_new_record: bool = False) -> None:
    """MAKEDOC: what is evaluate_attention_weights doing?"""
    logg = logging.getLogger(f"c.{__name__}.evaluate_attention_weights")
    # logg.setLevel("INFO")
    logg.debug("Start evaluate_attention_weights")

    # magic to fix the GPUs
    setup_gpus()

    # ATT_ct02_dr02_ks02_lu01_as01_qt01_dw01_opa1_lr01_bs01_en01_dsmel04_wk1
    # hypa: ty.Dict[str, str] = {}
    # hypa["conv_size_type"] = "02"
    # hypa["dropout_type"] = "02"
    # hypa["kernel_size_type"] = "02"
    # hypa["lstm_units_type"] = "01"
    # hypa["query_style_type"] = "01"
    # hypa["dense_width_type"] = "01"
    # hypa["optimizer_type"] = "a1"
    # hypa["learning_rate_type"] = "01"
    # hypa["batch_size_type"] = "01"
    # hypa["epoch_num_type"] = "01"
    # dataset_name = "mel04"
    # hypa["dataset_name"] = dataset_name
    # hypa["words_type"] = train_words_type
    # use_validation = True

    # ATT_ct02_dr01_ks01_lu01_qt05_dw01_opa1_lr03_bs02_en02_dsaug07_wLTnum
    hypa = {
        "batch_size_type": "02",
        "conv_size_type": "02",
        "dataset_name": "aug07",
        "dense_width_type": "01",
        "dropout_type": "01",
        "epoch_num_type": "02",
        "kernel_size_type": "01",
        "learning_rate_type": "03",
        "lstm_units_type": "01",
        "optimizer_type": "a1",
        "query_style_type": "05",
        "words_type": "LTnum",
    }
    use_validation = True

    dataset_name = hypa["dataset_name"]

    model_name = build_attention_name(hypa, use_validation)
    logg.debug(f"model_name: {model_name}")

    # load the model
    model_folder = Path("trained_models") / "attention"
    model_path = model_folder / f"{model_name}.h5"

    # model = tf.keras.models.load_model(model_path)
    # https://github.com/keras-team/keras/issues/5088#issuecomment-401498334
    model = tf.keras.models.load_model(
        model_path, custom_objects={"backend": tf.keras.backend})
    model.summary()
    logg.debug(f"ascii_model(model): {ascii_model(model)}")

    att_weight_model = tf.keras.models.Model(
        inputs=model.input,
        outputs=[
            model.get_layer("output").output,
            model.get_layer("att_softmax").output,
            model.get_layer("bidirectional_1").output,
        ],
    )
    att_weight_model.summary()
    # logg.debug(f"att_weight_model.outputs: {att_weight_model.outputs}")

    # get the training words
    train_words = words_types[train_words_type]
    # logg.debug(f"train_words: {train_words}")
    perm_pred = compute_permutation(train_words)

    rec_words_type = args.rec_words_type
    if rec_words_type == "train":
        rec_words = train_words[-3:]
        # rec_words = train_words[:]
        logg.debug(f"Using rec_words: {rec_words}")
    else:
        rec_words = words_types[rec_words_type]
    num_rec_words = len(rec_words)

    # record new audios
    if do_new_record:

        # where to save the audios
        audio_folder = Path("recorded_audio")
        if not audio_folder.exists():
            audio_folder.mkdir(parents=True, exist_ok=True)

        # record the audios and save them in audio_folder
        audio_path_fmt = "{}_02.wav"
        audios = record_audios(rec_words,
                               audio_folder,
                               audio_path_fmt,
                               timeout=0)

        # compute the spectrograms and build the dataset of correct shape
        img_specs = []
        spec_dict = get_spec_dict()
        spec_kwargs = spec_dict[dataset_name]
        p2d_kwargs = {"ref": np.max}
        for word in rec_words:
            # get the name
            audio_path = audio_folder / audio_path_fmt.format(word)

            # convert it to mel
            log_spec = wav2mel(audio_path, spec_kwargs, p2d_kwargs)
            img_spec = log_spec.reshape((*log_spec.shape, 1))
            # logg.debug(f"img_spec.shape: {img_spec.shape}")
            # img_spec.shape: (128, 32, 1)

            img_specs.append(img_spec)

        # the data needs to look like this data['testing'].shape: (735, 128, 32, 1)
        rec_data = np.stack(img_specs)
        # logg.debug(f"rec_data.shape: {rec_data.shape}")

    # load data if you do not want to record new audios
    else:

        # input data
        processed_folder = Path("data_proc")
        processed_path = processed_folder / f"{dataset_name}"

        # which word in the dataset to plot
        word_id = 2

        # the loaded spectrograms
        rec_data_l: ty.List[np.ndarray] = []

        for i, word in enumerate(rec_words):
            data, labels = load_processed(processed_path, [word])

            # get one of the spectrograms
            word_data = data["testing"][word_id]
            rec_data_l.append(word_data)

        # turn the list into np array
        rec_data = np.stack(rec_data_l)

    # get prediction and attention weights
    pred, att_weights, LSTM_out = att_weight_model.predict(rec_data)
    # logg.debug(f"att_weights.shape: {att_weights.shape}")
    # logg.debug(f"att_weights[0].shape: {att_weights[0].shape}")

    # if we recorded fresh audios we also have the waveform to plot
    ax_add = 1 if do_new_record else 0

    # plot the wave, spectrogram, weights and predictions in each column
    plot_size = 5
    fw = plot_size * num_rec_words
    nrows = 3 + ax_add
    # nrows = 4 + ax_add
    fh = plot_size * nrows * 0.7
    fig, axes = plt.subplots(nrows=nrows,
                             ncols=num_rec_words,
                             figsize=(fw, fh),
                             sharey="row")
    fig.suptitle(f"Attention weights and predictions for {rec_words}",
                 fontsize=20)

    for i, word in enumerate(rec_words):
        word_spec = rec_data[i][:, :, 0]
        # logg.debug(f"word_spec.shape: {word_spec.shape}")

        # plot the waveform
        if do_new_record:
            plot_waveform(audios[i], axes[0][i])

        # plot the spectrogram
        title = f"Spectrogram for {word}"
        plot_spec(word_spec, axes[0 + ax_add][i], title=title)

        # plot the weights
        word_att_weights = att_weights[i]
        # plot_att_weights(word_att_weights, axes[1 + ax_add][i], title)

        word_att_weights_img = np.expand_dims(word_att_weights, axis=-1).T
        axes[1 + ax_add][i].imshow(word_att_weights_img,
                                   origin="lower",
                                   aspect="auto")
        title = f"Attention weights for {word}"
        axes[1 + ax_add][i].set_title(title)

        # plot the predictions
        word_pred = pred[i]
        # permute the prediction from sorted to the order you have
        word_pred = word_pred[perm_pred]
        pred_index = np.argmax(word_pred)
        title = f"Predictions for {word}"
        plot_pred(word_pred, train_words, axes[2 + ax_add][i], title,
                  pred_index)

        # axes[3 + ax_add][i].imshow(LSTM_out[i], origin="lower")

    # fig.tight_layout()
    fig.tight_layout(h_pad=3, rect=[0, 0.03, 1, 0.97])

    fig_name = f"{model_name}"
    fig_name += f"_{train_words_type}"
    fig_name += f"_{rec_words_type}_img"
    if do_new_record:
        fig_name += "_new.{}"
    else:
        fig_name += "_data.{}"

    plot_folder = Path("plot_results")
    results_path = plot_folder / fig_name.format("png")
    fig.savefig(results_path)
    results_path = plot_folder / fig_name.format("pdf")
    fig.savefig(results_path)

    if num_rec_words <= 6:
        plt.show()

Exemple #5

0

Afficher le fichier

Fichier : evaluate_transfer.py Projet : Pitrified/hda-speech-recog

def evaluate_audio_transfer(train_words_type: str, rec_words_type: str) -> None:
    """MAKEDOC: what is evaluate_audio_transfer doing?"""
    logg = logging.getLogger(f"c.{__name__}.evaluate_audio_transfer")
    # logg.setLevel("INFO")
    logg.debug("Start evaluate_audio_transfer")

    # magic to fix the GPUs
    setup_gpus()

    datasets_type = "01"
    datasets_types = {
        "01": ["mel05", "mel09", "mel10"],
        "02": ["mel05", "mel10", "mfcc07"],
        "03": ["mfcc06", "mfcc07", "mfcc08"],
        "04": ["mel05", "mfcc06", "melc1"],
        "05": ["melc1", "melc2", "melc4"],
    }
    dataset_names = datasets_types[datasets_type]

    # we do not support composed datasets for now
    for dn in dataset_names:
        if dn.startswith("melc"):
            logg.error(f"not supported: {dataset_names}")
            return

    # words that the dataset was trained on
    train_words_type = args.train_words_type
    train_words = words_types[train_words_type]

    # the model predicts sorted words
    perm_pred = compute_permutation(train_words)

    if rec_words_type == "train":
        rec_words = train_words
    else:
        rec_words = words_types[rec_words_type]
    num_rec_words = len(rec_words)

    # where to save the audios
    audio_folder = Path("recorded_audio")
    if not audio_folder.exists():
        audio_folder.mkdir(parents=True, exist_ok=True)

    # record the audios and save them in audio_folder
    audio_path_fmt = "{}_02.wav"
    audios = record_audios(rec_words, audio_folder, audio_path_fmt, timeout=0)

    # compute the spectrograms and build the dataset of correct shape
    specs_3ch: ty.List[np.ndarray] = []
    # params for the mel conversion
    p2d_kwargs = {"ref": np.max}
    spec_dict = get_spec_dict()
    for word in rec_words:
        # get the name
        audio_path = audio_folder / audio_path_fmt.format(word)

        # convert it to mel for each type of dataset
        specs: ty.List[np.ndarray] = []
        for dataset_name in dataset_names:
            spec_kwargs = spec_dict[dataset_name]
            log_spec = wav2mel(audio_path, spec_kwargs, p2d_kwargs)
            specs.append(log_spec)
        img_spec = np.stack(specs, axis=2)
        # logg.debug(f"img_spec.shape: {img_spec.shape}")  # (128, 128, 3)

        specs_3ch.append(img_spec)

    data = np.stack(specs_3ch)
    logg.debug(f"data.shape: {data.shape}")

    hypa: ty.Dict[str, str] = {}
    hypa["dense_width_type"] = "03"
    hypa["dropout_type"] = "01"
    hypa["batch_size_type"] = "02"
    hypa["epoch_num_type"] = "01"
    hypa["learning_rate_type"] = "01"
    hypa["optimizer_type"] = "a1"
    hypa["datasets_type"] = datasets_type
    hypa["words_type"] = train_words_type
    use_validation = False

    # hypa: Dict[str, str] = {}
    # hypa["dense_width_type"] = "02"
    # hypa["dropout_type"] = "01"
    # hypa["batch_size_type"] = "01"
    # hypa["epoch_num_type"] = "01"
    # hypa["learning_rate_type"] = "01"
    # hypa["optimizer_type"] = "a1"
    # hypa["datasets_type"] = datasets_type
    # hypa["words_type"] = train_words_type
    # use_validation = True

    # get the model name
    model_name = build_transfer_name(hypa, use_validation)

    # load the model
    # model_folder = Path("trained_models") / "transfer"
    model_folder = Path("saved_models")
    model_path = model_folder / f"{model_name}.h5"
    model = tf.keras.models.load_model(model_path)

    # predict!
    pred = model.predict(data)

    # plot everything
    plot_size = 5
    fw = plot_size * 5
    fh = plot_size * num_rec_words
    fig, axes = plt.subplots(nrows=num_rec_words, ncols=5, figsize=(fw, fh))
    fig.suptitle("Recorded audios", fontsize=18)

    for i, word in enumerate(rec_words):
        plot_waveform(audios[i], axes[i][0])
        img_spec = specs_3ch[i]
        plot_spec(img_spec[:, :, 0], axes[i][1])
        plot_spec(img_spec[:, :, 1], axes[i][2])
        plot_spec(img_spec[:, :, 2], axes[i][3])
        plot_pred(
            pred[i][perm_pred],
            train_words,
            axes[i][4],
            f"Prediction for {rec_words[i]}",
            train_words.index(word),
        )

    # https://stackoverflow.com/q/8248467
    # https://stackoverflow.com/q/2418125
    fig.tight_layout(h_pad=3, rect=[0, 0.03, 1, 0.97])

    fig_name = f"{model_name}_{train_words_type}_{rec_words_type}.png"
    results_path = audio_folder / fig_name
    fig.savefig(results_path)

    if num_rec_words <= 6:
        plt.show()