def evaluate_audio_cnn(args):
    """MAKEDOC: what is evaluate_audio_cnn doing?"""
    logg = logging.getLogger(f"c.{__name__}.evaluate_audio_cnn")
    logg.debug("Start evaluate_audio_cnn")

    # magic to fix the GPUs
    setup_gpus()

    # need to know on which dataset the model was trained to compute specs
    dataset_name = "mel01"

    # words that the dataset was trained on
    train_words_type = args.train_words_type
    train_words = words_types[train_words_type]

    # permutation from sorted to your wor(l)d order
    perm_pred = compute_permutation(train_words)

    rec_words_type = args.rec_words_type
    if rec_words_type == "train":
        rec_words = train_words
    else:
        rec_words = words_types[rec_words_type]
    num_rec_words = len(rec_words)

    # where to save the audios
    audio_folder = Path("recorded_audio")
    if not audio_folder.exists():
        audio_folder.mkdir(parents=True, exist_ok=True)

    # record the audios and save them in audio_folder
    audio_path_fmt = "{}_02.wav"
    audios = record_audios(rec_words, audio_folder, audio_path_fmt, timeout=0)

    # compute the spectrograms and build the dataset of correct shape
    img_specs = []
    spec_dict = get_spec_dict()
    spec_kwargs = spec_dict[dataset_name]
    p2d_kwargs = {"ref": np.max}
    for word in rec_words:
        # get the name
        audio_path = audio_folder / audio_path_fmt.format(word)

        # convert it to mel
        log_spec = wav2mel(audio_path, spec_kwargs, p2d_kwargs)
        img_spec = log_spec.reshape((*log_spec.shape, 1))
        logg.debug(f"img_spec.shape: {img_spec.shape}"
                   )  # img_spec.shape: (128, 32, 1)

        img_specs.append(img_spec)

    # the data needs to look like this data['testing'].shape: (735, 128, 32, 1)
    # data = log_spec.reshape((1, *log_spec.shape, 1))
    data = np.stack(img_specs)
    logg.debug(f"data.shape: {data.shape}")

    hypa: ty.Dict[str, ty.Union[str, int]] = {}
    hypa["base_dense_width"] = 32
    hypa["base_filters"] = 20
    hypa["batch_size"] = 32
    hypa["dropout_type"] = "01"
    hypa["epoch_num"] = 16
    hypa["kernel_size_type"] = "02"
    hypa["pool_size_type"] = "02"
    hypa["learning_rate_type"] = "02"
    hypa["optimizer_type"] = "a1"
    hypa["dataset"] = dataset_name
    hypa["words"] = train_words_type

    # get the words
    train_words = words_types[train_words_type]

    model_name = build_cnn_name(hypa)
    logg.debug(f"model_name: {model_name}")

    # model_folder = Path("trained_models") / "cnn"
    model_folder = Path("saved_models")
    model_path = model_folder / f"{model_name}.h5"
    if not model_path.exists():
        logg.error(f"Model not found at: {model_path}")
        raise FileNotFoundError

    model = tf.keras.models.load_model(model_path)
    model.summary()

    pred = model.predict(data)
    # logg.debug(f"pred: {pred}")

    # plot the thing
    plot_size = 5
    fw = plot_size * 3
    fh = plot_size * num_rec_words
    fig, axes = plt.subplots(nrows=num_rec_words, ncols=3, figsize=(fw, fh))
    fig.suptitle("Recorded audios", fontsize=18)

    for i, word in enumerate(rec_words):
        plot_waveform(audios[i], axes[i][0])
        spec = img_specs[i][:, :, 0]
        plot_spec(spec, axes[i][1])
        plot_pred(
            pred[i][perm_pred],
            train_words,
            axes[i][2],
            f"Prediction for {rec_words[i]}",
            train_words.index(word),
        )

    # https://stackoverflow.com/q/8248467
    # https://stackoverflow.com/q/2418125
    fig.tight_layout(h_pad=3, rect=[0, 0.03, 1, 0.97])

    fig_name = f"{model_name}_{train_words_type}_{rec_words_type}.png"
    results_path = audio_folder / fig_name
    fig.savefig(results_path)

    if num_rec_words <= 6:
        plt.show()
Beispiel #2
0
def visualize_datasets(word_index):
    """MAKEDOC: what is visualize_datasets doing?"""
    logg = logging.getLogger(f"c.{__name__}.visualize_datasets")
    logg.debug("Start visualize_datasets")

    # magic to fix the GPUs
    setup_gpus()

    # show different datasets

    # datasets = [ "mfcc01", "mfcc02", "mfcc03", "mfcc04", "mfcc05", "mfcc06", "mfcc07", "mfcc08"]
    # datasets = [ "mel01", "mel02", "mel03", "mel04", "mel05", "mel06", "mel07", "mel08",
    #     "mel09", "mel10", "mel11", "mel12", "mel13", "mel14", "mel15", "melc1", "melc2",
    #     "melc3", "melc4", "mela1", "meL04", "meLa1", "auL18", "aug18", ]
    # datasets = [ "mel01", "mel04", "mel06", "melc1" ]
    # datasets = ["mel09", "mel10", "mel11", "melc1"]
    datasets = ["mel04", "mel04a", "mel04b", "melc1"]

    # words = words_types["f1"]
    # a_word = words[0]
    # a_word = "loudest_one"
    a_word = "happy"
    # a_word = "_other_ltts_loud"

    # datasets = []
    # datasets.extend(["meL04", "meLa1", "meLa2", "meLa3", "meLa4"])
    # datasets.extend(["auL06", "auL07", "auL08", "auL09"])
    # datasets.extend(["auL18", "auL19", "auL20", "auL21"])
    # a_word = "loudest_two"

    # datasets = []
    # datasets.extend(["mel04", "mela1"])
    # datasets.extend(["aug14", "aug15"])
    # a_word = "forward"
    # datasets.extend(["aug14", "aug07"])
    # a_word = "one"
    # a_word = "_other_ltts"

    # which word in the dataset to plot
    iw = word_index

    processed_folder = Path("data_proc")

    # fig, axes = plt.subplots(4, 5, figsize=(12, 15))
    nrows, ncols = find_rowcol(len(datasets))
    base_figsize = 5
    figsize = (ncols * base_figsize * 1.5, nrows * base_figsize)
    fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
    if nrows * ncols > 1:
        axes_flat = axes.flat
    else:
        axes_flat = [axes]

    fig.suptitle(f"Various spectrograms for {a_word}", fontsize=20)
    for i, ax in enumerate(axes_flat[: len(datasets)]):

        # the current dataset being plotted
        dataset_name = datasets[i]
        processed_path = processed_folder / f"{dataset_name}"
        word_path = processed_path / f"{a_word}_training.npy"
        logg.debug(f"word_path: {word_path}")

        # FIXME this is shaky as hell
        if not word_path.exists():
            if dataset_name.startswith("me"):
                preprocess_spec(dataset_name, f"_{a_word}")
            elif dataset_name.startswith("au"):
                do_augmentation(dataset_name, f"_{a_word}")

        word_data = np.load(word_path, allow_pickle=True)
        logg.debug(f"{dataset_name} {a_word} shape: {word_data[iw].shape}")
        title = f"{dataset_name}: shape {word_data[iw].shape}"
        plot_spec(word_data[iw], ax, title=title)
    fig.tight_layout()

    plot_folder = Path("plot_models")
    dt_names = "_".join(datasets)
    fig.savefig(plot_folder / f"{a_word}_{dt_names}_specs.pdf")
Beispiel #3
0
def evaluate_attention_weights(train_words_type: str,
                               rec_words_type: str,
                               do_new_record: bool = False) -> None:
    """MAKEDOC: what is evaluate_attention_weights doing?"""
    logg = logging.getLogger(f"c.{__name__}.evaluate_attention_weights")
    # logg.setLevel("INFO")
    logg.debug("Start evaluate_attention_weights")

    # magic to fix the GPUs
    setup_gpus()

    # ATT_ct02_dr02_ks02_lu01_as01_qt01_dw01_opa1_lr01_bs01_en01_dsmel04_wk1
    # hypa: ty.Dict[str, str] = {}
    # hypa["conv_size_type"] = "02"
    # hypa["dropout_type"] = "02"
    # hypa["kernel_size_type"] = "02"
    # hypa["lstm_units_type"] = "01"
    # hypa["query_style_type"] = "01"
    # hypa["dense_width_type"] = "01"
    # hypa["optimizer_type"] = "a1"
    # hypa["learning_rate_type"] = "01"
    # hypa["batch_size_type"] = "01"
    # hypa["epoch_num_type"] = "01"
    # dataset_name = "mel04"
    # hypa["dataset_name"] = dataset_name
    # hypa["words_type"] = train_words_type
    # use_validation = True

    # ATT_ct02_dr01_ks01_lu01_qt05_dw01_opa1_lr03_bs02_en02_dsaug07_wLTnum
    hypa = {
        "batch_size_type": "02",
        "conv_size_type": "02",
        "dataset_name": "aug07",
        "dense_width_type": "01",
        "dropout_type": "01",
        "epoch_num_type": "02",
        "kernel_size_type": "01",
        "learning_rate_type": "03",
        "lstm_units_type": "01",
        "optimizer_type": "a1",
        "query_style_type": "05",
        "words_type": "LTnum",
    }
    use_validation = True

    dataset_name = hypa["dataset_name"]

    model_name = build_attention_name(hypa, use_validation)
    logg.debug(f"model_name: {model_name}")

    # load the model
    model_folder = Path("trained_models") / "attention"
    model_path = model_folder / f"{model_name}.h5"

    # model = tf.keras.models.load_model(model_path)
    # https://github.com/keras-team/keras/issues/5088#issuecomment-401498334
    model = tf.keras.models.load_model(
        model_path, custom_objects={"backend": tf.keras.backend})
    model.summary()
    logg.debug(f"ascii_model(model): {ascii_model(model)}")

    att_weight_model = tf.keras.models.Model(
        inputs=model.input,
        outputs=[
            model.get_layer("output").output,
            model.get_layer("att_softmax").output,
            model.get_layer("bidirectional_1").output,
        ],
    )
    att_weight_model.summary()
    # logg.debug(f"att_weight_model.outputs: {att_weight_model.outputs}")

    # get the training words
    train_words = words_types[train_words_type]
    # logg.debug(f"train_words: {train_words}")
    perm_pred = compute_permutation(train_words)

    rec_words_type = args.rec_words_type
    if rec_words_type == "train":
        rec_words = train_words[-3:]
        # rec_words = train_words[:]
        logg.debug(f"Using rec_words: {rec_words}")
    else:
        rec_words = words_types[rec_words_type]
    num_rec_words = len(rec_words)

    # record new audios
    if do_new_record:

        # where to save the audios
        audio_folder = Path("recorded_audio")
        if not audio_folder.exists():
            audio_folder.mkdir(parents=True, exist_ok=True)

        # record the audios and save them in audio_folder
        audio_path_fmt = "{}_02.wav"
        audios = record_audios(rec_words,
                               audio_folder,
                               audio_path_fmt,
                               timeout=0)

        # compute the spectrograms and build the dataset of correct shape
        img_specs = []
        spec_dict = get_spec_dict()
        spec_kwargs = spec_dict[dataset_name]
        p2d_kwargs = {"ref": np.max}
        for word in rec_words:
            # get the name
            audio_path = audio_folder / audio_path_fmt.format(word)

            # convert it to mel
            log_spec = wav2mel(audio_path, spec_kwargs, p2d_kwargs)
            img_spec = log_spec.reshape((*log_spec.shape, 1))
            # logg.debug(f"img_spec.shape: {img_spec.shape}")
            # img_spec.shape: (128, 32, 1)

            img_specs.append(img_spec)

        # the data needs to look like this data['testing'].shape: (735, 128, 32, 1)
        rec_data = np.stack(img_specs)
        # logg.debug(f"rec_data.shape: {rec_data.shape}")

    # load data if you do not want to record new audios
    else:

        # input data
        processed_folder = Path("data_proc")
        processed_path = processed_folder / f"{dataset_name}"

        # which word in the dataset to plot
        word_id = 2

        # the loaded spectrograms
        rec_data_l: ty.List[np.ndarray] = []

        for i, word in enumerate(rec_words):
            data, labels = load_processed(processed_path, [word])

            # get one of the spectrograms
            word_data = data["testing"][word_id]
            rec_data_l.append(word_data)

        # turn the list into np array
        rec_data = np.stack(rec_data_l)

    # get prediction and attention weights
    pred, att_weights, LSTM_out = att_weight_model.predict(rec_data)
    # logg.debug(f"att_weights.shape: {att_weights.shape}")
    # logg.debug(f"att_weights[0].shape: {att_weights[0].shape}")

    # if we recorded fresh audios we also have the waveform to plot
    ax_add = 1 if do_new_record else 0

    # plot the wave, spectrogram, weights and predictions in each column
    plot_size = 5
    fw = plot_size * num_rec_words
    nrows = 3 + ax_add
    # nrows = 4 + ax_add
    fh = plot_size * nrows * 0.7
    fig, axes = plt.subplots(nrows=nrows,
                             ncols=num_rec_words,
                             figsize=(fw, fh),
                             sharey="row")
    fig.suptitle(f"Attention weights and predictions for {rec_words}",
                 fontsize=20)

    for i, word in enumerate(rec_words):
        word_spec = rec_data[i][:, :, 0]
        # logg.debug(f"word_spec.shape: {word_spec.shape}")

        # plot the waveform
        if do_new_record:
            plot_waveform(audios[i], axes[0][i])

        # plot the spectrogram
        title = f"Spectrogram for {word}"
        plot_spec(word_spec, axes[0 + ax_add][i], title=title)

        # plot the weights
        word_att_weights = att_weights[i]
        # plot_att_weights(word_att_weights, axes[1 + ax_add][i], title)

        word_att_weights_img = np.expand_dims(word_att_weights, axis=-1).T
        axes[1 + ax_add][i].imshow(word_att_weights_img,
                                   origin="lower",
                                   aspect="auto")
        title = f"Attention weights for {word}"
        axes[1 + ax_add][i].set_title(title)

        # plot the predictions
        word_pred = pred[i]
        # permute the prediction from sorted to the order you have
        word_pred = word_pred[perm_pred]
        pred_index = np.argmax(word_pred)
        title = f"Predictions for {word}"
        plot_pred(word_pred, train_words, axes[2 + ax_add][i], title,
                  pred_index)

        # axes[3 + ax_add][i].imshow(LSTM_out[i], origin="lower")

    # fig.tight_layout()
    fig.tight_layout(h_pad=3, rect=[0, 0.03, 1, 0.97])

    fig_name = f"{model_name}"
    fig_name += f"_{train_words_type}"
    fig_name += f"_{rec_words_type}_img"
    if do_new_record:
        fig_name += "_new.{}"
    else:
        fig_name += "_data.{}"

    plot_folder = Path("plot_results")
    results_path = plot_folder / fig_name.format("png")
    fig.savefig(results_path)
    results_path = plot_folder / fig_name.format("pdf")
    fig.savefig(results_path)

    if num_rec_words <= 6:
        plt.show()
Beispiel #4
0
def visualize_spec():
    """MAKEDOC: what is visualize_spec doing?"""
    logg = logging.getLogger(f"c.{__name__}.visualize_spec")
    logg.debug("Start visualize_spec")

    plot_folder = Path("plot_models")

    dataset_path = Path("data_raw")
    logg.debug(f"dataset_path: {dataset_path}")

    # word = "happy"
    # word = "wow"
    # word = "six"
    # word = "eight"  # 3
    word = "loudest_eight"
    word_folder = dataset_path / word
    # sample_path = word_folder / "0a2b400e_nohash_0.wav"
    sample_path = list(word_folder.iterdir())[0]

    # sample_path = "/home/pmn/free_spoken_digit_dataset/recordings/3_theo_10.wav"
    # sample_path = "/home/pmn/uni/human_data/progetto2020/src/data_fsdd_raw/five/5_yweweler_30.wav"
    # sample_path = "/home/pmn/uni/human_data/progetto2020/src/data_fsdd_raw/fsdd_five/5_yweweler_30.wav"
    # sample_path = "/home/pmn/uni/human_data/progetto2020/src/data_fsdd_raw/fsdd_five/5_yweweler_33.wav"
    # sample_path = "/home/pmn/uni/human_data/progetto2020/src/data_raw/_other/ljs_LJ001-0018_005.wav"
    logg.debug(f"sample_path: {sample_path}")

    # fig, ax = plt.subplots(3, 1, figsize=(12, 12))
    fig, ax = plt.subplots(4, 1, figsize=(10, 15))

    sample_sig, sr = librosa.load(sample_path, sr=None)
    logg.debug(f"sample_sig.shape: {sample_sig.shape}")
    plot_waveform(sample_sig, ax[0], sample_rate=sr, title=f"Waveform for {word}")
    plot_waveform(
        sample_sig ** 2, ax[1], sample_rate=sr, title=f"Waveform**2 for {word}"
    )

    sample_melspec = librosa.feature.melspectrogram(sample_sig, sr=sr)
    logg.debug(f"sample_melspec.shape: {sample_melspec.shape}")
    sample_log_melspec = librosa.power_to_db(sample_melspec, ref=np.max)
    logg.debug(f"sample_log_melspec.shape: {sample_log_melspec.shape}")
    plot_spec(sample_log_melspec, ax[2], title=f"Mel spectrogram for {word}")

    sample_mfcc = librosa.feature.mfcc(sample_sig, sr=sr)
    logg.debug(f"sample_mfcc.shape: {sample_mfcc.shape}")
    sample_log_mfcc = librosa.power_to_db(sample_mfcc, ref=np.max)
    logg.debug(f"sample_log_mfcc.shape: {sample_log_mfcc.shape}")
    plot_spec(sample_log_mfcc, ax[3], title=f"MFCCs for {word}")

    fig.tight_layout()
    fig.savefig(plot_folder / f"{word}_specs.pdf")

    sr = 16000
    n_fft = 2048
    hop_length = 512
    mel_10 = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=10)

    # fig, ax = plt.subplots(1, 1, figsize=(12, 12))
    fig, ax = plt.subplots(1, 1)
    librosa.display.specshow(
        mel_10,
        sr=sr,
        hop_length=hop_length,
        x_axis="linear",
        y_axis="linear",
        cmap="viridis",
        ax=ax,
    )
    ax.set_ylabel("Mel filter")
    ax.set_xlabel("Hz")
    fig.tight_layout()
    fig.savefig(plot_folder / "mel10_bins.pdf")

    fig, ax = plt.subplots(1, 1, figsize=(8, 4))
    for i, m in enumerate(mel_10):
        ax.plot(m, label=i)
    ax.legend()
    fig.tight_layout()
    fig.savefig(plot_folder / "mel10_filterbank.pdf")
def evaluate_audio_transfer(train_words_type: str, rec_words_type: str) -> None:
    """MAKEDOC: what is evaluate_audio_transfer doing?"""
    logg = logging.getLogger(f"c.{__name__}.evaluate_audio_transfer")
    # logg.setLevel("INFO")
    logg.debug("Start evaluate_audio_transfer")

    # magic to fix the GPUs
    setup_gpus()

    datasets_type = "01"
    datasets_types = {
        "01": ["mel05", "mel09", "mel10"],
        "02": ["mel05", "mel10", "mfcc07"],
        "03": ["mfcc06", "mfcc07", "mfcc08"],
        "04": ["mel05", "mfcc06", "melc1"],
        "05": ["melc1", "melc2", "melc4"],
    }
    dataset_names = datasets_types[datasets_type]

    # we do not support composed datasets for now
    for dn in dataset_names:
        if dn.startswith("melc"):
            logg.error(f"not supported: {dataset_names}")
            return

    # words that the dataset was trained on
    train_words_type = args.train_words_type
    train_words = words_types[train_words_type]

    # the model predicts sorted words
    perm_pred = compute_permutation(train_words)

    if rec_words_type == "train":
        rec_words = train_words
    else:
        rec_words = words_types[rec_words_type]
    num_rec_words = len(rec_words)

    # where to save the audios
    audio_folder = Path("recorded_audio")
    if not audio_folder.exists():
        audio_folder.mkdir(parents=True, exist_ok=True)

    # record the audios and save them in audio_folder
    audio_path_fmt = "{}_02.wav"
    audios = record_audios(rec_words, audio_folder, audio_path_fmt, timeout=0)

    # compute the spectrograms and build the dataset of correct shape
    specs_3ch: ty.List[np.ndarray] = []
    # params for the mel conversion
    p2d_kwargs = {"ref": np.max}
    spec_dict = get_spec_dict()
    for word in rec_words:
        # get the name
        audio_path = audio_folder / audio_path_fmt.format(word)

        # convert it to mel for each type of dataset
        specs: ty.List[np.ndarray] = []
        for dataset_name in dataset_names:
            spec_kwargs = spec_dict[dataset_name]
            log_spec = wav2mel(audio_path, spec_kwargs, p2d_kwargs)
            specs.append(log_spec)
        img_spec = np.stack(specs, axis=2)
        # logg.debug(f"img_spec.shape: {img_spec.shape}")  # (128, 128, 3)

        specs_3ch.append(img_spec)

    data = np.stack(specs_3ch)
    logg.debug(f"data.shape: {data.shape}")

    hypa: ty.Dict[str, str] = {}
    hypa["dense_width_type"] = "03"
    hypa["dropout_type"] = "01"
    hypa["batch_size_type"] = "02"
    hypa["epoch_num_type"] = "01"
    hypa["learning_rate_type"] = "01"
    hypa["optimizer_type"] = "a1"
    hypa["datasets_type"] = datasets_type
    hypa["words_type"] = train_words_type
    use_validation = False

    # hypa: Dict[str, str] = {}
    # hypa["dense_width_type"] = "02"
    # hypa["dropout_type"] = "01"
    # hypa["batch_size_type"] = "01"
    # hypa["epoch_num_type"] = "01"
    # hypa["learning_rate_type"] = "01"
    # hypa["optimizer_type"] = "a1"
    # hypa["datasets_type"] = datasets_type
    # hypa["words_type"] = train_words_type
    # use_validation = True

    # get the model name
    model_name = build_transfer_name(hypa, use_validation)

    # load the model
    # model_folder = Path("trained_models") / "transfer"
    model_folder = Path("saved_models")
    model_path = model_folder / f"{model_name}.h5"
    model = tf.keras.models.load_model(model_path)

    # predict!
    pred = model.predict(data)

    # plot everything
    plot_size = 5
    fw = plot_size * 5
    fh = plot_size * num_rec_words
    fig, axes = plt.subplots(nrows=num_rec_words, ncols=5, figsize=(fw, fh))
    fig.suptitle("Recorded audios", fontsize=18)

    for i, word in enumerate(rec_words):
        plot_waveform(audios[i], axes[i][0])
        img_spec = specs_3ch[i]
        plot_spec(img_spec[:, :, 0], axes[i][1])
        plot_spec(img_spec[:, :, 1], axes[i][2])
        plot_spec(img_spec[:, :, 2], axes[i][3])
        plot_pred(
            pred[i][perm_pred],
            train_words,
            axes[i][4],
            f"Prediction for {rec_words[i]}",
            train_words.index(word),
        )

    # https://stackoverflow.com/q/8248467
    # https://stackoverflow.com/q/2418125
    fig.tight_layout(h_pad=3, rect=[0, 0.03, 1, 0.97])

    fig_name = f"{model_name}_{train_words_type}_{rec_words_type}.png"
    results_path = audio_folder / fig_name
    fig.savefig(results_path)

    if num_rec_words <= 6:
        plt.show()