def evaluate_audio_cnn(args): """MAKEDOC: what is evaluate_audio_cnn doing?""" logg = logging.getLogger(f"c.{__name__}.evaluate_audio_cnn") logg.debug("Start evaluate_audio_cnn") # magic to fix the GPUs setup_gpus() # need to know on which dataset the model was trained to compute specs dataset_name = "mel01" # words that the dataset was trained on train_words_type = args.train_words_type train_words = words_types[train_words_type] # permutation from sorted to your wor(l)d order perm_pred = compute_permutation(train_words) rec_words_type = args.rec_words_type if rec_words_type == "train": rec_words = train_words else: rec_words = words_types[rec_words_type] num_rec_words = len(rec_words) # where to save the audios audio_folder = Path("recorded_audio") if not audio_folder.exists(): audio_folder.mkdir(parents=True, exist_ok=True) # record the audios and save them in audio_folder audio_path_fmt = "{}_02.wav" audios = record_audios(rec_words, audio_folder, audio_path_fmt, timeout=0) # compute the spectrograms and build the dataset of correct shape img_specs = [] spec_dict = get_spec_dict() spec_kwargs = spec_dict[dataset_name] p2d_kwargs = {"ref": np.max} for word in rec_words: # get the name audio_path = audio_folder / audio_path_fmt.format(word) # convert it to mel log_spec = wav2mel(audio_path, spec_kwargs, p2d_kwargs) img_spec = log_spec.reshape((*log_spec.shape, 1)) logg.debug(f"img_spec.shape: {img_spec.shape}" ) # img_spec.shape: (128, 32, 1) img_specs.append(img_spec) # the data needs to look like this data['testing'].shape: (735, 128, 32, 1) # data = log_spec.reshape((1, *log_spec.shape, 1)) data = np.stack(img_specs) logg.debug(f"data.shape: {data.shape}") hypa: ty.Dict[str, ty.Union[str, int]] = {} hypa["base_dense_width"] = 32 hypa["base_filters"] = 20 hypa["batch_size"] = 32 hypa["dropout_type"] = "01" hypa["epoch_num"] = 16 hypa["kernel_size_type"] = "02" hypa["pool_size_type"] = "02" hypa["learning_rate_type"] = "02" hypa["optimizer_type"] = "a1" hypa["dataset"] = dataset_name hypa["words"] = train_words_type # get the words train_words = words_types[train_words_type] model_name = build_cnn_name(hypa) logg.debug(f"model_name: {model_name}") # model_folder = Path("trained_models") / "cnn" model_folder = Path("saved_models") model_path = model_folder / f"{model_name}.h5" if not model_path.exists(): logg.error(f"Model not found at: {model_path}") raise FileNotFoundError model = tf.keras.models.load_model(model_path) model.summary() pred = model.predict(data) # logg.debug(f"pred: {pred}") # plot the thing plot_size = 5 fw = plot_size * 3 fh = plot_size * num_rec_words fig, axes = plt.subplots(nrows=num_rec_words, ncols=3, figsize=(fw, fh)) fig.suptitle("Recorded audios", fontsize=18) for i, word in enumerate(rec_words): plot_waveform(audios[i], axes[i][0]) spec = img_specs[i][:, :, 0] plot_spec(spec, axes[i][1]) plot_pred( pred[i][perm_pred], train_words, axes[i][2], f"Prediction for {rec_words[i]}", train_words.index(word), ) # https://stackoverflow.com/q/8248467 # https://stackoverflow.com/q/2418125 fig.tight_layout(h_pad=3, rect=[0, 0.03, 1, 0.97]) fig_name = f"{model_name}_{train_words_type}_{rec_words_type}.png" results_path = audio_folder / fig_name fig.savefig(results_path) if num_rec_words <= 6: plt.show()
def visualize_datasets(word_index): """MAKEDOC: what is visualize_datasets doing?""" logg = logging.getLogger(f"c.{__name__}.visualize_datasets") logg.debug("Start visualize_datasets") # magic to fix the GPUs setup_gpus() # show different datasets # datasets = [ "mfcc01", "mfcc02", "mfcc03", "mfcc04", "mfcc05", "mfcc06", "mfcc07", "mfcc08"] # datasets = [ "mel01", "mel02", "mel03", "mel04", "mel05", "mel06", "mel07", "mel08", # "mel09", "mel10", "mel11", "mel12", "mel13", "mel14", "mel15", "melc1", "melc2", # "melc3", "melc4", "mela1", "meL04", "meLa1", "auL18", "aug18", ] # datasets = [ "mel01", "mel04", "mel06", "melc1" ] # datasets = ["mel09", "mel10", "mel11", "melc1"] datasets = ["mel04", "mel04a", "mel04b", "melc1"] # words = words_types["f1"] # a_word = words[0] # a_word = "loudest_one" a_word = "happy" # a_word = "_other_ltts_loud" # datasets = [] # datasets.extend(["meL04", "meLa1", "meLa2", "meLa3", "meLa4"]) # datasets.extend(["auL06", "auL07", "auL08", "auL09"]) # datasets.extend(["auL18", "auL19", "auL20", "auL21"]) # a_word = "loudest_two" # datasets = [] # datasets.extend(["mel04", "mela1"]) # datasets.extend(["aug14", "aug15"]) # a_word = "forward" # datasets.extend(["aug14", "aug07"]) # a_word = "one" # a_word = "_other_ltts" # which word in the dataset to plot iw = word_index processed_folder = Path("data_proc") # fig, axes = plt.subplots(4, 5, figsize=(12, 15)) nrows, ncols = find_rowcol(len(datasets)) base_figsize = 5 figsize = (ncols * base_figsize * 1.5, nrows * base_figsize) fig, axes = plt.subplots(nrows, ncols, figsize=figsize) if nrows * ncols > 1: axes_flat = axes.flat else: axes_flat = [axes] fig.suptitle(f"Various spectrograms for {a_word}", fontsize=20) for i, ax in enumerate(axes_flat[: len(datasets)]): # the current dataset being plotted dataset_name = datasets[i] processed_path = processed_folder / f"{dataset_name}" word_path = processed_path / f"{a_word}_training.npy" logg.debug(f"word_path: {word_path}") # FIXME this is shaky as hell if not word_path.exists(): if dataset_name.startswith("me"): preprocess_spec(dataset_name, f"_{a_word}") elif dataset_name.startswith("au"): do_augmentation(dataset_name, f"_{a_word}") word_data = np.load(word_path, allow_pickle=True) logg.debug(f"{dataset_name} {a_word} shape: {word_data[iw].shape}") title = f"{dataset_name}: shape {word_data[iw].shape}" plot_spec(word_data[iw], ax, title=title) fig.tight_layout() plot_folder = Path("plot_models") dt_names = "_".join(datasets) fig.savefig(plot_folder / f"{a_word}_{dt_names}_specs.pdf")
def evaluate_attention_weights(train_words_type: str, rec_words_type: str, do_new_record: bool = False) -> None: """MAKEDOC: what is evaluate_attention_weights doing?""" logg = logging.getLogger(f"c.{__name__}.evaluate_attention_weights") # logg.setLevel("INFO") logg.debug("Start evaluate_attention_weights") # magic to fix the GPUs setup_gpus() # ATT_ct02_dr02_ks02_lu01_as01_qt01_dw01_opa1_lr01_bs01_en01_dsmel04_wk1 # hypa: ty.Dict[str, str] = {} # hypa["conv_size_type"] = "02" # hypa["dropout_type"] = "02" # hypa["kernel_size_type"] = "02" # hypa["lstm_units_type"] = "01" # hypa["query_style_type"] = "01" # hypa["dense_width_type"] = "01" # hypa["optimizer_type"] = "a1" # hypa["learning_rate_type"] = "01" # hypa["batch_size_type"] = "01" # hypa["epoch_num_type"] = "01" # dataset_name = "mel04" # hypa["dataset_name"] = dataset_name # hypa["words_type"] = train_words_type # use_validation = True # ATT_ct02_dr01_ks01_lu01_qt05_dw01_opa1_lr03_bs02_en02_dsaug07_wLTnum hypa = { "batch_size_type": "02", "conv_size_type": "02", "dataset_name": "aug07", "dense_width_type": "01", "dropout_type": "01", "epoch_num_type": "02", "kernel_size_type": "01", "learning_rate_type": "03", "lstm_units_type": "01", "optimizer_type": "a1", "query_style_type": "05", "words_type": "LTnum", } use_validation = True dataset_name = hypa["dataset_name"] model_name = build_attention_name(hypa, use_validation) logg.debug(f"model_name: {model_name}") # load the model model_folder = Path("trained_models") / "attention" model_path = model_folder / f"{model_name}.h5" # model = tf.keras.models.load_model(model_path) # https://github.com/keras-team/keras/issues/5088#issuecomment-401498334 model = tf.keras.models.load_model( model_path, custom_objects={"backend": tf.keras.backend}) model.summary() logg.debug(f"ascii_model(model): {ascii_model(model)}") att_weight_model = tf.keras.models.Model( inputs=model.input, outputs=[ model.get_layer("output").output, model.get_layer("att_softmax").output, model.get_layer("bidirectional_1").output, ], ) att_weight_model.summary() # logg.debug(f"att_weight_model.outputs: {att_weight_model.outputs}") # get the training words train_words = words_types[train_words_type] # logg.debug(f"train_words: {train_words}") perm_pred = compute_permutation(train_words) rec_words_type = args.rec_words_type if rec_words_type == "train": rec_words = train_words[-3:] # rec_words = train_words[:] logg.debug(f"Using rec_words: {rec_words}") else: rec_words = words_types[rec_words_type] num_rec_words = len(rec_words) # record new audios if do_new_record: # where to save the audios audio_folder = Path("recorded_audio") if not audio_folder.exists(): audio_folder.mkdir(parents=True, exist_ok=True) # record the audios and save them in audio_folder audio_path_fmt = "{}_02.wav" audios = record_audios(rec_words, audio_folder, audio_path_fmt, timeout=0) # compute the spectrograms and build the dataset of correct shape img_specs = [] spec_dict = get_spec_dict() spec_kwargs = spec_dict[dataset_name] p2d_kwargs = {"ref": np.max} for word in rec_words: # get the name audio_path = audio_folder / audio_path_fmt.format(word) # convert it to mel log_spec = wav2mel(audio_path, spec_kwargs, p2d_kwargs) img_spec = log_spec.reshape((*log_spec.shape, 1)) # logg.debug(f"img_spec.shape: {img_spec.shape}") # img_spec.shape: (128, 32, 1) img_specs.append(img_spec) # the data needs to look like this data['testing'].shape: (735, 128, 32, 1) rec_data = np.stack(img_specs) # logg.debug(f"rec_data.shape: {rec_data.shape}") # load data if you do not want to record new audios else: # input data processed_folder = Path("data_proc") processed_path = processed_folder / f"{dataset_name}" # which word in the dataset to plot word_id = 2 # the loaded spectrograms rec_data_l: ty.List[np.ndarray] = [] for i, word in enumerate(rec_words): data, labels = load_processed(processed_path, [word]) # get one of the spectrograms word_data = data["testing"][word_id] rec_data_l.append(word_data) # turn the list into np array rec_data = np.stack(rec_data_l) # get prediction and attention weights pred, att_weights, LSTM_out = att_weight_model.predict(rec_data) # logg.debug(f"att_weights.shape: {att_weights.shape}") # logg.debug(f"att_weights[0].shape: {att_weights[0].shape}") # if we recorded fresh audios we also have the waveform to plot ax_add = 1 if do_new_record else 0 # plot the wave, spectrogram, weights and predictions in each column plot_size = 5 fw = plot_size * num_rec_words nrows = 3 + ax_add # nrows = 4 + ax_add fh = plot_size * nrows * 0.7 fig, axes = plt.subplots(nrows=nrows, ncols=num_rec_words, figsize=(fw, fh), sharey="row") fig.suptitle(f"Attention weights and predictions for {rec_words}", fontsize=20) for i, word in enumerate(rec_words): word_spec = rec_data[i][:, :, 0] # logg.debug(f"word_spec.shape: {word_spec.shape}") # plot the waveform if do_new_record: plot_waveform(audios[i], axes[0][i]) # plot the spectrogram title = f"Spectrogram for {word}" plot_spec(word_spec, axes[0 + ax_add][i], title=title) # plot the weights word_att_weights = att_weights[i] # plot_att_weights(word_att_weights, axes[1 + ax_add][i], title) word_att_weights_img = np.expand_dims(word_att_weights, axis=-1).T axes[1 + ax_add][i].imshow(word_att_weights_img, origin="lower", aspect="auto") title = f"Attention weights for {word}" axes[1 + ax_add][i].set_title(title) # plot the predictions word_pred = pred[i] # permute the prediction from sorted to the order you have word_pred = word_pred[perm_pred] pred_index = np.argmax(word_pred) title = f"Predictions for {word}" plot_pred(word_pred, train_words, axes[2 + ax_add][i], title, pred_index) # axes[3 + ax_add][i].imshow(LSTM_out[i], origin="lower") # fig.tight_layout() fig.tight_layout(h_pad=3, rect=[0, 0.03, 1, 0.97]) fig_name = f"{model_name}" fig_name += f"_{train_words_type}" fig_name += f"_{rec_words_type}_img" if do_new_record: fig_name += "_new.{}" else: fig_name += "_data.{}" plot_folder = Path("plot_results") results_path = plot_folder / fig_name.format("png") fig.savefig(results_path) results_path = plot_folder / fig_name.format("pdf") fig.savefig(results_path) if num_rec_words <= 6: plt.show()
def visualize_spec(): """MAKEDOC: what is visualize_spec doing?""" logg = logging.getLogger(f"c.{__name__}.visualize_spec") logg.debug("Start visualize_spec") plot_folder = Path("plot_models") dataset_path = Path("data_raw") logg.debug(f"dataset_path: {dataset_path}") # word = "happy" # word = "wow" # word = "six" # word = "eight" # 3 word = "loudest_eight" word_folder = dataset_path / word # sample_path = word_folder / "0a2b400e_nohash_0.wav" sample_path = list(word_folder.iterdir())[0] # sample_path = "/home/pmn/free_spoken_digit_dataset/recordings/3_theo_10.wav" # sample_path = "/home/pmn/uni/human_data/progetto2020/src/data_fsdd_raw/five/5_yweweler_30.wav" # sample_path = "/home/pmn/uni/human_data/progetto2020/src/data_fsdd_raw/fsdd_five/5_yweweler_30.wav" # sample_path = "/home/pmn/uni/human_data/progetto2020/src/data_fsdd_raw/fsdd_five/5_yweweler_33.wav" # sample_path = "/home/pmn/uni/human_data/progetto2020/src/data_raw/_other/ljs_LJ001-0018_005.wav" logg.debug(f"sample_path: {sample_path}") # fig, ax = plt.subplots(3, 1, figsize=(12, 12)) fig, ax = plt.subplots(4, 1, figsize=(10, 15)) sample_sig, sr = librosa.load(sample_path, sr=None) logg.debug(f"sample_sig.shape: {sample_sig.shape}") plot_waveform(sample_sig, ax[0], sample_rate=sr, title=f"Waveform for {word}") plot_waveform( sample_sig ** 2, ax[1], sample_rate=sr, title=f"Waveform**2 for {word}" ) sample_melspec = librosa.feature.melspectrogram(sample_sig, sr=sr) logg.debug(f"sample_melspec.shape: {sample_melspec.shape}") sample_log_melspec = librosa.power_to_db(sample_melspec, ref=np.max) logg.debug(f"sample_log_melspec.shape: {sample_log_melspec.shape}") plot_spec(sample_log_melspec, ax[2], title=f"Mel spectrogram for {word}") sample_mfcc = librosa.feature.mfcc(sample_sig, sr=sr) logg.debug(f"sample_mfcc.shape: {sample_mfcc.shape}") sample_log_mfcc = librosa.power_to_db(sample_mfcc, ref=np.max) logg.debug(f"sample_log_mfcc.shape: {sample_log_mfcc.shape}") plot_spec(sample_log_mfcc, ax[3], title=f"MFCCs for {word}") fig.tight_layout() fig.savefig(plot_folder / f"{word}_specs.pdf") sr = 16000 n_fft = 2048 hop_length = 512 mel_10 = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=10) # fig, ax = plt.subplots(1, 1, figsize=(12, 12)) fig, ax = plt.subplots(1, 1) librosa.display.specshow( mel_10, sr=sr, hop_length=hop_length, x_axis="linear", y_axis="linear", cmap="viridis", ax=ax, ) ax.set_ylabel("Mel filter") ax.set_xlabel("Hz") fig.tight_layout() fig.savefig(plot_folder / "mel10_bins.pdf") fig, ax = plt.subplots(1, 1, figsize=(8, 4)) for i, m in enumerate(mel_10): ax.plot(m, label=i) ax.legend() fig.tight_layout() fig.savefig(plot_folder / "mel10_filterbank.pdf")
def evaluate_audio_transfer(train_words_type: str, rec_words_type: str) -> None: """MAKEDOC: what is evaluate_audio_transfer doing?""" logg = logging.getLogger(f"c.{__name__}.evaluate_audio_transfer") # logg.setLevel("INFO") logg.debug("Start evaluate_audio_transfer") # magic to fix the GPUs setup_gpus() datasets_type = "01" datasets_types = { "01": ["mel05", "mel09", "mel10"], "02": ["mel05", "mel10", "mfcc07"], "03": ["mfcc06", "mfcc07", "mfcc08"], "04": ["mel05", "mfcc06", "melc1"], "05": ["melc1", "melc2", "melc4"], } dataset_names = datasets_types[datasets_type] # we do not support composed datasets for now for dn in dataset_names: if dn.startswith("melc"): logg.error(f"not supported: {dataset_names}") return # words that the dataset was trained on train_words_type = args.train_words_type train_words = words_types[train_words_type] # the model predicts sorted words perm_pred = compute_permutation(train_words) if rec_words_type == "train": rec_words = train_words else: rec_words = words_types[rec_words_type] num_rec_words = len(rec_words) # where to save the audios audio_folder = Path("recorded_audio") if not audio_folder.exists(): audio_folder.mkdir(parents=True, exist_ok=True) # record the audios and save them in audio_folder audio_path_fmt = "{}_02.wav" audios = record_audios(rec_words, audio_folder, audio_path_fmt, timeout=0) # compute the spectrograms and build the dataset of correct shape specs_3ch: ty.List[np.ndarray] = [] # params for the mel conversion p2d_kwargs = {"ref": np.max} spec_dict = get_spec_dict() for word in rec_words: # get the name audio_path = audio_folder / audio_path_fmt.format(word) # convert it to mel for each type of dataset specs: ty.List[np.ndarray] = [] for dataset_name in dataset_names: spec_kwargs = spec_dict[dataset_name] log_spec = wav2mel(audio_path, spec_kwargs, p2d_kwargs) specs.append(log_spec) img_spec = np.stack(specs, axis=2) # logg.debug(f"img_spec.shape: {img_spec.shape}") # (128, 128, 3) specs_3ch.append(img_spec) data = np.stack(specs_3ch) logg.debug(f"data.shape: {data.shape}") hypa: ty.Dict[str, str] = {} hypa["dense_width_type"] = "03" hypa["dropout_type"] = "01" hypa["batch_size_type"] = "02" hypa["epoch_num_type"] = "01" hypa["learning_rate_type"] = "01" hypa["optimizer_type"] = "a1" hypa["datasets_type"] = datasets_type hypa["words_type"] = train_words_type use_validation = False # hypa: Dict[str, str] = {} # hypa["dense_width_type"] = "02" # hypa["dropout_type"] = "01" # hypa["batch_size_type"] = "01" # hypa["epoch_num_type"] = "01" # hypa["learning_rate_type"] = "01" # hypa["optimizer_type"] = "a1" # hypa["datasets_type"] = datasets_type # hypa["words_type"] = train_words_type # use_validation = True # get the model name model_name = build_transfer_name(hypa, use_validation) # load the model # model_folder = Path("trained_models") / "transfer" model_folder = Path("saved_models") model_path = model_folder / f"{model_name}.h5" model = tf.keras.models.load_model(model_path) # predict! pred = model.predict(data) # plot everything plot_size = 5 fw = plot_size * 5 fh = plot_size * num_rec_words fig, axes = plt.subplots(nrows=num_rec_words, ncols=5, figsize=(fw, fh)) fig.suptitle("Recorded audios", fontsize=18) for i, word in enumerate(rec_words): plot_waveform(audios[i], axes[i][0]) img_spec = specs_3ch[i] plot_spec(img_spec[:, :, 0], axes[i][1]) plot_spec(img_spec[:, :, 1], axes[i][2]) plot_spec(img_spec[:, :, 2], axes[i][3]) plot_pred( pred[i][perm_pred], train_words, axes[i][4], f"Prediction for {rec_words[i]}", train_words.index(word), ) # https://stackoverflow.com/q/8248467 # https://stackoverflow.com/q/2418125 fig.tight_layout(h_pad=3, rect=[0, 0.03, 1, 0.97]) fig_name = f"{model_name}_{train_words_type}_{rec_words_type}.png" results_path = audio_folder / fig_name fig.savefig(results_path) if num_rec_words <= 6: plt.show()