def get_spec_aug_info(self) -> None: """MAKEDOC: what is get_spec_aug_info doing?""" logg = logging.getLogger(f"c.{__name__}.get_spec_aug_info") # logg.setLevel("INFO") logg.debug("Start get_spec_aug_info") self.p2d_kwargs = {"ref": np.max} if self.train_dataset.startswith("me"): spec_dict = get_spec_dict() self.mel_kwargs: ty.Dict[str, ty.Any] = spec_dict[self.train_dataset] spec_shape_dict = get_spec_shape_dict() self.spec_shape: ty.Tuple[int, int] = spec_shape_dict[ self.train_dataset] elif self.train_dataset.startswith("au"): aug_dict = get_aug_dict() self.mel_kwargs = aug_dict[self.train_dataset]["mel_kwargs"] self.spec_shape = aug_dict[self.train_dataset]["aug_shape"]
def evaluate_stream( model: models.Model, datasets_type: str, train_words_type: str, architecture_type: str, model_name: str, orig_wav_path: Path, norm_tra: str, wav_ID: str, ) -> np.ndarray: """MAKEDOC: what is evaluate_stream doing? CNN_nf32_ks02_ps01_dw32_dr01_lr04_opa1_dsmeL04_bs32_en15_wLTnumLS if not 4 < len(norm_tra.split()) < 15: sentence_index 10 sentence_wav_paths[6241_61943_000011_000003]: 6241/61943/6241_61943_000011_000003.wav sentence_norm_tra[6241_61943_000011_000003]: As usual, the crew was small, five Danes doing the whole of the work. sentence_index 16 sentence_wav_paths[2412_153947_000023_000000]: 2412/153947/2412_153947_000023_000000.wav sentence_norm_tra[2412_153947_000023_000000]: june ninth eighteen seventy two sentence_index 19 sentence_wav_paths[174_168635_000014_000000]: 174/168635/174_168635_000014_000000.wav sentence_norm_tra[174_168635_000014_000000]: CHAPTER three-TWO MISFORTUNES MAKE ONE PIECE OF GOOD FORTUNE sentence_index 22 sentence_wav_paths[3000_15664_000017_000003]: 3000/15664/3000_15664_000017_000003.wav sentence_norm_tra[3000_15664_000017_000003]: The full grown bucks weigh nearly three hundred and fifty pounds. sentence_index 26 sentence_wav_paths[2277_149897_000035_000002]: 2277/149897/2277_149897_000035_000002.wav sentence_norm_tra[2277_149897_000035_000002]: Three o'clock came, four, five, six, and no letter. sentence_index 33 sentence_wav_paths[8297_275154_000019_000000]: 8297/275154/8297_275154_000019_000000.wav sentence_norm_tra[8297_275154_000019_000000]: "I will do neither the one nor the other. sentence_index 36 sentence_wav_paths[8297_275156_000017_000007]: 8297/275156/8297_275156_000017_000007.wav sentence_norm_tra[8297_275156_000017_000007]: In any event, her second marriage would lead to one disastrous result. sentence_index 42 sentence_wav_paths[7976_110124_000012_000000]: 7976/110124/7976_110124_000012_000000.wav sentence_norm_tra[7976_110124_000012_000000]: "Be sure that you admit no one," commanded the merchant. sentence_index 46 sentence_wav_paths[7976_105575_000008_000004]: 7976/105575/7976_105575_000008_000004.wav sentence_norm_tra[7976_105575_000008_000004]: Five of my eight messmates of the day before were shot. sentence_index 66 sentence_wav_paths[251_118436_000005_000000]: 251/118436/251_118436_000005_000000.wav sentence_norm_tra[251_118436_000005_000000]: one Death Strikes a King interesting because the two after the silence is identified, the four said quickly is missed sentence_index 67 sentence_wav_paths[251_137823_000033_000000]: 251/137823/251_137823_000033_000000.wav sentence_norm_tra[251_137823_000033_000000]: Of the four other company engineers, two were now stirring and partly conscious. sentence_index 100 sentence_wav_paths[1993_147966_000015_000000]: 1993/147966/1993_147966_000015_000000.wav sentence_norm_tra[1993_147966_000015_000000]: We had three weeks of this mild, open weather. ATT_ct02_dr01_ks01_lu01_qt04_dw01_opa1_lr04_bs02_en01_dsmeLa3_wLTnumLS sentence_index 10 sentence_wav_paths[6241_61943_000011_000003]: /home/pmn/audiodatasets/LibriTTS/dev-clean/6241/61943/6241_61943_000011_000003.wav sentence_norm_tra[6241_61943_000011_000003]: As usual, the crew was small, five Danes doing the whole of the work. sentence_index 40 sentence_wav_paths[7976_110124_000015_000000]: /home/pmn/audiodatasets/LibriTTS/dev-clean/7976/110124/7976_110124_000015_000000.wav sentence_norm_tra[7976_110124_000015_000000]: "Have pity upon a poor unfortunate one!" he called out. """ logg = logging.getLogger(f"c.{__name__}.evaluate_stream") # logg.setLevel("INFO") logg.debug("Start evaluate_stream") # a random number generator to use rng = np.random.default_rng(12345) # the sample rate to use new_sr = 16000 # load the sentence and resample it sentence_sig, sentence_sr = librosa.load(orig_wav_path, sr=None) sentence_sig = librosa.resample(sentence_sig, sentence_sr, new_sr) # split the sentence in chunks every sentence_hop_length sentence_hop_length = new_sr // 16 # the words the model was trained on words = sorted(words_types[train_words_type]) logg.debug(f"words: {words}") # the length of the split is chosen to match the training type if train_words_type.endswith("LS"): split_length = new_sr // 2 else: split_length = new_sr splits = split_sentence(sentence_sig, datasets_type, sentence_hop_length, split_length) logg.debug(f"len(splits): {len(splits)}") # if there are no splits available we pretend to have predicted only background # or only random if there are only numbers if len(splits) == 0: num_words = len(words) if num_words == 10: fake_pred = np.ones((1, num_words), dtype=np.float32) fake_pred /= num_words else: fake_pred = np.zeros((1, num_words), dtype=np.float32) fake_pred[0][0] = 1 return fake_pred # compute spectrograms / augment / compose if datasets_type.startswith("au"): specs = augment_signals(splits, datasets_type, rng, which_fold="testing") logg.debug(f"specs.shape: {specs.shape}") specs_img = np.expand_dims(specs, axis=-1) logg.debug(f"specs_img.shape: {specs_img.shape}") elif datasets_type.startswith("me"): spec_dict = get_spec_dict() mel_kwargs = spec_dict[datasets_type] logg.debug(f"mel_kwargs: {mel_kwargs}") spec_shape_dict = get_spec_shape_dict() spec_shape = spec_shape_dict[datasets_type] requested_length = spec_shape[1] logg.debug(f"requested_length: {requested_length}") p2d_kwargs = {"ref": np.max} specs_img = compute_spectrograms(splits, mel_kwargs, p2d_kwargs, requested_length) logg.debug(f"specs_img.shape: {specs_img.shape}") y_pred = model.predict(specs_img) # logg.debug(f"y_pred: {y_pred}") y_index = np.argmax(y_pred, axis=1) # logg.debug(f"y_index: {y_index}") y_pred_labels = [words[i] for i in y_index] # logg.debug(f"y_pred_labels: {y_pred_labels}") save_plots = False # save_plots = True if save_plots: clean_labels = [] for yl in y_pred_labels: if yl.startswith("_other"): clean_labels.append(".") else: clean_labels.append(yl) logg.debug(f"Predictions {clean_labels}") # fig_name = f"{architecture_type}_{evaluation_type}_{datasets_type}_{train_words_type}_{norm_tra}.{{}}" # fig_name = f"{architecture_type}" fig_name = f"{model_name}" # fig_name += f"_{evaluation_type}" fig_name += f"_{datasets_type}" fig_name += f"_{train_words_type}" fig_name += f"_{wav_ID}" # fig_name += f"_{norm_tra}.{{}}" fig_name += ".{}" plot_sentence_pred( sentence_sig, y_pred, norm_tra, words, sentence_hop_length, split_length, fig_name, ) # plt.show() return y_pred
def evaluate_audio_cnn(args): """MAKEDOC: what is evaluate_audio_cnn doing?""" logg = logging.getLogger(f"c.{__name__}.evaluate_audio_cnn") logg.debug("Start evaluate_audio_cnn") # magic to fix the GPUs setup_gpus() # need to know on which dataset the model was trained to compute specs dataset_name = "mel01" # words that the dataset was trained on train_words_type = args.train_words_type train_words = words_types[train_words_type] # permutation from sorted to your wor(l)d order perm_pred = compute_permutation(train_words) rec_words_type = args.rec_words_type if rec_words_type == "train": rec_words = train_words else: rec_words = words_types[rec_words_type] num_rec_words = len(rec_words) # where to save the audios audio_folder = Path("recorded_audio") if not audio_folder.exists(): audio_folder.mkdir(parents=True, exist_ok=True) # record the audios and save them in audio_folder audio_path_fmt = "{}_02.wav" audios = record_audios(rec_words, audio_folder, audio_path_fmt, timeout=0) # compute the spectrograms and build the dataset of correct shape img_specs = [] spec_dict = get_spec_dict() spec_kwargs = spec_dict[dataset_name] p2d_kwargs = {"ref": np.max} for word in rec_words: # get the name audio_path = audio_folder / audio_path_fmt.format(word) # convert it to mel log_spec = wav2mel(audio_path, spec_kwargs, p2d_kwargs) img_spec = log_spec.reshape((*log_spec.shape, 1)) logg.debug(f"img_spec.shape: {img_spec.shape}" ) # img_spec.shape: (128, 32, 1) img_specs.append(img_spec) # the data needs to look like this data['testing'].shape: (735, 128, 32, 1) # data = log_spec.reshape((1, *log_spec.shape, 1)) data = np.stack(img_specs) logg.debug(f"data.shape: {data.shape}") hypa: ty.Dict[str, ty.Union[str, int]] = {} hypa["base_dense_width"] = 32 hypa["base_filters"] = 20 hypa["batch_size"] = 32 hypa["dropout_type"] = "01" hypa["epoch_num"] = 16 hypa["kernel_size_type"] = "02" hypa["pool_size_type"] = "02" hypa["learning_rate_type"] = "02" hypa["optimizer_type"] = "a1" hypa["dataset"] = dataset_name hypa["words"] = train_words_type # get the words train_words = words_types[train_words_type] model_name = build_cnn_name(hypa) logg.debug(f"model_name: {model_name}") # model_folder = Path("trained_models") / "cnn" model_folder = Path("saved_models") model_path = model_folder / f"{model_name}.h5" if not model_path.exists(): logg.error(f"Model not found at: {model_path}") raise FileNotFoundError model = tf.keras.models.load_model(model_path) model.summary() pred = model.predict(data) # logg.debug(f"pred: {pred}") # plot the thing plot_size = 5 fw = plot_size * 3 fh = plot_size * num_rec_words fig, axes = plt.subplots(nrows=num_rec_words, ncols=3, figsize=(fw, fh)) fig.suptitle("Recorded audios", fontsize=18) for i, word in enumerate(rec_words): plot_waveform(audios[i], axes[i][0]) spec = img_specs[i][:, :, 0] plot_spec(spec, axes[i][1]) plot_pred( pred[i][perm_pred], train_words, axes[i][2], f"Prediction for {rec_words[i]}", train_words.index(word), ) # https://stackoverflow.com/q/8248467 # https://stackoverflow.com/q/2418125 fig.tight_layout(h_pad=3, rect=[0, 0.03, 1, 0.97]) fig_name = f"{model_name}_{train_words_type}_{rec_words_type}.png" results_path = audio_folder / fig_name fig.savefig(results_path) if num_rec_words <= 6: plt.show()
def evaluate_attention_weights(train_words_type: str, rec_words_type: str, do_new_record: bool = False) -> None: """MAKEDOC: what is evaluate_attention_weights doing?""" logg = logging.getLogger(f"c.{__name__}.evaluate_attention_weights") # logg.setLevel("INFO") logg.debug("Start evaluate_attention_weights") # magic to fix the GPUs setup_gpus() # ATT_ct02_dr02_ks02_lu01_as01_qt01_dw01_opa1_lr01_bs01_en01_dsmel04_wk1 # hypa: ty.Dict[str, str] = {} # hypa["conv_size_type"] = "02" # hypa["dropout_type"] = "02" # hypa["kernel_size_type"] = "02" # hypa["lstm_units_type"] = "01" # hypa["query_style_type"] = "01" # hypa["dense_width_type"] = "01" # hypa["optimizer_type"] = "a1" # hypa["learning_rate_type"] = "01" # hypa["batch_size_type"] = "01" # hypa["epoch_num_type"] = "01" # dataset_name = "mel04" # hypa["dataset_name"] = dataset_name # hypa["words_type"] = train_words_type # use_validation = True # ATT_ct02_dr01_ks01_lu01_qt05_dw01_opa1_lr03_bs02_en02_dsaug07_wLTnum hypa = { "batch_size_type": "02", "conv_size_type": "02", "dataset_name": "aug07", "dense_width_type": "01", "dropout_type": "01", "epoch_num_type": "02", "kernel_size_type": "01", "learning_rate_type": "03", "lstm_units_type": "01", "optimizer_type": "a1", "query_style_type": "05", "words_type": "LTnum", } use_validation = True dataset_name = hypa["dataset_name"] model_name = build_attention_name(hypa, use_validation) logg.debug(f"model_name: {model_name}") # load the model model_folder = Path("trained_models") / "attention" model_path = model_folder / f"{model_name}.h5" # model = tf.keras.models.load_model(model_path) # https://github.com/keras-team/keras/issues/5088#issuecomment-401498334 model = tf.keras.models.load_model( model_path, custom_objects={"backend": tf.keras.backend}) model.summary() logg.debug(f"ascii_model(model): {ascii_model(model)}") att_weight_model = tf.keras.models.Model( inputs=model.input, outputs=[ model.get_layer("output").output, model.get_layer("att_softmax").output, model.get_layer("bidirectional_1").output, ], ) att_weight_model.summary() # logg.debug(f"att_weight_model.outputs: {att_weight_model.outputs}") # get the training words train_words = words_types[train_words_type] # logg.debug(f"train_words: {train_words}") perm_pred = compute_permutation(train_words) rec_words_type = args.rec_words_type if rec_words_type == "train": rec_words = train_words[-3:] # rec_words = train_words[:] logg.debug(f"Using rec_words: {rec_words}") else: rec_words = words_types[rec_words_type] num_rec_words = len(rec_words) # record new audios if do_new_record: # where to save the audios audio_folder = Path("recorded_audio") if not audio_folder.exists(): audio_folder.mkdir(parents=True, exist_ok=True) # record the audios and save them in audio_folder audio_path_fmt = "{}_02.wav" audios = record_audios(rec_words, audio_folder, audio_path_fmt, timeout=0) # compute the spectrograms and build the dataset of correct shape img_specs = [] spec_dict = get_spec_dict() spec_kwargs = spec_dict[dataset_name] p2d_kwargs = {"ref": np.max} for word in rec_words: # get the name audio_path = audio_folder / audio_path_fmt.format(word) # convert it to mel log_spec = wav2mel(audio_path, spec_kwargs, p2d_kwargs) img_spec = log_spec.reshape((*log_spec.shape, 1)) # logg.debug(f"img_spec.shape: {img_spec.shape}") # img_spec.shape: (128, 32, 1) img_specs.append(img_spec) # the data needs to look like this data['testing'].shape: (735, 128, 32, 1) rec_data = np.stack(img_specs) # logg.debug(f"rec_data.shape: {rec_data.shape}") # load data if you do not want to record new audios else: # input data processed_folder = Path("data_proc") processed_path = processed_folder / f"{dataset_name}" # which word in the dataset to plot word_id = 2 # the loaded spectrograms rec_data_l: ty.List[np.ndarray] = [] for i, word in enumerate(rec_words): data, labels = load_processed(processed_path, [word]) # get one of the spectrograms word_data = data["testing"][word_id] rec_data_l.append(word_data) # turn the list into np array rec_data = np.stack(rec_data_l) # get prediction and attention weights pred, att_weights, LSTM_out = att_weight_model.predict(rec_data) # logg.debug(f"att_weights.shape: {att_weights.shape}") # logg.debug(f"att_weights[0].shape: {att_weights[0].shape}") # if we recorded fresh audios we also have the waveform to plot ax_add = 1 if do_new_record else 0 # plot the wave, spectrogram, weights and predictions in each column plot_size = 5 fw = plot_size * num_rec_words nrows = 3 + ax_add # nrows = 4 + ax_add fh = plot_size * nrows * 0.7 fig, axes = plt.subplots(nrows=nrows, ncols=num_rec_words, figsize=(fw, fh), sharey="row") fig.suptitle(f"Attention weights and predictions for {rec_words}", fontsize=20) for i, word in enumerate(rec_words): word_spec = rec_data[i][:, :, 0] # logg.debug(f"word_spec.shape: {word_spec.shape}") # plot the waveform if do_new_record: plot_waveform(audios[i], axes[0][i]) # plot the spectrogram title = f"Spectrogram for {word}" plot_spec(word_spec, axes[0 + ax_add][i], title=title) # plot the weights word_att_weights = att_weights[i] # plot_att_weights(word_att_weights, axes[1 + ax_add][i], title) word_att_weights_img = np.expand_dims(word_att_weights, axis=-1).T axes[1 + ax_add][i].imshow(word_att_weights_img, origin="lower", aspect="auto") title = f"Attention weights for {word}" axes[1 + ax_add][i].set_title(title) # plot the predictions word_pred = pred[i] # permute the prediction from sorted to the order you have word_pred = word_pred[perm_pred] pred_index = np.argmax(word_pred) title = f"Predictions for {word}" plot_pred(word_pred, train_words, axes[2 + ax_add][i], title, pred_index) # axes[3 + ax_add][i].imshow(LSTM_out[i], origin="lower") # fig.tight_layout() fig.tight_layout(h_pad=3, rect=[0, 0.03, 1, 0.97]) fig_name = f"{model_name}" fig_name += f"_{train_words_type}" fig_name += f"_{rec_words_type}_img" if do_new_record: fig_name += "_new.{}" else: fig_name += "_data.{}" plot_folder = Path("plot_results") results_path = plot_folder / fig_name.format("png") fig.savefig(results_path) results_path = plot_folder / fig_name.format("pdf") fig.savefig(results_path) if num_rec_words <= 6: plt.show()
def evaluate_audio_transfer(train_words_type: str, rec_words_type: str) -> None: """MAKEDOC: what is evaluate_audio_transfer doing?""" logg = logging.getLogger(f"c.{__name__}.evaluate_audio_transfer") # logg.setLevel("INFO") logg.debug("Start evaluate_audio_transfer") # magic to fix the GPUs setup_gpus() datasets_type = "01" datasets_types = { "01": ["mel05", "mel09", "mel10"], "02": ["mel05", "mel10", "mfcc07"], "03": ["mfcc06", "mfcc07", "mfcc08"], "04": ["mel05", "mfcc06", "melc1"], "05": ["melc1", "melc2", "melc4"], } dataset_names = datasets_types[datasets_type] # we do not support composed datasets for now for dn in dataset_names: if dn.startswith("melc"): logg.error(f"not supported: {dataset_names}") return # words that the dataset was trained on train_words_type = args.train_words_type train_words = words_types[train_words_type] # the model predicts sorted words perm_pred = compute_permutation(train_words) if rec_words_type == "train": rec_words = train_words else: rec_words = words_types[rec_words_type] num_rec_words = len(rec_words) # where to save the audios audio_folder = Path("recorded_audio") if not audio_folder.exists(): audio_folder.mkdir(parents=True, exist_ok=True) # record the audios and save them in audio_folder audio_path_fmt = "{}_02.wav" audios = record_audios(rec_words, audio_folder, audio_path_fmt, timeout=0) # compute the spectrograms and build the dataset of correct shape specs_3ch: ty.List[np.ndarray] = [] # params for the mel conversion p2d_kwargs = {"ref": np.max} spec_dict = get_spec_dict() for word in rec_words: # get the name audio_path = audio_folder / audio_path_fmt.format(word) # convert it to mel for each type of dataset specs: ty.List[np.ndarray] = [] for dataset_name in dataset_names: spec_kwargs = spec_dict[dataset_name] log_spec = wav2mel(audio_path, spec_kwargs, p2d_kwargs) specs.append(log_spec) img_spec = np.stack(specs, axis=2) # logg.debug(f"img_spec.shape: {img_spec.shape}") # (128, 128, 3) specs_3ch.append(img_spec) data = np.stack(specs_3ch) logg.debug(f"data.shape: {data.shape}") hypa: ty.Dict[str, str] = {} hypa["dense_width_type"] = "03" hypa["dropout_type"] = "01" hypa["batch_size_type"] = "02" hypa["epoch_num_type"] = "01" hypa["learning_rate_type"] = "01" hypa["optimizer_type"] = "a1" hypa["datasets_type"] = datasets_type hypa["words_type"] = train_words_type use_validation = False # hypa: Dict[str, str] = {} # hypa["dense_width_type"] = "02" # hypa["dropout_type"] = "01" # hypa["batch_size_type"] = "01" # hypa["epoch_num_type"] = "01" # hypa["learning_rate_type"] = "01" # hypa["optimizer_type"] = "a1" # hypa["datasets_type"] = datasets_type # hypa["words_type"] = train_words_type # use_validation = True # get the model name model_name = build_transfer_name(hypa, use_validation) # load the model # model_folder = Path("trained_models") / "transfer" model_folder = Path("saved_models") model_path = model_folder / f"{model_name}.h5" model = tf.keras.models.load_model(model_path) # predict! pred = model.predict(data) # plot everything plot_size = 5 fw = plot_size * 5 fh = plot_size * num_rec_words fig, axes = plt.subplots(nrows=num_rec_words, ncols=5, figsize=(fw, fh)) fig.suptitle("Recorded audios", fontsize=18) for i, word in enumerate(rec_words): plot_waveform(audios[i], axes[i][0]) img_spec = specs_3ch[i] plot_spec(img_spec[:, :, 0], axes[i][1]) plot_spec(img_spec[:, :, 1], axes[i][2]) plot_spec(img_spec[:, :, 2], axes[i][3]) plot_pred( pred[i][perm_pred], train_words, axes[i][4], f"Prediction for {rec_words[i]}", train_words.index(word), ) # https://stackoverflow.com/q/8248467 # https://stackoverflow.com/q/2418125 fig.tight_layout(h_pad=3, rect=[0, 0.03, 1, 0.97]) fig_name = f"{model_name}_{train_words_type}_{rec_words_type}.png" results_path = audio_folder / fig_name fig.savefig(results_path) if num_rec_words <= 6: plt.show()