def prediction(weights_path, name_model, audio_dir_prediction, dir_save_prediction, audio_input_prediction, audio_output_prediction, sample_rate, min_duration, frame_length, hop_length_frame, n_fft, hop_length_fft): """ This function takes as input pretrained weights, noisy voice sound to denoise, predict the denoise sound and save it to disk. """ # load json and create model json_file = open(weights_path + '/' + name_model + '.json', 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json) # load weights into new model loaded_model.load_weights(weights_path + '/' + name_model + '.h5') print("Loaded model from disk") # Extracting noise and voice from folder and convert to numpy audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction, sample_rate, frame_length, hop_length_frame, min_duration) #Dimensions of squared spectrogram dim_square_spec = int(n_fft / 2) + 1 print(dim_square_spec) # Create Amplitude and phase of the sounds m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram( audio, dim_square_spec, n_fft, hop_length_fft) #global scaling to have distribution -1/1 X_in = scaled_in(m_amp_db_audio) #Reshape for prediction X_in = X_in.reshape(X_in.shape[0], X_in.shape[1], X_in.shape[2], 1) #Prediction using loaded network X_pred = loaded_model.predict(X_in) #Rescale back the noise model inv_sca_X_pred = inv_scaled_ou(X_pred) #Remove noise model from noisy speech X_denoise = m_amp_db_audio - inv_sca_X_pred[:, :, :, 0] #Reconstruct audio from denoised spectrogram and phase print(X_denoise.shape) print(m_pha_audio.shape) print(frame_length) print(hop_length_fft) audio_denoise_recons = matrix_spectrogram_to_numpy_audio( X_denoise, m_pha_audio, frame_length, hop_length_fft) #Number of frames nb_samples = audio_denoise_recons.shape[0] #Save all frames in one file denoise_long = audio_denoise_recons.reshape(1, nb_samples * frame_length) * 10 # Update of Librosa no longer uses .output use sf.write instead #librosa.output.write_wav(dir_save_prediction + audio_output_prediction, denoise_long[0, :], sample_rate) import soundfile as sf sf.write(dir_save_prediction + audio_output_prediction, denoise_long[0, :], sample_rate, 'PCM_16')
def prediction(weights_path, model, audio_input_prediction, sample_rate, min_duration, frame_length, hop_length_frame, n_fft, hop_length_fft): """ This function takes as input pretrained weights, noisy voice sound to denoise, predict the denoise sound and save it to disk. """ loaded_model = model # load weights into new model loaded_model.load_weights(weights_path + '/' + 'model_best.h5') print("Loaded model from disk") audio_dir_prediction = "" # Extracting noise and voice from folder and convert to numpy audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction, sample_rate, frame_length, hop_length_frame, min_duration) #Dimensions of squared spectrogram dim_square_spec = int(n_fft / 2) + 1 print(dim_square_spec) # Create Amplitude and phase of the sounds m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram( audio, dim_square_spec, n_fft, hop_length_fft) #global scaling to have distribution -1/1 X_in = scaled_in(m_amp_db_audio) #Reshape for prediction X_in = X_in.reshape(X_in.shape[0], X_in.shape[1], X_in.shape[2], 1) #Prediction using loaded network X_pred = loaded_model.predict(X_in) #Rescale back the noise model inv_sca_X_pred = inv_scaled_ou(X_pred) #Remove noise model from noisy speech X_denoise = m_amp_db_audio - inv_sca_X_pred[:, :, :, 0] #Reconstruct audio from denoised spectrogram and phase print(X_denoise.shape) print(m_pha_audio.shape) print(frame_length) print(hop_length_fft) audio_denoise_recons = matrix_spectrogram_to_numpy_audio( X_denoise, m_pha_audio, frame_length, hop_length_fft) #Number of frames nb_samples = audio_denoise_recons.shape[0] #Save all frames in one file denoise_long = audio_denoise_recons.reshape(1, nb_samples * frame_length) * 10 return [denoise_long[0, :], sample_rate]
def decode(weights_path=args.weights_folder, name_model=args.name_model, audio_dir_prediction=args.audio_dir_prediction, dir_save_prediction=args.dir_save_prediction, audio_input_prediction=args.audio_input_prediction, audio_output_prediction=args.audio_output_prediction, sample_rate=args.sample_rate, min_duration=args.min_duration, frame_length=args.frame_length, hop_length_frame=args.hop_length_frame, n_fft=args.n_fft, hop_length_fft=args.hop_length_fft): loaded_model = ConvAutoEncoder(weights_path=weights_path) loaded_model.load_weights() loaded_model.info() print("Loaded model from disk") audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction, sample_rate, frame_length, hop_length_frame, min_duration) #Dimensions of squared spectrogram dim_square_spec = int(n_fft / 2) + 1 # Create Amplitude and phase of the sounds m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram( audio, dim_square_spec, n_fft, hop_length_fft) data_compress = np.load('aaa.npy') print(data_compress.shape) decoded = loaded_model.decode(data_compress) #Rescale back the noise model inv_sca_X_pred = inv_scaled_ou(decoded) #Remove noise model from noisy speech X_denoise = m_amp_db_audio - inv_sca_X_pred[:, :, :, 0] #Reconstruct audio from denoised spectrogram and phase print(X_denoise.shape) print(m_pha_audio.shape) print(frame_length) print(hop_length_fft) audio_denoise_recons = matrix_spectrogram_to_numpy_audio( X_denoise, m_pha_audio, frame_length, hop_length_fft) #Number of frames nb_samples = audio_denoise_recons.shape[0] #Save all frames in one file denoise_long = audio_denoise_recons.reshape(1, nb_samples * frame_length) * 10 librosa.output.write_wav(dir_save_prediction + audio_output_prediction, denoise_long[0, :], sample_rate) print('saved audio decoded file in:', dir_save_prediction + audio_output_prediction)
def encode(weights_path=args.weights_folder, name_model=args.name_model, audio_dir_prediction=args.audio_dir_prediction, dir_save_prediction=args.dir_save_prediction, audio_input_prediction=args.audio_input_prediction, audio_output_prediction=args.audio_output_prediction, sample_rate=args.sample_rate, min_duration=args.min_duration, frame_length=args.frame_length, hop_length_frame=args.hop_length_frame, n_fft=args.n_fft, hop_length_fft=args.hop_length_fft): loaded_model = ConvAutoEncoder(weights_path=weights_path) loaded_model.load_weights() loaded_model.info() print("Loaded model from:", weights_path) # Extracting noise and voice from folder and convert to numpy audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction, sample_rate, frame_length, hop_length_frame, min_duration) #Dimensions of squared spectrogram dim_square_spec = int(n_fft / 2) + 1 # Create Amplitude and phase of the sounds m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram( audio, dim_square_spec, n_fft, hop_length_fft) #global scaling to have distribution -1/1 X_in = scaled_in(m_amp_db_audio) #Reshape for prediction X_in = X_in.reshape(X_in.shape[0], X_in.shape[1], X_in.shape[2], 1) encoded = loaded_model.encode(X_in) #print(encoded) print('encoded.shape:'.encoded.shape) np.save('aaa', encoded) print('encoded file:', audio_dir_prediction + str(audio_input_prediction)) print('save to: aaa.npy')
def create_data(noise_dir, voice_dir, path_save_time_serie, path_save_sound, path_save_spectrogram, sample_rate, min_duration, frame_length, hop_length_frame, hop_length_frame_noise, nb_samples, n_fft, hop_length_fft): """This function will randomly blend some clean voices from voice_dir with some noises from noise_dir and save the spectrograms of noisy voice, noise and clean voices to disk as well as complex phase, time series and sounds. This aims at preparing datasets for denoising training. It takes as inputs parameters defined in args module""" list_noise_files = os.listdir(noise_dir) list_voice_files = os.listdir(voice_dir) def remove_ds_store(lst): """remove mac specific file if present""" if '.DS_Store' in lst: lst.remove('.DS_Store') return lst list_noise_files = remove_ds_store(list_noise_files) list_voice_files = remove_ds_store(list_voice_files) nb_voice_files = len(list_voice_files) nb_noise_files = len(list_noise_files) # Extracting noise and voice from folder and convert to numpy noise = audio_files_to_numpy(noise_dir, list_noise_files, sample_rate, frame_length, hop_length_frame_noise, min_duration) voice = audio_files_to_numpy(voice_dir, list_voice_files, sample_rate, frame_length, hop_length_frame, min_duration) # Blend some clean voices with random selected noises (and a random level of noise) # N2N additions - prod_extra_noisy_voice , prod_extra_noise , prod_total_noise prod_voice, prod_noise, prod_noisy_voice, prod_extra_noisy_voice, prod_extra_noise, prod_total_noise = blend_noise_randomly( voice, noise, nb_samples, frame_length) # To save the long audio generated to disk to QC: noisy_voice_long = prod_noisy_voice.reshape(1, nb_samples * frame_length) librosa.output.write_wav(path_save_sound + 'noisy_voice_long.wav', noisy_voice_long[0, :], sample_rate) voice_long = prod_voice.reshape(1, nb_samples * frame_length) librosa.output.write_wav(path_save_sound + 'voice_long.wav', voice_long[0, :], sample_rate) noise_long = prod_noise.reshape(1, nb_samples * frame_length) librosa.output.write_wav(path_save_sound + 'noise_long.wav', noise_long[0, :], sample_rate) # N2N additions - extra_noisy_voice_long = prod_extra_noisy_voice.reshape( 1, nb_samples * frame_length) librosa.output.write_wav(path_save_sound + 'extra_noisy_voice_long.wav', extra_noisy_voice_long[0, :], sample_rate) extra_noise_long = prod_extra_noise.reshape(1, nb_samples * frame_length) librosa.output.write_wav(path_save_sound + 'extra_noise_long.wav', extra_noise_long[0, :], sample_rate) total_noise_long = prod_total_noise.reshape(1, nb_samples * frame_length) librosa.output.write_wav(path_save_sound + 'total_noise_long.wav', total_noise_long[0, :], sample_rate) # Squared spectrogram dimensions dim_square_spec = int(n_fft / 2) + 1 # Create Amplitude and phase of the sounds m_amp_db_voice, m_pha_voice = numpy_audio_to_matrix_spectrogram( prod_voice, dim_square_spec, n_fft, hop_length_fft) m_amp_db_noise, m_pha_noise = numpy_audio_to_matrix_spectrogram( prod_noise, dim_square_spec, n_fft, hop_length_fft) m_amp_db_noisy_voice, m_pha_noisy_voice = numpy_audio_to_matrix_spectrogram( prod_noisy_voice, dim_square_spec, n_fft, hop_length_fft) # N2N additions - m_amp_db_extra_noise, m_pha_extra_noise = numpy_audio_to_matrix_spectrogram( prod_extra_noise, dim_square_spec, n_fft, hop_length_fft) m_amp_db_total_noise, m_pha_total_noise = numpy_audio_to_matrix_spectrogram( prod_total_noise, dim_square_spec, n_fft, hop_length_fft) m_amp_db_extra_noisy_voice, m_pha_extra_noisy_voice = numpy_audio_to_matrix_spectrogram( prod_extra_noisy_voice, dim_square_spec, n_fft, hop_length_fft) # Save to disk for Training / QC np.save(path_save_time_serie + 'voice_timeserie', prod_voice) np.save(path_save_time_serie + 'noise_timeserie', prod_noise) np.save(path_save_time_serie + 'noisy_voice_timeserie', prod_noisy_voice) np.save(path_save_spectrogram + 'voice_amp_db', m_amp_db_voice) np.save(path_save_spectrogram + 'noise_amp_db', m_amp_db_noise) np.save(path_save_spectrogram + 'noisy_voice_amp_db', m_amp_db_noisy_voice) np.save(path_save_spectrogram + 'voice_pha_db', m_pha_voice) np.save(path_save_spectrogram + 'noise_pha_db', m_pha_noise) np.save(path_save_spectrogram + 'noisy_voice_pha_db', m_pha_noisy_voice) # N2N additions - np.save(path_save_time_serie + 'extra_noise_timeserie', prod_extra_noise) np.save(path_save_time_serie + 'total_noise_timeserie', prod_total_noise) np.save(path_save_time_serie + 'extra_noisy_voice_timeserie', prod_extra_noisy_voice) np.save(path_save_spectrogram + 'extra_noise_amp_db', m_amp_db_extra_noise) np.save(path_save_spectrogram + 'total_noise_amp_db', m_amp_db_total_noise) np.save(path_save_spectrogram + 'extra_noisy_voice_amp_db', m_amp_db_extra_noisy_voice) np.save(path_save_spectrogram + 'extra_noise_pha_db', m_pha_extra_noise) np.save(path_save_spectrogram + 'total_noise_pha_db', m_pha_total_noise) np.save(path_save_spectrogram + 'extra_noisy_voice_pha_db', m_pha_extra_noisy_voice)
def create_data(noise_dir, voice_dir, path_save_time_serie, path_save_sound, path_save_spectrogram, sample_rate, min_duration, frame_length, hop_length_frame, hop_length_frame_noise, nb_samples, n_fft, hop_length_fft): """This function will randomly blend some clean voices from voice_dir with some noises from noise_dir and save the spectrograms of noisy voice, noise and clean voices to disk as well as complex phase, time series and sounds. This aims at preparing datasets for denoising training. It takes as inputs parameters defined in args module""" #os.listdir(voice_dir) list_noise_files = [] list_voice_files = [] for root, dirs, files in os.walk(noise_dir): for file in files: if file.endswith(('.mp3', '.wav', '.flac')): splitted_text = file.split('-') if (splitted_text[1] == '3'): list_noise_files.append(os.path.join(root, file)) for root, dirs, files in os.walk(voice_dir): for file in files: if file.endswith(('.mp3', '.wav', '.flac')): list_voice_files.append(os.path.join(root, file)) nb_voice_files = len(list_voice_files) nb_noise_files = len(list_noise_files) print(len(list_voice_files)) print(len(list_noise_files)) # Extracting noise and voice from folder and convert to numpy noise = audio_files_to_numpy(noise_dir, list_noise_files, sample_rate, frame_length, hop_length_frame_noise, min_duration) voice = audio_files_to_numpy(voice_dir, list_voice_files, sample_rate, frame_length, hop_length_frame, min_duration) # Blend some clean voices with random selected noises (and a random level of noise) prod_voice, prod_noise, prod_noisy_voice = blend_noise_randomly( voice, noise, nb_samples, frame_length) # To save the long audio generated to disk to QC: noisy_voice_long = prod_noisy_voice.reshape(1, nb_samples * frame_length) librosa.output.write_wav(path_save_sound + 'noisy_voice_long.wav', noisy_voice_long[0, :], sample_rate) voice_long = prod_voice.reshape(1, nb_samples * frame_length) librosa.output.write_wav(path_save_sound + 'voice_long.wav', voice_long[0, :], sample_rate) noise_long = prod_noise.reshape(1, nb_samples * frame_length) librosa.output.write_wav(path_save_sound + 'noise_long.wav', noise_long[0, :], sample_rate) # Squared spectrogram dimensions dim_square_spec = int(n_fft / 2) + 1 # Create Amplitude and phase of the sounds m_amp_db_voice, m_pha_voice = numpy_audio_to_matrix_spectrogram( prod_voice, dim_square_spec, n_fft, hop_length_fft) m_amp_db_noise, m_pha_noise = numpy_audio_to_matrix_spectrogram( prod_noise, dim_square_spec, n_fft, hop_length_fft) m_amp_db_noisy_voice, m_pha_noisy_voice = numpy_audio_to_matrix_spectrogram( prod_noisy_voice, dim_square_spec, n_fft, hop_length_fft) np.save(path_save_time_serie + 'voice_timeserie', prod_voice) np.save(path_save_time_serie + 'noise_timeserie', prod_noise) np.save(path_save_time_serie + 'noisy_voice_timeserie', prod_noisy_voice) np.save(path_save_spectrogram + 'voice_amp_db', m_amp_db_voice) np.save(path_save_spectrogram + 'noise_amp_db', m_amp_db_noise) np.save(path_save_spectrogram + 'noisy_voice_amp_db', m_amp_db_noisy_voice) np.save(path_save_spectrogram + 'voice_pha_db', m_pha_voice) np.save(path_save_spectrogram + 'noise_pha_db', m_pha_noise) np.save(path_save_spectrogram + 'noisy_voice_pha_db', m_pha_noisy_voice)
def create_data(noise_dir, voice_dir, path_save_time_serie, path_save_sound, path_save_spectrogram, sample_rate, min_duration, frame_length, hop_length_frame, hop_length_frame_noise, nb_samples, n_fft, hop_length_fft, mode): """This function will randomly blend some clean voices from voice_dir with some noises from noise_dir and save the spectrograms of noisy voice, noise and clean voices to disk as well as complex phase, time series and sounds. This aims at preparing datasets for denoising training. It takes as inputs parameters defined in args module""" alllist_noise_files = os.listdir(noise_dir) alllist_voice_files = os.listdir(voice_dir) def remove_ds_store(lst): """remove mac specific file if present""" if '.DS_Store' in lst: lst.remove('.DS_Store') return lst list_noise_files = remove_ds_store(alllist_noise_files) list_voice_files = remove_ds_store(alllist_voice_files) nb_voice_files = len(alllist_voice_files) nb_noise_files = len(alllist_noise_files) n = 1 list_list_voice_id = list() for k in range(0, n): list_voice_id = list() for id in range(nb_voice_files): if (k / n * nb_voice_files <= id) & (id <= ( (k + n) / n) * nb_voice_files): list_voice_id.append(id) list_list_voice_id.append(list_voice_id) list_list_noise_id = list() for k in range(0, n): list_noise_id = list() for id in range(nb_noise_files): if (k / n * nb_noise_files <= id) & (id <= ( (k + n) / n) * nb_noise_files): list_noise_id.append(id) list_list_noise_id.append(list_noise_id) ii = 0 for list_noise_id in list_list_noise_id: list_noise_files = [] for id in list_noise_id: list_noise_files.append(alllist_noise_files[id]) for list_voice_id in list_list_voice_id: list_voice_files = [] for id in list_voice_id: list_voice_files.append(alllist_voice_files[id]) # Extracting noise and voice from folder and convert to numpy noise = audio_files_to_numpy(noise_dir, list_noise_files, sample_rate, frame_length, hop_length_frame_noise * 1, min_duration, mode) voice = audio_files_to_numpy(voice_dir, list_voice_files, sample_rate, frame_length, hop_length_frame * 10, min_duration, mode) # Blend some clean voices with random selected noises (and a random level of noise) prod_voice, prod_noise, prod_noisy_voice = blend_noise_randomly( voice, noise, nb_samples, frame_length, 1) # To save the long audio generated to disk to QC: noisy_voice_long = prod_noisy_voice.reshape( 1, nb_samples * frame_length) sf.write(path_save_sound + '/noisy_voice_long' + str(ii) + '.wav', noisy_voice_long[0, :], sample_rate) voice_long = prod_voice.reshape(1, nb_samples * frame_length) sf.write(path_save_sound + '/voice_long' + str(ii) + '.wav', voice_long[0, :], sample_rate) noise_long = prod_noise.reshape(1, nb_samples * frame_length) sf.write(path_save_sound + '/noise_long' + str(ii) + '.wav', noise_long[0, :], sample_rate) # Squared spectrogram dimensions dim_square_spec = int(n_fft / 2) + 1 # Create Amplitude and phase of the sounds m_amp_db_voice, m_pha_voice = numpy_audio_to_matrix_spectrogram( prod_voice, dim_square_spec, n_fft, hop_length_fft) m_amp_db_noise, m_pha_noise = numpy_audio_to_matrix_spectrogram( prod_noise, dim_square_spec, n_fft, hop_length_fft) m_amp_db_noisy_voice, m_pha_noisy_voice = numpy_audio_to_matrix_spectrogram( prod_noisy_voice, dim_square_spec, n_fft, hop_length_fft) # Save to disk for Training / QC # np.save(path_save_time_serie + '/voice_timeserie'+str(ii), prod_voice) # np.save(path_save_time_serie + '/noise_timeserie'+str(ii), prod_noise) # np.save(path_save_time_serie + '/noisy_voice_timeserie'+str(ii), prod_noisy_voice) np.save(path_save_spectrogram + '/voice_amp_db' + str(ii), m_amp_db_voice) np.save(path_save_spectrogram + '/noise_amp_db' + str(ii), m_amp_db_noise) # np.save(path_save_spectrogram + '/noisy_voice_amp_db'+str(ii), m_amp_db_noisy_voice) np.save(path_save_spectrogram + '/voice_pha_db' + str(ii), m_pha_voice) np.save(path_save_spectrogram + '/noise_pha_db' + str(ii), m_pha_noise) # np.save(path_save_spectrogram + '/noisy_voice_pha_db'+str(ii), m_pha_noisy_voice) ii = ii + 1
# Minimum duration of audio files to consider min_duration = 1.0 # Our training data will be frame of slightly above 1 second frame_length = 8064 # hop length for clean voice files separation (no overlap) hop_length_frame = 8064 # hop length for noise files (we split noise into several windows) hop_length_frame_noise = 5000 # Extracting noise and voice from folder and convert to numpy audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction, sample_rate, frame_length, hop_length_frame, min_duration) # Choosing n_fft and hop_length_fft to have squared spectrograms n_fft = 255 hop_length_fft = 63 dim_square_spec = int(n_fft / 2) + 1 # Create Amplitude and phase of the sounds m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram( audio, dim_square_spec, n_fft, hop_length_fft) #global scaling to have distribution -1/1 X_in = scaled_in(m_amp_db_audio) #Reshape for prediction