def prediction(weights_path, name_model, audio_dir_prediction,
               dir_save_prediction, audio_input_prediction,
               audio_output_prediction, sample_rate, min_duration,
               frame_length, hop_length_frame, n_fft, hop_length_fft):
    """ This function takes as input pretrained weights, noisy voice sound to denoise, predict
    the denoise sound and save it to disk.
    """

    # load json and create model
    json_file = open(weights_path + '/' + name_model + '.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(weights_path + '/' + name_model + '.h5')
    print("Loaded model from disk")

    # Extracting noise and voice from folder and convert to numpy
    audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction,
                                 sample_rate, frame_length, hop_length_frame,
                                 min_duration)

    #Dimensions of squared spectrogram
    dim_square_spec = int(n_fft / 2) + 1
    print(dim_square_spec)

    # Create Amplitude and phase of the sounds
    m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram(
        audio, dim_square_spec, n_fft, hop_length_fft)

    #global scaling to have distribution -1/1
    X_in = scaled_in(m_amp_db_audio)
    #Reshape for prediction
    X_in = X_in.reshape(X_in.shape[0], X_in.shape[1], X_in.shape[2], 1)
    #Prediction using loaded network
    X_pred = loaded_model.predict(X_in)
    #Rescale back the noise model
    inv_sca_X_pred = inv_scaled_ou(X_pred)
    #Remove noise model from noisy speech
    X_denoise = m_amp_db_audio - inv_sca_X_pred[:, :, :, 0]
    #Reconstruct audio from denoised spectrogram and phase
    print(X_denoise.shape)
    print(m_pha_audio.shape)
    print(frame_length)
    print(hop_length_fft)
    audio_denoise_recons = matrix_spectrogram_to_numpy_audio(
        X_denoise, m_pha_audio, frame_length, hop_length_fft)
    #Number of frames
    nb_samples = audio_denoise_recons.shape[0]
    #Save all frames in one file
    denoise_long = audio_denoise_recons.reshape(1,
                                                nb_samples * frame_length) * 10
    # Update of Librosa no longer uses .output use sf.write instead
    #librosa.output.write_wav(dir_save_prediction + audio_output_prediction, denoise_long[0, :], sample_rate)
    import soundfile as sf
    sf.write(dir_save_prediction + audio_output_prediction, denoise_long[0, :],
             sample_rate, 'PCM_16')
Example #2
0
def prediction(weights_path, model, audio_input_prediction, sample_rate,
               min_duration, frame_length, hop_length_frame, n_fft,
               hop_length_fft):
    """ This function takes as input pretrained weights, noisy voice sound to denoise, predict
    the denoise sound and save it to disk.
    """

    loaded_model = model
    # load weights into new model
    loaded_model.load_weights(weights_path + '/' + 'model_best.h5')
    print("Loaded model from disk")

    audio_dir_prediction = ""
    # Extracting noise and voice from folder and convert to numpy
    audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction,
                                 sample_rate, frame_length, hop_length_frame,
                                 min_duration)

    #Dimensions of squared spectrogram
    dim_square_spec = int(n_fft / 2) + 1
    print(dim_square_spec)

    # Create Amplitude and phase of the sounds
    m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram(
        audio, dim_square_spec, n_fft, hop_length_fft)

    #global scaling to have distribution -1/1
    X_in = scaled_in(m_amp_db_audio)
    #Reshape for prediction
    X_in = X_in.reshape(X_in.shape[0], X_in.shape[1], X_in.shape[2], 1)
    #Prediction using loaded network
    X_pred = loaded_model.predict(X_in)
    #Rescale back the noise model
    inv_sca_X_pred = inv_scaled_ou(X_pred)
    #Remove noise model from noisy speech
    X_denoise = m_amp_db_audio - inv_sca_X_pred[:, :, :, 0]
    #Reconstruct audio from denoised spectrogram and phase
    print(X_denoise.shape)
    print(m_pha_audio.shape)
    print(frame_length)
    print(hop_length_fft)
    audio_denoise_recons = matrix_spectrogram_to_numpy_audio(
        X_denoise, m_pha_audio, frame_length, hop_length_fft)
    #Number of frames
    nb_samples = audio_denoise_recons.shape[0]
    #Save all frames in one file
    denoise_long = audio_denoise_recons.reshape(1,
                                                nb_samples * frame_length) * 10
    return [denoise_long[0, :], sample_rate]
def decode(weights_path=args.weights_folder,
           name_model=args.name_model,
           audio_dir_prediction=args.audio_dir_prediction,
           dir_save_prediction=args.dir_save_prediction,
           audio_input_prediction=args.audio_input_prediction,
           audio_output_prediction=args.audio_output_prediction,
           sample_rate=args.sample_rate,
           min_duration=args.min_duration,
           frame_length=args.frame_length,
           hop_length_frame=args.hop_length_frame,
           n_fft=args.n_fft,
           hop_length_fft=args.hop_length_fft):

    loaded_model = ConvAutoEncoder(weights_path=weights_path)
    loaded_model.load_weights()
    loaded_model.info()
    print("Loaded model from disk")
    audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction,
                                 sample_rate, frame_length, hop_length_frame,
                                 min_duration)
    #Dimensions of squared spectrogram
    dim_square_spec = int(n_fft / 2) + 1
    # Create Amplitude and phase of the sounds
    m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram(
        audio, dim_square_spec, n_fft, hop_length_fft)

    data_compress = np.load('aaa.npy')
    print(data_compress.shape)
    decoded = loaded_model.decode(data_compress)
    #Rescale back the noise model
    inv_sca_X_pred = inv_scaled_ou(decoded)
    #Remove noise model from noisy speech
    X_denoise = m_amp_db_audio - inv_sca_X_pred[:, :, :, 0]
    #Reconstruct audio from denoised spectrogram and phase
    print(X_denoise.shape)
    print(m_pha_audio.shape)
    print(frame_length)
    print(hop_length_fft)
    audio_denoise_recons = matrix_spectrogram_to_numpy_audio(
        X_denoise, m_pha_audio, frame_length, hop_length_fft)
    #Number of frames
    nb_samples = audio_denoise_recons.shape[0]
    #Save all frames in one file
    denoise_long = audio_denoise_recons.reshape(1,
                                                nb_samples * frame_length) * 10
    librosa.output.write_wav(dir_save_prediction + audio_output_prediction,
                             denoise_long[0, :], sample_rate)
    print('saved audio decoded file in:',
          dir_save_prediction + audio_output_prediction)
def encode(weights_path=args.weights_folder,
           name_model=args.name_model,
           audio_dir_prediction=args.audio_dir_prediction,
           dir_save_prediction=args.dir_save_prediction,
           audio_input_prediction=args.audio_input_prediction,
           audio_output_prediction=args.audio_output_prediction,
           sample_rate=args.sample_rate,
           min_duration=args.min_duration,
           frame_length=args.frame_length,
           hop_length_frame=args.hop_length_frame,
           n_fft=args.n_fft,
           hop_length_fft=args.hop_length_fft):
    loaded_model = ConvAutoEncoder(weights_path=weights_path)
    loaded_model.load_weights()
    loaded_model.info()
    print("Loaded model from:", weights_path)
    # Extracting noise and voice from folder and convert to numpy
    audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction,
                                 sample_rate, frame_length, hop_length_frame,
                                 min_duration)
    #Dimensions of squared spectrogram
    dim_square_spec = int(n_fft / 2) + 1
    # Create Amplitude and phase of the sounds
    m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram(
        audio, dim_square_spec, n_fft, hop_length_fft)
    #global scaling to have distribution -1/1
    X_in = scaled_in(m_amp_db_audio)
    #Reshape for prediction
    X_in = X_in.reshape(X_in.shape[0], X_in.shape[1], X_in.shape[2], 1)
    encoded = loaded_model.encode(X_in)
    #print(encoded)

    print('encoded.shape:'.encoded.shape)
    np.save('aaa', encoded)
    print('encoded file:', audio_dir_prediction + str(audio_input_prediction))
    print('save to: aaa.npy')
def create_data(noise_dir, voice_dir, path_save_time_serie, path_save_sound,
                path_save_spectrogram, sample_rate, min_duration, frame_length,
                hop_length_frame, hop_length_frame_noise, nb_samples, n_fft,
                hop_length_fft):
    """This function will randomly blend some clean voices from voice_dir with some noises from noise_dir
    and save the spectrograms of noisy voice, noise and clean voices to disk as well as complex phase,
    time series and sounds. This aims at preparing datasets for denoising training. It takes as inputs
    parameters defined in args module"""

    list_noise_files = os.listdir(noise_dir)
    list_voice_files = os.listdir(voice_dir)

    def remove_ds_store(lst):
        """remove mac specific file if present"""
        if '.DS_Store' in lst:
            lst.remove('.DS_Store')

        return lst

    list_noise_files = remove_ds_store(list_noise_files)
    list_voice_files = remove_ds_store(list_voice_files)

    nb_voice_files = len(list_voice_files)
    nb_noise_files = len(list_noise_files)

    # Extracting noise and voice from folder and convert to numpy
    noise = audio_files_to_numpy(noise_dir, list_noise_files, sample_rate,
                                 frame_length, hop_length_frame_noise,
                                 min_duration)

    voice = audio_files_to_numpy(voice_dir, list_voice_files, sample_rate,
                                 frame_length, hop_length_frame, min_duration)

    # Blend some clean voices with random selected noises (and a random level of noise)
    # N2N additions - prod_extra_noisy_voice , prod_extra_noise , prod_total_noise
    prod_voice, prod_noise, prod_noisy_voice, prod_extra_noisy_voice, prod_extra_noise, prod_total_noise = blend_noise_randomly(
        voice, noise, nb_samples, frame_length)

    # To save the long audio generated to disk to QC:
    noisy_voice_long = prod_noisy_voice.reshape(1, nb_samples * frame_length)
    librosa.output.write_wav(path_save_sound + 'noisy_voice_long.wav',
                             noisy_voice_long[0, :], sample_rate)
    voice_long = prod_voice.reshape(1, nb_samples * frame_length)
    librosa.output.write_wav(path_save_sound + 'voice_long.wav',
                             voice_long[0, :], sample_rate)
    noise_long = prod_noise.reshape(1, nb_samples * frame_length)
    librosa.output.write_wav(path_save_sound + 'noise_long.wav',
                             noise_long[0, :], sample_rate)
    # N2N additions -
    extra_noisy_voice_long = prod_extra_noisy_voice.reshape(
        1, nb_samples * frame_length)
    librosa.output.write_wav(path_save_sound + 'extra_noisy_voice_long.wav',
                             extra_noisy_voice_long[0, :], sample_rate)
    extra_noise_long = prod_extra_noise.reshape(1, nb_samples * frame_length)
    librosa.output.write_wav(path_save_sound + 'extra_noise_long.wav',
                             extra_noise_long[0, :], sample_rate)
    total_noise_long = prod_total_noise.reshape(1, nb_samples * frame_length)
    librosa.output.write_wav(path_save_sound + 'total_noise_long.wav',
                             total_noise_long[0, :], sample_rate)

    # Squared spectrogram dimensions
    dim_square_spec = int(n_fft / 2) + 1

    # Create Amplitude and phase of the sounds
    m_amp_db_voice, m_pha_voice = numpy_audio_to_matrix_spectrogram(
        prod_voice, dim_square_spec, n_fft, hop_length_fft)
    m_amp_db_noise, m_pha_noise = numpy_audio_to_matrix_spectrogram(
        prod_noise, dim_square_spec, n_fft, hop_length_fft)
    m_amp_db_noisy_voice, m_pha_noisy_voice = numpy_audio_to_matrix_spectrogram(
        prod_noisy_voice, dim_square_spec, n_fft, hop_length_fft)
    # N2N additions -
    m_amp_db_extra_noise, m_pha_extra_noise = numpy_audio_to_matrix_spectrogram(
        prod_extra_noise, dim_square_spec, n_fft, hop_length_fft)
    m_amp_db_total_noise, m_pha_total_noise = numpy_audio_to_matrix_spectrogram(
        prod_total_noise, dim_square_spec, n_fft, hop_length_fft)
    m_amp_db_extra_noisy_voice, m_pha_extra_noisy_voice = numpy_audio_to_matrix_spectrogram(
        prod_extra_noisy_voice, dim_square_spec, n_fft, hop_length_fft)

    # Save to disk for Training / QC
    np.save(path_save_time_serie + 'voice_timeserie', prod_voice)
    np.save(path_save_time_serie + 'noise_timeserie', prod_noise)
    np.save(path_save_time_serie + 'noisy_voice_timeserie', prod_noisy_voice)

    np.save(path_save_spectrogram + 'voice_amp_db', m_amp_db_voice)
    np.save(path_save_spectrogram + 'noise_amp_db', m_amp_db_noise)
    np.save(path_save_spectrogram + 'noisy_voice_amp_db', m_amp_db_noisy_voice)

    np.save(path_save_spectrogram + 'voice_pha_db', m_pha_voice)
    np.save(path_save_spectrogram + 'noise_pha_db', m_pha_noise)
    np.save(path_save_spectrogram + 'noisy_voice_pha_db', m_pha_noisy_voice)

    # N2N additions -
    np.save(path_save_time_serie + 'extra_noise_timeserie', prod_extra_noise)
    np.save(path_save_time_serie + 'total_noise_timeserie', prod_total_noise)
    np.save(path_save_time_serie + 'extra_noisy_voice_timeserie',
            prod_extra_noisy_voice)

    np.save(path_save_spectrogram + 'extra_noise_amp_db', m_amp_db_extra_noise)
    np.save(path_save_spectrogram + 'total_noise_amp_db', m_amp_db_total_noise)
    np.save(path_save_spectrogram + 'extra_noisy_voice_amp_db',
            m_amp_db_extra_noisy_voice)

    np.save(path_save_spectrogram + 'extra_noise_pha_db', m_pha_extra_noise)
    np.save(path_save_spectrogram + 'total_noise_pha_db', m_pha_total_noise)
    np.save(path_save_spectrogram + 'extra_noisy_voice_pha_db',
            m_pha_extra_noisy_voice)
Example #6
0
def create_data(noise_dir, voice_dir, path_save_time_serie, path_save_sound,
                path_save_spectrogram, sample_rate, min_duration, frame_length,
                hop_length_frame, hop_length_frame_noise, nb_samples, n_fft,
                hop_length_fft):
    """This function will randomly blend some clean voices from voice_dir with some noises from noise_dir
    and save the spectrograms of noisy voice, noise and clean voices to disk as well as complex phase,
    time series and sounds. This aims at preparing datasets for denoising training. It takes as inputs
    parameters defined in args module"""

    #os.listdir(voice_dir)
    list_noise_files = []
    list_voice_files = []

    for root, dirs, files in os.walk(noise_dir):
        for file in files:
            if file.endswith(('.mp3', '.wav', '.flac')):
                splitted_text = file.split('-')
                if (splitted_text[1] == '3'):
                    list_noise_files.append(os.path.join(root, file))

    for root, dirs, files in os.walk(voice_dir):
        for file in files:
            if file.endswith(('.mp3', '.wav', '.flac')):
                list_voice_files.append(os.path.join(root, file))

    nb_voice_files = len(list_voice_files)
    nb_noise_files = len(list_noise_files)

    print(len(list_voice_files))
    print(len(list_noise_files))

    # Extracting noise and voice from folder and convert to numpy
    noise = audio_files_to_numpy(noise_dir, list_noise_files, sample_rate,
                                 frame_length, hop_length_frame_noise,
                                 min_duration)

    voice = audio_files_to_numpy(voice_dir, list_voice_files, sample_rate,
                                 frame_length, hop_length_frame, min_duration)

    # Blend some clean voices with random selected noises (and a random level of noise)
    prod_voice, prod_noise, prod_noisy_voice = blend_noise_randomly(
        voice, noise, nb_samples, frame_length)

    # To save the long audio generated to disk to QC:
    noisy_voice_long = prod_noisy_voice.reshape(1, nb_samples * frame_length)
    librosa.output.write_wav(path_save_sound + 'noisy_voice_long.wav',
                             noisy_voice_long[0, :], sample_rate)
    voice_long = prod_voice.reshape(1, nb_samples * frame_length)
    librosa.output.write_wav(path_save_sound + 'voice_long.wav',
                             voice_long[0, :], sample_rate)
    noise_long = prod_noise.reshape(1, nb_samples * frame_length)
    librosa.output.write_wav(path_save_sound + 'noise_long.wav',
                             noise_long[0, :], sample_rate)

    # Squared spectrogram dimensions
    dim_square_spec = int(n_fft / 2) + 1

    # Create Amplitude and phase of the sounds
    m_amp_db_voice, m_pha_voice = numpy_audio_to_matrix_spectrogram(
        prod_voice, dim_square_spec, n_fft, hop_length_fft)
    m_amp_db_noise, m_pha_noise = numpy_audio_to_matrix_spectrogram(
        prod_noise, dim_square_spec, n_fft, hop_length_fft)
    m_amp_db_noisy_voice, m_pha_noisy_voice = numpy_audio_to_matrix_spectrogram(
        prod_noisy_voice, dim_square_spec, n_fft, hop_length_fft)

    np.save(path_save_time_serie + 'voice_timeserie', prod_voice)
    np.save(path_save_time_serie + 'noise_timeserie', prod_noise)
    np.save(path_save_time_serie + 'noisy_voice_timeserie', prod_noisy_voice)

    np.save(path_save_spectrogram + 'voice_amp_db', m_amp_db_voice)
    np.save(path_save_spectrogram + 'noise_amp_db', m_amp_db_noise)
    np.save(path_save_spectrogram + 'noisy_voice_amp_db', m_amp_db_noisy_voice)

    np.save(path_save_spectrogram + 'voice_pha_db', m_pha_voice)
    np.save(path_save_spectrogram + 'noise_pha_db', m_pha_noise)
    np.save(path_save_spectrogram + 'noisy_voice_pha_db', m_pha_noisy_voice)
Example #7
0
def create_data(noise_dir, voice_dir, path_save_time_serie, path_save_sound,
                path_save_spectrogram, sample_rate, min_duration, frame_length,
                hop_length_frame, hop_length_frame_noise, nb_samples, n_fft,
                hop_length_fft, mode):
    """This function will randomly blend some clean voices from voice_dir with some noises from noise_dir
    and save the spectrograms of noisy voice, noise and clean voices to disk as well as complex phase,
    time series and sounds. This aims at preparing datasets for denoising training. It takes as inputs
    parameters defined in args module"""

    alllist_noise_files = os.listdir(noise_dir)
    alllist_voice_files = os.listdir(voice_dir)

    def remove_ds_store(lst):
        """remove mac specific file if present"""
        if '.DS_Store' in lst:
            lst.remove('.DS_Store')

        return lst

    list_noise_files = remove_ds_store(alllist_noise_files)
    list_voice_files = remove_ds_store(alllist_voice_files)

    nb_voice_files = len(alllist_voice_files)
    nb_noise_files = len(alllist_noise_files)

    n = 1
    list_list_voice_id = list()
    for k in range(0, n):
        list_voice_id = list()
        for id in range(nb_voice_files):
            if (k / n * nb_voice_files <= id) & (id <= (
                (k + n) / n) * nb_voice_files):
                list_voice_id.append(id)
        list_list_voice_id.append(list_voice_id)

    list_list_noise_id = list()
    for k in range(0, n):
        list_noise_id = list()
        for id in range(nb_noise_files):
            if (k / n * nb_noise_files <= id) & (id <= (
                (k + n) / n) * nb_noise_files):
                list_noise_id.append(id)
        list_list_noise_id.append(list_noise_id)

    ii = 0
    for list_noise_id in list_list_noise_id:
        list_noise_files = []
        for id in list_noise_id:
            list_noise_files.append(alllist_noise_files[id])

        for list_voice_id in list_list_voice_id:
            list_voice_files = []
            for id in list_voice_id:
                list_voice_files.append(alllist_voice_files[id])

            # Extracting noise and voice from folder and convert to numpy
            noise = audio_files_to_numpy(noise_dir, list_noise_files,
                                         sample_rate, frame_length,
                                         hop_length_frame_noise * 1,
                                         min_duration, mode)

            voice = audio_files_to_numpy(voice_dir, list_voice_files,
                                         sample_rate, frame_length,
                                         hop_length_frame * 10, min_duration,
                                         mode)

            # Blend some clean voices with random selected noises (and a random level of noise)

            prod_voice, prod_noise, prod_noisy_voice = blend_noise_randomly(
                voice, noise, nb_samples, frame_length, 1)

            # To save the long audio generated to disk to QC:
            noisy_voice_long = prod_noisy_voice.reshape(
                1, nb_samples * frame_length)
            sf.write(path_save_sound + '/noisy_voice_long' + str(ii) + '.wav',
                     noisy_voice_long[0, :], sample_rate)
            voice_long = prod_voice.reshape(1, nb_samples * frame_length)
            sf.write(path_save_sound + '/voice_long' + str(ii) + '.wav',
                     voice_long[0, :], sample_rate)
            noise_long = prod_noise.reshape(1, nb_samples * frame_length)
            sf.write(path_save_sound + '/noise_long' + str(ii) + '.wav',
                     noise_long[0, :], sample_rate)

            # Squared spectrogram dimensions
            dim_square_spec = int(n_fft / 2) + 1

            # Create Amplitude and phase of the sounds
            m_amp_db_voice, m_pha_voice = numpy_audio_to_matrix_spectrogram(
                prod_voice, dim_square_spec, n_fft, hop_length_fft)
            m_amp_db_noise, m_pha_noise = numpy_audio_to_matrix_spectrogram(
                prod_noise, dim_square_spec, n_fft, hop_length_fft)
            m_amp_db_noisy_voice, m_pha_noisy_voice = numpy_audio_to_matrix_spectrogram(
                prod_noisy_voice, dim_square_spec, n_fft, hop_length_fft)

            # Save to disk for Training / QC
            # np.save(path_save_time_serie + '/voice_timeserie'+str(ii), prod_voice)
            # np.save(path_save_time_serie + '/noise_timeserie'+str(ii), prod_noise)
            # np.save(path_save_time_serie + '/noisy_voice_timeserie'+str(ii), prod_noisy_voice)

            np.save(path_save_spectrogram + '/voice_amp_db' + str(ii),
                    m_amp_db_voice)
            np.save(path_save_spectrogram + '/noise_amp_db' + str(ii),
                    m_amp_db_noise)
            # np.save(path_save_spectrogram + '/noisy_voice_amp_db'+str(ii), m_amp_db_noisy_voice)

            np.save(path_save_spectrogram + '/voice_pha_db' + str(ii),
                    m_pha_voice)
            np.save(path_save_spectrogram + '/noise_pha_db' + str(ii),
                    m_pha_noise)
            # np.save(path_save_spectrogram + '/noisy_voice_pha_db'+str(ii), m_pha_noisy_voice)

            ii = ii + 1
# Minimum duration of audio files to consider
min_duration = 1.0

# Our training data will be frame of slightly above 1 second
frame_length = 8064

# hop length for clean voice files separation (no overlap)
hop_length_frame = 8064

# hop length for noise files (we split noise into several windows)
hop_length_frame_noise = 5000

# Extracting noise and voice from folder and convert to numpy
audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction,
                             sample_rate, frame_length, hop_length_frame,
                             min_duration)

# Choosing n_fft and hop_length_fft to have squared spectrograms
n_fft = 255
hop_length_fft = 63

dim_square_spec = int(n_fft / 2) + 1

# Create Amplitude and phase of the sounds
m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram(
    audio, dim_square_spec, n_fft, hop_length_fft)

#global scaling to have distribution -1/1
X_in = scaled_in(m_amp_db_audio)
#Reshape for prediction