Exemple #1
0
def preprocess_audio_pair(speech_file_path, noise_file_path, slice_duration_ms,
                          n_video_slices, video_frame_rate):
    print("preprocessing pair: %s, %s" % (speech_file_path, noise_file_path))

    speech_signal = AudioSignal.from_wav_file(speech_file_path)
    noise_signal = AudioSignal.from_wav_file(noise_file_path)

    while noise_signal.get_number_of_samples(
    ) < speech_signal.get_number_of_samples():
        noise_signal = AudioSignal.concat([noise_signal, noise_signal])

    noise_signal.truncate(speech_signal.get_number_of_samples())

    factor = AudioMixer.snr_factor(speech_signal, noise_signal, snr_db=0)
    noise_signal.amplify_by_factor(factor)

    mixed_signal = AudioMixer.mix([speech_signal, noise_signal],
                                  mixing_weights=[1, 1])

    mixed_spectrograms = preprocess_audio_signal(mixed_signal,
                                                 slice_duration_ms,
                                                 n_video_slices,
                                                 video_frame_rate)
    speech_spectrograms = preprocess_audio_signal(speech_signal,
                                                  slice_duration_ms,
                                                  n_video_slices,
                                                  video_frame_rate)
    noise_spectrograms = preprocess_audio_signal(noise_signal,
                                                 slice_duration_ms,
                                                 n_video_slices,
                                                 video_frame_rate)

    return mixed_spectrograms, speech_spectrograms, noise_spectrograms, mixed_signal
def enhance_speech(speaker_file_path, noise_file_path, speech_prediction_path,
                   speech_profile):
    print("enhancing mix of %s, %s" % (speaker_file_path, noise_file_path))

    speaker_source_signal = AudioSignal.from_wav_file(speaker_file_path)
    noise_source_signal = AudioSignal.from_wav_file(noise_file_path)

    while noise_source_signal.get_number_of_samples(
    ) < speaker_source_signal.get_number_of_samples():
        noise_source_signal = AudioSignal.concat(
            [noise_source_signal, noise_source_signal])

    noise_source_signal = noise_source_signal.slice(
        0, speaker_source_signal.get_number_of_samples())
    mixed_signal = AudioMixer.mix([speaker_source_signal, noise_source_signal])

    predicted_speech_signal = AudioSignal.from_wav_file(speech_prediction_path)

    signals = [mixed_signal, predicted_speech_signal]
    max_length = max([signal.get_number_of_samples() for signal in signals])
    for signal in signals:
        signal.pad_with_zeros(max_length)

    mel_converter = MelConverter(mixed_signal.get_sample_rate(),
                                 n_mel_freqs=128,
                                 freq_min_hz=0,
                                 freq_max_hz=4000)
    mixed_spectrogram, original_phase = mel_converter.signal_to_mel_spectrogram(
        mixed_signal, get_phase=True)
    predicted_speech_spectrogram = mel_converter.signal_to_mel_spectrogram(
        predicted_speech_signal)

    speech_enhancement_mask = np.zeros(shape=mixed_spectrogram.shape)

    thresholds = np.zeros(shape=(speech_enhancement_mask.shape[0]))
    for f in range(speech_enhancement_mask.shape[0]):
        thresholds[f] = np.percentile(speech_profile[f, :], 85)

    for f in range(speech_enhancement_mask.shape[0]):
        for t in range(speech_enhancement_mask.shape[1]):
            if predicted_speech_spectrogram[f, t] > thresholds[f]:
                speech_enhancement_mask[f, t] = 1
                continue

    enhanced_speech_spectrogram = mixed_spectrogram * speech_enhancement_mask
    enhanced_speech_signal = mel_converter.reconstruct_signal_from_mel_spectrogram(
        enhanced_speech_spectrogram, original_phase)

    return mixed_signal, enhanced_speech_signal
def preprocess_sample(speech_entry, noise_file_path, slice_duration_ms=200):
    print("preprocessing sample: %s, %s, %s..." %
          (speech_entry.video_path, speech_entry.audio_path, noise_file_path))

    mouth_height = 128
    mouth_width = 128

    print("preprocessing %s" % speech_entry.video_path)

    face_detector = FaceDetector()
    a = speech_entry.video_path

    with VideoFileReader(a) as reader:

        frames = reader.read_all_frames(convert_to_gray_scale=True)

        mouth_cropped_frames = np.zeros(shape=(mouth_height, mouth_width, 75),
                                        dtype=np.float32)
        for i in range(75):
            mouth_cropped_frames[:, :, i] = face_detector.crop_mouth(
                frames[i], bounding_box_shape=(mouth_width, mouth_height))

        frames_per_slice = int(slice_duration_ms / 1000 *
                               reader.get_frame_rate())

        slices = [
            mouth_cropped_frames[:, :,
                                 (i * frames_per_slice):((i + 1) *
                                                         frames_per_slice)]
            for i in range(int(75 / frames_per_slice))
        ]

        video_samples = np.stack(slices)
        video_frame_rate = reader.get_frame_rate()

    print("preprocessing pair: %s, %s" %
          (speech_entry.audio_path, noise_file_path))

    speech_signal = AudioSignal.from_wav_file(speech_entry.audio_path)
    print(noise_file_path)
    noise_signal = AudioSignal.from_wav_file(noise_file_path)
    print(noise_signal.get_data())
    print(noise_signal.get_sample_rate())
    noise_signal.save_to_wav_file('./noise.wav')
    while noise_signal.get_number_of_samples(
    ) < speech_signal.get_number_of_samples():
        noise_signal = AudioSignal.concat([noise_signal, noise_signal])

    noise_signal.truncate(speech_signal.get_number_of_samples())

    factor = AudioMixer.snr_factor(speech_signal, noise_signal, snr_db=0)
    # print(factor)
    noise_signal.amplify_by_factor(factor)

    #noise_signal.save_to_wav_file('./noise.wav')
    mixed_signal = AudioMixer.mix([speech_signal, noise_signal],
                                  mixing_weights=[1, 1])
    mixed_signal.save_to_wav_file('./mixed.wav')
    mixed_spectrograms = preprocess_audio_signal(mixed_signal,
                                                 slice_duration_ms,
                                                 video_samples.shape[0],
                                                 video_frame_rate)
    speech_spectrograms = preprocess_audio_signal(speech_signal,
                                                  slice_duration_ms,
                                                  video_samples.shape[0],
                                                  video_frame_rate)
    noise_spectrograms = preprocess_audio_signal(noise_signal,
                                                 slice_duration_ms,
                                                 video_samples.shape[0],
                                                 video_frame_rate)

    n_slices = min(video_samples.shape[0], mixed_spectrograms.shape[0])

    return Sample(speaker_id=speech_entry.speaker_id,
                  video_file_path=speech_entry.video_path,
                  speech_file_path=speech_entry.audio_path,
                  noise_file_path=noise_file_path,
                  video_samples=video_samples[:n_slices],
                  mixed_spectrograms=mixed_spectrograms[:n_slices],
                  speech_spectrograms=speech_spectrograms[:n_slices],
                  noise_spectrograms=noise_spectrograms[:n_slices],
                  mixed_signal=mixed_signal,
                  video_frame_rate=video_frame_rate)