def preprocess_audio_pair(speech_file_path, noise_file_path, slice_duration_ms, n_video_slices, video_frame_rate): print("preprocessing pair: %s, %s" % (speech_file_path, noise_file_path)) speech_signal = AudioSignal.from_wav_file(speech_file_path) noise_signal = AudioSignal.from_wav_file(noise_file_path) while noise_signal.get_number_of_samples( ) < speech_signal.get_number_of_samples(): noise_signal = AudioSignal.concat([noise_signal, noise_signal]) noise_signal.truncate(speech_signal.get_number_of_samples()) factor = AudioMixer.snr_factor(speech_signal, noise_signal, snr_db=0) noise_signal.amplify_by_factor(factor) mixed_signal = AudioMixer.mix([speech_signal, noise_signal], mixing_weights=[1, 1]) mixed_spectrograms = preprocess_audio_signal(mixed_signal, slice_duration_ms, n_video_slices, video_frame_rate) speech_spectrograms = preprocess_audio_signal(speech_signal, slice_duration_ms, n_video_slices, video_frame_rate) noise_spectrograms = preprocess_audio_signal(noise_signal, slice_duration_ms, n_video_slices, video_frame_rate) return mixed_spectrograms, speech_spectrograms, noise_spectrograms, mixed_signal
def enhance_speech(speaker_file_path, noise_file_path, speech_prediction_path, speech_profile): print("enhancing mix of %s, %s" % (speaker_file_path, noise_file_path)) speaker_source_signal = AudioSignal.from_wav_file(speaker_file_path) noise_source_signal = AudioSignal.from_wav_file(noise_file_path) while noise_source_signal.get_number_of_samples( ) < speaker_source_signal.get_number_of_samples(): noise_source_signal = AudioSignal.concat( [noise_source_signal, noise_source_signal]) noise_source_signal = noise_source_signal.slice( 0, speaker_source_signal.get_number_of_samples()) mixed_signal = AudioMixer.mix([speaker_source_signal, noise_source_signal]) predicted_speech_signal = AudioSignal.from_wav_file(speech_prediction_path) signals = [mixed_signal, predicted_speech_signal] max_length = max([signal.get_number_of_samples() for signal in signals]) for signal in signals: signal.pad_with_zeros(max_length) mel_converter = MelConverter(mixed_signal.get_sample_rate(), n_mel_freqs=128, freq_min_hz=0, freq_max_hz=4000) mixed_spectrogram, original_phase = mel_converter.signal_to_mel_spectrogram( mixed_signal, get_phase=True) predicted_speech_spectrogram = mel_converter.signal_to_mel_spectrogram( predicted_speech_signal) speech_enhancement_mask = np.zeros(shape=mixed_spectrogram.shape) thresholds = np.zeros(shape=(speech_enhancement_mask.shape[0])) for f in range(speech_enhancement_mask.shape[0]): thresholds[f] = np.percentile(speech_profile[f, :], 85) for f in range(speech_enhancement_mask.shape[0]): for t in range(speech_enhancement_mask.shape[1]): if predicted_speech_spectrogram[f, t] > thresholds[f]: speech_enhancement_mask[f, t] = 1 continue enhanced_speech_spectrogram = mixed_spectrogram * speech_enhancement_mask enhanced_speech_signal = mel_converter.reconstruct_signal_from_mel_spectrogram( enhanced_speech_spectrogram, original_phase) return mixed_signal, enhanced_speech_signal
def preprocess_sample(speech_entry, noise_file_path, slice_duration_ms=200): print("preprocessing sample: %s, %s, %s..." % (speech_entry.video_path, speech_entry.audio_path, noise_file_path)) mouth_height = 128 mouth_width = 128 print("preprocessing %s" % speech_entry.video_path) face_detector = FaceDetector() a = speech_entry.video_path with VideoFileReader(a) as reader: frames = reader.read_all_frames(convert_to_gray_scale=True) mouth_cropped_frames = np.zeros(shape=(mouth_height, mouth_width, 75), dtype=np.float32) for i in range(75): mouth_cropped_frames[:, :, i] = face_detector.crop_mouth( frames[i], bounding_box_shape=(mouth_width, mouth_height)) frames_per_slice = int(slice_duration_ms / 1000 * reader.get_frame_rate()) slices = [ mouth_cropped_frames[:, :, (i * frames_per_slice):((i + 1) * frames_per_slice)] for i in range(int(75 / frames_per_slice)) ] video_samples = np.stack(slices) video_frame_rate = reader.get_frame_rate() print("preprocessing pair: %s, %s" % (speech_entry.audio_path, noise_file_path)) speech_signal = AudioSignal.from_wav_file(speech_entry.audio_path) print(noise_file_path) noise_signal = AudioSignal.from_wav_file(noise_file_path) print(noise_signal.get_data()) print(noise_signal.get_sample_rate()) noise_signal.save_to_wav_file('./noise.wav') while noise_signal.get_number_of_samples( ) < speech_signal.get_number_of_samples(): noise_signal = AudioSignal.concat([noise_signal, noise_signal]) noise_signal.truncate(speech_signal.get_number_of_samples()) factor = AudioMixer.snr_factor(speech_signal, noise_signal, snr_db=0) # print(factor) noise_signal.amplify_by_factor(factor) #noise_signal.save_to_wav_file('./noise.wav') mixed_signal = AudioMixer.mix([speech_signal, noise_signal], mixing_weights=[1, 1]) mixed_signal.save_to_wav_file('./mixed.wav') mixed_spectrograms = preprocess_audio_signal(mixed_signal, slice_duration_ms, video_samples.shape[0], video_frame_rate) speech_spectrograms = preprocess_audio_signal(speech_signal, slice_duration_ms, video_samples.shape[0], video_frame_rate) noise_spectrograms = preprocess_audio_signal(noise_signal, slice_duration_ms, video_samples.shape[0], video_frame_rate) n_slices = min(video_samples.shape[0], mixed_spectrograms.shape[0]) return Sample(speaker_id=speech_entry.speaker_id, video_file_path=speech_entry.video_path, speech_file_path=speech_entry.audio_path, noise_file_path=noise_file_path, video_samples=video_samples[:n_slices], mixed_spectrograms=mixed_spectrograms[:n_slices], speech_spectrograms=speech_spectrograms[:n_slices], noise_spectrograms=noise_spectrograms[:n_slices], mixed_signal=mixed_signal, video_frame_rate=video_frame_rate)