def preprocess_audio_pair(speech_file_path, noise_file_path, slice_duration_ms, n_video_slices, video_frame_rate): print("preprocessing pair: %s, %s" % (speech_file_path, noise_file_path)) speech_signal = AudioSignal.from_wav_file(speech_file_path) noise_signal = AudioSignal.from_wav_file(noise_file_path) while noise_signal.get_number_of_samples( ) < speech_signal.get_number_of_samples(): noise_signal = AudioSignal.concat([noise_signal, noise_signal]) noise_signal.truncate(speech_signal.get_number_of_samples()) factor = AudioMixer.snr_factor(speech_signal, noise_signal, snr_db=0) noise_signal.amplify_by_factor(factor) mixed_signal = AudioMixer.mix([speech_signal, noise_signal], mixing_weights=[1, 1]) mixed_spectrograms = preprocess_audio_signal(mixed_signal, slice_duration_ms, n_video_slices, video_frame_rate) speech_spectrograms = preprocess_audio_signal(speech_signal, slice_duration_ms, n_video_slices, video_frame_rate) noise_spectrograms = preprocess_audio_signal(noise_signal, slice_duration_ms, n_video_slices, video_frame_rate) return mixed_spectrograms, speech_spectrograms, noise_spectrograms, mixed_signal
def enhance_speech(speaker_file_path, noise_file_path, speech_prediction_path, speech_profile): print("enhancing mix of %s, %s" % (speaker_file_path, noise_file_path)) speaker_source_signal = AudioSignal.from_wav_file(speaker_file_path) noise_source_signal = AudioSignal.from_wav_file(noise_file_path) while noise_source_signal.get_number_of_samples( ) < speaker_source_signal.get_number_of_samples(): noise_source_signal = AudioSignal.concat( [noise_source_signal, noise_source_signal]) noise_source_signal = noise_source_signal.slice( 0, speaker_source_signal.get_number_of_samples()) mixed_signal = AudioMixer.mix([speaker_source_signal, noise_source_signal]) predicted_speech_signal = AudioSignal.from_wav_file(speech_prediction_path) signals = [mixed_signal, predicted_speech_signal] max_length = max([signal.get_number_of_samples() for signal in signals]) for signal in signals: signal.pad_with_zeros(max_length) mel_converter = MelConverter(mixed_signal.get_sample_rate(), n_mel_freqs=128, freq_min_hz=0, freq_max_hz=4000) mixed_spectrogram, original_phase = mel_converter.signal_to_mel_spectrogram( mixed_signal, get_phase=True) predicted_speech_spectrogram = mel_converter.signal_to_mel_spectrogram( predicted_speech_signal) speech_enhancement_mask = np.zeros(shape=mixed_spectrogram.shape) thresholds = np.zeros(shape=(speech_enhancement_mask.shape[0])) for f in range(speech_enhancement_mask.shape[0]): thresholds[f] = np.percentile(speech_profile[f, :], 85) for f in range(speech_enhancement_mask.shape[0]): for t in range(speech_enhancement_mask.shape[1]): if predicted_speech_spectrogram[f, t] > thresholds[f]: speech_enhancement_mask[f, t] = 1 continue enhanced_speech_spectrogram = mixed_spectrogram * speech_enhancement_mask enhanced_speech_signal = mel_converter.reconstruct_signal_from_mel_spectrogram( enhanced_speech_spectrogram, original_phase) return mixed_signal, enhanced_speech_signal
def preprocess_audio_sample(audio_file_path, slice_duration_ms=330): print("preprocessing %s" % audio_file_path) audio_signal = AudioSignal.from_wav_file(audio_file_path) mel_converter = MelConverter(audio_signal.get_sample_rate(), n_mel_freqs=128, freq_min_hz=0, freq_max_hz=4000) new_signal_length = int( math.ceil( float(audio_signal.get_number_of_samples()) / mel_converter.get_hop_length())) * mel_converter.get_hop_length() audio_signal.pad_with_zeros(new_signal_length) mel_spectrogram = mel_converter.signal_to_mel_spectrogram(audio_signal) samples_per_slice = int( (float(slice_duration_ms) / 1000) * audio_signal.get_sample_rate()) spectrogram_samples_per_slice = int(samples_per_slice / mel_converter.get_hop_length()) n_slices = int(mel_spectrogram.shape[1] / spectrogram_samples_per_slice) slices = [ mel_spectrogram[:, (i * spectrogram_samples_per_slice):( (i + 1) * spectrogram_samples_per_slice)].flatten() for i in range(n_slices) ] return np.stack(slices)
def evaluate(source_file_paths, estimated_file_paths): source_signals = [AudioSignal.from_wav_file(f) for f in source_file_paths] estimated_signals = [AudioSignal.from_wav_file(f) for f in estimated_file_paths] signals = source_signals + estimated_signals max_length = max([signal.get_number_of_samples() for signal in signals]) for signal in signals: signal.pad_with_zeros(max_length) source_data = [signal.get_data(channel_index=0) for signal in source_signals] estimated_data = [signal.get_data(channel_index=0) for signal in estimated_signals] source_data = np.stack(source_data) estimated_data = np.stack(estimated_data) return mir_eval.separation.bss_eval_sources(source_data, estimated_data, compute_permutation=True)
def evaluate(enhancement_dir): noisy_snr_dbs = [] snr_dbs = [] speaker_ids = os.listdir(enhancement_dir) for speaker_id in speaker_ids: for sample_dir_name in os.listdir( os.path.join(enhancement_dir, speaker_id)): print('evaluating snr of %s' % sample_dir_name) source_path = os.path.join(enhancement_dir, speaker_id, sample_dir_name, 'source.wav') mixture_path = os.path.join(enhancement_dir, speaker_id, sample_dir_name, 'mixture.wav') enhanced_path = os.path.join(enhancement_dir, speaker_id, sample_dir_name, 'enhanced.wav') source_signal = AudioSignal.from_wav_file(source_path) mixture_signal = AudioSignal.from_wav_file(mixture_path) enhanced_signal = AudioSignal.from_wav_file(enhanced_path) truncate_longer_signal(mixture_signal, source_signal) s = source_signal.get_data() n = mixture_signal.get_data() - source_signal.get_data() noisy_snr = np.var(s) / np.var(n) noisy_snr_db = 10 * np.log10(noisy_snr) print('noisy snr db: %f' % noisy_snr_db) noisy_snr_dbs.append(noisy_snr_db) truncate_longer_signal(enhanced_signal, source_signal) s = source_signal.get_data() e = enhanced_signal.get_data() residual_noise = e - s snr = np.var(s) / np.var(residual_noise) snr_db = 10 * np.log10(snr) print('snr db: %f' % snr_db) snr_dbs.append(snr_db) print('mean noisy snr db: %f' % np.mean(noisy_snr_dbs)) print('mean snr db: %f' % np.mean(snr_dbs))
def start(args): # Initialize Network assets = AssetManager(args.prediction_dir) storage = PredictionStorage(args.prediction_dir) network = SpeechEnhancementNetwork.load( assets.get_model_cache_path(args.model_dir)) network.start_prediction_mode() network.predict(np.zeros((2, 80, 24)), np.zeros((2, 128, 128, 6))) predicted_speech_signal = reconstruct_speech_signal\ (AudioSignal.from_wav_file("/cs/engproj/322/real_time/raw_data/mixture.wav"), np.zeros((2, 80, 24)), 30) with open(assets.get_normalization_cache_path(args.model_dir), 'rb') as normalization_fd: video_normalizer = pickle.load(normalization_fd) lock = Lock() video_dir = assets.get_video_cache_path(args.video_audio_dir) predict_object = RunPredict(network, video_dir, storage.storage_dir) # Run video, audio, preprocess and play threads video_queue = Queue() audio_queue = Queue() predict_queue = Queue() play_queue = Queue() video_object = VideoProcess(video_dir) video_thread = Process(target=video_object.capture_frames, args=(video_queue, lock)) audio_object = AudioProcess( assets.get_audio_cache_path(args.video_audio_dir)) audio_thread = Process(target=audio_object.capture_frames, args=(audio_queue, lock)) preprocess_thread = Process(target=predict_object.run_pre_process, args=(video_queue, audio_queue, predict_queue, video_normalizer, lock)) play_thread = Process(target=predict_object.play, args=(play_queue, lock)) video_thread.start() audio_thread.start() preprocess_thread.start() play_thread.start() # Run predict predict_object.predict(predict_queue, play_queue, lock) video_thread.join() audio_thread.join() preprocess_thread.join() play_thread.join() # Save files predict_object.save_files(storage) print("*Finish All*")
def build_speech_profile(speaker_speech_dir, max_files=50): print("building speech profile...") speech_file_paths = [os.path.join(speaker_speech_dir, f) for f in os.listdir(speaker_speech_dir)][:max_files] speech_signals = [AudioSignal.from_wav_file(f) for f in speech_file_paths] mel_converter = MelConverter(speech_signals[0].get_sample_rate(), n_mel_freqs=128, freq_min_hz=0, freq_max_hz=None) speech_spectrograms = [mel_converter.signal_to_mel_spectrogram(signal) for signal in speech_signals] speech_profile = np.concatenate(speech_spectrograms, axis=1) return speech_profile
def separate_sources(source_file_paths, prediction_file_paths, separation_function): print("separating mixture of %s" % str(source_file_paths)) source_signals = [AudioSignal.from_wav_file(f) for f in source_file_paths] prediction_signals = [ AudioSignal.from_wav_file(f) for f in prediction_file_paths ] signals = source_signals + prediction_signals max_length = max([signal.get_number_of_samples() for signal in signals]) for signal in signals: signal.pad_with_zeros(max_length) mixed_signal = AudioMixer.mix(source_signals) mel_converter = MelConverter(mixed_signal.get_sample_rate(), n_mel_freqs=128, freq_min_hz=0, freq_max_hz=None) mixed_spectrogram, original_phase = mel_converter.signal_to_mel_spectrogram( mixed_signal, get_phase=True) prediction_spectrograms = [ mel_converter.signal_to_mel_spectrogram(signal) for signal in prediction_signals ] masks = generate_separation_masks(mixed_spectrogram, prediction_spectrograms, separation_function) separated_spectrograms = [mixed_spectrogram * mask for mask in masks] separated_signals = [ mel_converter.reconstruct_signal_from_mel_spectrogram( s, original_phase) for s in separated_spectrograms ] return mixed_signal, separated_signals
def preprocess_sample(speech_entry, noise_file_path, slice_duration_ms=200): print("preprocessing sample: %s, %s, %s..." % (speech_entry.video_path, speech_entry.audio_path, noise_file_path)) mouth_height = 128 mouth_width = 128 print("preprocessing %s" % speech_entry.video_path) face_detector = FaceDetector() a = speech_entry.video_path with VideoFileReader(a) as reader: frames = reader.read_all_frames(convert_to_gray_scale=True) mouth_cropped_frames = np.zeros(shape=(mouth_height, mouth_width, 75), dtype=np.float32) for i in range(75): mouth_cropped_frames[:, :, i] = face_detector.crop_mouth( frames[i], bounding_box_shape=(mouth_width, mouth_height)) frames_per_slice = int(slice_duration_ms / 1000 * reader.get_frame_rate()) slices = [ mouth_cropped_frames[:, :, (i * frames_per_slice):((i + 1) * frames_per_slice)] for i in range(int(75 / frames_per_slice)) ] video_samples = np.stack(slices) video_frame_rate = reader.get_frame_rate() print("preprocessing pair: %s, %s" % (speech_entry.audio_path, noise_file_path)) speech_signal = AudioSignal.from_wav_file(speech_entry.audio_path) print(noise_file_path) noise_signal = AudioSignal.from_wav_file(noise_file_path) print(noise_signal.get_data()) print(noise_signal.get_sample_rate()) noise_signal.save_to_wav_file('./noise.wav') while noise_signal.get_number_of_samples( ) < speech_signal.get_number_of_samples(): noise_signal = AudioSignal.concat([noise_signal, noise_signal]) noise_signal.truncate(speech_signal.get_number_of_samples()) factor = AudioMixer.snr_factor(speech_signal, noise_signal, snr_db=0) # print(factor) noise_signal.amplify_by_factor(factor) #noise_signal.save_to_wav_file('./noise.wav') mixed_signal = AudioMixer.mix([speech_signal, noise_signal], mixing_weights=[1, 1]) mixed_signal.save_to_wav_file('./mixed.wav') mixed_spectrograms = preprocess_audio_signal(mixed_signal, slice_duration_ms, video_samples.shape[0], video_frame_rate) speech_spectrograms = preprocess_audio_signal(speech_signal, slice_duration_ms, video_samples.shape[0], video_frame_rate) noise_spectrograms = preprocess_audio_signal(noise_signal, slice_duration_ms, video_samples.shape[0], video_frame_rate) n_slices = min(video_samples.shape[0], mixed_spectrograms.shape[0]) return Sample(speaker_id=speech_entry.speaker_id, video_file_path=speech_entry.video_path, speech_file_path=speech_entry.audio_path, noise_file_path=noise_file_path, video_samples=video_samples[:n_slices], mixed_spectrograms=mixed_spectrograms[:n_slices], speech_spectrograms=speech_spectrograms[:n_slices], noise_spectrograms=noise_spectrograms[:n_slices], mixed_signal=mixed_signal, video_frame_rate=video_frame_rate)