def preprocess_video_sample(video_file_path, slice_duration_ms=330): print("preprocessing %s" % video_file_path) face_detector = FaceDetector() with VideoFileReader(video_file_path) as reader: features = np.zeros(shape=(reader.get_frame_count(), 512), dtype=np.float32) for i in range(reader.get_frame_count()): frame = reader.read_next_frame() face = face_detector.crop_face(frame) face = cv2.resize(face, (224, 224)) x = np.expand_dims(face, axis=0) x = x.astype(np.float32) x[:, :, :, 0] -= 93.5940 x[:, :, :, 1] -= 104.7624 x[:, :, :, 2] -= 129.1863 features[i, :] = vgg_model.predict(x) frames_per_slice = int( math.ceil( (float(slice_duration_ms) / 1000) * reader.get_frame_rate())) n_slices = int(float(reader.get_frame_count()) / frames_per_slice) slices = [ features[(i * frames_per_slice):((i + 1) * frames_per_slice)] for i in range(n_slices) ] return np.stack(slices)
def __init__(self, network, video_path, storage_dir, width=128, height=128): print("*Initialize RunPredict class*\n") self.face_detector = FaceDetector() self.path_video_writer_path = os.path.join(storage_dir, "video_input_realtime.avi") self.video_path = video_path self.frame_size = (width, height) self.video_writer_frame_size = (1280, 720) self.fps, self.bounding_box = self.detect_bounding_box() self.open = True self.slice_of_frames = \ np.zeros(shape=(height, width, RunPredict.NUMBER_OF_FRAMES), dtype=np.float32) self.frames_counter = 0 self.slice_duration_ms = 200 self.n_video_slices = RunPredict.NUMBER_OF_SLICES self.network = network self.save_predicted = [] self.save_original = [] self.save_signal = [] self.num_iteration = 0 self.sample_type_info = np.iinfo(np.int16)
def preprocess_video_sample(video_file_path, slice_duration_ms=330, mouth_height=50, mouth_width=100): print("preprocessing %s" % video_file_path) face_detector = FaceDetector() with VideoFileReader(video_file_path) as reader: cropped_frames = np.zeros(shape=(reader.get_frame_count(), mouth_height, mouth_width, 3), dtype=np.float32) for i in range(reader.get_frame_count()): frame = reader.read_next_frame() cropped_frames[i, :] = face_detector.crop_mouth( frame, bounding_box_shape=(mouth_width, mouth_height)) frames_per_slice = int( math.ceil( (float(slice_duration_ms) / 1000) * reader.get_frame_rate())) n_slices = int(float(reader.get_frame_count()) / frames_per_slice) slices = [ cropped_frames[(i * frames_per_slice):((i + 1) * frames_per_slice)] for i in range(n_slices) ] return np.stack(slices)
def preprocess_video_sample(video_file_path, slice_duration_ms, mouth_height=128, mouth_width=128): print("preprocessing %s" % video_file_path) face_detector = FaceDetector() with VideoFileReader(video_file_path) as reader: frames = reader.read_all_frames(convert_to_gray_scale=True) mouth_cropped_frames = np.zeros(shape=(mouth_height, mouth_width, reader.get_frame_count()), dtype=np.float32) for i in range(reader.get_frame_count()): mouth_cropped_frames[:, :, i] = face_detector.crop_mouth( frames[i], bounding_box_shape=(mouth_width, mouth_height)) frames_per_slice = int( (float(slice_duration_ms) / 1000) * reader.get_frame_rate()) n_slices = int(float(reader.get_frame_count()) / frames_per_slice) slices = [ mouth_cropped_frames[:, :, (i * frames_per_slice):((i + 1) * frames_per_slice)] for i in range(n_slices) ] return np.stack(slices), reader.get_frame_rate()
def create_pkl(mp4path): tgtpklpath = (mp4path.parent / (mp4path.stem + ".pkl")) if not tgtpklpath.exists(): try: images = rgb2gray(vread(mp4path)).astype(np.uint8).squeeze() face_detector = FaceDetector() faces = np.stack([ face_detector.crop_mouth(image, bounding_box_shape=(220, 150)) for image in images ], 0) faces.dump(tgtpklpath.as_posix()) except Exception as e: print("{}: {}".format(tgtpklpath, e))
class RunPredict: """ This class runs real time predict. """ FRAMES_PER_SLICE = 6 NUMBER_OF_FRAMES = 6 # NUMBER_OF_FRAMES = 12 # NUMBER_OF_FRAMES = 30 # NUMBER_OF_FRAMES = 150 NUMBER_OF_SLICES = int(NUMBER_OF_FRAMES / FRAMES_PER_SLICE) def __init__(self, network, video_path, storage_dir, width=128, height=128): print("*Initialize RunPredict class*\n") self.face_detector = FaceDetector() self.path_video_writer_path = os.path.join(storage_dir, "video_input_realtime.avi") self.video_path = video_path self.frame_size = (width, height) self.video_writer_frame_size = (1280, 720) self.fps, self.bounding_box = self.detect_bounding_box() self.open = True self.slice_of_frames = \ np.zeros(shape=(height, width, RunPredict.NUMBER_OF_FRAMES), dtype=np.float32) self.frames_counter = 0 self.slice_duration_ms = 200 self.n_video_slices = RunPredict.NUMBER_OF_SLICES self.network = network self.save_predicted = [] self.save_original = [] self.save_signal = [] self.num_iteration = 0 self.sample_type_info = np.iinfo(np.int16) def detect_bounding_box(self): print("*Detect bounding box*") video_cap = cv2.VideoCapture(self.video_path) fps = video_cap.get(cv2.CAP_PROP_FPS) success, image = video_cap.read() try: if success: bounding_box = self.face_detector.detect_mouth( image, bounding_box_shape=self.frame_size) print("*Finish to create bounding box*\n") else: raise Exception('*Error - Detecting bounding box*') except Exception as error: print('Caught this error: ' + repr(error)) raise finally: video_cap.release() cv2.destroyAllWindows() return fps, bounding_box def run_pre_process(self, v_queue, a_queue, predict_queue, video_normalizer, lock): with lock: print("*Start pre-process*\n") video_out = cv2.VideoWriter(self.path_video_writer_path, cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), self.fps, self.frame_size) while self.open: video_frames_list, video_slice_number = v_queue.get() audio_frames_list, audio_slice_number = a_queue.get() # Video pre-process for frame in video_frames_list: im_crop = self.face_detector.crop_mouth( frame, self.bounding_box) video_out.write(im_crop) im_gray = cv2.cvtColor(im_crop, cv2.COLOR_BGR2GRAY) self.slice_of_frames[:, :, self.frames_counter] = im_gray self.frames_counter += 1 slices = [ self.slice_of_frames[:, :, (i * RunPredict.FRAMES_PER_SLICE):( (i + 1) * RunPredict.FRAMES_PER_SLICE)] for i in range(RunPredict.NUMBER_OF_SLICES) ] slices = np.stack(slices) video_normalizer.normalize(slices) # Audio pre-process data = np.fromstring(audio_frames_list, dtype=np.int16) mixed_signal = AudioSignal(data, 16000) self.num_iteration += 1 mixed_spectrograms = preprocess_audio_signal( mixed_signal, self.slice_duration_ms, self.n_video_slices, self.fps) # Predict predict_queue.put( (slices, mixed_signal, mixed_spectrograms, int(video_slice_number), int(audio_slice_number))) self.slice_of_frames = \ np.zeros(shape=(128, 128, RunPredict.NUMBER_OF_FRAMES), dtype=np.float32) self.frames_counter = 0 if (v_queue.empty() and a_queue.empty()) or audio_slice_number == 0\ or video_slice_number == 0: with lock: print( "****************************************************************" ) print("Video - slice number: " + str(video_slice_number)) print("Audio - slice number: " + str(audio_slice_number)) print("Predict - number of iterations: " + str(self.num_iteration)) print( "****************************************************************" ) if not v_queue.empty(): video_slice_number -= 1 v_queue.get() elif not a_queue.empty(): audio_slice_number -= 1 a_queue.get() v_queue.close() a_queue.close() with lock: print("*Video queue and Audio queue are empty*\n") break def predict(self, predict_queue, play_queue, lock): with lock: print("*Start Predict*\n") counter = 0 while True: video_data, mixed_signal, mixed_spectrograms, video_slice_number, audio_slice_number = predict_queue.get( ) # Spectrogram Test # self.collect_frames_for_saving(mixed_signal, self.save_original, object_flag=False) # after_spectrograms = reconstruct_speech_signal(mixed_signal, mixed_spectrograms, self.fps) # self.collect_frames_for_saving(after_spectrograms, self.save_signal, object_flag=True) counter += 1 try: predicted_speech_spectrograms = self.network.predict( mixed_spectrograms, video_data) predicted_speech_signal = reconstruct_speech_signal( mixed_signal, predicted_speech_spectrograms, self.fps) self.collect_frames_for_saving(predicted_speech_signal, self.save_predicted, object_flag=True) play_queue.put((predicted_speech_signal, video_slice_number, audio_slice_number)) if (video_slice_number == 0 and predict_queue.empty()) or\ (audio_slice_number == 0 and predict_queue.empty()): with lock: print("*Predict queue is empty*\n") print("Predict: " + str(counter)) predict_queue.close() break except Exception: logging.exception("Failed to predict") def play(self, play_queue, lock): # Live audio stream counter = 0 with lock: print("*Start Play*\n") p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True) # sd._initialize() while True: counter += 1 predicted_speech_signal, video_slice_number, audio_slice_number = play_queue.get( ) clean_audio = predicted_speech_signal.get_data() \ .clip(self.sample_type_info.min, self.sample_type_info.max).astype(np.int16) if counter == 1: with lock: print("First play time: " + str(datetime.now())) stream.write(clean_audio.tobytes(), exception_on_underflow=False) # sd.play(clean_audio, 16000, blocking=True) if (video_slice_number == 0 and play_queue.empty()) or (audio_slice_number == 0 and play_queue.empty()): with lock: print("*play queue is empty*\n") print("Play: " + str(counter)) play_queue.close() stream.stop_stream() stream.close() p.terminate() break def collect_frames_for_saving(self, data, list_name, object_flag=False): if object_flag: data = data.get_data().clip(self.sample_type_info.min, self.sample_type_info.max).astype( np.int16) list_name.append(data) else: list_name.append(data.get_data()) # TODO: Move to PredictionStorage class def save_files(self, storage): rate = 16000 print("*Saving wav files*") prediction_dir = storage.storage_dir # audio_input_real_time = os.path.join(prediction_dir, "audio_input_real_time.wav") video_input_realtime = os.path.join(prediction_dir, "video_input_realtime.avi") # audio_reconstruct_real_time = os.path.join(prediction_dir, "audio_reconstruct_real_time.wav") enhanced_real_time = os.path.join(prediction_dir, "enhanced_real_time.wav") enhanced_video_output = os.path.join(prediction_dir, "enhanced_video_output.avi") # wav.write(audio_input_real_time, rate, np.hstack(self.save_original)) # wav.write(audio_reconstruct_real_time, rate, np.hstack(self.save_signal)) wav.write(enhanced_real_time, rate, np.hstack(self.save_predicted)) print("*Mixing*") cmd = "ffmpeg -i {} -i {} -codec copy {}".format( video_input_realtime, enhanced_real_time, enhanced_video_output) subprocess.call(cmd, shell=True)
def preprocess_sample(speech_entry, noise_file_path, slice_duration_ms=200): print("preprocessing sample: %s, %s, %s..." % (speech_entry.video_path, speech_entry.audio_path, noise_file_path)) mouth_height = 128 mouth_width = 128 print("preprocessing %s" % speech_entry.video_path) face_detector = FaceDetector() a = speech_entry.video_path with VideoFileReader(a) as reader: frames = reader.read_all_frames(convert_to_gray_scale=True) mouth_cropped_frames = np.zeros(shape=(mouth_height, mouth_width, 75), dtype=np.float32) for i in range(75): mouth_cropped_frames[:, :, i] = face_detector.crop_mouth( frames[i], bounding_box_shape=(mouth_width, mouth_height)) frames_per_slice = int(slice_duration_ms / 1000 * reader.get_frame_rate()) slices = [ mouth_cropped_frames[:, :, (i * frames_per_slice):((i + 1) * frames_per_slice)] for i in range(int(75 / frames_per_slice)) ] video_samples = np.stack(slices) video_frame_rate = reader.get_frame_rate() print("preprocessing pair: %s, %s" % (speech_entry.audio_path, noise_file_path)) speech_signal = AudioSignal.from_wav_file(speech_entry.audio_path) print(noise_file_path) noise_signal = AudioSignal.from_wav_file(noise_file_path) print(noise_signal.get_data()) print(noise_signal.get_sample_rate()) noise_signal.save_to_wav_file('./noise.wav') while noise_signal.get_number_of_samples( ) < speech_signal.get_number_of_samples(): noise_signal = AudioSignal.concat([noise_signal, noise_signal]) noise_signal.truncate(speech_signal.get_number_of_samples()) factor = AudioMixer.snr_factor(speech_signal, noise_signal, snr_db=0) # print(factor) noise_signal.amplify_by_factor(factor) #noise_signal.save_to_wav_file('./noise.wav') mixed_signal = AudioMixer.mix([speech_signal, noise_signal], mixing_weights=[1, 1]) mixed_signal.save_to_wav_file('./mixed.wav') mixed_spectrograms = preprocess_audio_signal(mixed_signal, slice_duration_ms, video_samples.shape[0], video_frame_rate) speech_spectrograms = preprocess_audio_signal(speech_signal, slice_duration_ms, video_samples.shape[0], video_frame_rate) noise_spectrograms = preprocess_audio_signal(noise_signal, slice_duration_ms, video_samples.shape[0], video_frame_rate) n_slices = min(video_samples.shape[0], mixed_spectrograms.shape[0]) return Sample(speaker_id=speech_entry.speaker_id, video_file_path=speech_entry.video_path, speech_file_path=speech_entry.audio_path, noise_file_path=noise_file_path, video_samples=video_samples[:n_slices], mixed_spectrograms=mixed_spectrograms[:n_slices], speech_spectrograms=speech_spectrograms[:n_slices], noise_spectrograms=noise_spectrograms[:n_slices], mixed_signal=mixed_signal, video_frame_rate=video_frame_rate)