def preprocess_video_sample(video_file_path, slice_duration_ms=330):
    print("preprocessing %s" % video_file_path)

    face_detector = FaceDetector()

    with VideoFileReader(video_file_path) as reader:
        features = np.zeros(shape=(reader.get_frame_count(), 512),
                            dtype=np.float32)
        for i in range(reader.get_frame_count()):
            frame = reader.read_next_frame()

            face = face_detector.crop_face(frame)
            face = cv2.resize(face, (224, 224))

            x = np.expand_dims(face, axis=0)
            x = x.astype(np.float32)

            x[:, :, :, 0] -= 93.5940
            x[:, :, :, 1] -= 104.7624
            x[:, :, :, 2] -= 129.1863

            features[i, :] = vgg_model.predict(x)

        frames_per_slice = int(
            math.ceil(
                (float(slice_duration_ms) / 1000) * reader.get_frame_rate()))
        n_slices = int(float(reader.get_frame_count()) / frames_per_slice)

        slices = [
            features[(i * frames_per_slice):((i + 1) * frames_per_slice)]
            for i in range(n_slices)
        ]

        return np.stack(slices)
    def __init__(self,
                 network,
                 video_path,
                 storage_dir,
                 width=128,
                 height=128):

        print("*Initialize RunPredict class*\n")

        self.face_detector = FaceDetector()
        self.path_video_writer_path = os.path.join(storage_dir,
                                                   "video_input_realtime.avi")
        self.video_path = video_path
        self.frame_size = (width, height)
        self.video_writer_frame_size = (1280, 720)
        self.fps, self.bounding_box = self.detect_bounding_box()
        self.open = True
        self.slice_of_frames = \
            np.zeros(shape=(height, width, RunPredict.NUMBER_OF_FRAMES), dtype=np.float32)
        self.frames_counter = 0
        self.slice_duration_ms = 200
        self.n_video_slices = RunPredict.NUMBER_OF_SLICES
        self.network = network
        self.save_predicted = []
        self.save_original = []
        self.save_signal = []
        self.num_iteration = 0
        self.sample_type_info = np.iinfo(np.int16)
Ejemplo n.º 3
0
def preprocess_video_sample(video_file_path,
                            slice_duration_ms=330,
                            mouth_height=50,
                            mouth_width=100):
    print("preprocessing %s" % video_file_path)

    face_detector = FaceDetector()

    with VideoFileReader(video_file_path) as reader:
        cropped_frames = np.zeros(shape=(reader.get_frame_count(),
                                         mouth_height, mouth_width, 3),
                                  dtype=np.float32)

        for i in range(reader.get_frame_count()):
            frame = reader.read_next_frame()
            cropped_frames[i, :] = face_detector.crop_mouth(
                frame, bounding_box_shape=(mouth_width, mouth_height))

        frames_per_slice = int(
            math.ceil(
                (float(slice_duration_ms) / 1000) * reader.get_frame_rate()))
        n_slices = int(float(reader.get_frame_count()) / frames_per_slice)

        slices = [
            cropped_frames[(i * frames_per_slice):((i + 1) * frames_per_slice)]
            for i in range(n_slices)
        ]

        return np.stack(slices)
Ejemplo n.º 4
0
def preprocess_video_sample(video_file_path,
                            slice_duration_ms,
                            mouth_height=128,
                            mouth_width=128):
    print("preprocessing %s" % video_file_path)

    face_detector = FaceDetector()

    with VideoFileReader(video_file_path) as reader:
        frames = reader.read_all_frames(convert_to_gray_scale=True)

        mouth_cropped_frames = np.zeros(shape=(mouth_height, mouth_width,
                                               reader.get_frame_count()),
                                        dtype=np.float32)
        for i in range(reader.get_frame_count()):
            mouth_cropped_frames[:, :, i] = face_detector.crop_mouth(
                frames[i], bounding_box_shape=(mouth_width, mouth_height))

        frames_per_slice = int(
            (float(slice_duration_ms) / 1000) * reader.get_frame_rate())
        n_slices = int(float(reader.get_frame_count()) / frames_per_slice)

        slices = [
            mouth_cropped_frames[:, :,
                                 (i * frames_per_slice):((i + 1) *
                                                         frames_per_slice)]
            for i in range(n_slices)
        ]

        return np.stack(slices), reader.get_frame_rate()
Ejemplo n.º 5
0
def create_pkl(mp4path):
    tgtpklpath = (mp4path.parent / (mp4path.stem + ".pkl"))
    if not tgtpklpath.exists():
        try:
            images = rgb2gray(vread(mp4path)).astype(np.uint8).squeeze()
            face_detector = FaceDetector()
            faces = np.stack([
                face_detector.crop_mouth(image, bounding_box_shape=(220, 150))
                for image in images
            ], 0)
            faces.dump(tgtpklpath.as_posix())
        except Exception as e:
            print("{}: {}".format(tgtpklpath, e))
class RunPredict:
    """
    This class runs real time predict.
    """

    FRAMES_PER_SLICE = 6

    NUMBER_OF_FRAMES = 6
    # NUMBER_OF_FRAMES = 12
    # NUMBER_OF_FRAMES = 30
    # NUMBER_OF_FRAMES = 150

    NUMBER_OF_SLICES = int(NUMBER_OF_FRAMES / FRAMES_PER_SLICE)

    def __init__(self,
                 network,
                 video_path,
                 storage_dir,
                 width=128,
                 height=128):

        print("*Initialize RunPredict class*\n")

        self.face_detector = FaceDetector()
        self.path_video_writer_path = os.path.join(storage_dir,
                                                   "video_input_realtime.avi")
        self.video_path = video_path
        self.frame_size = (width, height)
        self.video_writer_frame_size = (1280, 720)
        self.fps, self.bounding_box = self.detect_bounding_box()
        self.open = True
        self.slice_of_frames = \
            np.zeros(shape=(height, width, RunPredict.NUMBER_OF_FRAMES), dtype=np.float32)
        self.frames_counter = 0
        self.slice_duration_ms = 200
        self.n_video_slices = RunPredict.NUMBER_OF_SLICES
        self.network = network
        self.save_predicted = []
        self.save_original = []
        self.save_signal = []
        self.num_iteration = 0
        self.sample_type_info = np.iinfo(np.int16)

    def detect_bounding_box(self):

        print("*Detect bounding box*")

        video_cap = cv2.VideoCapture(self.video_path)
        fps = video_cap.get(cv2.CAP_PROP_FPS)
        success, image = video_cap.read()

        try:
            if success:
                bounding_box = self.face_detector.detect_mouth(
                    image, bounding_box_shape=self.frame_size)
                print("*Finish to create bounding box*\n")

            else:
                raise Exception('*Error - Detecting bounding box*')

        except Exception as error:
            print('Caught this error: ' + repr(error))
            raise

        finally:
            video_cap.release()
            cv2.destroyAllWindows()

        return fps, bounding_box

    def run_pre_process(self, v_queue, a_queue, predict_queue,
                        video_normalizer, lock):

        with lock:
            print("*Start pre-process*\n")

        video_out = cv2.VideoWriter(self.path_video_writer_path,
                                    cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'),
                                    self.fps, self.frame_size)

        while self.open:

            video_frames_list, video_slice_number = v_queue.get()
            audio_frames_list, audio_slice_number = a_queue.get()

            # Video pre-process
            for frame in video_frames_list:
                im_crop = self.face_detector.crop_mouth(
                    frame, self.bounding_box)
                video_out.write(im_crop)

                im_gray = cv2.cvtColor(im_crop, cv2.COLOR_BGR2GRAY)
                self.slice_of_frames[:, :, self.frames_counter] = im_gray
                self.frames_counter += 1

            slices = [
                self.slice_of_frames[:, :, (i * RunPredict.FRAMES_PER_SLICE):(
                    (i + 1) * RunPredict.FRAMES_PER_SLICE)]
                for i in range(RunPredict.NUMBER_OF_SLICES)
            ]

            slices = np.stack(slices)
            video_normalizer.normalize(slices)

            # Audio pre-process
            data = np.fromstring(audio_frames_list, dtype=np.int16)
            mixed_signal = AudioSignal(data, 16000)

            self.num_iteration += 1

            mixed_spectrograms = preprocess_audio_signal(
                mixed_signal, self.slice_duration_ms, self.n_video_slices,
                self.fps)

            # Predict
            predict_queue.put(
                (slices, mixed_signal, mixed_spectrograms,
                 int(video_slice_number), int(audio_slice_number)))

            self.slice_of_frames = \
                np.zeros(shape=(128, 128, RunPredict.NUMBER_OF_FRAMES), dtype=np.float32)
            self.frames_counter = 0

            if (v_queue.empty() and a_queue.empty()) or audio_slice_number == 0\
                    or video_slice_number == 0:

                with lock:
                    print(
                        "****************************************************************"
                    )
                    print("Video - slice number: " + str(video_slice_number))
                    print("Audio - slice number: " + str(audio_slice_number))
                    print("Predict - number of iterations: " +
                          str(self.num_iteration))
                    print(
                        "****************************************************************"
                    )

                if not v_queue.empty():
                    video_slice_number -= 1
                    v_queue.get()

                elif not a_queue.empty():
                    audio_slice_number -= 1
                    a_queue.get()

                v_queue.close()
                a_queue.close()

                with lock:
                    print("*Video queue and Audio queue are empty*\n")
                break

    def predict(self, predict_queue, play_queue, lock):

        with lock:
            print("*Start Predict*\n")

        counter = 0
        while True:

            video_data, mixed_signal, mixed_spectrograms, video_slice_number, audio_slice_number = predict_queue.get(
            )

            # Spectrogram Test
            # self.collect_frames_for_saving(mixed_signal, self.save_original, object_flag=False)
            # after_spectrograms = reconstruct_speech_signal(mixed_signal, mixed_spectrograms, self.fps)
            # self.collect_frames_for_saving(after_spectrograms, self.save_signal, object_flag=True)

            counter += 1
            try:

                predicted_speech_spectrograms = self.network.predict(
                    mixed_spectrograms, video_data)
                predicted_speech_signal = reconstruct_speech_signal(
                    mixed_signal, predicted_speech_spectrograms, self.fps)

                self.collect_frames_for_saving(predicted_speech_signal,
                                               self.save_predicted,
                                               object_flag=True)

                play_queue.put((predicted_speech_signal, video_slice_number,
                                audio_slice_number))

                if (video_slice_number == 0 and predict_queue.empty()) or\
                        (audio_slice_number == 0 and predict_queue.empty()):
                    with lock:
                        print("*Predict queue is empty*\n")
                        print("Predict: " + str(counter))
                    predict_queue.close()
                    break

            except Exception:
                logging.exception("Failed to predict")

    def play(self, play_queue, lock):

        # Live audio stream
        counter = 0
        with lock:
            print("*Start Play*\n")

        p = pyaudio.PyAudio()
        stream = p.open(format=pyaudio.paInt16,
                        channels=1,
                        rate=16000,
                        output=True)

        # sd._initialize()

        while True:
            counter += 1

            predicted_speech_signal, video_slice_number, audio_slice_number = play_queue.get(
            )
            clean_audio = predicted_speech_signal.get_data() \
                .clip(self.sample_type_info.min, self.sample_type_info.max).astype(np.int16)

            if counter == 1:
                with lock:
                    print("First play time: " + str(datetime.now()))

            stream.write(clean_audio.tobytes(), exception_on_underflow=False)

            # sd.play(clean_audio, 16000, blocking=True)

            if (video_slice_number == 0
                    and play_queue.empty()) or (audio_slice_number == 0
                                                and play_queue.empty()):
                with lock:
                    print("*play queue is empty*\n")
                    print("Play: " + str(counter))
                play_queue.close()

                stream.stop_stream()
                stream.close()
                p.terminate()

                break

    def collect_frames_for_saving(self, data, list_name, object_flag=False):
        if object_flag:
            data = data.get_data().clip(self.sample_type_info.min,
                                        self.sample_type_info.max).astype(
                                            np.int16)
            list_name.append(data)
        else:
            list_name.append(data.get_data())

    # TODO: Move to PredictionStorage class
    def save_files(self, storage):

        rate = 16000

        print("*Saving wav files*")

        prediction_dir = storage.storage_dir

        # audio_input_real_time = os.path.join(prediction_dir, "audio_input_real_time.wav")
        video_input_realtime = os.path.join(prediction_dir,
                                            "video_input_realtime.avi")
        # audio_reconstruct_real_time = os.path.join(prediction_dir, "audio_reconstruct_real_time.wav")
        enhanced_real_time = os.path.join(prediction_dir,
                                          "enhanced_real_time.wav")
        enhanced_video_output = os.path.join(prediction_dir,
                                             "enhanced_video_output.avi")

        # wav.write(audio_input_real_time, rate, np.hstack(self.save_original))
        # wav.write(audio_reconstruct_real_time, rate, np.hstack(self.save_signal))
        wav.write(enhanced_real_time, rate, np.hstack(self.save_predicted))

        print("*Mixing*")

        cmd = "ffmpeg -i {} -i {} -codec copy {}".format(
            video_input_realtime, enhanced_real_time, enhanced_video_output)

        subprocess.call(cmd, shell=True)
def preprocess_sample(speech_entry, noise_file_path, slice_duration_ms=200):
    print("preprocessing sample: %s, %s, %s..." %
          (speech_entry.video_path, speech_entry.audio_path, noise_file_path))

    mouth_height = 128
    mouth_width = 128

    print("preprocessing %s" % speech_entry.video_path)

    face_detector = FaceDetector()
    a = speech_entry.video_path

    with VideoFileReader(a) as reader:

        frames = reader.read_all_frames(convert_to_gray_scale=True)

        mouth_cropped_frames = np.zeros(shape=(mouth_height, mouth_width, 75),
                                        dtype=np.float32)
        for i in range(75):
            mouth_cropped_frames[:, :, i] = face_detector.crop_mouth(
                frames[i], bounding_box_shape=(mouth_width, mouth_height))

        frames_per_slice = int(slice_duration_ms / 1000 *
                               reader.get_frame_rate())

        slices = [
            mouth_cropped_frames[:, :,
                                 (i * frames_per_slice):((i + 1) *
                                                         frames_per_slice)]
            for i in range(int(75 / frames_per_slice))
        ]

        video_samples = np.stack(slices)
        video_frame_rate = reader.get_frame_rate()

    print("preprocessing pair: %s, %s" %
          (speech_entry.audio_path, noise_file_path))

    speech_signal = AudioSignal.from_wav_file(speech_entry.audio_path)
    print(noise_file_path)
    noise_signal = AudioSignal.from_wav_file(noise_file_path)
    print(noise_signal.get_data())
    print(noise_signal.get_sample_rate())
    noise_signal.save_to_wav_file('./noise.wav')
    while noise_signal.get_number_of_samples(
    ) < speech_signal.get_number_of_samples():
        noise_signal = AudioSignal.concat([noise_signal, noise_signal])

    noise_signal.truncate(speech_signal.get_number_of_samples())

    factor = AudioMixer.snr_factor(speech_signal, noise_signal, snr_db=0)
    # print(factor)
    noise_signal.amplify_by_factor(factor)

    #noise_signal.save_to_wav_file('./noise.wav')
    mixed_signal = AudioMixer.mix([speech_signal, noise_signal],
                                  mixing_weights=[1, 1])
    mixed_signal.save_to_wav_file('./mixed.wav')
    mixed_spectrograms = preprocess_audio_signal(mixed_signal,
                                                 slice_duration_ms,
                                                 video_samples.shape[0],
                                                 video_frame_rate)
    speech_spectrograms = preprocess_audio_signal(speech_signal,
                                                  slice_duration_ms,
                                                  video_samples.shape[0],
                                                  video_frame_rate)
    noise_spectrograms = preprocess_audio_signal(noise_signal,
                                                 slice_duration_ms,
                                                 video_samples.shape[0],
                                                 video_frame_rate)

    n_slices = min(video_samples.shape[0], mixed_spectrograms.shape[0])

    return Sample(speaker_id=speech_entry.speaker_id,
                  video_file_path=speech_entry.video_path,
                  speech_file_path=speech_entry.audio_path,
                  noise_file_path=noise_file_path,
                  video_samples=video_samples[:n_slices],
                  mixed_spectrograms=mixed_spectrograms[:n_slices],
                  speech_spectrograms=speech_spectrograms[:n_slices],
                  noise_spectrograms=noise_spectrograms[:n_slices],
                  mixed_signal=mixed_signal,
                  video_frame_rate=video_frame_rate)