class InteractionMaker: EMOTION_PROB_THRESH = 0 def __init__(self): self.detection_reader = DetectionReader('detections.json') self.project_file_name = '/home/algernon/andro2' self.video_file_name = '' self.db_name = '' self.data_base = None self.video_maker = None self.db_user_name = 'root' self.db_user_pass = '******' self.db_host = 'localhost' self.commands = [] self.output_video_file_name = 'output.mkv' self.video_reader = None self.video_writer = None self.emotion_detection_reader = DetectionReader('emotion_results/er.json') self.emotion_recognizer = EmotionRecognizer(self.EMOTION_PROB_THRESH) self.captioner = Captioner('/home/algernon/a-PyTorch-Tutorial-to-Image-Captioning/weights/BEST_checkpoint_coco_5_cap_per_img_5_min_word_freq.pth.tar', '/home/algernon/a-PyTorch-Tutorial-to-Image-Captioning/weights/WORDMAP_coco_5_cap_per_img_5_min_word_freq.json') self.segmentator = None self.clothes_detector = ClothesDetector("yolo/df2cfg/yolov3-df2.cfg", "yolo/weights/yolov3-df2_15000.weights", "yolo/df2cfg/df2.names") self.face_recognizer = FaceRecognizer() self.open_project() self.recognizer = Recognizer( '/home/algernon/PycharmProjects/AIVlog/mmdetection/configs/pascal_voc/faster_rcnn_r50_fpn_1x_voc0712.py', '/home/algernon/PycharmProjects/AIVlog/mmdetection/work_dirs/faster_rcnn_r50_fpn_1x_voc0712/epoch_10.pth') def open_project(self): with open(self.project_file_name, 'r') as project_file: self.video_file_name = project_file.readline().strip() self.db_name = project_file.readline().strip() self.data_base = DB(self.db_host, self.db_user_name, self.db_user_pass, self.db_name) self.video_reader = VideoReader(self.video_file_name) self.video_writer = cv2.VideoWriter(self.output_video_file_name, cv2.VideoWriter_fourcc(*"XVID"), self.video_reader.fps, (self.video_reader.width, self.video_reader.height)) self.segmentator = SceneSegmentator(self.video_reader.fps * 5) self.load_commands_from_db() def load_commands_from_db(self): # upload commands cursor = self.data_base.exec_query("SELECT * FROM Command") while cursor.rownumber < cursor.rowcount: command_response = cursor.fetchone() query = "SELECT name FROM Labels WHERE label_id=%s" attached_character_class = \ self.data_base.exec_template_query(query, [command_response['attached_character_class']]).fetchone()[ 'name'] relation_class = '' if command_response['relation_class'] is not None: relation_class = \ self.data_base.exec_template_query(query, [command_response['relation_class']]).fetchone()[ 'name'] media_response = self.data_base.exec_query( f"SELECT * FROM Media WHERE media_id={command_response['media_id']}").fetchone() media = Media(media_response['file_name'], media_response['type'], media_response['duration']) trigger_cmd_name = '' trigger_cmd_id = command_response['trigger_event_id'] if trigger_cmd_id is not None: trigger_cmd_name = \ self.data_base.exec_query(f"SELECT name FROM Command WHERE command_id={trigger_cmd_id}").fetchone()[ 'name'] delay = command_response['delay'] emotion = '' emotion_id = command_response['expected_emotion_id'] if emotion_id is not None: emotion = \ self.data_base.exec_query(f"SELECT name FROM Emotion WHERE emotion_id={emotion_id}").fetchone()['name'] command = Command(command_response['name'], command_response['centered'], command_response['trigger_event_id'], attached_character_class, relation_class, CommandType(command_response['command_type_id']), trigger_cmd_name, media, command_response['duration'], delay, emotion) self.commands.append(command) def process_commands(self): for _ in trange(self.video_reader.frame_count): frame = self.video_reader.get_next_frame() cur_frame_num = self.video_reader.cur_frame_num #emotion_detections = self.detect_emotions_on_frame(frame) emotion_detections = [] #self.segmentator.push_frame(frame) clothes_detections = self.clothes_detector.detect_clothes(frame) self.draw_clothes(frame, clothes_detections) emotions_per_frame = [] for emotion_pos, emotion in emotion_detections: emotions_per_frame.append((emotion_pos, emotion)) self.draw_emotion_box(frame, emotion_pos, emotion) #_, object_detections_per_frame = self.recognizer.inference(frame) object_detections_per_frame = [] draw_det_boxes(frame, object_detections_per_frame) labels_per_frame = [detection[0] for detection in object_detections_per_frame] states_needed_to_be_checked_on_event = [Command.State.WAITING, Command.State.EXECUTING, Command.State.AFTER_DELAYING] commands_needed_to_be_checked_on_event = [cmd for cmd in self.commands if cmd.cur_state in states_needed_to_be_checked_on_event] for command in commands_needed_to_be_checked_on_event: self.update_commands(command, object_detections_per_frame, emotions_per_frame, labels_per_frame) executing_commands = [cmd for cmd in self.commands if cmd.cur_state == cmd.State.EXECUTING] for active_cmd in executing_commands: active_cmd.exec(frame) delaying_commands = [cmd for cmd in self.commands if cmd.cur_state == cmd.State.DELAYING] for delaying_command in delaying_commands: if delaying_command.wait_out_delay(): delaying_command.set_as_after_delay() #self.show_caption(frame) cv2.imshow('frame', frame) self.video_writer.write(frame) cv2.waitKey(1) def show_caption(self, frame): most_clear_img = self.segmentator.get_most_clear_frame() caption = self.captioner.caption_img(most_clear_img) cv2.putText(frame, caption, (0, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, Color.GOLD, 2) def draw_clothes(self, frame, clothes_detections): # clothes_detections: [[label: str, ((x1, y1), (x2, y2)), prob], ..] font = cv2.FONT_HERSHEY_SIMPLEX color = Color.BLACK for label, ((x1, y1), (x2, y2)), prob in clothes_detections: text = f'{label} ({prob}%)' cv2.rectangle(frame, (x1, y1), (x2, y2), color, 3) cv2.rectangle(frame, (x1 - 2, y1 - 25), (x1 + 8.5 * len(text), y1), color, -1) cv2.putText(frame, text, (x1, y1 - 5), font, 0.5, (255, 255, 255), 1, cv2.LINE_AA) def detect_emotions_on_frame(self, frame): #return list of items of the following format: ((lt_point: tuple, rb_point: tuple), (emotion: str, prob: int)) detected_faces = self.face_recognizer.recognize_faces_on_image(frame) emotions = [] for face_pos in detected_faces: (l, t), (r, b) = face_pos face_img = frame[t:b, l:r] emotion = self.emotion_recognizer.recognize_emotion_by_face(face_img) if emotion: emotions.append((face_pos, emotion)) return emotions def draw_emotion_box(self, frame, emotion_pos, emotion: list): cv2.rectangle(frame, *emotion_pos, Color.GOLD, 2) font = cv2.FONT_HERSHEY_SIMPLEX cv2.putText(frame, f'{emotion[0]} - {emotion[1]}', (emotion_pos[0][0], emotion_pos[0][1] - 5), font, 1, Color.YELLOW, 3) def update_commands(self, command, detections_per_frame, emotions_per_frame, labels_per_frame): if command.command_type == CommandType.OBJECTS_TRIGGER: self.check_object_on_the_screen_event(command, detections_per_frame, labels_per_frame) elif command.command_type == CommandType.REACTION_CHAIN_TRIGGER: self.check_reactions_chain_event(command, detections_per_frame, labels_per_frame) elif command.command_type == CommandType.EMOTION_TRIGGER: self.check_emotion_event(command, detections_per_frame, emotions_per_frame, labels_per_frame) def check_emotion_event(self, command: Command, objects_detections, emotion_detections, labels_per_frame): # emotions_per_frame format - [((start_point, end_point), (emotion, prob)), ...] # check whether there's main object if command.attached_character_class in labels_per_frame: # check whether there's expected emotion expected_emotions = [emotion for emotion in emotion_detections if emotion[1][0] == command.emotion] # check whether an emotion box is inside main object main_object_box = self.get_coords(command, objects_detections, labels_per_frame) main_object_box = (main_object_box[:2]), (main_object_box[2:]) emotion = [emotion for emotion in expected_emotions if self.is_rect_inside_rect((emotion[0][0], emotion[0][1]), main_object_box)] assert len(emotion) <= 1 if emotion: print(emotion) coords = *emotion[0][0][0], *emotion[0][0][1] self.update_state(True, command, objects_detections, labels_per_frame, coords=coords) def is_rect_inside_rect(self, in_rect: tuple, out_rect: tuple): lt_in_box_point_inside_out_box = all([out_rect[0][i] <= in_rect[0][i] <= out_rect[1][i] for i in range(2)]) rb_in_box_point_inside_out_box = all([out_rect[0][i] <= in_rect[0][i] <= out_rect[1][i] for i in range(2)]) return lt_in_box_point_inside_out_box and rb_in_box_point_inside_out_box def check_reactions_chain_event(self, command: Command, detections_per_frame, labels_per_frame): # there's main object if command.attached_character_class in labels_per_frame: # check whether triggered command is active active_command_names = [command.name for command in self.commands if command.cur_state == command.State.EXECUTING] event_happened = command.trigger_cmd_name in active_command_names self.update_state(event_happened, command, detections_per_frame, labels_per_frame) def check_object_on_the_screen_event(self, command: Command, detections_per_frame, labels_per_frame): desired_classes = {command.attached_character_class, command.relation_class} # we found desired labels event_happened = desired_classes.issubset(labels_per_frame) self.update_state(event_happened, command, detections_per_frame, labels_per_frame) def update_state(self, event_happened, command, detections_per_frame, labels_per_frame, coords=None): if event_happened: if command.cur_state == command.State.WAITING: command.set_as_delaying(self.video_reader.one_frame_duration) return coords = self.get_coords(command, detections_per_frame, labels_per_frame) if not coords else coords if command.cur_state == command.State.EXECUTING: command.overlay.set_coords(coords) # extract later this part from update_commands method if command.cur_state == command.State.AFTER_DELAYING: if command.media.type == MediaType.VIDEO: command.overlay = self.generate_video_overlay(command, coords) elif command.media.type == MediaType.IMAGE: command.overlay = self.generate_image_overlay_object(command, coords) elif command.media.type == MediaType.TEXT: command.overlay = self.generate_text_overlay_object(command, coords) command.set_as_executing() elif command.cur_state == command.cur_state.AFTER_DELAYING: command.set_as_waiting() @staticmethod def get_coords(command: Command, detections_per_frame, labels_per_frame): main_box = detections_per_frame[labels_per_frame.index(command.attached_character_class)][1] coords = main_box if command.centered: secondary_box = detections_per_frame[labels_per_frame.index(command.relation_class)][1] main_box_center = [(main_box[i + 2] + main_box[i]) // 2 for i in range(2)] secondary_box_center = [(secondary_box[i + 2] + secondary_box[i]) // 2 for i in range(2)] boxes_center = [(main_box_center[i] + secondary_box_center[i]) // 2 for i in range(2)] coords = boxes_center return coords def generate_video_overlay(self, command: Command, coords: tuple): video_cap = cv2.VideoCapture(command.media.file_name) duration = command.media.duration if command.duration == 0 else command.duration return VideoOverlay(video_cap, duration, coords, self.video_reader.one_frame_duration) def generate_image_overlay_object(self, command: Command, coords: tuple): image = cv2.imread(command.media.file_name) return ImageOverlay(image, command.duration, coords, self.video_reader.one_frame_duration) def generate_text_overlay_object(self, command: Command, coords: tuple): texts = self.read_text_from_file(command.media.file_name) ellipse, text_rect = generate_thought_balloon_by_text(texts) return TextOverlay((ellipse, text_rect), command.duration, coords, self.video_reader.one_frame_duration) def read_text_from_file(self, txt_file): with open(txt_file) as txt: texts = txt.readlines() return texts def close(self): if self.video_reader: self.video_reader.close() if self.video_writer: self.video_writer.release()
class SalienceRecognizer: EMOTION_PROB_THRESH = 0 DELAY_TO_DETECT_IN_SECS = 5 FONT = cv2.FONT_HERSHEY_SIMPLEX def __init__(self): self.output_video_file_name = '/home/algernon/samba/video_queue/SalienceRecognizer/videos/processed/output2.mkv' self.emotion_recognizer = EmotionRecognizer(self.EMOTION_PROB_THRESH) self.segmentator = None # self.clothes_detector = ClothesDetector("yolo/df2cfg/yolov3-df2.cfg", "yolo/weights/yolov3-df2_15000.weights", "yolo/df2cfg/df2.names") self.video_file_name = '/home/algernon/Videos/source_videos/interview_anna.webm' self.captioner = Captioner() self.video_reader = VideoReader(self.video_file_name) self.joke_picker = JokePicker('joke_picker/shortjokes.csv', 'joke_picker/joke_picker.fse') self.video_writer = cv2.VideoWriter( self.output_video_file_name, cv2.VideoWriter_fourcc(*"XVID"), self.video_reader.fps, (self.video_reader.width, self.video_reader.height)) self.face_recognizer = FaceRecognizer() self.segmentator = SceneSegmentator(self.video_reader.fps * self.DELAY_TO_DETECT_IN_SECS) self.object_detection_reader = DetectionReader('detections.json') self.object_detector = ObjectDetector( './configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml', 'https://dl.fbaipublicfiles.com/detectron2/COCO-Detection/faster_rcnn_R_101_FPN_3x/137851257/model_final_f6e8b1.pkl' ) self.age_gender_predictor = AgeGenderPredictor() self.beauty_estimator = BeautyEstimator( '/home/algernon/DNNS/isBeauty/weights/epoch_50.pkl') self.main_person_definer = MainPersonDefiner() self.context_generator = ContextGenerator(self.object_detector.classes) self.speech_recognizer = SpeechRecognizer(self.video_file_name) def launch(self): for _ in trange(self.video_reader.frame_count): frame = self.video_reader.get_next_frame() frame_for_detection = self.get_clearest_frame(frame) use_prev_detects = True if frame_for_detection is None else False speech = self.speech_recognizer.rewind_audio( self.video_reader.get_cur_timestamp()) if not use_prev_detects: # faces detected_faces = self.face_recognizer.recognize_faces_on_image( frame_for_detection, get_prev=use_prev_detects) # main person main_person_index = self.main_person_definer.get_main_person_by_face_size( detected_faces, get_prev=use_prev_detects) # emotions emotion_detections = self.emotion_recognizer.detect_emotions_on_frame( frame_for_detection, detected_faces, get_prev=use_prev_detects) emotion = [emotion_detections[main_person_index] ] if main_person_index is not None else [] # age gender age_gender_predictions = self.age_gender_predictor.detect_age_dender_by_faces( frame_for_detection, detected_faces, get_prev=use_prev_detects) age_gender_prediction = [ age_gender_predictions[main_person_index] ] if main_person_index is not None else [] # beauty beauty_scores = self.beauty_estimator.estimate_beauty_by_face( frame_for_detection, detected_faces, get_prev=use_prev_detects) beauty_score = [beauty_scores[main_person_index] ] if main_person_index is not None else [] # clothes # clothes_detections = self.clothes_detector.detect_clothes(frame) # clothes_detections = [] # self.draw_clothes(frame, clothes_detections) # caption # caption = self.captioner.caption_img(frame_for_detection, get_prev=use_prev_detects) # objects object_detections = self.object_detector.forward( frame_for_detection, get_prev=use_prev_detects) # object_detections = self.object_detection_reader.get_detections_per_specified_frame(cur_frame_num) # context = self.context_generator.generate_context(object_detections, caption, emotion, age_gender_prediction, # beauty_score, get_prev=use_prev_detects) # cv2.putText(frame, 'Context:', (0, 360), cv2.FONT_HERSHEY_SIMPLEX, 0.8, Color.GOLD, 2) # cv2.putText(frame, context, (0, 400), cv2.FONT_HERSHEY_SIMPLEX, 0.8, Color.GOLD, 2) # jokes = self.joke_picker.pick_jokes_by_context(context, get_prev=use_prev_detects) # self.apply_jokes_on_frame(jokes, frame) self.apply_emotions_on_frame(frame, emotion_detections) self.apply_beauty_scores_on_frame(frame, detected_faces, beauty_scores) self.apply_age_gender_on_frame(frame, detected_faces, age_gender_predictions) # self.apply_caption_on_frame(frame, caption) frame = self.object_detector.draw_boxes(frame, object_detections) cv2.imshow('frame', frame) self.video_writer.write(frame) cv2.waitKey(1) def apply_age_gender_on_frame(self, frame, faces, age_gender_predictions): for i, face in enumerate(faces): tl_x, tl_y = face[i] cv2.putText(frame, f'{age_gender_predictions[i]}', (tl_x - 5, tl_y + 60), self.FONT, 1, Color.BLACK, 2) def apply_beauty_scores_on_frame(self, frame, faces, beauty_scores): for i, face in enumerate(faces): tl_x, tl_y = face[i] cv2.putText( frame, f'{ContextGenerator.beauty_score2desc(beauty_scores[i])} ({beauty_scores[i]})', (tl_x - 5, tl_y + 30), self.FONT, 1, Color.BLACK, 2) def apply_jokes_on_frame(self, jokes, frame): height = frame.shape[0] joke_height = 40 joke_y = height - joke_height * len(jokes) for joke in jokes: cv2.putText(frame, joke, (0, joke_y), cv2.FONT_HERSHEY_SIMPLEX, 0.8, Color.GOLD, 2) joke_y += joke_height def apply_emotions_on_frame(self, frame, emotion_detections): for emotion_pos, emotion in emotion_detections: self.draw_emotion_box(frame, emotion_pos, emotion) def get_clearest_frame(self, frame): self.segmentator.push_frame(frame) most_clear_img = self.segmentator.get_most_clear_frame() return most_clear_img def apply_caption_on_frame(self, frame, caption): cv2.putText(frame, caption, (0, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, Color.BLUE, 2) def draw_clothes(self, frame, clothes_detections): # clothes_detections: [[label: str, ((x1, y1), (x2, y2)), prob], ..] color = Color.BLACK for label, ((x1, y1), (x2, y2)), prob in clothes_detections: text = f'{label} ({prob}%)' cv2.rectangle(frame, (x1, y1), (x2, y2), color, 3) cv2.rectangle(frame, (x1 - 2, y1 - 25), (x1 + 8.5 * len(text), y1), color, -1) cv2.putText(frame, text, (x1, y1 - 5), self.FONT, 0.5, color.WHITE, 1, cv2.LINE_AA) def draw_emotion_box(self, frame, emotion_pos, emotion: list): cv2.rectangle(frame, *emotion_pos, Color.GOLD, 2) cv2.putText(frame, f'{emotion[0]} - {emotion[1]}', (emotion_pos[0][0], emotion_pos[0][1] - 5), self.FONT, 1, Color.BLACK, 3) def is_rect_inside_rect(self, in_rect: tuple, out_rect: tuple): lt_in_box_point_inside_out_box = all([ out_rect[0][i] <= in_rect[0][i] <= out_rect[1][i] for i in range(2) ]) rb_in_box_point_inside_out_box = all([ out_rect[0][i] <= in_rect[0][i] <= out_rect[1][i] for i in range(2) ]) return lt_in_box_point_inside_out_box and rb_in_box_point_inside_out_box # def generate_video_overlay(self, command: Command, coords: tuple): # video_cap = cv2.VideoCapture(command.media.file_name) # duration = command.media.duration if command.duration == 0 else command.duration # return VideoOverlay(video_cap, duration, coords, self.video_reader.one_frame_duration) # # def generate_image_overlay_object(self, command: Command, coords: tuple): # image = cv2.imread(command.media.file_name) # return ImageOverlay(image, command.duration, coords, self.video_reader.one_frame_duration) # # def generate_text_overlay_object(self, command: Command, coords: tuple): # texts = self.read_text_from_file(command.media.file_name) # ellipse, text_rect = generate_thought_balloon_by_text(texts) # return TextOverlay((ellipse, text_rect), command.duration, coords, self.video_reader.one_frame_duration) # def read_text_from_file(self, txt_file): # with open(txt_file) as txt: # texts = txt.readlines() # return texts def close(self): if self.video_reader: self.video_reader.close() if self.video_writer: self.video_writer.release()