def three_people(): from query.models import Face from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser from rekall.merge_ops import payload_plus from esper.rekall import intrvllists_to_result_bbox from rekall.payload_predicates import payload_satisfies from rekall.spatial_predicates import scene_graph from rekall.bbox_predicates import height_at_least, left_of, same_value MIN_FACE_HEIGHT = 0.3 EPSILON = 0.05 # Annotate face rows with start and end frames and the video ID faces = Face.objects.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id')) # Materialize all the faces and load them into rekall with bounding box payloads # Then coalesce them so that all faces in the same frame are in the same interval # NOTE that this is slow right now since we're loading all faces! face_lists = VideoIntervalCollection.from_django_qs( faces, with_payload=in_array( bbox_payload_parser( VideoIntervalCollection.django_accessor))).coalesce( payload_merge_op=payload_plus) three_people_scene_graph = { 'nodes': [{ 'name': 'face1', 'predicates': [height_at_least(MIN_FACE_HEIGHT)] }, { 'name': 'face2', 'predicates': [height_at_least(MIN_FACE_HEIGHT)] }, { 'name': 'face3', 'predicates': [height_at_least(MIN_FACE_HEIGHT)] }], 'edges': [{ 'start': 'face1', 'end': 'face2', 'predicates': [left_of(), same_value('y1', epsilon=EPSILON)] }, { 'start': 'face2', 'end': 'face3', 'predicates': [left_of(), same_value('y1', epsilon=EPSILON)] }] } three_people = face_lists.filter( payload_satisfies(scene_graph(three_people_scene_graph, exact=True))) # Post-process to display in Esper widget return intrvllists_to_result_bbox(three_people.get_allintervals(), limit=100, stride=100)
def man_woman_up_close(): from query.models import FaceGender from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser, merge_dict_parsers, dict_payload_parser from rekall.merge_ops import payload_plus from esper.rekall import intrvllists_to_result_bbox from rekall.payload_predicates import payload_satisfies from rekall.spatial_predicates import scene_graph from rekall.bbox_predicates import height_at_least MIN_FACE_CONFIDENCE = 0.95 MIN_GENDER_CONFIDENCE = 0.95 MIN_FACE_HEIGHT = 0.6 # Annotate face rows with start and end frames and the video ID faces_with_gender= FaceGender.objects.filter(face__frame__video__name=video_name).annotate( min_frame=F('face__frame__number'), max_frame=F('face__frame__number'), video_id=F('face__frame__video_id'), bbox_x1=F('face__bbox_x1'), bbox_y1=F('face__bbox_y1'), bbox_x2=F('face__bbox_x2'), bbox_y2=F('face__bbox_y2'), gender_name=F('gender__name'), face_probability=F('face__probability')) faces = VideoIntervalCollection.from_django_qs( faces_with_gender, with_payload=in_array(merge_dict_parsers([ bbox_payload_parser(VideoIntervalCollection.django_accessor), dict_payload_parser(VideoIntervalCollection.django_accessor, { 'gender': 'gender_name' }), dict_payload_parser(VideoIntervalCollection.django_accessor, { 'gender_probability': 'probability' }), dict_payload_parser(VideoIntervalCollection.django_accessor, { 'face_probability': 'face_probability' }) ])) ).coalesce(payload_merge_op=payload_plus) graph = { 'nodes': [ { 'name': 'face_male', 'predicates': [ height_at_least(MIN_FACE_HEIGHT), lambda payload: payload['gender'] is 'M', lambda payload: payload['face_probability'] > MIN_FACE_CONFIDENCE, lambda payload: payload['gender_probability'] > MIN_GENDER_CONFIDENCE ] }, { 'name': 'face_female', 'predicates': [ height_at_least(MIN_FACE_HEIGHT), lambda payload: payload['gender'] is 'F', lambda payload: payload['face_probability'] > MIN_FACE_CONFIDENCE, lambda payload: payload['gender_probability'] > MIN_GENDER_CONFIDENCE ] }, ], 'edges': [] } mf_up_close = faces.filter(payload_satisfies( scene_graph(graph, exact=True))) return intrvllists_to_result_bbox(mf_up_close.get_allintervals(), limit=100, stride=100)
def panels_rekall(): from query.models import LabeledCommercial, Face from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser from rekall.merge_ops import payload_plus from rekall.bbox_predicates import height_at_least, same_value, left_of from rekall.spatial_predicates import scene_graph from rekall.payload_predicates import payload_satisfies from esper.rekall import intrvllists_to_result_bbox # Get list of sandbox video IDs sandbox_videos = [ row.video_id for row in LabeledCommercial.objects.distinct('video_id') ] faces_qs = Face.objects.filter(shot__video_id__in=sandbox_videos).annotate( video_id=F("shot__video_id"), min_frame=F("shot__min_frame"), max_frame=F("shot__max_frame") ) # One interval for each face faces = VideoIntervalCollection.from_django_qs( faces_qs, with_payload=in_array( bbox_payload_parser( VideoIntervalCollection.django_accessor))) # Merge shots faces = faces.coalesce(payload_merge_op=payload_plus) # Define a scene graph for things that look like panels three_faces_scene_graph = { 'nodes': [ { 'name': 'face1', 'predicates': [ height_at_least(0.3) ] }, { 'name': 'face2', 'predicates': [ height_at_least(0.3) ] }, { 'name': 'face3', 'predicates': [ height_at_least(0.3) ] } ], 'edges': [ { 'start': 'face1', 'end': 'face2', 'predicates': [ same_value('y1', epsilon=0.05), left_of() ] }, { 'start': 'face2', 'end': 'face3', 'predicates': [ same_value('y1', epsilon=0.05), left_of() ] }, ] } panels = faces.filter(payload_satisfies( scene_graph(three_faces_scene_graph, exact=True) )) return intrvllists_to_result_bbox(panels.get_allintervals())
def frames_with_character_x(): from query.models import FaceCharacterActor from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser, merge_dict_parsers, dict_payload_parser from rekall.merge_ops import payload_plus from rekall.payload_predicates import payload_satisfies from rekall.spatial_predicates import scene_graph from esper.rekall import intrvllists_to_result_bbox character_name = "harry potter" # Annotate face rows with start and end frames and the video ID faces_with_character_actor_qs = FaceCharacterActor.objects.annotate( min_frame=F('face__frame__number'), max_frame=F('face__frame__number'), video_id=F('face__frame__video_id'), bbox_x1=F('face__bbox_x1'), bbox_y1=F('face__bbox_y1'), bbox_x2=F('face__bbox_x2'), bbox_y2=F('face__bbox_y2'), character_name=F('characteractor__character__name')) faces_with_identity = VideoIntervalCollection.from_django_qs( faces_with_character_actor_qs, with_payload=in_array( merge_dict_parsers([ bbox_payload_parser(VideoIntervalCollection.django_accessor), dict_payload_parser(VideoIntervalCollection.django_accessor, {'character': 'character_name'}), ]))).coalesce(payload_merge_op=payload_plus) faces_with_actor = faces_with_identity.filter( payload_satisfies( scene_graph({ 'nodes': [{ 'name': 'face1', 'predicates': [lambda f: f['character'] == character_name] }], 'edges': [] }))) return intrvllists_to_result_bbox(faces_with_actor.get_allintervals(), limit=100, stride=1000)
def get_numface_intrvlcol(relevant_shots, num_face=1): faces = Face.objects.filter(shot__in=list(relevant_shots)) \ .annotate(video_id=F('shot__video_id')) \ .annotate(min_frame=F('shot__min_frame')) \ .annotate(max_frame=F('shot__max_frame')) # Materialize all the faces and load them into rekall with bounding box payloads # Then coalesce them so that all faces in the same frame are in the same interval # NOTE that this is slow right now since we're loading all faces! numface_intrvlcol = VideoIntervalCollection.from_django_qs( faces, with_payload=in_array( bbox_payload_parser(VideoIntervalCollection.django_accessor)) ).coalesce(payload_merge_op=payload_plus).filter(payload_satisfies(length_exactly(num_face))) num_intrvl = 0 for _, intrvllist in numface_intrvlcol.get_allintervals().items(): num_intrvl += intrvllist.size() print("Get {} relevant {} face intervals".format(num_intrvl, num_face)) return numface_intrvlcol
def shot_reverse_shot_complex(): from query.models import Face, Shot from rekall.temporal_predicates import overlaps from rekall.merge_ops import payload_second, payload_plus from rekall.video_interval_collection import VideoIntervalCollection from rekall.interval_list import Interval, IntervalList from rekall.parsers import in_array, bbox_payload_parser from rekall.payload_predicates import payload_satisfies from rekall.list_predicates import length_at_most from rekall.logical_predicates import and_pred from rekall.spatial_predicates import scene_graph, make_region from rekall.temporal_predicates import before, after from rekall.bbox_predicates import height_at_least from esper.rekall import intrvllists_to_result_with_objects VIDEO_NAME = 'godfather part iii' MAX_FACE_MOVEMENT = 0.15 MIN_FACE_HEIGHT = 0.2 MAX_FACES_ON_SCREEN = 4 RIGHT_HALF_MIN_X = 0.33 LEFT_HALF_MAX_X = 0.66 SHOTS_LABELER_ID = 64 # faces are sampled every 12 frames SAMPLING_RATE = 12 # Annotate face rows with start and end frames and the video ID faces = Face.objects.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id')).filter( frame__video__name__contains=VIDEO_NAME) shots = VideoIntervalCollection.from_django_qs(Shot.objects.filter( video__name__contains=VIDEO_NAME, labeler_id=SHOTS_LABELER_ID), with_payload=lambda obj: []) # vids are all faces for each frame vids = VideoIntervalCollection.from_django_qs( faces.filter(probability__gte=0.99), with_payload=in_array( bbox_payload_parser( VideoIntervalCollection.django_accessor))).coalesce( payload_merge_op=payload_plus) right_half = make_region(RIGHT_HALF_MIN_X, 0.0, 1.0, 1.0) left_half = make_region(0.0, 0.0, LEFT_HALF_MAX_X, 1.0) graph = { 'nodes': [{ 'name': 'face', 'predicates': [height_at_least(MIN_FACE_HEIGHT)] }], 'edges': [] } faces_on_right = vids.filter( and_pred(payload_satisfies(length_at_most(MAX_FACES_ON_SCREEN)), payload_satisfies(scene_graph(graph, region=right_half)))) faces_on_left = vids.filter( and_pred(payload_satisfies(length_at_most(MAX_FACES_ON_SCREEN)), payload_satisfies(scene_graph(graph, region=left_half)))) def wrap_list(intvl): intvl.payload = [intvl.payload] return intvl def get_height(box): return box['y2'] - box['y1'] def get_center(box): return ((box['x1'] + box['x2']) / 2, (box['y1'] + box['y2']) / 2) def get_distance(pt1, pt2): return np.sqrt((pt1[0] - pt2[0])**2 + (pt1[1] - pt2[1])**2) def find_highest_box(boxes): if len(boxes) == 0: return None result = boxes[0] best = get_height(result) for i in range(1, len(boxes)): h = get_height(boxes[i]) if h > best: best = h result = boxes[i] return result def take_highest_in_frame(intvl): result = [] for faces_in_frame in intvl.payload: largest = find_highest_box(faces_in_frame) if largest is not None: result.append(largest) intvl.payload = result return intvl # Check if displacement of box center between frames are within `dist` def inter_frame_movement_less_than(dist): def check(boxes): for b1, b2 in zip(boxes, boxes[1:]): if get_distance(get_center(b1), get_center(b2)) > dist: return False return True return check # Payload is a list, each element is a list of faces for a frame shots_with_face_on_right = shots.merge( faces_on_right, predicate=overlaps(), payload_merge_op=payload_second).map(wrap_list).coalesce( payload_merge_op=payload_plus).map(take_highest_in_frame).filter( payload_satisfies( inter_frame_movement_less_than(MAX_FACE_MOVEMENT))) shots_with_face_on_left = shots.merge( faces_on_left, predicate=overlaps(), payload_merge_op=payload_second).map(wrap_list).coalesce( payload_merge_op=payload_plus).map(take_highest_in_frame).filter( payload_satisfies( inter_frame_movement_less_than(MAX_FACE_MOVEMENT))) # Right-Left-Right sequences shot_reverse_shot_1 = shots_with_face_on_right.merge( shots_with_face_on_left, predicate=before(max_dist=1)).merge(shots_with_face_on_right, predicate=before(max_dist=1)) # Left-Right-Left sequences shot_reverse_shot_2 = shots_with_face_on_left.merge( shots_with_face_on_right, predicate=before(max_dist=1)).merge(shots_with_face_on_left, predicate=before(max_dist=1)) shot_reverse_shot = shot_reverse_shot_1.set_union( shot_reverse_shot_2).coalesce() result = intrvllists_to_result_with_objects( shot_reverse_shot.get_allintervals(), payload_to_objs=lambda p, v: []) return result
def shot_reverse_shot(): from query.models import Face from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser from rekall.merge_ops import payload_plus from esper.rekall import intrvllists_to_result_bbox from rekall.payload_predicates import payload_satisfies from rekall.list_predicates import length_at_most from rekall.logical_predicates import and_pred, or_pred from rekall.spatial_predicates import scene_graph, make_region from rekall.temporal_predicates import before, after from rekall.bbox_predicates import height_at_least from esper.rekall import intrvllists_to_result, intrvllists_to_result_with_objects, add_intrvllists_to_result # If True, visualize results in a timeline TIMELINE_OUTPUT = False RIGHT_HALF_MIN_X = 0.45 LEFT_HALF_MAX_X = 0.55 MIN_FACE_HEIGHT = 0.4 MAX_FACES_ON_SCREEN = 2 # faces are sampled every 12 frames SAMPLING_RATE = 12 ONE_SECOND = 24 FOUR_SECONDS = 96 TEN_SECONDS = 240 # Annotate face rows with start and end frames and the video ID faces = Face.objects.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id')) right_half = make_region(RIGHT_HALF_MIN_X, 0.0, 1.0, 1.0) left_half = make_region(0.0, 0.0, LEFT_HALF_MAX_X, 1.0) graph = { 'nodes': [{ 'name': 'face', 'predicates': [height_at_least(MIN_FACE_HEIGHT)] }], 'edges': [] } vids = VideoIntervalCollection.from_django_qs( faces, with_payload=in_array( bbox_payload_parser( VideoIntervalCollection.django_accessor))).coalesce( payload_merge_op=payload_plus) # Get sequences where there's a face on the right half of the screen and # there are at most two faces faces_on_right = vids.filter( and_pred(payload_satisfies(length_at_most(MAX_FACES_ON_SCREEN)), payload_satisfies(scene_graph( graph, region=right_half)))).dilate(SAMPLING_RATE / 2).coalesce() # Get sequences where there's a face on the left half of the screen and # there are at most two faces faces_on_left = vids.filter( and_pred(payload_satisfies(length_at_most(MAX_FACES_ON_SCREEN)), payload_satisfies(scene_graph( graph, region=left_half)))).dilate(SAMPLING_RATE / 2).coalesce() # Sequences where faces on left up to one second before/after faces on left # Four seconds of buffer time between left-then-right/right-then-left # segments # Only keep remaining sequences that last longer than ten seconds shot_reverse_shot = faces_on_right.merge( faces_on_left, predicate=or_pred( before(max_dist=ONE_SECOND), after(max_dist=ONE_SECOND), arity=2)).dilate(FOUR_SECONDS).coalesce().dilate( -1 * FOUR_SECONDS).filter_length(min_length=TEN_SECONDS) # Post-process to display in Esper widget if TIMELINE_OUTPUT: results = intrvllists_to_result(shot_reverse_shot.get_allintervals()) add_intrvllists_to_result(results, faces_on_left.get_allintervals(), color='black') add_intrvllists_to_result(results, faces_on_right.get_allintervals(), color='green') else: results = intrvllists_to_result_with_objects( shot_reverse_shot.get_allintervals(), lambda payload, video: []) return results
def kissing(): # Takes 7min to run! from query.models import Face, Shot from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser, merge_dict_parsers, dict_payload_parser from rekall.merge_ops import payload_plus from rekall.payload_predicates import payload_satisfies from rekall.spatial_predicates import scene_graph from rekall.temporal_predicates import overlaps from rekall.face_landmark_predicates import looking_left, looking_right from rekall.bbox_predicates import height_at_least, same_height import esper.face_landmarks_wrapper as flw from esper.captions import get_all_segments from esper.rekall import intrvllists_to_result_with_objects, bbox_to_result_object from esper.stdlib import face_landmarks_to_dict MAX_MOUTH_DIFF = 0.12 MIN_FACE_CONFIDENCE = 0.8 MIN_FACE_HEIGHT = 0.4 MAX_FACE_HEIGHT_DIFF = 0.1 MIN_FACE_OVERLAP_X = 0.05 MIN_FACE_OVERLAP_Y = 0.2 MAX_FACE_OVERLAP_X_FRACTION = 0.7 MIN_FACE_ANGLE = 0.1 def map_payload(func): def map_fn(intvl): intvl.payload = func(intvl.payload) return intvl return map_fn def get_landmarks(faces): ids = [face['id'] for face in faces] landmarks = flw.get(Face.objects.filter(id__in=ids)) for face, landmark in zip(faces, landmarks): face['landmarks'] = landmark return faces # Annotate face rows with start and end frames and the video ID faces_qs = Face.objects.filter( probability__gte=MIN_FACE_CONFIDENCE).annotate( min_frame=F('frame__number'), max_frame=F('frame__number'), height=F('bbox_y2') - F('bbox_y1'), video_id=F('frame__video_id')).filter(height__gte=MIN_FACE_HEIGHT) faces = VideoIntervalCollection.from_django_qs( faces_qs, with_payload=in_array( merge_dict_parsers([ bbox_payload_parser(VideoIntervalCollection.django_accessor), dict_payload_parser(VideoIntervalCollection.django_accessor, {'id': 'id'}) ]))).coalesce(payload_merge_op=payload_plus) graph = { 'nodes': [ { 'name': 'face_left', 'predicates': [] }, { 'name': 'face_right', 'predicates': [] }, ], 'edges': [ { 'start': 'face_left', 'end': 'face_right', 'predicates': [ lambda f1, f2: f1['x2'] < f2['x2'] and f1['x1'] < f2[ 'x1'], # Left face on the left lambda f1, f2: f1['x2'] - f2['x1'] > MIN_FACE_OVERLAP_X, # Faces overlap lambda f1, f2: min(f1['y2'], f2['y2']) - max( f1['y1'], f1['y1']) > MIN_FACE_OVERLAP_Y, lambda f1, f2: f1['y2'] > f2['y1'] and f1['y1'] < f2[ 'y2'], # No face is entirely above another same_height(MAX_FACE_HEIGHT_DIFF), lambda f1, f2: (f1['x2'] - f2['x1']) / max(f1['x2'] - f1['x1'], f2[ 'x2'] - f2['x1']) < MAX_FACE_OVERLAP_X_FRACTION ] }, ] } def mouths_are_close(lm1, lm2): select_outer = [2, 3, 4, 8, 9, 10] select_inner = [1, 2, 3, 5, 6, 7] mouth1 = np.concatenate( (lm1.outer_lips()[select_outer], lm1.inner_lips()[select_inner])) mouth2 = np.concatenate( (lm2.outer_lips()[select_outer], lm2.inner_lips()[select_inner])) mean1 = np.mean(mouth1, axis=0) mean2 = np.mean(mouth2, axis=0) return np.linalg.norm(mean1 - mean2) <= MAX_MOUTH_DIFF # Face is profile if both eyes are on the same side of the nose bridge horizontally. def is_left_profile(f): lm = f['landmarks'] nose_x = min(lm.nose_bridge()[:, 0]) left = np.all(lm.left_eye()[:, 0] >= nose_x) right = np.all(lm.right_eye()[:, 0] >= nose_x) return left and right def is_right_profile(f): lm = f['landmarks'] nose_x = max(lm.nose_bridge()[:, 0]) left = np.all(lm.left_eye()[:, 0] <= nose_x) right = np.all(lm.right_eye()[:, 0] <= nose_x) return left and right # Line is ax+by+c=0 def project_point_to_line(pt, a, b, c): x0, y0 = pt[0], pt[1] d = a * a + b * b x = (b * (b * x0 - a * y0) - a * c) / d y = (a * (-b * x0 + a * y0) - b * c) / d return np.array([x, y]) # Positive if facing right def signed_face_angle(lm): center_line_indices = [27, 28, 32, 33, 34, 51, 62, 66, 57] data = lm.landmarks[center_line_indices] fit = np.polyfit(data[:, 0], data[:, 1], 1) # y = ax+b a, b = fit[0], fit[1] A = project_point_to_line(lm.landmarks[center_line_indices[0]], a, -1, b) B = project_point_to_line(lm.landmarks[center_line_indices[-1]], a, -1, b) AB = B - A AB = AB / np.linalg.norm(AB) C = np.mean(lm.nose_bridge()[2:4], axis=0) AC = C - A AC = AC / np.linalg.norm(AC) return np.cross(AB, AC) graph2 = { 'nodes': [ { 'name': 'left', 'predicates': [ lambda f: signed_face_angle(f['landmarks']) > MIN_FACE_ANGLE # is_right_profile ] }, { 'name': 'right', 'predicates': [ lambda f: signed_face_angle(f['landmarks']) < -MIN_FACE_ANGLE # is_left_profile ] }, ], 'edges': [{ 'start': 'left', 'end': 'right', 'predicates': [ lambda l, r: mouths_are_close(l['landmarks'], r['landmarks']), ] }] } mf_up_close = faces.filter( payload_satisfies(scene_graph(graph, exact=True))).map( map_payload(get_landmarks)).filter( payload_satisfies(scene_graph(graph2, exact=True))) vids = mf_up_close.get_allintervals().keys() # Merge with shots shots_qs = Shot.objects.filter( video_id__in=vids, labeler=Labeler.objects.get(name='shot-hsvhist-face')).all() total = shots_qs.count() print("Total shots:", total) # use emtpy list as payload shots = VideoIntervalCollection.from_django_qs(shots_qs, with_payload=lambda row: [], progress=True, total=total) kissing_shots = mf_up_close.join(shots, lambda kiss, shot: [(kiss.get_start( ), shot.get_end(), kiss.get_payload())], predicate=overlaps(), working_window=1).coalesce() # Getting faces in the shot def wrap_in_list(intvl): intvl.payload = [intvl.payload] return intvl print("Getting faces...") faces_qs2 = Face.objects.filter(frame__video_id__in=vids, probability__gte=MIN_FACE_CONFIDENCE) total = faces_qs2.count() faces2 = VideoIntervalCollection.from_django_qs( faces_qs2.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id')), with_payload=in_array( merge_dict_parsers([ bbox_payload_parser(VideoIntervalCollection.django_accessor), dict_payload_parser(VideoIntervalCollection.django_accessor, {'frame': 'min_frame'}) ])), progress=True, total=total).coalesce(payload_merge_op=payload_plus).map(wrap_in_list) def clip_to_last_frame_with_two_faces(intvl): faces = intvl.get_payload()[1] two_faces = [(f[0], f[1]) for f in faces if len(f) == 2] two_high_faces = [ (a, b) for a, b in two_faces if min(a['y2'] - a['y1'], b['y2'] - b['y1']) >= MIN_FACE_HEIGHT ] frame = [a['frame'] for a, b in two_high_faces] if len(frame) > 0: intvl.end = frame[-1] return intvl clipped_kissing_shots = kissing_shots.merge( faces2, payload_merge_op=lambda p1, p2: (p1, p2), predicate=overlaps(), working_window=1).coalesce( payload_merge_op=lambda p1, p2: (p1[0], p1[1] + p2[1])).map( clip_to_last_frame_with_two_faces).filter_length(min_length=12) results = get_all_segments(vids) fps_map = dict((i, Video.objects.get(id=i).fps) for i in vids) caption_results = VideoIntervalCollection({ video_id: [ ( word[0] * fps_map[video_id], # start frame word[1] * fps_map[video_id], # end frame word[2]) # payload is the word for word in words ] for video_id, words in results }) kissing_without_words = clipped_kissing_shots.minus(caption_results) kissing_final = kissing_without_words.map(lambda intvl: (int( intvl.start), int(intvl.end), intvl.payload)).coalesce().filter_length( min_length=12) def payload_to_objects(p, video_id): return [face_landmarks_to_dict(face['landmarks']) for face in p[0] ] + [bbox_to_result_object(face, video_id) for face in p[0]] return intrvllists_to_result_with_objects( kissing_final.get_allintervals(), lambda p, vid: payload_to_objects(p, vid), stride=1)
def harry_ron_hermione(): from query.models import FaceCharacterActor from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser, merge_dict_parsers, dict_payload_parser from rekall.merge_ops import payload_plus from rekall.payload_predicates import payload_satisfies from rekall.spatial_predicates import scene_graph from rekall.bbox_predicates import height_at_least, left_of, same_value, same_height from esper.rekall import intrvllists_to_result_bbox MIN_FACE_HEIGHT = 0.25 EPSILON = 0.15 NAMES = [ 'ron weasley', 'harry potter', 'hermione granger' ] # Annotate face rows with start and end frames and the video ID faces_with_character_actor_qs = FaceCharacterActor.objects.annotate( min_frame=F('face__frame__number'), max_frame=F('face__frame__number'), video_id=F('face__frame__video_id'), bbox_x1=F('face__bbox_x1'), bbox_y1=F('face__bbox_y1'), bbox_x2=F('face__bbox_x2'), bbox_y2=F('face__bbox_y2'), character_name=F('characteractor__character__name') ).filter(face__frame__video__name__contains="harry potter") faces_with_identity = VideoIntervalCollection.from_django_qs( faces_with_character_actor_qs, with_payload=in_array(merge_dict_parsers([ bbox_payload_parser(VideoIntervalCollection.django_accessor), dict_payload_parser(VideoIntervalCollection.django_accessor, { 'character': 'character_name' }), ])) ).coalesce(payload_merge_op=payload_plus) harry_ron_hermione_scene_graph = { 'nodes': [ { 'name': 'face1', 'predicates': [ height_at_least(MIN_FACE_HEIGHT), lambda f: f['character'] == NAMES[0] ] }, { 'name': 'face2', 'predicates': [ height_at_least(MIN_FACE_HEIGHT), lambda f: f['character'] == NAMES[1] ] }, { 'name': 'face3', 'predicates': [ height_at_least(MIN_FACE_HEIGHT), lambda f: f['character'] == NAMES[2] ] } ], 'edges': [ { 'start': 'face1', 'end': 'face2', 'predicates': [ same_value('y1', epsilon=EPSILON), same_height(epsilon=EPSILON) ] }, { 'start': 'face2', 'end': 'face3', 'predicates': [ same_value('y1', epsilon=EPSILON), same_height(epsilon=EPSILON) ] }, { 'start': 'face1', 'end': 'face3', 'predicates': [ same_value('y1', epsilon=EPSILON), same_height(epsilon=EPSILON) ] } ] } harry_ron_hermione = faces_with_identity.filter(payload_satisfies(scene_graph( harry_ron_hermione_scene_graph, exact=True ))) return intrvllists_to_result_bbox(harry_ron_hermione.get_allintervals(), limit=100, stride=10)
def conversations_for_display(): from query.models import FaceCharacterActor, Shot from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser, merge_dict_parsers, dict_payload_parser from rekall.merge_ops import payload_plus from rekall.payload_predicates import payload_satisfies from rekall.spatial_predicates import scene_graph from esper.rekall import intrvllists_to_result_bbox from query.models import Face from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser from rekall.merge_ops import payload_plus, merge_named_payload, payload_second from esper.rekall import intrvllists_to_result_bbox from rekall.payload_predicates import payload_satisfies from rekall.list_predicates import length_at_most from rekall.logical_predicates import and_pred, or_pred, true_pred from rekall.spatial_predicates import scene_graph, make_region from rekall.temporal_predicates import before, after, overlaps, equal from rekall.bbox_predicates import height_at_least from esper.rekall import intrvllists_to_result, intrvllists_to_result_with_objects, add_intrvllists_to_result from esper.prelude import esper_widget from rekall.interval_list import Interval, IntervalList import esper.face_embeddings as face_embeddings video_id = 15 EMBEDDING_EQUALITY_THRESHOLD = 1. ONE_FRAME = 1 faces_qs = Face.objects.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id')).filter( frame__video_id=video_id, frame__regularly_sampled=True) faces_per_frame = VideoIntervalCollection.from_django_qs( faces_qs, with_payload=in_array( merge_dict_parsers([ bbox_payload_parser(VideoIntervalCollection.django_accessor), dict_payload_parser(VideoIntervalCollection.django_accessor, {'face_id': 'id'}), ]))).coalesce(payload_merge_op=payload_plus) shots_qs = Shot.objects.filter(cinematic=True) shots = VideoIntervalCollection.from_django_qs(shots_qs) shots_with_faces = shots.merge( faces_per_frame, predicate=overlaps(), payload_merge_op=lambda shot_id, faces_in_frame: (shot_id, [faces_in_frame])).coalesce( payload_merge_op=lambda p1, p2: (p1[0], p1[1] + p2[1])) def cluster_center(face_ids): # print("About to compute mean") mean_embedding = face_embeddings.mean(face_ids) # print("About to compute dist", face_ids) dists = face_embeddings.dist(face_ids, [mean_embedding]) # print("Done computing dist") return min(zip(dists, face_ids))[1] def cluster_and_compute_centers(faces_in_frame_list, shot_id): num_people = max( len(faces_in_frame) for faces_in_frame in faces_in_frame_list) face_ids = [ face['face_id'] for faces_in_frame in faces_in_frame_list for face in faces_in_frame ] face_heights = [ face['y2'] - face['y1'] for faces_in_frame in faces_in_frame_list for face in faces_in_frame ] print(num_people) if num_people == 1: clusters = [(fid, 0) for fid in face_ids] else: clusters = face_embeddings.kmeans(face_ids, num_people) # print("Done clustering") centers = [(cluster_center([ face_id for face_id, cluster_id in clusters if cluster_id == i ]), [face_id for face_id, cluster_id in clusters if cluster_id == i], shot_id, max([ face_heights[face_ids.index(face_id)] for face_id, cluster_id in clusters if cluster_id == i ])) for i in range(num_people)] # print("Done computing the center") return centers # print("About to compute clusters") shots_with_centers = shots_with_faces.map(lambda intrvl: ( intrvl.start, intrvl.end, (intrvl.payload[0], cluster_and_compute_centers(intrvl.payload[1], intrvl.payload[0])))) # print("Clusters computed") def same_face(center1, center2): return face_embeddings.dist( [center1], target_ids=[center2])[0] < EMBEDDING_EQUALITY_THRESHOLD def cross_product_faces(intrvl1, intrvl2): payload1 = intrvl1.get_payload() payload2 = intrvl2.get_payload() payload = [] for cluster1 in payload1[1]: for cluster2 in payload2[1]: if not same_face(cluster1[0], cluster2[0]): new_payload = {'A': cluster1, 'B': cluster2} payload.append(new_payload) return [(min(intrvl1.get_start(), intrvl2.get_start()), max(intrvl1.get_end(), intrvl2.get_end()), { 'chrs': payload, 'shots': [payload1[0], payload2[0]] })] two_shots = shots_with_centers.join(shots_with_centers, predicate=after(max_dist=ONE_FRAME, min_dist=ONE_FRAME), merge_op=cross_product_faces) # print("Cross product done") def faces_equal(payload1, payload2): for face_pair1 in payload1['chrs']: for face_pair2 in payload2['chrs']: if (same_face(face_pair1['A'][0], face_pair2['A'][0]) and same_face(face_pair1['B'][0], face_pair2['B'][0])): return True if (same_face(face_pair1['A'][0], face_pair2['B'][0]) and same_face(face_pair1['B'][0], face_pair2['A'][0])): return True return False convs = two_shots.coalesce( predicate=payload_satisfies(faces_equal, arity=2), payload_merge_op=lambda payload1, payload2: { 'chrs': payload1['chrs'] + payload2['chrs'], 'shots': payload1['shots'] + payload2['shots'] }) # print("Coalesce done") adjacent_seq = convs.merge( convs, predicate=and_pred(after(max_dist=ONE_FRAME, min_dist=ONE_FRAME), payload_satisfies(faces_equal, arity=2), arity=2), payload_merge_op=lambda payload1, payload2: { 'chrs': payload1['chrs'] + payload2['chrs'], 'shots': payload1['shots'] + payload2['shots'] }, working_window=1) convs = convs.set_union(adjacent_seq) # convs = convs.coalesce(predicate=times_equal, payload_merge_op=shots_equal) # print("Two-shot adjacencies done") def filter_fn(intvl): payload = intvl.get_payload() if type(payload) is dict and 'shots' in payload: return len(set(payload['shots'])) >= 3 return False convs = convs.filter(filter_fn) convs = convs.coalesce() # print("Final filter done") # for video_id in convs.intervals.keys(): # print(video_id) # intvllist = convs.get_intervallist(video_id) # for intvl in intvllist.get_intervals(): # print(intvl.payload) # print(str(intvl.start) + ':' + str(intvl.end)) return intervallists_to_result_with_objects(convs, lambda a, b: [])
def reaction_shots_apollo_13(): from rekall.video_interval_collection import VideoIntervalCollection from rekall.merge_ops import payload_plus from rekall.payload_predicates import payload_satisfies from rekall.temporal_predicates import overlaps from rekall.parsers import in_array, merge_dict_parsers, bbox_payload_parser, dict_payload_parser from esper.caption_metadata import caption_metadata_for_video from esper.captions import get_all_segments from esper.rekall import intrvllists_to_result_with_objects from query.models import FaceCharacterActor, Shot videos = Video.objects.filter(name__contains="apollo 13").all() # Load script data metadata = VideoIntervalCollection({ video.id: caption_metadata_for_video(video.id) for video in videos }).filter(lambda meta_interval: (meta_interval.payload['speaker'] is not None and "man's voice" not in meta_interval.payload['speaker'] and meta_interval. payload['speaker'].strip() != "gene krantz")) all_segments = get_all_segments([video.id for video in videos]) captions_interval_collection = VideoIntervalCollection( {video: intervals for video, intervals in all_segments}) captions_with_speaker_id = captions_interval_collection.overlaps( metadata.filter(payload_satisfies(lambda p: p['aligned'])), payload_merge_op=lambda word, script_meta: (word[0], script_meta['speaker'])) # Annotate face rows with start and end frames and the video ID faces_with_character_actor_qs = FaceCharacterActor.objects.annotate( min_frame=F('face__frame__number'), max_frame=F('face__frame__number'), video_id=F('face__frame__video_id'), character_name=F('characteractor__character__name')).filter( video_id__in=[v.id for v in videos]) frames_with_identity = VideoIntervalCollection.from_django_qs( faces_with_character_actor_qs, with_payload=in_array( dict_payload_parser(VideoIntervalCollection.django_accessor, {'character': 'character_name'}), )).coalesce( payload_merge_op=payload_plus) # Annotate shots with all the people in them shots_qs = Shot.objects.filter( cinematic=True, video_id__in=[v.id for v in videos]).annotate(fps=F('video__fps')) shots = VideoIntervalCollection.from_django_qs( shots_qs, with_payload=lambda shot: shot.fps) # Annotate shots with mode shot scale frames_with_shot_scale_qs = Frame.objects.filter( regularly_sampled=True, video_id__in=[v.id for v in videos ]).annotate(min_frame=F('number'), max_frame=F('number'), shot_scale_name=F('shot_scale__name')).all() frames_with_shot_scale = VideoIntervalCollection.from_django_qs( frames_with_shot_scale_qs, with_payload=lambda f: f.shot_scale_name) def get_mode(items): return max(set(items), key=items.count) shots_with_scale = shots.merge( frames_with_shot_scale, predicate=overlaps(), payload_merge_op=lambda shot_fps, shot_scale: [(shot_fps, shot_scale)] ).coalesce(payload_merge_op=payload_plus).map( lambda intrvl: (intrvl.start, intrvl.end, { 'fps': intrvl.payload[0][0], 'shot_scale': get_mode([p[1] for p in intrvl.payload]) })) shots_with_people_in_them = shots_with_scale.overlaps( frames_with_identity, payload_merge_op=lambda shot_payload, identities: (shot_payload, identities), working_window=1).coalesce(payload_merge_op=lambda p1, p2: (p1[0], p1[ 1] + p2[1])).map(lambda intrvl: (intrvl.start / intrvl.payload[0][ 'fps'], intrvl.end / intrvl.payload[0]['fps'], { 'fps': intrvl.payload[0]['fps'], 'shot_scale': intrvl.payload[0]['shot_scale'], 'characters': set([ name.strip().split(' ')[0].strip() for d in intrvl. payload[1] for name in d['character'].split('/') if len(name.strip()) > 0 ]) })) reaction_shots = captions_with_speaker_id.overlaps( shots_with_people_in_them.filter( payload_satisfies( lambda p: p['shot_scale'] in ['medium_close_up', 'close_up', 'extreme_close_up'])), predicate=lambda captions, shots: captions.payload[1].strip().split( ' ')[0] not in shots.payload['characters'], payload_merge_op=lambda word_and_speaker, fps_and_characters: (fps_and_characters['fps'], word_and_speaker)).map(lambda intrvl: ( int(intrvl.start * intrvl.payload[0]), int(intrvl.end * intrvl.payload[0]), [intrvl.payload[1]])).dilate( 12).coalesce( payload_merge_op=payload_plus).dilate(-12).filter_length( min_length=12) return intrvllists_to_result_with_objects(reaction_shots, lambda a, b: [])
def shot_reverse_shot_intensification(): from query.models import Face from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser, merge_dict_parsers, named_payload from rekall.merge_ops import payload_plus, merge_named_payload, payload_first from esper.rekall import intrvllists_to_result_bbox from rekall.payload_predicates import payload_satisfies, on_name from rekall.list_predicates import length_at_most from rekall.logical_predicates import and_pred, or_pred from rekall.spatial_predicates import scene_graph, make_region from rekall.temporal_predicates import before, after from rekall.bbox_predicates import height_at_least from esper.rekall import intrvllists_to_result, intrvllists_to_result_with_objects, add_intrvllists_to_result # If True, visualize results in a timeline TIMELINE_OUTPUT = False RIGHT_HALF_MIN_X = 0.45 LEFT_HALF_MAX_X = 0.55 MIN_FACE_HEIGHT = 0.4 MAX_FACES_ON_SCREEN = 2 # faces are sampled every 12 frames SAMPLING_RATE = 12 ONE_SECOND = 24 FOUR_SECONDS = 96 TEN_SECONDS = 240 # Annotate face rows with start and end frames and the video ID faces = Face.objects.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id'), shot_scale=F('frame__shot_scale')) right_half = make_region(RIGHT_HALF_MIN_X, 0.0, 1.0, 1.0) left_half = make_region(0.0, 0.0, LEFT_HALF_MAX_X, 1.0) graph = { 'nodes': [{ 'name': 'face', 'predicates': [height_at_least(MIN_FACE_HEIGHT)] }], 'edges': [] } vids = VideoIntervalCollection.from_django_qs( faces, with_payload=merge_dict_parsers([ named_payload( 'faces', in_array( bbox_payload_parser( VideoIntervalCollection.django_accessor))), named_payload('shot_scale', in_array(lambda obj: obj.shot_scale)) ])).coalesce( payload_merge_op=merge_named_payload({ 'faces': payload_plus, 'shot_scale': payload_first })) def shot_scales_decreasing(scales): if len(scales) <= 1: return True cur_scale = scales[0] for scale in scales: if cur_scale == 1: cur_scale = scale continue if scale == 1: continue if scale < cur_scale: # Shot scale has gotten father here return False return True # Get sequences where there's a face on the right half of the screen and # there are at most two faces # Payload is the faces in the first frame, and a list of the shot scales # throughout the sequence # Filter out any sequences where the shot scale gets farther away over the sequence faces_on_right = vids.filter( and_pred( payload_satisfies( on_name('faces', length_at_most(MAX_FACES_ON_SCREEN))), payload_satisfies( on_name('faces', scene_graph( graph, region=right_half))))).dilate(SAMPLING_RATE / 2).coalesce( payload_merge_op=merge_named_payload( { 'faces': payload_first, 'shot_scale': payload_plus })).filter(lambda intrvl: shot_scales_decreasing( intrvl.get_payload()['shot_scale'])) # Get sequences where there's a face on the left half of the screen and # there are at most two faces # Payload is the faces in the first frame, and a list of the shot scales # throughout the sequence faces_on_left = vids.filter( and_pred( payload_satisfies( on_name('faces', length_at_most(MAX_FACES_ON_SCREEN))), payload_satisfies( on_name('faces', scene_graph( graph, region=left_half))))).dilate(SAMPLING_RATE / 2).coalesce( payload_merge_op=merge_named_payload( { 'faces': payload_first, 'shot_scale': payload_plus })).filter(lambda intrvl: shot_scales_decreasing( intrvl.get_payload()['shot_scale'])) # Sequences where faces on left up to one second before/after faces on left # Four seconds of buffer time between left-then-right/right-then-left # segments # Filter sequences by decreasing shot sequences # Only keep remaining sequences that last longer than ten seconds shot_reverse_shot_intensification = faces_on_right.merge( faces_on_left, predicate=before(max_dist=ONE_SECOND)).set_union( faces_on_left.merge(faces_on_right, predicate=before(max_dist=ONE_SECOND)) ).dilate(FOUR_SECONDS).coalesce( payload_merge_op=merge_named_payload({ 'faces': payload_first, 'shot_scale': payload_plus })).dilate( -1 * FOUR_SECONDS).filter(lambda intrvl: shot_scales_decreasing( intrvl.get_payload()['shot_scale'])).filter_length( min_length=TEN_SECONDS) def non_uniform(shot_scales): return (len(set(shot_scales)) > 2 if 1 in set(shot_scales) else len(set(shot_scales)) > 1) # Finally, filter out any shot sequences where the shot scales are uniform shot_reverse_shot_intensification = shot_reverse_shot_intensification.filter( lambda intrvl: non_uniform(intrvl.get_payload()['shot_scale'])) # Post-process to display in Esper widget if TIMELINE_OUTPUT: results = intrvllists_to_result( shot_reverse_shot_intensification.get_allintervals()) add_intrvllists_to_result(results, faces_on_left.get_allintervals(), color='black') add_intrvllists_to_result(results, faces_on_right.get_allintervals(), color='green') else: results = intrvllists_to_result_with_objects( shot_reverse_shot_intensification.get_allintervals(), lambda payload, video: []) return results
def hero_shot(): from query.models import Face from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import named_payload, in_array, bbox_payload_parser from rekall.parsers import merge_dict_parsers, dict_payload_parser from rekall.merge_ops import payload_plus, payload_first, merge_named_payload from rekall.payload_predicates import payload_satisfies, on_name from rekall.spatial_predicates import scene_graph from rekall.logical_predicates import and_pred from rekall.bbox_predicates import height_at_least, left_of, same_value from esper.rekall import intrvllists_to_result_with_objects, bbox_to_result_object # We're going to look for frames that would be good "hero shot" frames -- # potentially good frames to show in a Netflix preview, for instance. # We're going to look for frames where there's exactly one face of a # certain height, and the frame has certain minimum brightness, # sharpness, and contrast properties. MIN_FACE_HEIGHT = 0.2 MIN_BRIGHTNESS = 50 MIN_SHARPNESS = 50 MIN_CONTRAST = 30 FILM_NAME = "star wars the force awakens" # Annotate face rows with start and end frames, video ID, and frame image # information faces_qs = Face.objects.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id'), brightness=F('frame__brightness'), contrast=F('frame__contrast'), sharpness=F('frame__sharpness')).filter( frame__video__name=FILM_NAME, brightness__isnull=False, contrast__isnull=False, sharpness__isnull=False) # Load bounding boxes and faces into rekall, and put all faces in one frame faces = VideoIntervalCollection.from_django_qs( faces_qs, with_payload=merge_dict_parsers([ named_payload( 'faces', in_array( bbox_payload_parser( VideoIntervalCollection.django_accessor))), dict_payload_parser( VideoIntervalCollection.django_accessor, { 'brightness': 'brightness', 'contrast': 'contrast', 'sharpness': 'sharpness' }) ])).coalesce( merge_named_payload({ 'faces': payload_plus, 'brightness': payload_first, 'contrast': payload_first, 'sharpness': payload_first })) # Hero shots are shots where there is exactly one face of at least a # certain height, and brightness, contrast, and sharpness are at least # some amount hero_shots = faces.filter( payload_satisfies( and_pred( on_name( 'faces', scene_graph( { 'nodes': [{ 'name': 'face', 'predicates': [height_at_least(MIN_FACE_HEIGHT)] }], 'edges': [] }, exact=True)), lambda payload: (payload['brightness'] > MIN_BRIGHTNESS and payload['contrast'] > MIN_CONTRAST and payload['sharpness'] > MIN_SHARPNESS)))) return intrvllists_to_result_with_objects( hero_shots.get_allintervals(), lambda payload, video_id: [bbox_to_result_object(bbox, video_id) for bbox in payload['faces']], limit=100, stride=10)