def interview_with_person_x(): from query.models import LabeledCommercial, FaceIdentity from rekall.video_interval_collection import VideoIntervalCollection from rekall.temporal_predicates import before, after, overlaps from rekall.logical_predicates import or_pred from esper.rekall import intrvllists_to_result # Get list of sandbox video IDs sandbox_videos = [ row.video_id for row in LabeledCommercial.objects.distinct('video_id') ] guest_name = "bernie sanders" # Load hosts and instances of guest from SQL identities = FaceIdentity.objects.filter(face__shot__video_id__in=sandbox_videos) hosts_qs = identities.filter(face__is_host=True) guest_qs = identities.filter(identity__name=guest_name).filter(probability__gt=0.7) # Put bounding boxes in SQL hosts = VideoIntervalCollection.from_django_qs( hosts_qs.annotate(video_id=F("face__shot__video_id"), min_frame=F("face__shot__min_frame"), max_frame=F("face__shot__max_frame")) ) guest = VideoIntervalCollection.from_django_qs( guest_qs.annotate(video_id=F("face__shot__video_id"), min_frame=F("face__shot__min_frame"), max_frame=F("face__shot__max_frame")) ) # Get all shots where the guest and a host are on screen together guest_with_host = guest.overlaps(hosts).coalesce() # This temporal predicate defines A overlaps with B, or A before by less than 10 frames, # or A after B by less than 10 frames overlaps_before_or_after_pred = or_pred( or_pred(overlaps(), before(max_dist=10), arity=2), after(max_dist=10), arity=2) # This code finds sequences of: # guest with host overlaps/before/after host OR # guest with host overlaps/before/after guest interview_candidates = guest_with_host \ .merge(hosts, predicate=overlaps_before_or_after_pred) \ .set_union(guest_with_host.merge( guest, predicate=overlaps_before_or_after_pred)) \ .coalesce() # Sequences may be interrupted by shots where the guest or host don't # appear, so dilate and coalesce to merge neighboring segments interviews = interview_candidates \ .dilate(600) \ .coalesce() \ .dilate(-600) \ .filter_length(min_length=1350) # Return intervals return intrvllists_to_result(interviews.get_allintervals())
def all_faces_rekall(): from query.models import Face from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser from rekall.merge_ops import payload_plus from esper.rekall import intrvllists_to_result_bbox from esper.stdlib import qs_to_result video_name = "inception" # Annotate face rows with start and end frames and the video ID faces = Face.objects.filter(frame__video__name=video_name).annotate( min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id')) # Materialize all the faces and load them into rekall with bounding box payloads # Then coalesce them so that all faces in the same frame are in the same interval # NOTE that this is slow right now since we're loading all faces! vids = VideoIntervalCollection.from_django_qs( faces, with_payload=in_array( bbox_payload_parser( VideoIntervalCollection.django_accessor))).coalesce( payload_merge_op=payload_plus) # Post-process to display in Esper widget return intrvllists_to_result_bbox(vids.get_allintervals(), limit=100, stride=100)
def consecutive_short_shots(): from query.models import Shot from rekall.video_interval_collection import VideoIntervalCollection from rekall.temporal_predicates import meets_before from esper.rekall import intrvllists_to_result_with_objects from django.db.models import ExpressionWrapper, FloatField NUM_SHOTS = 3 MAX_SHOT_DURATION = 0.5 short_shots = VideoIntervalCollection.from_django_qs( Shot.objects.annotate(duration=ExpressionWrapper( (F('max_frame') - F('min_frame')) / F('video__fps'), output_field=FloatField())).filter( duration__lt=MAX_SHOT_DURATION, duration__gt=0., labeler__name='shot-hsvhist-face').all()) n_shots = short_shots for n in range(2, NUM_SHOTS + 1): print('Constructing {} consecutive short shots'.format(n)) n_shots = n_shots.merge( short_shots, predicate=meets_before(epsilon=1), working_window=1).coalesce().filter_length(min_length=1) print('There are {} videos with {} consecutive short shots'.format( len(n_shots.get_allintervals().keys()), n)) return intrvllists_to_result_with_objects(n_shots, lambda a, b: [], limit=100, stride=1)
def get_caption_intrvlcol(phrase, video_ids=None): results = phrase_search(phrase, video_ids) if video_ids == None: videos = {v.id: v for v in Video.objects.all()} else: videos = { v.id: v for v in Video.objects.filter(id__in=video_ids).all() } def convert_time(k, t): return int(t * videos[k].fps) flattened = [(doc.id, convert_time(doc.id, p.start), convert_time(doc.id, p.end)) for doc in results for p in doc.postings] phrase_intrvllists = {} for video_id, t1, t2 in flattened: if video_id in phrase_intrvllists: phrase_intrvllists[video_id].append((t1, t2, 0)) else: phrase_intrvllists[video_id] = [(t1, t2, 0)] for video_id, intrvllist in phrase_intrvllists.items(): phrase_intrvllists[video_id] = IntervalList(intrvllist) phrase_intrvlcol = VideoIntervalCollection(phrase_intrvllists) print('Get {} intervals for phrase \"{}\"'.format( count_intervals(phrase_intrvlcol), phrase)) return phrase_intrvlcol
def faces_with_gender(): from query.models import FaceGender from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser, merge_dict_parsers, dict_payload_parser from rekall.merge_ops import payload_plus from esper.rekall import intrvllists_to_result_bbox VIDEO_NAME_CONTAINS = "harry potter" # Annotate face rows with start and end frames and the video ID faces_with_gender = FaceGender.objects.annotate( min_frame=F('face__frame__number'), max_frame=F('face__frame__number'), video_id=F('face__frame__video_id'), bbox_x1=F('face__bbox_x1'), bbox_y1=F('face__bbox_y1'), bbox_x2=F('face__bbox_x2'), bbox_y2=F('face__bbox_y2'), gender_name=F('gender__name')).filter( face__frame__video__name__contains=VIDEO_NAME_CONTAINS) faces = VideoIntervalCollection.from_django_qs( faces_with_gender, with_payload=in_array( merge_dict_parsers([ bbox_payload_parser(VideoIntervalCollection.django_accessor), dict_payload_parser(VideoIntervalCollection.django_accessor, {'gender': 'gender_name'}) ]))).coalesce(payload_merge_op=payload_plus) return intrvllists_to_result_bbox(faces.get_allintervals(), limit=100, stride=1000)
def three_people(): from query.models import Face from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser from rekall.merge_ops import payload_plus from esper.rekall import intrvllists_to_result_bbox from rekall.payload_predicates import payload_satisfies from rekall.spatial_predicates import scene_graph from rekall.bbox_predicates import height_at_least, left_of, same_value MIN_FACE_HEIGHT = 0.3 EPSILON = 0.05 # Annotate face rows with start and end frames and the video ID faces = Face.objects.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id')) # Materialize all the faces and load them into rekall with bounding box payloads # Then coalesce them so that all faces in the same frame are in the same interval # NOTE that this is slow right now since we're loading all faces! face_lists = VideoIntervalCollection.from_django_qs( faces, with_payload=in_array( bbox_payload_parser( VideoIntervalCollection.django_accessor))).coalesce( payload_merge_op=payload_plus) three_people_scene_graph = { 'nodes': [{ 'name': 'face1', 'predicates': [height_at_least(MIN_FACE_HEIGHT)] }, { 'name': 'face2', 'predicates': [height_at_least(MIN_FACE_HEIGHT)] }, { 'name': 'face3', 'predicates': [height_at_least(MIN_FACE_HEIGHT)] }], 'edges': [{ 'start': 'face1', 'end': 'face2', 'predicates': [left_of(), same_value('y1', epsilon=EPSILON)] }, { 'start': 'face2', 'end': 'face3', 'predicates': [left_of(), same_value('y1', epsilon=EPSILON)] }] } three_people = face_lists.filter( payload_satisfies(scene_graph(three_people_scene_graph, exact=True))) # Post-process to display in Esper widget return intrvllists_to_result_bbox(three_people.get_allintervals(), limit=100, stride=100)
def intrvlcol_second2frame(intrvlcol): intrvllists_frame = {} for video_id, intrvllist in intrvlcol.get_allintervals().items(): video = Video.objects.filter(id=video_id)[0] fps = video.fps intrvllists_frame[video_id] = IntervalList([(int(i.start * fps), int(i.end * fps), i.payload) \ for i in intrvllist.get_intervals()] ) return VideoIntervalCollection(intrvllists_frame)
def man_woman_up_close(): from query.models import FaceGender from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser, merge_dict_parsers, dict_payload_parser from rekall.merge_ops import payload_plus from esper.rekall import intrvllists_to_result_bbox from rekall.payload_predicates import payload_satisfies from rekall.spatial_predicates import scene_graph from rekall.bbox_predicates import height_at_least MIN_FACE_CONFIDENCE = 0.95 MIN_GENDER_CONFIDENCE = 0.95 MIN_FACE_HEIGHT = 0.6 # Annotate face rows with start and end frames and the video ID faces_with_gender= FaceGender.objects.filter(face__frame__video__name=video_name).annotate( min_frame=F('face__frame__number'), max_frame=F('face__frame__number'), video_id=F('face__frame__video_id'), bbox_x1=F('face__bbox_x1'), bbox_y1=F('face__bbox_y1'), bbox_x2=F('face__bbox_x2'), bbox_y2=F('face__bbox_y2'), gender_name=F('gender__name'), face_probability=F('face__probability')) faces = VideoIntervalCollection.from_django_qs( faces_with_gender, with_payload=in_array(merge_dict_parsers([ bbox_payload_parser(VideoIntervalCollection.django_accessor), dict_payload_parser(VideoIntervalCollection.django_accessor, { 'gender': 'gender_name' }), dict_payload_parser(VideoIntervalCollection.django_accessor, { 'gender_probability': 'probability' }), dict_payload_parser(VideoIntervalCollection.django_accessor, { 'face_probability': 'face_probability' }) ])) ).coalesce(payload_merge_op=payload_plus) graph = { 'nodes': [ { 'name': 'face_male', 'predicates': [ height_at_least(MIN_FACE_HEIGHT), lambda payload: payload['gender'] is 'M', lambda payload: payload['face_probability'] > MIN_FACE_CONFIDENCE, lambda payload: payload['gender_probability'] > MIN_GENDER_CONFIDENCE ] }, { 'name': 'face_female', 'predicates': [ height_at_least(MIN_FACE_HEIGHT), lambda payload: payload['gender'] is 'F', lambda payload: payload['face_probability'] > MIN_FACE_CONFIDENCE, lambda payload: payload['gender_probability'] > MIN_GENDER_CONFIDENCE ] }, ], 'edges': [] } mf_up_close = faces.filter(payload_satisfies( scene_graph(graph, exact=True))) return intrvllists_to_result_bbox(mf_up_close.get_allintervals(), limit=100, stride=100)
def label_videos_with_shot_scale(video_ids): faces = Face.objects.annotate( min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video__id')).filter(video_id__in=video_ids) poses = PoseMeta.objects.annotate( min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video__id')).filter(video_id__in=video_ids) face_frames = VideoIntervalCollection.from_django_qs( faces, with_payload=rk.parsers.merge_dict_parsers([ with_face(), with_named_empty_list('pose'), rk.parsers.dict_payload_parser( VideoIntervalCollection.django_accessor, {'frame_id': 'frame_id'}) ])) pose_frames = VideoIntervalCollection.from_django_qs( poses, with_payload=rk.parsers.merge_dict_parsers([ with_pose(), with_named_empty_list('face'), rk.parsers.dict_payload_parser( VideoIntervalCollection.django_accessor, {'frame_id': 'frame_id'}) ])) faces_with_pose = face_frames.set_union(pose_frames).coalesce( merge_named_payload({ 'pose': rk.merge_ops.payload_plus, 'face': rk.merge_ops.payload_plus, 'frame_id': rk.merge_ops.payload_first })) frames_with_shot_scale = faces_with_pose.map( lambda intrvl: (intrvl.start, intrvl.end, { 'pose': intrvl.payload['pose'], 'face': intrvl.payload['face'], 'frame_id': intrvl.payload['frame_id'], 'shot_scale': payload_to_shot_scale(intrvl.payload) })) return frames_with_shot_scale
def get_commercial_intrvlcol(video_ids=None, granularity='frame'): if video_ids is None: commercial_qs = Commercial.objects.all() else: commercial_qs = Commercial.objects.filter(video_id__in=video_ids) commercial_intrvllists = qs_to_intrvllists( commercial_qs.annotate(video_id=F("video_id"))) commercial = VideoIntervalCollection(commercial_intrvllists) if granularity == 'second': commercial = intrvlcol_frame2second(commercial) return commercial
def manual_shots_rekall(): from query.models import Shot, Labeler from rekall.video_interval_collection import VideoIntervalCollection from esper.rekall import intrvllists_to_result from esper.stdlib import qs_to_result shots_qs = Shot.objects.filter( labeler__name__contains='manual') shots = VideoIntervalCollection.from_django_qs(shots_qs) return intrvllists_to_result_with_objects(shots.get_allintervals())
def panels_rekall(): from query.models import LabeledCommercial, Face from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser from rekall.merge_ops import payload_plus from rekall.bbox_predicates import height_at_least, same_value, left_of from rekall.spatial_predicates import scene_graph from rekall.payload_predicates import payload_satisfies from esper.rekall import intrvllists_to_result_bbox # Get list of sandbox video IDs sandbox_videos = [ row.video_id for row in LabeledCommercial.objects.distinct('video_id') ] faces_qs = Face.objects.filter(shot__video_id__in=sandbox_videos).annotate( video_id=F("shot__video_id"), min_frame=F("shot__min_frame"), max_frame=F("shot__max_frame") ) # One interval for each face faces = VideoIntervalCollection.from_django_qs( faces_qs, with_payload=in_array( bbox_payload_parser( VideoIntervalCollection.django_accessor))) # Merge shots faces = faces.coalesce(payload_merge_op=payload_plus) # Define a scene graph for things that look like panels three_faces_scene_graph = { 'nodes': [ { 'name': 'face1', 'predicates': [ height_at_least(0.3) ] }, { 'name': 'face2', 'predicates': [ height_at_least(0.3) ] }, { 'name': 'face3', 'predicates': [ height_at_least(0.3) ] } ], 'edges': [ { 'start': 'face1', 'end': 'face2', 'predicates': [ same_value('y1', epsilon=0.05), left_of() ] }, { 'start': 'face2', 'end': 'face3', 'predicates': [ same_value('y1', epsilon=0.05), left_of() ] }, ] } panels = faces.filter(payload_satisfies( scene_graph(three_faces_scene_graph, exact=True) )) return intrvllists_to_result_bbox(panels.get_allintervals())
def cinematic_shots_rekall(): from query.models import Shot, Labeler from rekall.video_interval_collection import VideoIntervalCollection from esper.rekall import intrvllists_to_result_with_objects from esper.stdlib import qs_to_result video_ids = [1] shots_qs = Shot.objects.filter(video_id__in=video_ids, cinematic=True) shots = VideoIntervalCollection.from_django_qs(shots_qs) return intrvllists_to_result_with_objects(shots.get_allintervals(), lambda payload, video: [])
def split_intrvlcol(intrvlcol, seg_length): intrvllists_split = {} for video_id, intrvllist in intrvlcol.get_allintervals().items(): intervals_split = [] for i in intrvllist.get_intervals(): duration = i.end - i.start start = i.start while duration > 0: if duration > seg_length: intervals_split.append((start, start + seg_length, i.payload)) duration -= seg_length start += seg_length else: intervals_split.append((start, start + duration, i.payload)) duration = 0 intrvllists_split[video_id] = IntervalList(intervals_split) return VideoIntervalCollection(intrvllists_split)
def frames_with_character_x(): from query.models import FaceCharacterActor from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser, merge_dict_parsers, dict_payload_parser from rekall.merge_ops import payload_plus from rekall.payload_predicates import payload_satisfies from rekall.spatial_predicates import scene_graph from esper.rekall import intrvllists_to_result_bbox character_name = "harry potter" # Annotate face rows with start and end frames and the video ID faces_with_character_actor_qs = FaceCharacterActor.objects.annotate( min_frame=F('face__frame__number'), max_frame=F('face__frame__number'), video_id=F('face__frame__video_id'), bbox_x1=F('face__bbox_x1'), bbox_y1=F('face__bbox_y1'), bbox_x2=F('face__bbox_x2'), bbox_y2=F('face__bbox_y2'), character_name=F('characteractor__character__name')) faces_with_identity = VideoIntervalCollection.from_django_qs( faces_with_character_actor_qs, with_payload=in_array( merge_dict_parsers([ bbox_payload_parser(VideoIntervalCollection.django_accessor), dict_payload_parser(VideoIntervalCollection.django_accessor, {'character': 'character_name'}), ]))).coalesce(payload_merge_op=payload_plus) faces_with_actor = faces_with_identity.filter( payload_satisfies( scene_graph({ 'nodes': [{ 'name': 'face1', 'predicates': [lambda f: f['character'] == character_name] }], 'edges': [] }))) return intrvllists_to_result_bbox(faces_with_actor.get_allintervals(), limit=100, stride=1000)
def faces_from_poses_rekall(): from query.models import Pose from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser from rekall.merge_ops import payload_plus from esper.rekall import intrvllists_to_result_bbox from esper.stdlib import qs_to_result # Annotate pose rows with start and end frames and the video ID poses = Pose.objects.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id')) # Parse the pose keypoints and get a bounding box around the face def get_face_bbox(pose): pose_keypoints = pose.pose_keypoints() face_indices = [ Pose.Nose, Pose.Neck, Pose.REye, Pose.LEye, Pose.REar, Pose.LEar ] x_vals = [ pose_keypoints[index][0] for index in face_indices if pose_keypoints[index][2] is not 0.0 ] y_vals = [ pose_keypoints[index][1] for index in face_indices if pose_keypoints[index][2] is not 0.0 ] x1 = min(x_vals) y1 = min(y_vals) x2 = max(x_vals) y2 = max(y_vals) return {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2} # Materialize all the faces and load them into rekall with bounding box payloads # Then coalesce them so that all faces in the same frame are in the same interval # NOTE that this is slow right now since we're loading all poses! vids = VideoIntervalCollection.from_django_qs(poses, with_payload=in_array(get_face_bbox)) \ .coalesce(payload_merge_op=payload_plus) # Post-process to display in Esper widget return intrvllists_to_result_bbox(vids.get_allintervals(), limit=100, stride=100)
def all_captions(): from esper.captions import get_all_segments from rekall.video_interval_collection import VideoIntervalCollection from esper.rekall import intrvllists_to_result_with_objects video_ids = [1] # Only aligned captions are in the caption index results = get_all_segments(video_ids) caption_results = VideoIntervalCollection({ video_id: [( word[0] * Video.objects.get(id=video_id).fps, # start frame word[1] * Video.objects.get(id=video_id).fps, # end frame word[2]) # payload is the word (string) for word in words] for video_id, words in results }) return intrvllists_to_result_with_objects(caption_results, lambda a, b: [])
def multi_person_one_phrase(phrase, filters={}): ''' Get all intervals which the phrase is being said @phrase: input phrase to be searched @filters: 'with_face': must contain exactly one face 'gender': filter by gender 'limit': number of output intervals ''' videos = Video.objects.filter(threeyears_dataset=True) video_ids = [video.id for video in videos] phrase_intrvlcol = get_caption_intrvlcol(phrase.upper(), video_ids) def fn(i): faces = Face.objects.filter(shot__video__id=video_id, shot__min_frame__lte=i.start, shot__max_frame__gte=i.end) # faces = Face.objects.filter(frame__number__gte=i.start, frame__number__lte=i.end) if len(faces) != 1: return False if 'gender' in filters: faceGender = FaceGender.objects.filter(face__id=faces[0].id)[0] if faceGender.gender.name != filters['gender']: return False return True if 'with_face' in filters: print('Filtering with face...') intrvlcol_withface = {} for video_id, intrvllist in phrase_intrvlcol.intervals.items(): intrvllist_withface = intrvllist.filter(fn) if intrvllist_withface.size() > 0: intrvlcol_withface[video_id] = intrvllist_withface if 'limit' in filters and len( intrvlcol_withface) == filters['limit']: break phrase_intrvlcol = VideoIntervalCollection(intrvlcol_withface) # print(len(phrase_intrvlcol)) return intrvlcol2list(phrase_intrvlcol)
def get_numface_intrvlcol(relevant_shots, num_face=1): faces = Face.objects.filter(shot__in=list(relevant_shots)) \ .annotate(video_id=F('shot__video_id')) \ .annotate(min_frame=F('shot__min_frame')) \ .annotate(max_frame=F('shot__max_frame')) # Materialize all the faces and load them into rekall with bounding box payloads # Then coalesce them so that all faces in the same frame are in the same interval # NOTE that this is slow right now since we're loading all faces! numface_intrvlcol = VideoIntervalCollection.from_django_qs( faces, with_payload=in_array( bbox_payload_parser(VideoIntervalCollection.django_accessor)) ).coalesce(payload_merge_op=payload_plus).filter(payload_satisfies(length_exactly(num_face))) num_intrvl = 0 for _, intrvllist in numface_intrvlcol.get_allintervals().items(): num_intrvl += intrvllist.size() print("Get {} relevant {} face intervals".format(num_intrvl, num_face)) return numface_intrvlcol
def all_poses(): from query.models import PoseMeta from esper.stdlib import pose_to_dict, simple_result import esper.pose_wrapper as pw from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array from rekall.merge_ops import payload_plus from esper.rekall import intrvllists_to_result_with_objects STRIDE = 1000 LIMIT = 100 # PoseMeta is a table that contains pose ID, labeler, and a pointer to # a Frame. # NOTE that PoseMeta ID's have NO RELATION to Pose ID's. pose_meta_qs = PoseMeta.objects.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id')) # Use coalesce to get a list of frames we want # We store Video ID and frame number in the payload frames = VideoIntervalCollection.from_django_qs( pose_meta_qs[:LIMIT * STRIDE:STRIDE], with_payload=lambda pose_meta_obj: (pose_meta_obj.video_id, pose_meta_obj.min_frame)).coalesce() # pose_wrapper.get takes in a PoseMeta queryset or list of PoseMeta objects # and returns a list of PoseWrapper objects. poses = frames.map(lambda interval: ( interval.start, interval.end, pw.get( pose_meta_qs.filter(video_id=interval.payload[0], min_frame=interval.payload[1]).all()))) # We use pose_to_dict to draw PoseWrapper objects. return intrvllists_to_result_with_objects( poses, lambda pose_wrappers, video_id: [pose_to_dict(wrapper) for wrapper in pose_wrappers])
def all_face_landmarks(): from query.models import Face from esper.stdlib import face_landmarks_to_dict, simple_result import esper.face_landmarks_wrapper as flw from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array from rekall.merge_ops import payload_plus from esper.rekall import intrvllists_to_result_with_objects STRIDE = 1000 LIMIT = 100 # Face landmarks are keyed by Face ID's. faces_qs = Face.objects.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id')) # Use coalesce to get a list of frames we want # We store Video ID and frame number in the payload frames = VideoIntervalCollection.from_django_qs( faces_qs[:LIMIT * STRIDE:STRIDE], with_payload=lambda face_obj: (face_obj.video_id, face_obj.min_frame)).coalesce() # face_landmarks_wrapper.get takes in a Face queryset or list of Face # objects and returns a list of LandmarksWrapper objects. landmarks = frames.map(lambda interval: ( interval.start, interval.end, flw.get( faces_qs.filter(video_id=interval.payload[0], min_frame=interval.payload[1]).all()))) # We use face_landmarks_to_dict to draw LandmarksWrapper objects. return intrvllists_to_result_with_objects( landmarks, lambda landmarks_wrappers, video_id: [face_landmarks_to_dict(wrapper) for wrapper in landmarks_wrappers])
def caption_search(): from esper.captions import topic_search from rekall.video_interval_collection import VideoIntervalCollection from esper.rekall import intrvllists_to_result_with_objects phrases = [ 'may the Force be with you', 'may the force be with you', 'May the Force be with you', 'May the force be with you' ] results = topic_search( phrases, window_size=0, video_ids=[ vid.id for vid in Video.objects.filter(name__contains="star wars").all() ]) caption_results = VideoIntervalCollection({ r.id: [((p.start * Video.objects.get(id=r.id).fps), (p.end * Video.objects.get(id=r.id).fps), 0) for p in r.postings] for r in results }) return intrvllists_to_result_with_objects(caption_results, lambda a, b: [])
def get_person_intrvlcol(person_name, **kwargs): # if video_ids is None: # videos = Video.objects.filter(threeyears_dataset=True) # video_ids = [video.id for video in videos] # faceIDs = FaceIdentity.objects \ # .annotate(video_id=F("face__frame__video_id")) \ # .annotate(shot_boundary=F("face__frame__shot_boundary")) # if not video_ids is None: # faceIDs = faceIDs.filter(video_id__in=video_ids, shot_boundary=True) \ if kwargs['labeler'] == 'old': # old labeler model faceIDs = FaceIdentity.objects \ .exclude(face__shot__isnull=True) \ .filter(Q(labeler__name='face-identity-converted:'+person_name.lower()) | Q(labeler__name='face-identity:'+person_name.lower()) ) \ .filter(probability__gt=0.9) \ .annotate(height=F("face__bbox_y2") - F("face__bbox_y1")) if 'large_face' in kwargs: faceIDs = faceIDs.filter(height__gte=0.3) person_intrvllists = qs_to_intrvllists( faceIDs.annotate(video_id=F("face__shot__video_id")) .annotate(shot_id=F("face__shot_id")) .annotate(min_frame=F("face__shot__min_frame")) .annotate(max_frame=F("face__shot__max_frame")),\ schema={ 'start': 'min_frame', 'end': 'max_frame', 'payload': 'shot_id' }) person_intrvlcol = VideoIntervalCollection(person_intrvllists) else: # new labeler model faceIDs = FaceIdentity.objects \ .filter(face__frame__shot_boundary=False) \ .filter(Q(labeler__name='face-identity-converted:'+person_name.lower()) | Q(labeler__name='face-identity:'+person_name.lower()) ) \ .filter(probability__gt=0.9) \ .annotate(height=F("face__bbox_y2") - F("face__bbox_y1")) if 'large_face' in kwargs: faceIDs = faceIDs.filter(height__gte=0.3) person_intrvllists_raw = qs_to_intrvllists( faceIDs.annotate(video_id=F("face__frame__video_id")) .annotate(frame_id=F("face__frame__number")) .annotate(min_frame=F("face__frame__number")) .annotate(max_frame=F("face__frame__number") + 1),\ schema={ 'start': 'min_frame', 'end': 'max_frame', 'payload': 'frame_id' }) # dilate and coalesce person_intrvllists = {} for video_id, intrvllist in person_intrvllists_raw.items(): video = Video.objects.filter(id=video_id)[0] person_intrvllists[video_id] = intrvllist.dilate( int(video.fps * 1.6)).coalesce() person_intrvlcol = VideoIntervalCollection(person_intrvllists) print("Get {} intervals for person {}".format( count_intervals(person_intrvlcol), person_name)) return person_intrvlcol
def shot_reverse_shot_complex(): from query.models import Face, Shot from rekall.temporal_predicates import overlaps from rekall.merge_ops import payload_second, payload_plus from rekall.video_interval_collection import VideoIntervalCollection from rekall.interval_list import Interval, IntervalList from rekall.parsers import in_array, bbox_payload_parser from rekall.payload_predicates import payload_satisfies from rekall.list_predicates import length_at_most from rekall.logical_predicates import and_pred from rekall.spatial_predicates import scene_graph, make_region from rekall.temporal_predicates import before, after from rekall.bbox_predicates import height_at_least from esper.rekall import intrvllists_to_result_with_objects VIDEO_NAME = 'godfather part iii' MAX_FACE_MOVEMENT = 0.15 MIN_FACE_HEIGHT = 0.2 MAX_FACES_ON_SCREEN = 4 RIGHT_HALF_MIN_X = 0.33 LEFT_HALF_MAX_X = 0.66 SHOTS_LABELER_ID = 64 # faces are sampled every 12 frames SAMPLING_RATE = 12 # Annotate face rows with start and end frames and the video ID faces = Face.objects.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id')).filter( frame__video__name__contains=VIDEO_NAME) shots = VideoIntervalCollection.from_django_qs(Shot.objects.filter( video__name__contains=VIDEO_NAME, labeler_id=SHOTS_LABELER_ID), with_payload=lambda obj: []) # vids are all faces for each frame vids = VideoIntervalCollection.from_django_qs( faces.filter(probability__gte=0.99), with_payload=in_array( bbox_payload_parser( VideoIntervalCollection.django_accessor))).coalesce( payload_merge_op=payload_plus) right_half = make_region(RIGHT_HALF_MIN_X, 0.0, 1.0, 1.0) left_half = make_region(0.0, 0.0, LEFT_HALF_MAX_X, 1.0) graph = { 'nodes': [{ 'name': 'face', 'predicates': [height_at_least(MIN_FACE_HEIGHT)] }], 'edges': [] } faces_on_right = vids.filter( and_pred(payload_satisfies(length_at_most(MAX_FACES_ON_SCREEN)), payload_satisfies(scene_graph(graph, region=right_half)))) faces_on_left = vids.filter( and_pred(payload_satisfies(length_at_most(MAX_FACES_ON_SCREEN)), payload_satisfies(scene_graph(graph, region=left_half)))) def wrap_list(intvl): intvl.payload = [intvl.payload] return intvl def get_height(box): return box['y2'] - box['y1'] def get_center(box): return ((box['x1'] + box['x2']) / 2, (box['y1'] + box['y2']) / 2) def get_distance(pt1, pt2): return np.sqrt((pt1[0] - pt2[0])**2 + (pt1[1] - pt2[1])**2) def find_highest_box(boxes): if len(boxes) == 0: return None result = boxes[0] best = get_height(result) for i in range(1, len(boxes)): h = get_height(boxes[i]) if h > best: best = h result = boxes[i] return result def take_highest_in_frame(intvl): result = [] for faces_in_frame in intvl.payload: largest = find_highest_box(faces_in_frame) if largest is not None: result.append(largest) intvl.payload = result return intvl # Check if displacement of box center between frames are within `dist` def inter_frame_movement_less_than(dist): def check(boxes): for b1, b2 in zip(boxes, boxes[1:]): if get_distance(get_center(b1), get_center(b2)) > dist: return False return True return check # Payload is a list, each element is a list of faces for a frame shots_with_face_on_right = shots.merge( faces_on_right, predicate=overlaps(), payload_merge_op=payload_second).map(wrap_list).coalesce( payload_merge_op=payload_plus).map(take_highest_in_frame).filter( payload_satisfies( inter_frame_movement_less_than(MAX_FACE_MOVEMENT))) shots_with_face_on_left = shots.merge( faces_on_left, predicate=overlaps(), payload_merge_op=payload_second).map(wrap_list).coalesce( payload_merge_op=payload_plus).map(take_highest_in_frame).filter( payload_satisfies( inter_frame_movement_less_than(MAX_FACE_MOVEMENT))) # Right-Left-Right sequences shot_reverse_shot_1 = shots_with_face_on_right.merge( shots_with_face_on_left, predicate=before(max_dist=1)).merge(shots_with_face_on_right, predicate=before(max_dist=1)) # Left-Right-Left sequences shot_reverse_shot_2 = shots_with_face_on_left.merge( shots_with_face_on_right, predicate=before(max_dist=1)).merge(shots_with_face_on_left, predicate=before(max_dist=1)) shot_reverse_shot = shot_reverse_shot_1.set_union( shot_reverse_shot_2).coalesce() result = intrvllists_to_result_with_objects( shot_reverse_shot.get_allintervals(), payload_to_objs=lambda p, v: []) return result
# CONTINUE_PATH = '/app/notebooks/learning/models/deepsbd_resnet_train_on_40000_min_weak/fold1_270000_iteration.pth' device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print('Initialized constants') # load folds from disk with open(FOLDS_PATH, 'rb') as f: folds = pickle.load(f) # Load DeepSBD datasets for each fold. This is used for testing. deepsbd_datasets_weak_testing = [] for fold in folds: shots_in_fold_qs = Shot.objects.filter(labeler__name__contains='manual', video_id__in=fold) shots_in_fold = VideoIntervalCollection.from_django_qs(shots_in_fold_qs) data = movies_deepsbd_data.DeepSBDDataset(shots_in_fold, verbose=True, preload=False, logits=True, local_path=LOCAL_PATH) deepsbd_datasets_weak_testing.append(data) print('Loaded test data') # load weak labels if TRAINING_SET != 'ground_truth': with open(WEAK_LABELS_PATH, 'rb') as f: weak_labels_windows = np.load(f)
def __init__(self, shots, window_size=16, stride=8, size=128, verbose=False, preload=True, logits=False, local_path=None): """Constrcutor for ShotDetectionDataset. Args: shots: VideoIntervalCollection of all the intervals to get frames from. If the payload is -1, then the interval is not an actual shot and just needs to be included in the dataset. """ self.window_size = window_size self.preload = preload self.logits = logits self.local_path = local_path self.storehouse_backend = storehouse.StorageBackend.make_from_config( storehouse.StorageConfig.make_posix_config()) items = set() frame_nums = {} shot_boundaries = shots.map(lambda intrvl: ( intrvl.start, intrvl.start, intrvl.payload)).set_union( shots.map(lambda intrvl: (intrvl.end + 1, intrvl.end + 1, intrvl.payload)) ).coalesce().filter(lambda intrvl: intrvl.payload != -1) clips = shots.dilate(1).coalesce().dilate(-1).map(lambda intrvl: ( intrvl.start - stride - ((intrvl.start - stride) % stride), intrvl. end + stride - ((intrvl.end + stride) % stride ), intrvl.payload)).dilate(1).coalesce().dilate(-1) items_intrvls = {} for video_id in clips.get_allintervals(): items_intrvls[video_id] = [] for intrvl in clips.get_intervallist(video_id).get_intervals(): items_intrvls[video_id] += [ (f, f + window_size, 0 if not logits else (1, 0, 0)) for f in range(intrvl.start, intrvl.end - stride, stride) ] items_col = VideoIntervalCollection(items_intrvls) items_w_boundaries = items_col.filter_against( shot_boundaries, predicate=during_inv()).map(lambda intrvl: ( intrvl.start, intrvl.end, 2 if not logits else (0, 0, 1))) items_w_labels = items_col.minus( items_w_boundaries, predicate=equal()).set_union(items_w_boundaries) for video_id in items_w_labels.get_allintervals(): path = Video.objects.get(id=video_id).path frame_nums[video_id] = set() for intrvl in items_w_labels.get_intervallist( video_id).get_intervals(): items.add( (video_id, intrvl.start, intrvl.end, intrvl.payload, path)) for f in range(intrvl.start, intrvl.end): frame_nums[video_id].add(f) self.items = sorted(list(items), key=lambda item: (item[0], item[1], item[2])) self.transform = transforms.Compose([ transforms.ToPILImage(), Scale((128, 128)), ToTensor(1), Normalize(get_mean(1), (1, 1, 1)) ])
window) > POSITIVE_OUTLIER_THRESHOLD_FLOW_MAGNITUDE * np.std( window): positive_boundaries.append(i) if avg_magnitudes[i] - np.mean( window) < NEGATIVE_OUTLIER_THRESHOLD_FLOW_MAGNITUDE * np.std( window): negative_boundaries.append(i) return positive_boundaries, negative_boundaries db = scannerpy.Database() # Load up all manually annotated shots shots_qs = Shot.objects.filter(labeler__name__contains='manual') shots = VideoIntervalCollection.from_django_qs(shots_qs) shot_video_ids = sorted(list(shots.get_allintervals().keys())) #videos = list(Video.objects.filter(ignore_film=False).exclude(id__in=shot_video_ids).order_by('id').all()) videos = list(Video.objects.filter(ignore_film=False).order_by('id').all()) frames = [range(0, video.num_frames) for video in videos] print("Generating weak labels from RGB histograms") output_directory = '/app/data/shot_detection_weak_labels/rgb_hists_high_pre' rgb_hists = st.histograms.compute_histograms( db, videos=[video.for_scannertools() for video in videos], frames=frames) for video, rgb_hist in tqdm(zip(videos, rgb_hists), total=len(videos)): pos_bounds, neg_bounds = color_histogram_shot_labels( rgb_hist.load(), WINDOW_SIZE, POSITIVE_OUTLIER_THRESHOLD_COLOR_HIST, NEGATIVE_OUTLIER_THRESHOLD_COLOR_HIST)
def shot_reverse_shot(): from query.models import Face from rekall.video_interval_collection import VideoIntervalCollection from rekall.parsers import in_array, bbox_payload_parser from rekall.merge_ops import payload_plus from esper.rekall import intrvllists_to_result_bbox from rekall.payload_predicates import payload_satisfies from rekall.list_predicates import length_at_most from rekall.logical_predicates import and_pred, or_pred from rekall.spatial_predicates import scene_graph, make_region from rekall.temporal_predicates import before, after from rekall.bbox_predicates import height_at_least from esper.rekall import intrvllists_to_result, intrvllists_to_result_with_objects, add_intrvllists_to_result # If True, visualize results in a timeline TIMELINE_OUTPUT = False RIGHT_HALF_MIN_X = 0.45 LEFT_HALF_MAX_X = 0.55 MIN_FACE_HEIGHT = 0.4 MAX_FACES_ON_SCREEN = 2 # faces are sampled every 12 frames SAMPLING_RATE = 12 ONE_SECOND = 24 FOUR_SECONDS = 96 TEN_SECONDS = 240 # Annotate face rows with start and end frames and the video ID faces = Face.objects.annotate(min_frame=F('frame__number'), max_frame=F('frame__number'), video_id=F('frame__video_id')) right_half = make_region(RIGHT_HALF_MIN_X, 0.0, 1.0, 1.0) left_half = make_region(0.0, 0.0, LEFT_HALF_MAX_X, 1.0) graph = { 'nodes': [{ 'name': 'face', 'predicates': [height_at_least(MIN_FACE_HEIGHT)] }], 'edges': [] } vids = VideoIntervalCollection.from_django_qs( faces, with_payload=in_array( bbox_payload_parser( VideoIntervalCollection.django_accessor))).coalesce( payload_merge_op=payload_plus) # Get sequences where there's a face on the right half of the screen and # there are at most two faces faces_on_right = vids.filter( and_pred(payload_satisfies(length_at_most(MAX_FACES_ON_SCREEN)), payload_satisfies(scene_graph( graph, region=right_half)))).dilate(SAMPLING_RATE / 2).coalesce() # Get sequences where there's a face on the left half of the screen and # there are at most two faces faces_on_left = vids.filter( and_pred(payload_satisfies(length_at_most(MAX_FACES_ON_SCREEN)), payload_satisfies(scene_graph( graph, region=left_half)))).dilate(SAMPLING_RATE / 2).coalesce() # Sequences where faces on left up to one second before/after faces on left # Four seconds of buffer time between left-then-right/right-then-left # segments # Only keep remaining sequences that last longer than ten seconds shot_reverse_shot = faces_on_right.merge( faces_on_left, predicate=or_pred( before(max_dist=ONE_SECOND), after(max_dist=ONE_SECOND), arity=2)).dilate(FOUR_SECONDS).coalesce().dilate( -1 * FOUR_SECONDS).filter_length(min_length=TEN_SECONDS) # Post-process to display in Esper widget if TIMELINE_OUTPUT: results = intrvllists_to_result(shot_reverse_shot.get_allintervals()) add_intrvllists_to_result(results, faces_on_left.get_allintervals(), color='black') add_intrvllists_to_result(results, faces_on_right.get_allintervals(), color='green') else: results = intrvllists_to_result_with_objects( shot_reverse_shot.get_allintervals(), lambda payload, video: []) return results
# resnet deepSBD pre-trained on Kinetics deepsbd_resnet_model_no_clipshots = deepsbd_resnet.resnet18(num_classes=3, sample_size=128, sample_duration=16) deepsbd_resnet_model_no_clipshots = deepsbd_resnet_model_no_clipshots.to( device).train() if TRAINING_SET in ["kfolds", "ground_truth"]: # Load DeepSBD datasets for each fold. This is used for testing. deepsbd_datasets_weak_testing = [] if not SAME_VAL_TEST: for fold in folds: shots_in_fold_qs = Shot.objects.filter( labeler__name__contains='manual', video_id__in=fold) shots_in_fold = VideoIntervalCollection.from_django_qs( shots_in_fold_qs) shots_per_fold.append(shots_in_fold) data = movies_deepsbd_data.DeepSBDDataset(shots_in_fold, verbose=True, preload=False, logits=True, local_path=LOCAL_PATH) deepsbd_datasets_weak_testing.append(data) else: with open(VAL_WINDOWS, 'rb') as f: val_windows_by_video_id = pickle.load(f) with open(Y_VAL, 'rb') as f: Y_val = np.load(f) paths = { video_id: Video.objects.get(id=video_id).path
def get_person_intrvlcol(person_list=None, video_ids=None, probability=0.9, face_size=None, stride_face=False, labeler='new', exclude_person=False, granularity='frame', payload_type='shot_id'): def identity_filter(person_list): filter_all = None for p in person_list: if labeler == 'new': filter = Q(labeler__name='face-identity-converted:' + p) | Q(labeler__name='face-identity:' + p) else: filter = Q(labeler__name='face-identity-old:' + p) if filter_all is None: filter_all = filter else: filter_all = filter_all | filter return filter_all if stride_face: labeler = 'new' if type(person_list) == str: person_list = [person_list.lower()] else: person_list = [p.lower() for p in person_list] faceIDs = FaceIdentity.objects \ .filter(probability__gt=probability) \ .annotate(face_size=F("face__bbox_y2") - F("face__bbox_y1")) \ .annotate(video_id=F("face__frame__video_id")) \ if not stride_face: faceIDs = faceIDs.exclude(face__shot__isnull=True) else: faceIDs = faceIDs.filter(face__frame__shot_boundary=False) if not person_list is None: if not exclude_person: faceIDs = faceIDs.filter(identity_filter(person_list)) else: faceIDs = faceIDs.exclude(identity_filter(person_list)) if not face_size is None: faceIDs = faceIDs.filter(face_size__gte=face_size) if not video_ids is None: faceIDs = faceIDs.filter(video_id__in=video_ids) if not stride_face: person_intrvllists = qs_to_intrvllists( faceIDs.annotate(video_id=F("face__shot__video_id")) .annotate(shot_id=F("face__shot_id")) .annotate(min_frame=F("face__shot__min_frame")) .annotate(max_frame=F("face__shot__max_frame")) .annotate(faceID_id=F("identity_id")),\ schema={ 'start': 'min_frame', 'end': 'max_frame', 'payload': payload_type }) person_intrvlcol = VideoIntervalCollection( person_intrvllists).coalesce() else: if payload_type == 'shot_id': payload_type = 'frame_id' person_intrvllists_raw = qs_to_intrvllists( faceIDs.annotate(video_id=F("face__frame__video_id")) .annotate(frame_id=F("face__frame__number")) .annotate(min_frame=F("face__frame__number")) .annotate(max_frame=F("face__frame__number") + 1) .annotate(faceID_id=F("identity_id")),\ schema={ 'start': 'min_frame', 'end': 'max_frame', 'payload': payload_type }) # dilate and coalesce SAMPLE_RATE = 3 person_intrvllists = {} for video_id, intrvllist in person_intrvllists_raw.items(): video = Video.objects.filter(id=video_id)[0] dilation = int(video.fps * SAMPLE_RATE / 2) person_intrvllists[video_id] = intrvllist.dilate( dilation).coalesce().dilate(-dilation) person_intrvlcol = VideoIntervalCollection(person_intrvllists) if granularity == 'second': person_intrvlcol = intrvlcol_frame2second(person_intrvlcol) print("Get {} intervals for person {}".format( count_intervals(person_intrvlcol), person_list[0] + ' ...' if len(person_list) > 1 else person_list[0])) return person_intrvlcol