def process_video( item: Tuple[pd.Index, pd.Series], source_dir: Path, facedestination_dir: Path, checkpoint_folder: Path, face_size: int, face_extractor: FaceExtractor, lazycheck: bool = False, deepcheck: bool = False, ) -> (pd.DataFrame, Path, pd.DataFrame, Path, List[Tuple[Image.Image, Path]]) or None: idx, record = item # Checkpoint video_faces_checkpoint_path = checkpoint_folder.joinpath( record['path']).with_suffix('.faces.pkl') if not lazycheck: if video_faces_checkpoint_path.exists(): try: df_video_faces = pd.read_pickle( str(video_faces_checkpoint_path)) for _, r in df_video_faces.iterrows(): face_path = facedestination_dir.joinpath(r.name) assert (face_path.exists()) if deepcheck: img = Image.open(face_path) img_arr = np.asarray(img) assert (img_arr.ndim == 3) assert (np.prod(img_arr.shape) > 0) except Exception as e: print('Error while checking: {}'.format( video_faces_checkpoint_path)) print(e) video_faces_checkpoint_path.unlink() if not (video_faces_checkpoint_path.exists()): try: video_face_dict_list = [] # Load faces frames = face_extractor.process_video( source_dir.joinpath(record['path'])) if len(frames) == 0: return face_extractor.keep_only_best_face(frames) for frame_idx, frame in enumerate(frames): frames[frame_idx]['subjects'] = [0] * len( frames[frame_idx]['detections']) # Extract and save faces, bounding boxes, keypoints images_to_save: List[Tuple[Image.Image, Path]] = [] for frame_idx, frame in enumerate(frames): if len(frames[frame_idx]['detections']): fullframe = Image.fromarray(frames[frame_idx]['frame']) # Preserve the only found face even if not a good one, otherwise preserve only clusters > -1 subjects = np.unique(frames[frame_idx]['subjects']) if len(subjects) > 1: subjects = np.asarray([s for s in subjects if s > -1]) for face_idx, _ in enumerate(frame['faces']): subj_id = frames[frame_idx]['subjects'][face_idx] if subj_id in subjects: # Exclude outliers if other faces detected face_path = facedestination_dir.joinpath( record['path'], 'fr{:03d}_subj{:1d}.jpg'.format( frames[frame_idx]['frame_idx'], subj_id)) face_dict = { 'facepath': str(face_path.relative_to( facedestination_dir)), 'video': idx, 'label': record['label'], 'videosubject': subj_id } for field_idx, key in enumerate( blazeface.BlazeFace.detection_keys): face_dict[key] = frames[frame_idx][ 'detections'][face_idx][field_idx] cropping_bb = adapt_bb( frame_height=fullframe.height, frame_width=fullframe.width, bb_height=face_size, bb_width=face_size, left=face_dict['xmin'], top=face_dict['ymin'], right=face_dict['xmax'], bottom=face_dict['ymax']) face = fullframe.crop(cropping_bb) for key in blazeface.BlazeFace.detection_keys: if (key[0] == 'k' and key[-1] == 'x') or (key[0] == 'x'): face_dict[key] -= cropping_bb[0] elif (key[0] == 'k' and key[-1] == 'y') or (key[0] == 'y'): face_dict[key] -= cropping_bb[1] face_dict['left'] = face_dict.pop('xmin') face_dict['top'] = face_dict.pop('ymin') face_dict['right'] = face_dict.pop('xmax') face_dict['bottom'] = face_dict.pop('ymax') face_path.parent.mkdir(parents=True, exist_ok=True) images_to_save.append((face, face_path)) video_face_dict_list.append(face_dict) if len(video_face_dict_list) > 0: df_video_faces = pd.DataFrame(video_face_dict_list) df_video_faces.index = df_video_faces['facepath'] del df_video_faces['facepath'] # type conversions for key in [ 'kp1x', 'kp1y', 'kp2x', 'kp2y', 'kp3x', 'kp3y', 'kp4x', 'kp4y', 'kp5x', 'kp5y', 'kp6x', 'kp6y', 'left', 'top', 'right', 'bottom' ]: df_video_faces[key] = df_video_faces[key].astype(np.int16) df_video_faces['conf'] = df_video_faces['conf'].astype( np.float32) df_video_faces['video'] = df_video_faces['video'].astype( 'category') video_faces_checkpoint_path.parent.mkdir(parents=True, exist_ok=True) else: print('No faces extracted for video {}'.format(record['path'])) df_video_faces = pd.DataFrame() return df_video_faces, video_faces_checkpoint_path, images_to_save except Exception as e: print('Error while processing: {}'.format(record['path'])) print("-" * 60) traceback.print_exc(file=sys.stdout, limit=5) print("-" * 60) return
print('Model loaded!') transf = utils.get_transformer(face_policy, face_size, net.get_normalizer(), train=False) facedet = BlazeFace().to(device) facedet.load_weights("blazeface/blazeface.pth") facedet.load_anchors("blazeface/anchors.npy") videoreader = VideoReader(verbose=False) video_read_fn = lambda x: videoreader.read_frames(x, num_frames=frames_per_video) face_extractor = FaceExtractor(video_read_fn=video_read_fn, facedet=facedet) vid_real_faces = face_extractor.process_video( 'samples/490868123550446422495477631417.mp4') vid_fake_faces = face_extractor.process_video( 'samples/284649338838012868101332189709.mp4') ## Predict scores for each frame # For each frame, we consider the face with the highest confidence score found by BlazeFace (= frame['faces'][0]) faces_real_t = torch.stack([ transf(image=frame['faces'][0])['image'] for frame in vid_real_faces if len(frame['faces']) ]) faces_fake_t = torch.stack([ transf(image=frame['faces'][0])['image'] for frame in vid_fake_faces if len(frame['faces']) ])
# In[9]: torch.cuda.device(0) # In[10]: torch.cuda.device_count() # In[11]: torch.cuda.get_device_name(0) # In[12]: vid_real_faces = face_extractor.process_video('samples/lynaeydofd.mp4') vid_fake_faces = face_extractor.process_video('samples/mqzvfufzoq.mp4') # In[13]: im_real_face = vid_real_faces[0]['faces'][0] im_fake_face = vid_fake_faces[0]['faces'][0] # In[14]: fig, ax = plt.subplots(1, 2, figsize=(8, 4)) ax[0].imshow(im_real_face) ax[0].set_title('REAL') ax[1].imshow(im_fake_face)
def run_nb(modelname): # ## Parameters # In[2]: """ Choose an architecture between - EfficientNetB4 - EfficientNetB4ST - EfficientNetAutoAttB4 - EfficientNetAutoAttB4ST - Xception """ net_model = modelname """ Choose a training dataset between - DFDC - FFPP """ train_db = 'DFDC' # In[3]: device = torch.device( 'cuda:0') if torch.cuda.is_available() else torch.device('cpu') face_policy = 'scale' face_size = 224 frames_per_video = 32 # ## Initialization # In[4]: print('=' * 20) model_url = weights.weight_url['{:s}_{:s}'.format(net_model, train_db)] print('=' * 20) net = getattr(fornet, net_model)().eval().to(device) print('=' * 20) net.load_state_dict( load_url(model_url, map_location=device, check_hash=True)) # In[5]: transf = utils.get_transformer(face_policy, face_size, net.get_normalizer(), train=False) # In[6]: facedet = BlazeFace().to(device) facedet.load_weights("../blazeface/blazeface.pth") facedet.load_anchors("../blazeface/anchors.npy") videoreader = VideoReader(verbose=False) video_read_fn = lambda x: videoreader.read_frames( x, num_frames=frames_per_video) face_extractor = FaceExtractor(video_read_fn=video_read_fn, facedet=facedet) # ## Detect faces # In[7]: torch.cuda.is_available() # In[8]: torch.cuda.current_device() # In[9]: torch.cuda.device(0) # In[10]: torch.cuda.device_count() # In[11]: torch.cuda.get_device_name(0) # In[12]: vid_real_faces = face_extractor.process_video('samples/lynaeydofd.mp4') vid_fake_faces = face_extractor.process_video('samples/mqzvfufzoq.mp4') # In[13]: im_real_face = vid_real_faces[0]['faces'][0] im_fake_face = vid_fake_faces[0]['faces'][0] # In[14]: fig, ax = plt.subplots(1, 2, figsize=(8, 4)) ax[0].imshow(im_real_face) ax[0].set_title('REAL') ax[1].imshow(im_fake_face) ax[1].set_title('FAKE') # ## Predict scores for each frame # In[15]: # For each frame, we consider the face with the highest confidence score found by BlazeFace (= frame['faces'][0]) faces_real_t = torch.stack([ transf(image=frame['faces'][0])['image'] for frame in vid_real_faces if len(frame['faces']) ]) faces_fake_t = torch.stack([ transf(image=frame['faces'][0])['image'] for frame in vid_fake_faces if len(frame['faces']) ]) with torch.no_grad(): faces_real_pred = net(faces_real_t.to(device)).cpu().numpy().flatten() faces_fake_pred = net(faces_fake_t.to(device)).cpu().numpy().flatten() # In[16]: fig, ax = plt.subplots(1, 2, figsize=(12, 4)) ax[0].stem([f['frame_idx'] for f in vid_real_faces if len(f['faces'])], expit(faces_real_pred), use_line_collection=True) ax[0].set_title('REAL') ax[0].set_xlabel('Frame') ax[0].set_ylabel('Score') ax[0].set_ylim([0, 1]) ax[0].grid(True) ax[1].stem([f['frame_idx'] for f in vid_fake_faces if len(f['faces'])], expit(faces_fake_pred), use_line_collection=True) ax[1].set_title('FAKE') ax[1].set_xlabel('Frame') ax[1].set_ylabel('Score') ax[1].set_ylim([0, 1]) ax[1].set_yticks([0, 1], ['REAL', 'FAKE']) # In[17]: """ Print average scores. An average score close to 0 predicts REAL. An average score close to 1 predicts FAKE. """ print('Average score for REAL video: {:.4f}'.format( expit(faces_real_pred.mean()))) print('Average score for FAKE face: {:.4f}'.format( expit(faces_fake_pred.mean())))