class EmotionRecognition(object): def __init__(self, device, gpu_id=0): assert device == 'cpu' or device == 'gpu' if torch.cuda.is_available(): if device == 'cpu': print( '[*]Warning: Your device have GPU, for better performance do EmotionRecognition(device=gpu)' ) self.device = torch.device('cpu') if device == 'gpu': self.device = torch.device(f'cuda:{str(gpu_id)}') else: if device == 'gpu': print( '[*]Warning: No GPU is detected, so cpu is selected as device' ) self.device = torch.device('cpu') if device == 'cpu': self.device = torch.device('cpu') self.network = NetworkV2(in_c=1, nl=32, out_f=7).to(self.device) self.transform = transforms.Compose([ transforms.ToPILImage(), transforms.Resize((48, 48)), transforms.ToTensor(), transforms.Normalize(mean=[0.5], std=[0.5]) ]) self.mtcnn = MTCNN(keep_all=True, device=self.device) # print(os.path.join(os.path.dirname(__file__))) C:\Users\mustdur\AppData\Local\Programs\Python\Python37\lib\site-packages\facial_emotion_recognition model_dict = torch.load(os.path.join(os.path.dirname(__file__), 'model', 'model.pkl'), map_location=torch.device('cpu')) # print(f'[*] Accuracy: {model_dict["accuracy"]}') # Accuracy 0.9565809379727686 self.emotions = { 0: 'Angry', 1: 'Disgust', 2: 'Fear', 3: 'Happy', 4: 'Sad', 5: 'Surprise', 6: 'Neutral' } self.network.load_state_dict(model_dict['network']) self.network.eval() def _predict(self, image): tensor = self.transform(image).unsqueeze(0).to(self.device) output = self.network(tensor) ps = torch.exp(output).tolist() index = np.argmax(ps) return self.emotions[index] def recognise_emotion(self, frame): emotions = [] f_h, f_w, c = frame.shape gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY) boxes, _ = self.mtcnn.detect(frame) if boxes is not None: for i in range(len(boxes)): x1, y1, x2, y2 = int(round(boxes[i][0])), int( round(boxes[i][1])), int(round(boxes[i][2])), int( round(boxes[i][3])) emotion = self._predict(gray[y1:y2, x1:x2]) emotions.append(emotion) frame = cv.rectangle(frame, (x1, y1), (x2, y2), color=[0, 255, 0], thickness=1) frame = cv.rectangle(frame, (x1, y1 - int(f_h * 0.03125)), (x1 + int(f_w * 0.125), y1), color=[0, 255, 0], thickness=-1) frame = cv.putText(frame, text=emotion, org=(x1 + 5, y1 - 3), fontFace=cv.FONT_HERSHEY_PLAIN, color=[0, 0, 0], fontScale=1, thickness=1) return frame, emotions, boxes else: #print('No face detected') return frame, emotions, boxes def recognise_emotion_fast(self, frame): emotions = [] gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY) boxes, _ = self.mtcnn.detect(frame) if boxes is not None: for i in range(len(boxes)): x1, y1, x2, y2 = int(round(boxes[i][0])), int( round(boxes[i][1])), int(round(boxes[i][2])), int( round(boxes[i][3])) emotion = self._predict(gray[y1:y2, x1:x2]) emotions.append(emotion) return frame, emotions, boxes else: #print('No face detected') return frame, emotions, boxes
class MTCNN_extractor(Face_extractor): def __init__(self, down_sample=2, batch_size=30, my_device=device, keep_empty=False, factor=0.709, size_range=None, prob_limit=None, same_bbox_size=False, scale=1.2): super().__init__() self.prob_limit = prob_limit self.size_range = size_range self.extractor = MTCNN(keep_all=True, device=my_device, min_face_size=80 // down_sample, factor=factor).eval() self.down_sample = down_sample self.batch_size = batch_size self.keep_empty = keep_empty self.same_bbox_size = same_bbox_size self.scale = scale def _get(self, images): ret = [] for start in range(0, len(images), self.batch_size): ret.extend(self._limited_get(images[start:start + self.batch_size])) if self.size_range is not None: ret = self._filter(ret) return ret def _limited_get(self, images): h, w = images.shape[1:3] if h * w < 1280 * 720: down_sample = max(1, self.down_sample // 2) elif h * w >= 1280 * 720 * 4: down_sample = self.down_sample * 2 else: down_sample = self.down_sample pils = [ Image.fromarray(img).resize((w // down_sample, h // down_sample)) for img in images ] bboxes, probs = self.extractor.detect(pils) clean_bboxes, clean_probs = [], [] for boxes, prob in zip(bboxes, probs): if boxes is not None: rets = sorted([(p, box) for box, p in zip(boxes, prob)], key=lambda x: x[0]) if len(rets) >= 2: rets = rets[ -1:] if rets[-1][0] - rets[-2][0] > 0.05 else rets[-2:] clean_bboxes.append(np.array([box for p, box in rets])) clean_probs.append(np.array([p for p, box in rets])) else: clean_bboxes.append([]) clean_probs.append([]) bsize = sorted([ max(box[2] - box[0], box[3] - box[1]) * self.scale for boxes in clean_bboxes for box in boxes * down_sample ]) if len(bsize) > 0: bsize = int(bsize[-len(bsize) // 4]) # -1//4 = -1 ret = [] for boxes, img, prob, idx in zip(clean_bboxes, images, clean_probs, range(len(clean_probs))): faceInfo = [] if boxes is not None and len(boxes) > 0: min_size = bsize if self.same_bbox_size else None max_size = bsize if self.same_bbox_size else None faceInfo = [ FaceInfo(face=self._rectang_crop(img, box, self.scale, min_size, max_size), box=self._get_boundingbox(box, w, h, self.scale, min_size, max_size), prob=p, frame=idx) for box, p in zip(boxes * down_sample, prob) ] faceInfo = sorted(faceInfo, key=lambda x: -x.prob) elif not self.keep_empty: continue ret.append(faceInfo) return ret def _filter(self, ret): new_ret = [] for frames in ret: ret_frame = [] for face in frames: size = (face.box[2] - face.box[0]) * (face.box[3] - face.box[1]) if self.size_range[0] < size < self.size_range[ 1] and face.prob > self.prob_limit: ret_frame.append(face) new_ret.append(ret_frame) return new_ret
def input_face_embeddings(frames: Union[List[str], np.ndarray], is_path: bool, mtcnn: MTCNN, resnet: InceptionResnetV1, face_embed_cuda: bool, use_half: bool, coord: List, name: str = None, save_frames: bool = False) -> torch.Tensor: """ Get the face embedding NOTE: If a face is not detected by the detector, instead of throwing an error it zeros the input for embedder. NOTE: Memory hungry function, hence the profiler. Args: frames: Frames from the video is_path: Whether to read from filesystem or memory mtcnn: face detector resnet: face embedder face_embed_cuda: use cuda for model use_half: use half precision Returns: emb: Embedding for all input frames """ if face_embed_cuda: device = torch.device("cuda:0") else: device = torch.device("cpu") result_cropped_tensors = [] no_face_indices = [] for i, f in enumerate(frames): if is_path: frame = Image.open(f) else: frame = Image.fromarray(f.astype("uint8")) with torch.no_grad(): cropped_tensors = None height, width, c = f.shape bounding_box, prob = mtcnn.detect(frame) if bounding_box is not None: for box in bounding_box: x1, y1, x2, y2 = box if (x1 > x2): x1, x2 = x2, x1 if (y1 > y2): y1, y2 = y2, y1 #for point in coord: x, y = coord[0], coord[1] x *= width y *= height if (x >= x1 and y >= y1 and x <= x2 and y <= y2): cropped_tensors = extract_face(frame, box) #print("found", box, x, y, end='\r') break if cropped_tensors is None: #Face not detected, for some reason cropped_tensors = torch.zeros((3, 160, 160)) no_face_indices.append(i) if save_frames: name = name.replace(".mp4", "") saveimg = cropped_tensors.detach().cpu().numpy().astype("uint8") saveimg = np.squeeze(saveimg.transpose(1, 2, 0)) Image.fromarray(saveimg).save(f"{name}_{i}.png") result_cropped_tensors.append(cropped_tensors.to(device)) if len(no_face_indices) > 20: #few videos start with silence, allow 0.5 seconds of silence else remove return None del frames #Stack all frames result_cropped_tensors = torch.stack(result_cropped_tensors) #Embed all frames result_cropped_tensors = result_cropped_tensors.to(device) if use_half: result_cropped_tensors = result_cropped_tensors.half() with torch.no_grad(): emb = resnet(result_cropped_tensors) if use_half: emb = emb.float() return emb.to(cpu_device)
class EmotionRecognition(object): def __init__(self, model_path, device='cpu', gpu_id=0): assert device == 'cpu' or device == 'gpu',"Need to specify device! ('cpu' or 'gpu')" # Set the device according to arguments and what is available if torch.cuda.is_available(): if device == 'cpu': logging.warning('Your machine has a GPU. Performance would be better with EmotionRecognition(device=gpu)!') self.device = torch.device('cpu') if device == 'gpu': self.device = torch.device(f'cuda:{str(gpu_id)}') else: if device == 'gpu': logging.warning('No GPU is detected, so cpu is selected as device!') self.device = torch.device('cpu') if device == 'cpu': self.device = torch.device('cpu') self.emotions = FERPlus.classes #{0: 'Angry', 1: 'Disgust', 2: 'Fear', 3: 'Happy', 4: 'Sad', 5: 'Surprise', 6: 'Neutral'} # Model used for emotion recognition (from cropped face images) self.network = NetworkBasic(in_c=1, nl=32, out_f=len(self.emotions)).to(self.device) # Load the saved state model_dict = torch.load(model_path, map_location=self.device) self.network.load_state_dict(model_dict) self.network.eval() # Normalization self.transform = transforms.Compose([ transforms.ToPILImage(), transforms.Grayscale(num_output_channels=1), transforms.Resize((48, 48)), transforms.ToTensor(), transforms.Normalize(mean=[0.5], std=[0.5]) ]) # Model used to detect faces in the video stream self.mtcnn = MTCNN(keep_all=True, device=self.device) def _predict(self, image): """Given an image of a face, return the primary emotion shown in the face.""" tensor = self.transform(image).unsqueeze(0).to(self.device) output = self.network(tensor) ps = torch.exp(output).tolist() index = np.argmax(ps) score = np.amax(output.detach().numpy()) return self.emotions[index], score def run_on_face(self, face): gray = cv.cvtColor(face, cv.COLOR_BGR2GRAY) emotion, score = self._predict(gray) return {'emotion': emotion, 'score': score} def run(self, frame): """Perform emotion recognition on a single frame and return the results. Different from show_emotions(), this method does not return a modified frame.""" f_h, f_w, c = frame.shape gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY) boxes, _ = self.mtcnn.detect(frame) results = [] if boxes is not None: for i in range(len(boxes)): x1, y1, x2, y2 = int(round(boxes[i][0])), int(round(boxes[i][1])), int(round(boxes[i][2])), int( round(boxes[i][3])) emotion, score = self._predict(gray[y1:y2, x1:x2]) results.append( {'emotion': emotion, 'score': score, 'position': (x1, y1, x2, y2)} ) return results def show(self, frame, return_type='BGR'): """Perform emotion recognition on a single frame and show the result by returning a modified frame. The returned frame has a bounding box around all detected faces plus the names of the detected emotions.""" f_h, f_w, c = frame.shape detection = self.recognize(frame) for result in detection: x1, y1, x2, y2 = result['position'] emotion = result['emotion'] score = result['score'] frame = cv.rectangle(frame, (x1, y1), (x2, y2), color=[0, 255, 0], thickness=1) frame = cv.rectangle(frame, (x1, y1 - int(f_h*0.03125)), (x1 + int(f_w*0.21), y1), color=[0, 255, 0], thickness=-1) frame = cv.putText(frame, text=emotion+' (%0.2f)'%score, org=(x1 + 5, y1 - 3), fontFace=cv.FONT_HERSHEY_PLAIN, color=[0, 0, 0], fontScale=1, thickness=1) if return_type == 'BGR': return frame if return_type == 'RGB': return cv.cvtColor(frame, cv.COLOR_BGR2RGB) else: raise Exception("Unknown return_type!")
class VideoTracker(object): def __init__(self, args): print('Initialize DeepSORT & YOLO-V5') # ***************** Initialize ****************************************************** self.args = args self.scale = args.scale # 2 self.margin_ratio = args.margin_ratio # 0.2 self.frame_interval = args.frame_interval # frequency self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') self.half = self.device.type != 'cpu' # half precision only supported on CUDA # create video capture **************** if args.display: cv2.namedWindow("test", cv2.WINDOW_NORMAL) cv2.resizeWindow("test", args.display_width, args.display_height) if args.cam != -1: print("Using webcam " + str(args.cam)) self.vdo = cv2.VideoCapture(args.cam) else: self.vdo = cv2.VideoCapture() # ***************************** initialize DeepSORT ********************************** cfg = get_config() cfg.merge_from_file(args.config_deepsort) use_cuda = self.device.type != 'cpu' and torch.cuda.is_available() self.deepsort = build_tracker(cfg, use_cuda=use_cuda) # ***************************** initialize Face Det ********************************** self.face_detector = MTCNN(keep_all=True, device=self.device) print('Done..') if self.device == 'cpu': warnings.warn("Running in cpu mode which maybe very slow!", UserWarning) def __enter__(self): # ************************* Load video from camera ************************* if self.args.cam != -1: print('Camera ...') ret, frame = self.vdo.read() assert ret, "Error: Camera error" self.im_width = int(self.vdo.get(cv2.CAP_PROP_FRAME_WIDTH)) self.im_height = int(self.vdo.get(cv2.CAP_PROP_FRAME_HEIGHT)) # ************************* Load video from file ************************* else: assert os.path.isfile(self.args.input_path), "Path error" self.vdo.open(self.args.input_path) self.im_width = int(self.vdo.get(cv2.CAP_PROP_FRAME_WIDTH)) self.im_height = int(self.vdo.get(cv2.CAP_PROP_FRAME_HEIGHT)) assert self.vdo.isOpened() print('Done. Load video file ', self.args.input_path) # ************************* create output ************************* if self.args.save_path: os.makedirs(self.args.save_path, exist_ok=True) # path of saved video and results self.save_video_path = os.path.join(self.args.save_path, "results.mp4") # create video writer fourcc = cv2.VideoWriter_fourcc(*self.args.fourcc) self.writer = cv2.VideoWriter(self.save_video_path, fourcc, self.vdo.get(cv2.CAP_PROP_FPS), (self.im_width, self.im_height)) print('Done. Create output file ', self.save_video_path) if self.args.save_txt: os.makedirs(self.args.save_txt, exist_ok=True) return self def __exit__(self, exc_type, exc_value, exc_traceback): self.vdo.release() self.writer.release() if exc_type: print(exc_type, exc_value, exc_traceback) def run(self): yolo_time, sort_time, avg_fps = [], [], [] t_start = time.time() idx_frame = 0 last_out = None while self.vdo.grab(): # Inference ********************************************************************* t0 = time.time() _, img0 = self.vdo.retrieve() if idx_frame % self.args.frame_interval == 0: outputs, yt, st = self.image_track( img0) # (#ID, 5) x1,y1,x2,y2,id last_out = outputs yolo_time.append(yt) sort_time.append(st) print('Frame %d Done. Det-time:(%.3fs) SORT-time:(%.3fs)' % (idx_frame, yt, st)) else: outputs = last_out # directly use prediction in last frames t1 = time.time() avg_fps.append(t1 - t0) # post-processing *************************************************************** # visualize bbox ******************************** if len(outputs) > 0: bbox_xyxy = outputs[:, :4] identities = outputs[:, -1] img0 = draw_boxes(img0, bbox_xyxy, identities) # BGR # add FPS information on output video text_scale = max(1, img0.shape[1] // 1600) cv2.putText(img0, 'frame: %d fps: %.2f ' % (idx_frame, len(avg_fps) / sum(avg_fps)), (20, 20 + text_scale), cv2.FONT_HERSHEY_PLAIN, text_scale, (0, 0, 255), thickness=2) # display on window ****************************** if self.args.display: cv2.imshow("test", img0) if cv2.waitKey(1) == ord('q'): # q to quit cv2.destroyAllWindows() break # save to video file ***************************** if self.args.save_path: self.writer.write(img0) if self.args.save_txt: with open( self.args.save_txt + str(idx_frame).zfill(4) + '.txt', 'a') as f: for i in range(len(outputs)): x1, y1, x2, y2, idx = outputs[i] f.write('{}\t{}\t{}\t{}\t{}\n'.format( x1, y1, x2, y2, idx)) idx_frame += 1 print( 'Avg Det time (%.3fs), Sort time (%.3fs) per frame' % (sum(yolo_time) / len(yolo_time), sum(sort_time) / len(sort_time))) t_end = time.time() print('Total time (%.3fs), Total Frame: %d' % (t_end - t_start, idx_frame)) def image_track(self, im0): """ :param im0: original image, BGR format cv2 :return: """ # preprocess ************************************************************ h, w, _ = im0.shape img = cv2.resize( im0, (w // self.scale, h // self.scale)) # down sample to speed up img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # # Detection time ********************************************************* # Inference t1 = time.time() with torch.no_grad(): boxes, confs = self.face_detector.detect(img) # boxes: (#obj, 4) x1,y1,x2,y2 in img scale ! # confs: () t2 = time.time() # get all obj ************************************************************ if boxes is not None and len(boxes): boxes = boxes * self.scale # x1,y1,x2,y2 go back to original image bbox_xywh = xyxy2xywh(boxes) # (#obj, 4) xc,yc,w,h # add margin here. only need to revise width and height bbox_xywh[:, 2:] = bbox_xywh[:, 2:] * (1 + self.margin_ratio) # ****************************** deepsort **************************** outputs = self.deepsort.update(bbox_xywh, confs, im0) # (#ID, 5) x1,y1,x2,y2,track_ID else: outputs = torch.zeros((0, 5)) t3 = time.time() return outputs, t2 - t1, t3 - t2
# cv2.FONT_HERSHEY_COMPLEX, 0.5, (255, 0, 0), 1, cv2.LINE_AA) # Draw landmarks(5) cv2.circle(img, tuple(ld[0]), 10, (0, 0, 255), 8) cv2.circle(img, tuple(ld[1]), 10, (0, 0, 255), 8) cv2.circle(img, tuple(ld[2]), 10, (0, 0, 255), 8) cv2.circle(img, tuple(ld[3]), 10, (0, 0, 255), 8) cv2.circle(img, tuple(ld[4]), 10, (0, 0, 255), 8) return img path = glob.glob("D:/Image Dataset/val/not_aryan/*.*") save_path = "D:/Python Files/Face_Detection/New_Image/val/not_aryan" mtcnn = MTCNN() for count,file in enumerate(path): img = cv2.imread(file) # new_image = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) try: boxes, probs, landmarks = mtcnn.detect(img, landmarks=True) final_image = _draw(img, boxes, probs, landmarks) except: pass cv2.imwrite(os.path.join(save_path, str(count) + "new.jpg"), final_image) print("Done")
class VideoToFrames: """ Transforms input video files into image frames. Additionally detects all faces for each frame. """ def __init__(self, num_frames_per_video=3, face_additional_area=0.5): self.num_frames_per_video = num_frames_per_video self.face_additional_area = face_additional_area self.face_detection_model = MTCNN( image_size=224, margin=0, keep_all=True, select_largest=False, post_process=False, thresholds=[0.8, 0.9, 0.9], device="cuda", ).eval() def video_to_frames(self, video_path, output_path): output_image_paths = [] video_id = os.path.split(video_path)[-1] # Read video orig_capture = cv2.VideoCapture(video_path) # Select only self.num_frames_per_video uniform frames n_frames = int(orig_capture.get(cv2.CAP_PROP_FRAME_COUNT)) frames_idx = np.linspace(0, n_frames, self.num_frames_per_video, endpoint=False, dtype=np.int) # Loop through all frames for frame_num in range(n_frames): ret = orig_capture.grab() if not ret: continue # Retrieve only required frames if frame_num in frames_idx: ret, frame_orig = orig_capture.retrieve() if ret: # Save the whole video frame to the image # img_path = os.path.join(output_path, f"{video_id}_frame_{frame_num}.png") # cv2.imwrite(frame_orig, img_path) # output_image_paths.append(os.path.split(img_path)[-1]) # Skip the next part if want to save the whole frame only frame_orig = cv2.cvtColor(frame_orig, cv2.COLOR_BGR2RGB) # Detect all faces faces, _ = self.face_detection_model.detect(frame_orig) if faces is None: return [] # For each detected face for face_id, box in enumerate(faces): # Get face coordinates c0_start, c0_end, c1_start, c1_end = self.get_face_coordinates( frame_orig, box) # Crop face face_full = frame_orig[c0_start:c0_end, c1_start:c1_end] # Save face to the file img_path = os.path.join( output_path, f"{video_id}_frame_{frame_num}_face_{face_id}.png", ) # Return BGR before saving face_full = cv2.cvtColor(face_full, cv2.COLOR_RGB2BGR) cv2.imwrite(img_path, face_full) output_image_paths.append(os.path.split(img_path)[-1]) return output_image_paths def get_face_coordinates(self, frame_orig, box): sh0_start = int(box[1]) sh0_end = int(box[3]) sh1_start = int(box[0]) sh1_end = int(box[2]) # Add area around the face d0 = int((sh0_end - sh0_start) * self.face_additional_area) d1 = int((sh1_end - sh1_start) * self.face_additional_area) c0_start = max(sh0_start - d0, 0) c0_end = min(sh0_end + d0, frame_orig.shape[0]) c1_start = max(sh1_start - d1, 0) c1_end = min(sh1_end + d1, frame_orig.shape[1]) return c0_start, c0_end, c1_start, c1_end
from tqdm import tqdm_notebook as tqdm import os import torchvision.transforms as transforms device = 'cuda:0' if torch.cuda.is_available() else 'cpu' mtcnn = MTCNN(device=device) model = InceptionResnetV1(pretrained='vggface2').eval() img = cv2.imread('./test.jpg') if img is None: print("Img Err") import sys sys.exit() s_face = 0 faces, _ = mtcnn.detect(img) model.classify = True try: for face in faces: face = np.trunc(face) s_face = img[int(face[1]):int(face[3]), int(face[0]):int(face[2])] cv2.rectangle(img, (face[0],face[1]), (face[2], face[3]), (255,0,0), 3) i_c = mtcnn(s_face) emb = model(i_c.unsqueeze(0)) print(emb) except Exception as e: print(e)
class FaceCam(): # Video class based on openCV def __init__(self): self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu' self.mtcnn = MTCNN(device=self.device) self.open = True self.gender_model = def_model('gender', self.device) self.gaze_model = def_model('gaze', self.device) self.emotion_model = def_model('emotion', self.device) self.multimodal_model = def_model('multimodal', self.device) def rec(self): global label cap = cv2.VideoCapture(0) while(self.open==True): timer_start = time.time() print('start camera!') ret, frame = cap.read() try: # detect face box and probability boxes, probs = self.mtcnn.detect(frame, landmarks=False) # draw box on frame frame = draw_bbox(frame, boxes, probs) # perform only when face is detected if len(boxes) > 0: # extract the face rois rois = detect_rois(boxes) for roi in rois: (start_Y, end_Y, start_X, end_X) = roi face = frame[start_Y:end_Y, start_X:end_X] print('detect time: ', time.time()-timer_start) predict_start = time.time() gender_i = predict(self.gender_model, face, self.device) gaze_i = predict(self.gaze_model, face, self.device) emotion_i = predict(self.emotion_model, face, self.device) multimodal_i = predict(self.multimodal_model, face, self.device) cv2.putText(frame, label['gender'][gender_i], (end_X-50, start_Y-55), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,0,0), 2, cv2.LINE_AA) cv2.putText(frame, label['gaze'][gaze_i], (end_X-50, start_Y-40), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,0,0), 2, cv2.LINE_AA) cv2.putText(frame, label['emotion'][emotion_i], (end_X-50, start_Y-25), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,0,0), 2, cv2.LINE_AA) cv2.putText(frame, label['multimodal'][multimodal_i], (end_X-50), start_Y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,0,0), 2, cv2.LINE_AA) print('predict time: ', time.time()-predict_start) except Exception as e: print(e) pass # show the frame cv2.imshow('Demo', frame) # q to quit if cv2.waitKey(1) & 0xFF == ord('q'): print('Interrupted by user!') break # clear program and close windows cap.release() cv2.destroyAllWindows() print('All done!')
class FaceNet: '''Face Net ''' def __init__(self, mtcnn=dict(), resnet=dict(), threshold=1, device='cpu', data=None): # default arguments default_mtcnn = dict( image_size=160, margin=80, min_face_size=20, thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=False, keep_all=True, device=device, ) default_resnet = dict(pretrained='vggface2', device=device) default_mtcnn.update(mtcnn) default_resnet.update(resnet) data = data or collections.defaultdict(list) # assign values self._kwargs = dict(mtcnn=mtcnn, resnet=resnet, threshold=threshold, device=device, data=data) self._mtcnn = MTCNN(**default_mtcnn) self._resnet = InceptionResnetV1(**default_resnet).eval() def add_image(self, image, label): for embedding in self._embed(image): self._kwargs['data'][label].append(embedding) def add_images_from_folder(self, root, progress_bar=True): dataset = datasets.ImageFolder(root) idx_to_class = {v: k for k, v in dataset.class_to_idx.items()} for image, idx in (tqdm.tqdm(dataset) if progress_bar else dataset): self.add_image(image, idx_to_class[idx]) return self def image_to_labels(self, image_or_path, key=None, crop=True): '''返回图片人脸的标签 ''' key = key or (lambda x: sum(x)/len(x)) result = list() embeddings = self._embed(self.imread(image_or_path), crop=crop) for embedding in embeddings: distances = {k: key(v) for k, v in self._distances(embedding).items()} label = min(distances, key=lambda x: distances[x]) result.append(label if distances[label]<self._kwargs['threshold'] else None) return result def image_to_image(self, image_or_path, mark=True, font=5, size=1, thickness=1, offset=(5, 5), color=(255, 0, 0)): '''返回人脸标注的图片 Argument: - image_or_path: [str, numpy.ndarray] - mark: bool - font: int, default is cv2.FONT_HERSHEY_COMPLEX_SMALL - size: float - thickness: float - offset: Tuple[float] - color: Tuple[int] ''' image = self.imread(image_or_path) boxes, _ = self._mtcnn.detect(image, landmarks=False) if isinstance(boxes, numpy.ndarray): for box in boxes.astype(numpy.int): image = cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), color) if mark: crop = image[box[1]: box[3], box[0]: box[2], :] try: label, = self.image_to_labels(crop, crop=False) except: label = 'ERROR' coord = tuple(int(box[i]-offset[i]) for i in range(2)) image = cv2.putText(image, label or 'other', coord, font, size, color, thickness) return image def image_to_crops(self, image_or_path): ''' Argument: - image_or_path: [str, numpy.ndarray] ''' result = list() image = self.imread(image_or_path) boxes, _ = self._mtcnn.detect(image, landmarks=False) if isinstance(boxes, numpy.ndarray): for box in boxes.astype(numpy.int): result.append(image[box[1]: box[3], box[0]: box[2], :]) return result def save(self, path): with open(path, 'wb') as f: torch.save(self._kwargs, f) @classmethod def load(cls, path, **kwargs): with open(path, 'rb') as f: data = torch.load(f) data.update(kwargs) return cls(**data) @classmethod def imread(cls, image_or_path): if isinstance(image_or_path, str): return cv2.imread(image_or_path)[:, :, ::-1].copy() elif isinstance(image_or_path, numpy.ndarray): return image_or_path else: raise NotImplementedError def _embed(self, image, crop=True): # __import__('IPython').embed(colors='Linux') if crop: faces = self._mtcnn(image) if faces is None: return numpy.array(tuple()) else: face = cv2.resize(image, (self._mtcnn.image_size, self._mtcnn.image_size)) faces = (torch.Tensor(face.transpose(2, 1, 0)), ) faces = faces if self._kwargs['mtcnn'].get('keep_all', True) else (faces, ) return self._resnet(torch.stack(faces).to(self._kwargs['device'])).detach().cpu() def _distances(self, embedding, **kwargs): return { k: tuple((embedding-v).norm(**kwargs) for v in vs) for k, vs in self._kwargs['data'].items() }
def detect_face(tensor_image_stack): mtcnn = MTCNN(image_size=160, margin=0, min_face_size=20, thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=False, device='cuda') return mtcnn.detect(tensor_image_stack)
while True: success, frame = cap.read() if success: width, height, _ = frame.shape # if the video is too big uncomment the below code #frame = resize(frame, height, width) #padding the image to avoid the bounding going out of the image #and crashes the program padding = cv.copyMakeBorder(frame, 50, 50, 50, 50, cv.BORDER_CONSTANT) #converting numpy array into image image = Image.fromarray(padding) #gives the face co-ordinates face_coord, _ = mtcnn.detect(image) if face_coord is not None: for coord in face_coord: for x1, y1, x2, y2 in [coord]: x1, y1, x2, y2 = r(x1), r(y1), r(x2), r(y2) #face array face = padding[y1:y2, x1:x2] #Preprocessing preprocess = Preprocessing(img=Image.fromarray(face)) #tensor array tensor_img_array = preprocess.preprocessed_arrays() #Predicting
boxes = [] tocni = 0 ukupno_pronadenih = 0 stvarna_kolicina = len(new_dataset) for i in range(0, len(new_dataset), 1): if ((i + 1 < len(new_dataset)) and new_dataset[i]['name'] == new_dataset[i + 1]['name']): boxes += [new_dataset[i]['box_frame'].numpy()] #print("adding to box") continue boxes += [new_dataset[i]['box_frame'].numpy()] pixels = new_dataset[i]['image'].transpose(0, 1).transpose(1, 2).numpy() pixels = pixels * new_dataset[i]['image'].size()[2] faces = detector.detect(pixels) max_iou = [0] * len(boxes) if (faces[0] is None): boxes = [] continue ukupno_pronadenih += len(faces[0]) for j in range(len(boxes)): found = False for k in range(len(faces[0])): pxmin, pymin, pxmax, pymax = faces[0][k] xmin, ymin, xmax, ymax = boxes[j] interxmin = max(xmin, pxmin) interymin = max(ymin, pymin) interxmax = min(xmax, pxmax)
class Check_In_Window(QMainWindow): def __init__(self): super(Check_In_Window, self).__init__() loadUi("Check_In_Window.ui", self) self.mtcnn = MTCNN(select_largest=True, device='cuda') # some constants kept as default from facenet self.input_image_size = 160 self.sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto( log_device_placement=True)) pre_trained_facenet.load_model('model/20170512-110547.pb') self.images_placeholder = tf.get_default_graph().get_tensor_by_name( "input:0") self.embeddings = tf.get_default_graph().get_tensor_by_name( "embeddings:0") self.phase_train_placeholder = tf.get_default_graph( ).get_tensor_by_name("phase_train:0") self.embedding_size = self.embeddings.get_shape()[1] self.startVideo('0') def startVideo(self, camera_name): """ :param camera_name: link of camera or usb camera :return: """ if len(camera_name) == 1: self.capture = cv2.VideoCapture(int(camera_name)) else: self.capture = cv2.VideoCapture(camera_name) self.timer = QTimer(self) # Create Timer # path = './Face_Recognition/images' path = '/home/ftpuser/ftp/files/' if not os.path.exists(path): os.mkdir(path) # known face encoding and known face name list self.images = [] self.class_names = [] self.faces = [] attendance_list = os.listdir(path) self.attendance_num = len(attendance_list) for cl in attendance_list: cur_img = cv2.imread(f'{path}/{cl}') print(cur_img) self.images.append(cur_img) # print('image',cur_img) # cur_img = cv2.resize(cur_img, (504,378)) faces_detected = 0 start = time.time() # result = self.detector.detect_faces(cur_img) box = self.mtcnn.detect(cur_img, True) faces_detected += len(box) print(box) print(f'Frames per second: {(time.time() - start):.3f},', f'faces detected: {faces_detected}\r') face = self.getFace(cur_img, box) self.faces.append(face) self.class_names.append(os.path.splitext(cl)[0]) self.timer.timeout.connect( self.update_frame) # Connect timeout to the output function self.timer.start(10) # emit the timeout() signal at x=10ms def face_rec_(self, frame): """ :param frame: frame from camera :param encode_list_known: known face encoding :param class_names: known face names :return: """ box = self.mtcnn.detect(frame, True) # print(box) # print(box[0]) # print('길이',len(box[0])) if box[0] is None: print('없') else: print('heeeeeeeeeeeeeeeeeeeeeeee') print(box) print(len(self.images), len(self.faces), len(self.class_names)) for f, c in zip(self.faces, self.class_names): print('for loop') distance = self.compare2face(f, frame, box) print('여기까진?') threshold = 0.7 # set yourself to meet your requirement print("distance = " + str(distance), ' 사진번호: ', c) name = 'unknonw' if (distance <= threshold): name = c print(name) print("distance = " + str(distance), ' 인덱: ', c) self.mark_attendance(name) return frame def mark_attendance(self, name): """ :param name: detected face known or unknown one :return: """ if name != 'unknonw': print(name) self.logIn(name) def logIn(self, name): customer_id = int(name) DB = DB_Connection() cnt = DB.select_user(customer_id) if cnt[0] == "False": greeting = ' Welcome, ' + cnt[1] + '.' print(name, '님이 입장하셨습니다.') DB.update_login_session_T(customer_id) DB.insert_check_In_Time(customer_id) self.GreetingLabel.setText(greeting) self.timer1 = QTimer(self) self.timer1.start(5000) self.timer1.timeout.connect(self.clearLabel) def clearLabel(self): self.GreetingLabel.clear() def update_frame(self): path = '/home/ftpuser/ftp/files/' new_attendance_list = os.listdir(path) image_num = len(new_attendance_list) ret, image = self.capture.read() if image_num == self.attendance_num: self.displayImage(image) else: for cl in new_attendance_list: if os.path.splitext(cl)[0] not in self.class_names: cur_img = cv2.imread(f'{path}/{cl}') faces_detected = 0 start = time.time() # result = self.detector.detect_faces(cur_img) box = self.mtcnn.detect(cur_img, True) faces_detected += len(box) print(f'Frames per second: {(time.time() - start):.3f},', f'faces detected: {faces_detected}\r') face = self.getFace(cur_img, box) self.faces.append(face) self.class_names.append(os.path.splitext(cl)[0]) self.displayImage(image) def displayImage(self, image, window=1): """ :param image: frame from camera :param encode_list: known face encoding list :param class_names: known face names :param window: number of window :return: """ print(image.shape) try: image = self.face_rec_(image) except Exception as e: print('뭐지?', e) image = cv2.resize(image, (640, 480)) qformat = QImage.Format_Indexed8 if len(image.shape) == 3: if image.shape[2] == 4: qformat = QImage.Format_RGBA8888 else: qformat = QImage.Format_RGB888 outImage = QImage(image, image.shape[1], image.shape[0], image.strides[0], qformat) outImage = outImage.rgbSwapped() if window == 1: self.imgLabel.setPixmap(QPixmap.fromImage(outImage)) self.imgLabel.setScaledContents(True) def getFace(self, img, box): faces = [] box = box[0][0] box = np.int32(box) # Result is an array with all the bounding boxes detected. We know that for 'ivan.jpg' there is only one. cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]), (0, 155, 255), 2) cropped = img[box[1]:box[3], box[0]:box[2]] rearranged = cv2.resize(cropped, (self.input_image_size, self.input_image_size), interpolation=cv2.INTER_CUBIC) prewhitened = pre_trained_facenet.prewhiten(rearranged) faces.append({ 'face': rearranged, 'embedding': self.getEmbedding(prewhitened) }) return faces def getEmbedding(self, resized): reshaped = resized.reshape(-1, self.input_image_size, self.input_image_size, 3) feed_dict = { self.images_placeholder: reshaped, self.phase_train_placeholder: False } embedding = self.sess.run(self.embeddings, feed_dict=feed_dict) return embedding def compare2face(self, face, img2, box2): face1 = face print('여기') face2 = self.getFace(img2, box2) print('여기2') if face1 and face2: dist = np.sqrt( np.sum( np.square( np.subtract(face1[0]['embedding'], face2[0]['embedding'])))) return dist return -1
# Checar se há GPU disponÃvel device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') print('Running on device: {}'.format(device)) # Definir parâmetros do módulo MTCNN mtcnn = MTCNN(keep_all=False, device=device, post_process=False) # Obter lista de arquivos e diretorios fname, dname = listar_imagens(basedir) # Detectar faces e salvar na pasta facecrops inicio = time.time() print('Processamento iniciado') facecrop = [it.replace(basedir, basedir+'_faces') for it in fname] for f, filename in enumerate(fname): try: img = Image.open(filename) box, prob = mtcnn.detect(img) except: print('Falha no processamento do arquivo '+filename) continue if prob[0] and prob[0] >= 0.95: savepath = '/projects/jeff/TUMGAIDimage_facecrops3' + '' + \ os.path.dirname(filename)[-9:]+'-'+os.path.basename(filename) extract_face(img, box[0], save_path=savepath) print('Processamento concluido') print(time.strftime('%H:%M:%S', time.localtime())) tempo_total = time.time() - inicio print("Tempo total: %02dm:%02ds" % divmod(tempo_total, 60))
def detect_live(self): mtcnn = MTCNN() faces = {} frameCount = 0 vid = cv2.VideoCapture(0) if self.record_for is not None : start_time = time.time() while vid.isOpened(): if self.record_for is not None : curr_time = time.time() - start_time if curr_time > self.record_for : break _, frame = vid.read() frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) frameCount = frameCount + 1 boxes, probs = mtcnn.detect(frame) frame_draw = frame.copy() draw = ImageDraw.Draw(frame_draw) if boxes is not None: faces["frame_{}".format(frameCount)] = [] for box, p in zip(boxes, probs) : if p > 0.70 : draw.rectangle(box.tolist(), outline = (255, 0, 0), width = 1) if self.extract == True : face = extract_face(frame, box.tolist()) faces["frame_{}".format(frameCount)].append(face) if self.save == True : img = self.tsfms(face) if self.saveIn is None : raise ValueError else : img.save(os.path.join(self.saveIn, "frame_{}.jpg".format(len(faces)))) cv2.imshow("Tracking window", cv2.cvtColor(np.array(frame_draw), cv2.COLOR_RGB2BGR)) if self.save_video == True : self.frames_tracked.append(frame_draw) if cv2.waitKey(1) == ord("a") : break vid.release() if self.save_video == True: print(len(self.frames_tracked)) self.saveVideo(self.saveIn, self.frames_tracked, "trackedVid") if self.save == True : return len(faces.keys()), faces else : return None, None
class FaceNetSegmenter(TorchDevice, BaseSegmenter): """FaceNetSegmenter segments faces from an image. - Input shape: `(Height x Width x Channels)` - Output shape: `NumFaces x (Channels x ImageSize x ImageSize)` `Channels` dimension can be changed (e.g. set `channel_axis` to 0 for channels first mode instead of channels last). :param image_size: Height and width of a detected face. Smaller faces are upscaled. :param margin: Margin to add to bounding box, in terms of pixels in the final image. :param selection_method: Heuristic to use to select a single face from the image. Options: "probability": highest probability selected "largest": largest box selected "largest_over_threshold": largest box over a certain probability selected "center_weighted_size": box size minus weighted squared offset from image center :param post_process: Flag for normalizing the output image. Required if you want to pass these face to the FaceNetEmbedder. :param min_face_size: Minimum face size to search for. :param channel_axis: Axis of channels in the image. Default is 2 (channels-last), use 0 for channels-first. """ def __init__(self, image_size: int = 160, margin: int = 0, selection_method: str = 'largest', post_process: bool = True, min_face_size: int = 20, channel_axis: int = 2, *args, **kwargs): super().__init__(*args, **kwargs) self.image_size = image_size self.margin = margin self.selection_method = selection_method self.post_process = post_process self.min_face_size = min_face_size self.channel_axis = channel_axis self._default_channel_axis = 2 def post_init(self): from facenet_pytorch import MTCNN self.face_detector = MTCNN(selection_method=self.selection_method, image_size=self.image_size, margin=self.margin, device=self.device, post_process=self.post_process, min_face_size=self.min_face_size, keep_all=True) @batching def segment(self, blob: 'np.ndarray', *args, **kwargs) -> List[List[Dict]]: """Transform a numpy `ndarray` of shape `(Height x Width x Channel)` into a list with dicts that contain cropped images. :param blob: A numpy `ndarray` that represents a single image. :param args: Additional positional arguments. :param kwargs: Additional positional arguments. :return: A list with dicts that contain cropped images. """ if self.channel_axis != self._default_channel_axis: blob = np.moveaxis(blob, self.channel_axis, self._default_channel_axis + 1) batch = blob results = [] batch = np.asarray(batch) with torch.no_grad(): image = torch.from_numpy(data.astype('float32')).to(self.device) # Create a batch of size 1 image = image.unsqueeze(0) # Detect faces batch_boxes, batch_probs, _ = self.face_detector.detect( image, landmarks=True) # Select faces if not self.keep_all: batch_boxes, batch_probs, _ = self.face_detector.select_boxes( batch_boxes, batch_probs, _, image, method=self.selection_method) # Extract faces faces = self.face_detector.extract(image, batch_boxes, save_path=None) if faces[0] is not None: faces = faces[0].view(-1, image.shape[-1], self.image_size, self.image_size) batch_boxes = batch_boxes[0] batch_probs = batch_probs[0] results = [ dict(offset=0, weight=probability, blob=face.numpy(), location=bounding_box.tolist()) for face, probability, bounding_box in zip( faces, batch_probs, batch_boxes) if face is not None ] return results
def detect(self): vid = cv2.VideoCapture(self.lookIn) frameCount = int(vid.get(cv2.CAP_PROP_FRAME_COUNT)) - 1 mtcnn = MTCNN() bboxes_and_probs = [] count = frameCount while vid.isOpened(): #if count < frameCount: #break _, frame = vid.read() print("%d to go.." %(count)) count -= 1 frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) boxes, prob = mtcnn.detect(frame) frame_draw = frame.copy() draw = ImageDraw.Draw(frame_draw) if boxes is None : #print("Skipping Frame") if self.writeMode == True: detected_frames.append(frame_draw) cv2.imshow("Frame", cv2.cvtColor(np.asarray(frame_draw), cv2.COLOR_BGR2RGB)) if cv2.waitKey(2) & 0xFF == ord('y'): break continue for box, p in zip(boxes,prob): if p > 0.80: #print("Not skipping!") draw.rectangle(box.tolist(), outline= (255, 0, 0), width= 1) bboxes_and_probs.append({"bbox":box, "prob":p}) if self.writeMode == True: detected_frames.append(frame_draw) cv2.imshow("Frame", cv2.cvtColor(np.asarray(frame_draw), cv2.COLOR_BGR2RGB)) if cv2.waitKey(1) & 0xFF == ord('y'): break print("releasing capture") vid.release() if self.writeMode == True : dim = detected_frames[0].size print(dim , int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))) fourcc = cv2.VideoWriter_fourcc(*"mp4v") video_tracked = cv2.VideoWriter(self.saveIn, fourcc, 25.0, dim) for frame in detected_frames: video_tracked.write(cv2.cvtColor(np.array(frame), cv2.COLOR_RGB2BGR)) video_tracked.release() return bboxes_and_probs
class FaceRecognition: def __init__(self, capture, min_face=300, accuracy_th=0.7): self.min_face = min_face self.mtcnn_pt = MTCNN(image_size=160, margin=0, min_face_size=self.min_face ) # initializing mtcnn for face detection self.resnet = InceptionResnetV1(pretrained='vggface2').eval( ) # initializing resnet for face img to embeding conversion self.model_path = 'classify_model.pkl' self.accuracy_th = accuracy_th self.new_boxes = False self.lock_boxes = threading.Lock() self.lock_cap = threading.Lock() self.lock_flag = threading.Lock() self.cap = capture # self.cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280) # self.cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720) with open(self.model_path, 'rb') as infile: (self.model, self.class_names) = pickle.load(infile) # cv2.namedWindow('frame', cv2.WINDOW_AUTOSIZE) self.box_draw = [[]] self.text_draw = [[]] self.mark_draw = [[]] self.stop_flag = [False] self.mask = cv2.imread('images/fm2.png') def set_params(self, min_face=None, accuracy_th=None): if min_face is not None and 0 < min_face <= 1000: self.min_face = min_face self.mtcnn_pt = MTCNN(image_size=160, margin=0, min_face_size=self.min_face) if accuracy_th is not None and 0 < accuracy_th < 1: self.accuracy_th = accuracy_th def load_model(self, path=''): if path != '' and os.path.isfile(path): with open(path, 'rb') as infile: (self.model, self.class_names) = pickle.load(infile) else: with open(self.model_path, 'rb') as infile: (self.model, self.class_names) = pickle.load(infile) # Draw bounding box and text on image def draw_frame(self, image, bounding_boxes, label_texts=[], landmarks=[], face_mask_anchor=False, color=[], thick=2, text_scale=0.5, skip_list=[]): if bounding_boxes is None: return if not color: color = [(255, 255, 0)] * len(bounding_boxes) for i, box in enumerate(bounding_boxes): if i in skip_list: continue cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), color[i], thick) if label_texts: cv2.putText(image, label_texts[i], (int(box[0]), int(box[1] - 5)), cv2.FONT_HERSHEY_SIMPLEX, text_scale, color[i], thick) if landmarks: for point in landmarks[i]: cv2.circle(image, (int(point[0]), int(point[1])), 2, color, thick) if face_mask_anchor: center_eye = (landmarks[i][0] + landmarks[i][1]) / 2 center_lip = (landmarks[i][3] + landmarks[i][4]) / 2 slope_ver = (center_eye[1] - center_lip[1]) / ( center_eye[0] - center_lip[0]) slope_hor = -1 / slope_ver chin = ((box[3] - center_eye[1]) / slope_ver + center_eye[0], box[3]) center = (center_eye + landmarks[i][2]) / 2 left_ear = (box[2], slope_hor * (box[2] - center[0]) + center[1]) right_ear = (box[0], slope_hor * (box[0] - center[0]) + center[1]) cv2.circle(image, (int(chin[0]), int(chin[1])), 2, (255, 255, 255), 2) cv2.circle(image, (int(center[0]), int(center[1])), 2, (255, 255, 255), 2) cv2.circle(image, (int(left_ear[0]), int(left_ear[1])), 2, (255, 255, 255), 2) cv2.circle(image, (int(right_ear[0]), int(right_ear[1])), 2, (255, 255, 255), 2) # Detect face on image and match with classify model, update result to bounding boxes and texts def face_match(self, image, classify_model, person_names): box_dr = [] text_dr = [] mark_dr = [] try: bboxes, prob, landmarks = self.mtcnn_pt.detect(image, landmarks=True) except Exception as ex: with self.lock_boxes: self.box_draw[0] = box_dr self.text_draw[0] = text_dr return box_dr, text_dr, mark_dr if bboxes is None: with self.lock_boxes: self.box_draw[0] = box_dr self.text_draw[0] = text_dr return box_dr, text_dr, mark_dr for idx, box in enumerate(bboxes): if prob[idx] > 0.90: # if face detected and probability > 90% box_dr.append(box) mark_dr.append(landmarks[idx]) face = extract_face(image, box, image_size=self.mtcnn_pt.image_size, margin=self.mtcnn_pt.margin) face = fixed_image_standardization(face) emb = self.resnet( face.unsqueeze(0) ) # passing cropped face into resnet model to get embedding matrix emb_array = emb.detach().numpy() predictions = classify_model.predict_proba(emb_array) best_class_indices = np.argmax(predictions, axis=1) best_class_probabilities = predictions[ np.arange(len(best_class_indices)), best_class_indices] if best_class_probabilities[0] > self.accuracy_th: text = '{0}: {1:.0%}'.format( person_names[best_class_indices[0]], best_class_probabilities[0]) else: text = '{0}'.format('Unknown') text_dr.append(text) elif prob[idx] > 0.10: continue else: continue with self.lock_boxes: self.box_draw[0] = box_dr self.text_draw[0] = text_dr self.mark_draw[0] = mark_dr self.new_boxes = True return box_dr, text_dr, mark_dr # A thread to apply function face_match def thread_face_recog(self): while True: if self.cap is None: break with self.lock_flag: if self.stop_flag[0]: break with self.lock_cap: ret_copy, frame_copy = self.cap.read() frame_copy = cv2.cvtColor(frame_copy, cv2.COLOR_BGR2RGB) self.face_match(frame_copy, self.model, self.class_names) print('thread_face_recog stoped') # Stop above thread def stop_thread_face_recog(self): with self.lock_flag: self.stop_flag[0] = True with self.lock_boxes: self.box_draw[0] = [] self.text_draw[0] = [] self.mark_draw[0] = [] # Sample to implement with camera def face_recog_cam(self): thread = threading.Thread(target=self.thread_face_recog, args=(), daemon=True) thread.start() while True: # Capture frame-by-frame with self.lock_cap: ret, frame = self.cap.read() with self.lock_boxes: boxes = self.box_draw[0] texts = self.text_draw[0] marks = self.mark_draw[0] self.draw_frame(frame, boxes, texts) # frame = add_face_mask(frame, mask) # Display the resulting frame cv2.imshow('frame', frame) if cv2.waitKey(1) & 0xFF == ord('q'): break # Sample with image folder or file def face_recog_image(self, path): if not os.path.exists(path): return if os.path.isfile(path): image = cv2.imread(path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) bboxes, texts, marks = self.face_match(image, self.model, self.class_names) self.draw_frame(image, bboxes, texts) cv2.imshow('', image) cv2.waitKey() cv2.destroyWindow('') if os.path.isdir(path): filenames = glob.glob(path + '/*.jpg') images = [cv2.imread(img) for img in filenames] for idx, img in enumerate(images): img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) bboxes, texts, marks = self.face_match(img, self.model, self.class_names) img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) self.draw_frame(img, bboxes, texts) cv2.imshow(str(idx), img) cv2.waitKey() cv2.destroyWindow(str(idx)) def __del__(self): self.cap.release() cv2.destroyAllWindows()
v_cap.set(cv2.CAP_PROP_FRAME_WIDTH, image_size) v_cap.set(cv2.CAP_PROP_FRAME_HEIGHT, image_size) flag = False face_results = [] start = time.time() while (True): time_elapsed = time.time() - prev break_time = time.time() - start if break_time > 10: break ret, frame = v_cap.read() if time_elapsed > 1. / frame_rate: # Collect frames every 1/frame_rate of a second prev = time.time() frame_ = Image.fromarray(frame) frames.append(frame_) batch_boxes, prob, landmark = mtcnn.detect(frames, landmarks=True) frames_duplicate = frames.copy() boxes.append(batch_boxes) boxes_duplicate = boxes.copy() # show imgs with bbxs face_results.append( show_images(frames_duplicate, boxes_duplicate, bbx_color)) frames = [] boxes = [] if cv2.waitKey(1) & 0xFF == ord('q'): break v_cap.release() cv2.destroyAllWindows() accuracy = (sum(face_results) / len(face_results)) * 100 print('Percentage match ' + '{:.2f}'.format(accuracy)) if accuracy > 0.75:
name = str(input("Person Name: ")) DATASET_PATH = os.path.join("datasets", name) if not os.path.isdir(DATASET_PATH): os.mkdir(DATASET_PATH) mtcnn = MTCNN(prewhiten=False, keep_all=True, thresholds=[0.6, 0.7, 0.9]) image_no = 0 capture = cv2.VideoCapture(0) count = 0 while True: count += 1 check, frame = capture.read() frame = cv2.resize(frame, (400, 300)) faces, _ = mtcnn.detect(Image.fromarray(frame)) if faces is not None and count % 7 == 0: image_no += 1 cv2.imwrite(os.path.join(DATASET_PATH, f"{name}_{image_no}.jpg"), frame) if image_no == 100: break image_text = f"Number of image taken {image_no} for {name}" cv2.putText(frame, image_text, (20, 20), cv2.LINE_AA, .5, (100, 0, 200), 1) if faces is not None: for (x, y, w, h) in faces: x, y, w, h = int(x), int(y), int(w), int(h) cv2.rectangle(frame, (x, y), (w, h), (200, 100, 0), 2) cv2.imshow('frame', frame) if cv2.waitKey(1) & 0xFF == ord('q'):
def find_face(self): """ find face on the frames create: self.faces self.frame_ids """ def del_skipped_frames(): idxs = [idx for idx, val in enumerate(self.centers) if val == 0] for index in sorted(idxs, reverse=True): del self.centers[index] del self.frames[index] del self.frame_ids[index] def medfilt_filter(step=7): y_ = medfilt([i[0] for i in self.centers], step) x_ = medfilt([i[1] for i in self.centers], step) return y_, x_ self.centers, h_shift, w_shift, centers = ([], [], [], None) # fast mtcnn pytorch; uses with cuda if cuda.is_available(): frames_cropped = [] box_prev = None mtcnn = MTCNN(image_size=200, device=device) for frame in tqdm(self.frames): box, _ = mtcnn.detect(frame) if box is not None: box = np.array(box[0]).astype(int) x1, x2, y1, y2 = box[1], box[3], box[0], box[2] h_shift += [(y2 - y1) // 2] w_shift += [(x2 - x1) // 2] centers = [y1 + h_shift[-1], x1 + w_shift[-1]] #plt.imshow(frame[x1:x2, y1:y2]) #plt.show() if centers is not None: self.centers += [centers] else: self.centers += [0] else: self.centers += [0] del mtcnn del_skipped_frames() # haard; uses without cuda else: face_cascade = cv2.CascadeClassifier( 'haarcascade_frontalface_default.xml') for frame in tqdm(self.frames): gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) faces = face_cascade.detectMultiScale(gray) for (x, y, w, h) in faces: h_shift += [h // 2] w_shift += [w // 2] centers = [y + h // 2, x + w // 2] if centers is not None: self.centers += [centers] else: self.centers += [0] del face_cascade del_skipped_frames() self.box_shift = [ np.mean(w_shift, dtype=int), np.mean(h_shift, dtype=int) ] # drop discharges from signal if len(self.centers) == 0: raise ValueError("Невозможно определить лицо") if cuda.is_available(): y_, x_ = medfilt_filter(5) else: y_, x_ = medfilt_filter() self.centers = [[int(y), int(x)] for x, y in zip(x_, y_)] for frame, (y, x) in tqdm(zip(self.frames, self.centers)): face = frame[x - self.box_shift[0]:x + self.box_shift[0], y - self.box_shift[1]:y + self.box_shift[1]] self.faces += [face]
class MTCNN_Model: def __init__(self, general_parameters, model_parameters, inference_parameters): #---------dataset_infos self.X = None self.input_images = None self.subfolders = None #--------general_parameters self.root_folder_name = general_parameters['root_folder_name'] #---------model_parameters self.image_size = model_parameters['image_size'] self.margin = model_parameters['margin'] self.min_face_size = model_parameters['min_face_size'] self.thresholds = model_parameters['thresholds'] self.factor = model_parameters['factor'] self.keep_all = model_parameters['keep_all'] self.device = 'cuda:0' if (model_parameters['device'] == "cuda" and torch.cuda.is_available()) else 'cpu' self.seed = model_parameters['seed'] self.post_process = False #---------Inference_parameters self.inference_batch_size = inference_parameters[ 'inference_batch_size'] self.input_square_transformation_size = inference_parameters[ 'input_square_transformation_size'] #------- Other self.num_workers = cpu_count() #------- MTCNN self.mtcnn = MTCNN(image_size=self.image_size, margin=self.margin, min_face_size=self.min_face_size, thresholds=self.thresholds, factor=self.factor, post_process=self.post_process, keep_all=self.keep_all, device=self.device) #------- Reproducibility random.seed(self.seed) np.random.seed(self.seed) torch.random.manual_seed(self.seed) torch.cuda.manual_seed(self.seed) #------- Results self.df_result = None def predict(self, img_reference, step): if step == "Experiment": image_array = img_reference if step == "Deployment": img = img_reference image_array = Image.fromarray(img) boxes, probs = self.mtcnn.detect(image_array, landmarks=False) return (boxes, probs) def _construct_result_dataframe(self, step): boxes = [] probs = [] for i in range(0, len(self.X), self.inference_batch_size): img_reference = [] batch = self.X[i:i + self.inference_batch_size] for row in batch: v_cap = cv2.VideoCapture(row[0]) success, frame = v_cap.read() img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) img = cv2.resize(img, (self.input_square_transformation_size, self.input_square_transformation_size)) img_reference.append(Image.fromarray(img)) batch_result = self.predict(img_reference, step) if self.keep_all: for b, p in zip(batch_result[0], batch_result[1]): boxes.append(b) probs.append(p) else: for b, p in zip(batch_result[0], batch_result[1]): max_prob_position = np.argmax(p) boxes.append(b[max_prob_position]) probs.append(np.max(p)) self.df_result = pd.DataFrame({ 'Input_image': self.input_images, 'Subfolder': self.subfolders, 'Bboxes(x1,y1,x2,y2)': boxes, 'Probabilities': probs }) def get_result_dataframe(self, X, step='Experiment'): self.X = X self.input_images = X[:, 0] self.subfolders = X[:, 1] self._construct_result_dataframe(step) return self.df_result
class FaceAndHandDetector(QThread): frame_update_signal = pyqtSignal(QPixmap) def __init__(self): QThread.__init__(self) self.frame = 0 self.mtcnn = MTCNN() self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # print(self.device) self.frame_counter = 0 self.prev_frame_counter = 0 self.timer = QTimer(self) self.timer.timeout.connect(self.fps_count) self.timer.start(1000) self.model = HPSearchNET(cnn_num=3, fc_num=2, kern_size=3, func_act='elu', nn_prn=True, in_shape=160).to(self.device) self.model.load_state_dict(torch.load("hnd_net_elu_cnn3_fc2_kr3.pth", map_location=self.device)) self.model.eval() # Функция рисования прямоугольника лица def draw_face(self, frame, boxes, probs): # , landmarks try: cnt = 0 for box, prob in zip(boxes, probs): # , ld , landmarks cnt += 1 print(f"Лицо {cnt} box: {box} prob: {prob:.4f}") # Рисуем обрамляющий прямоугольник лица на кадре cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 0, 255), thickness=2) except Exception as e: print('Error in _draw') print(f'error : {e}') return frame # Функция рисования прямоугольников рук def draw_hand(self, frame, hand_landmarks): # mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS) max_x = max_y = 0 min_x = min_y = 65535 for mark in hand_landmarks.landmark: if mark.x > max_x: max_x = mark.x if mark.x < min_x: min_x = mark.x if mark.y > max_y: max_y = mark.y if mark.y < min_y: min_y = mark.y max_x = round(max_x * IMAGE_WIDTH) + 30 min_x = round(min_x * IMAGE_WIDTH) - 30 max_y = round(max_y * IMAGE_HEIGHT) + 30 min_y = round(min_y * IMAGE_HEIGHT) - 30 if min_x < 0: min_x = 0 if min_y < 0: min_y = 0 if max_x > IMAGE_WIDTH: max_x = IMAGE_WIDTH if max_y > IMAGE_HEIGHT: max_y = IMAGE_HEIGHT print(f"\tmax_x: {max_x} min_x: {min_x} max_y: {max_y} min_y: {min_y}") # Рисуем обрамляющий прямоугольник руки на кадре cv2.rectangle(frame, (min_x, min_y), (max_x, max_y), (0, 255, 0), thickness=2) return frame, [min_x, min_y, max_x, max_y] def fps_count(self): self.prev_frame_counter, self.frame_counter = self.frame_counter, 0 # self.frame_counter = 0 # Определение наличия рук в кадре def hand_detection_mp(self, frame): frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # cv2.flip(frame, 1) frame.flags.writeable = False results = hands.process(frame) frame.flags.writeable = True if results.multi_hand_landmarks: count = 0 for hand_landmarks in results.multi_hand_landmarks: count += 1 print(f"Рука {count}") print( f'\tIndex finger tip coordinates: (' f'x: {round(hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].x * IMAGE_WIDTH)}, ' f'y: {round(hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].y * IMAGE_HEIGHT)})' ) for num, mark in enumerate(hand_landmarks.landmark): print(f"\tМетка {arm_marks[num]}" f"- x: {round(mark.x * IMAGE_WIDTH)}, y: {round(mark.y * IMAGE_HEIGHT)}") return results # Функция в которой будет происходить процесс считывания и обработки каждого кадра def run(self): # Заходим в бесконечный цикл while True: if cam_index_list: # Считываем каждый новый кадр - frame # ret - логическая переменая. Смысл - считали ли мы кадр с потока или нет hands = [] ret, self.frame = cam.read() self.frame = cv2.flip(self.frame, 1) try: # детектируем расположение лица на кадре, вероятности на сколько это лицо boxes, probs = self.mtcnn.detect(self.frame, landmarks=False) # , landmarks if boxes is not None: # Рисуем на кадре self.frame = self.draw_face(self.frame, boxes, probs) # , landmarks # Ищем руки hand_detect_rez = self.hand_detection_mp(self.frame) if hand_detect_rez.multi_hand_landmarks: for hand_landmarks in hand_detect_rez.multi_hand_landmarks: self.frame, hand_box = self.draw_hand(self.frame, hand_landmarks) hands.append(self.filter_hand(self.frame, hand_box)) # размер 160х160 # Нормализуем изображение в значениях [0, 1] img = torch.from_numpy(hands[-1]) / 255 img = img.unsqueeze(0).unsqueeze(0) with torch.no_grad(): outputs = self.model(img.to(self.device)) _, predicted = torch.max(outputs.data, 1) print(f"predicted: {labels_texts[int(predicted)]}") # пишем в кадре какой жест cv2.putText(self.frame, labels_texts[int(predicted)], (hand_box[2], hand_box[3]), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA) except Exception as e: print(f'Error {e} in run') # пишем в кадре число FPS cv2.putText(self.frame, f"FPS: {self.prev_frame_counter}", (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 1, cv2.LINE_AA) self.frame_counter += 1 self.frame_update_signal.emit(self.frame_to_qpixmap(self.frame)) # cv2.imshow(self.label, self.frame) # if hands: # self.frame_update_signal.emit(self.frame_to_qpixmap(hands[0])) # Функция преобразования врейма в QPixmap def frame_to_qpixmap(self, frame): rgb_image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) convert_to_qt_format = QImage(rgb_image.data, rgb_image.shape[1], rgb_image.shape[0], QImage.Format_RGB888) convert_to_qt_format = QPixmap.fromImage(convert_to_qt_format) pixmap = QPixmap(convert_to_qt_format) return pixmap def filter_hand(self, frame, hand_box): hand_img = frame[int(hand_box[1]):int(hand_box[3]), int(hand_box[0]):int(hand_box[2])] # hand_img = cv2.resize(hand_img, (48, 48)) hsv = cv2.cvtColor(hand_img, cv2.COLOR_BGR2HSV) # define range of skin color in HSV lower_skin = np.array([0, 20, 70], dtype=np.uint8) upper_skin = np.array([20, 255, 255], dtype=np.uint8) # extract skin colur imagw mask = cv2.inRange(hsv, lower_skin, upper_skin) # extrapolate the hand to fill dark spots within # kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (11, 11)) # kernel = np.ones((3, 3), np.uint8) mask = cv2.erode(mask, kernel, iterations=2) mask = cv2.dilate(mask, kernel, iterations=2) # mask = cv2.dilate(mask, kernel, iterations=4) # blur the image mask = cv2.GaussianBlur(mask, (3, 3), 0) # 10 # mask = cv2.resize(mask, (48, 48)) # hand_img = cv2.resize(hand_img, (48, 48)) res = cv2.bitwise_and(hand_img, hand_img, mask=mask) res = cv2.resize(res, (160, 160)) # Превращаем в 1-канальное серое изображение res = cv2.cvtColor(res, cv2.COLOR_BGR2GRAY) return res
labelColor = [(10, 255, 0), (10, 0, 255)] cap = cv2.VideoCapture(0) # MTCNN for detecting the presence of faces mtcnn = MTCNN(keep_all=True, device=device) model.to(device) model.eval() while True: ret, frame = cap.read() if ret == False: pass img_ = frame.copy() boxes, _ = mtcnn.detect(img_) # Using PIL to draw boxes '''frame_draw = frame.copy() draw = ImageDraw.Draw(frame_draw) for box in boxes: draw.rectangle(box.tolist(), outline=(255, 0, 0), width=6)''' ''' try: for x1,y1,x2,y2 in boxes: frame = cv2.rectangle(frame,(x1,y1),(x2,y2),(0,0,255),3) roi = img_[int(y1):int(y2) , int(x1):int(x2)] except TypeError as e: pass''' try: for i in range(len(boxes)):
class Demo(): def __init__(self, args): ## configs self.device = 'cuda:0' if args.gpu else 'cpu' self.checkpoint_path = args.checkpoint self.detect_human_face = args.detect_human_face self.render_video = args.render_video self.output_size = args.output_size self.image_size = 64 self.min_depth = 0.9 self.max_depth = 1.1 self.border_depth = 1.05 self.xyz_rotation_range = 60 self.xy_translation_range = 0.1 self.z_translation_range = 0 self.fov = 10 # in degrees self.depth_rescaler = lambda d: (1 + d) / 2 * self.max_depth + ( 1 - d) / 2 * self.min_depth # (-1,1) => (min_depth,max_depth) self.depth_inv_rescaler = lambda d: (d - self.min_depth) / ( self.max_depth - self.min_depth) # (min_depth,max_depth) => (0,1) fx = (self.image_size - 1) / 2 / (np.tan(self.fov / 2 * np.pi / 180)) fy = (self.image_size - 1) / 2 / (np.tan(self.fov / 2 * np.pi / 180)) cx = (self.image_size - 1) / 2 cy = (self.image_size - 1) / 2 K = [[fx, 0., cx], [0., fy, cy], [0., 0., 1.]] K = torch.FloatTensor(K).to(self.device) self.inv_K = torch.inverse(K).unsqueeze(0) self.K = K.unsqueeze(0) ## NN models self.netD = EDDeconv(cin=3, cout=1, nf=64, zdim=256, activation=None) self.netA = EDDeconv(cin=3, cout=3, nf=64, zdim=256) self.netL = Encoder(cin=3, cout=4, nf=32) self.netV = Encoder(cin=3, cout=6, nf=32) self.netD = self.netD.to(self.device) self.netA = self.netA.to(self.device) self.netL = self.netL.to(self.device) self.netV = self.netV.to(self.device) self.load_checkpoint() self.netD.eval() self.netA.eval() self.netL.eval() self.netV.eval() ## face detecter if self.detect_human_face: from facenet_pytorch import MTCNN self.face_detector = MTCNN(select_largest=True, device=self.device) ## renderer if self.render_video: from unsup3d.renderer import Renderer assert 'cuda' in self.device, 'A GPU device is required for rendering because the neural_renderer only has GPU implementation.' cfgs = { 'device': self.device, 'image_size': self.output_size, 'min_depth': self.min_depth, 'max_depth': self.max_depth, 'fov': self.fov, } self.renderer = Renderer(cfgs) def load_checkpoint(self): print(f"Loading checkpoint from {self.checkpoint_path}") cp = torch.load(self.checkpoint_path, map_location=self.device) self.netD.load_state_dict(cp['netD']) self.netA.load_state_dict(cp['netA']) self.netL.load_state_dict(cp['netL']) self.netV.load_state_dict(cp['netV']) def depth_to_3d_grid(self, depth, inv_K=None): if inv_K is None: inv_K = self.inv_K b, h, w = depth.shape grid_2d = get_grid(b, h, w, normalize=False).to(depth.device) # Nxhxwx2 depth = depth.unsqueeze(-1) grid_3d = torch.cat((grid_2d, torch.ones_like(depth)), dim=3) grid_3d = grid_3d.matmul(inv_K.transpose(2, 1)) * depth return grid_3d def get_normal_from_depth(self, depth): b, h, w = depth.shape grid_3d = self.depth_to_3d_grid(depth) tu = grid_3d[:, 1:-1, 2:] - grid_3d[:, 1:-1, :-2] tv = grid_3d[:, 2:, 1:-1] - grid_3d[:, :-2, 1:-1] normal = tu.cross(tv, dim=3) zero = normal.new_tensor([0, 0, 1]) normal = torch.cat( [zero.repeat(b, h - 2, 1, 1), normal, zero.repeat(b, h - 2, 1, 1)], 2) normal = torch.cat( [zero.repeat(b, 1, w, 1), normal, zero.repeat(b, 1, w, 1)], 1) normal = normal / (((normal**2).sum(3, keepdim=True))**0.5 + EPS) return normal def detect_face(self, im): print("Detecting face using MTCNN face detector") try: bboxes, prob = self.face_detector.detect(im) w0, h0, w1, h1 = bboxes[0] except: print("Could not detect faces in the image") return None hc, wc = (h0 + h1) / 2, (w0 + w1) / 2 crop = int(((h1 - h0) + (w1 - w0)) / 2 / 2 * 1.1) im = np.pad( im, ((crop, crop), (crop, crop), (0, 0)), mode='edge') # allow cropping outside by replicating borders h0 = int(hc - crop + crop + crop * 0.15) w0 = int(wc - crop + crop) return im[h0:h0 + crop * 2, w0:w0 + crop * 2] def run(self, pil_im): im = np.uint8(pil_im) ## face detection if self.detect_human_face: im = self.detect_face(im) if im is None: return -1 h, w, _ = im.shape im = torch.FloatTensor(im / 255.).permute(2, 0, 1).unsqueeze(0) # resize to 128 first if too large, to avoid bilinear downsampling artifacts if h > self.image_size * 4 and w > self.image_size * 4: im = nn.functional.interpolate( im, (self.image_size * 2, self.image_size * 2), mode='bilinear', align_corners=False) im = nn.functional.interpolate(im, (self.image_size, self.image_size), mode='bilinear', align_corners=False) with torch.no_grad(): self.input_im = im.to(self.device) * 2. - 1. b, c, h, w = self.input_im.shape ## predict canonical depth self.canon_depth_raw = self.netD(self.input_im).squeeze(1) # BxHxW self.canon_depth = self.canon_depth_raw - self.canon_depth_raw.view( b, -1).mean(1).view(b, 1, 1) self.canon_depth = self.canon_depth.tanh() self.canon_depth = self.depth_rescaler(self.canon_depth) ## clamp border depth depth_border = torch.zeros(1, h, w - 4).to(self.input_im.device) depth_border = nn.functional.pad(depth_border, (2, 2), mode='constant', value=1) self.canon_depth = self.canon_depth * ( 1 - depth_border) + depth_border * self.border_depth ## predict canonical albedo self.canon_albedo = self.netA(self.input_im) # Bx3xHxW ## predict lighting canon_light = self.netL(self.input_im) # Bx4 self.canon_light_a = canon_light[:, :1] / 2 + 0.5 # ambience term self.canon_light_b = canon_light[:, 1:2] / 2 + 0.5 # diffuse term canon_light_dxy = canon_light[:, 2:] self.canon_light_d = torch.cat( [canon_light_dxy, torch.ones(b, 1).to(self.input_im.device)], 1) self.canon_light_d = self.canon_light_d / ( (self.canon_light_d**2).sum( 1, keepdim=True))**0.5 # diffuse light direction ## shading self.canon_normal = self.get_normal_from_depth(self.canon_depth) self.canon_diffuse_shading = ( self.canon_normal * self.canon_light_d.view(-1, 1, 1, 3)).sum(3).clamp( min=0).unsqueeze(1) canon_shading = self.canon_light_a.view( -1, 1, 1, 1) + self.canon_light_b.view( -1, 1, 1, 1) * self.canon_diffuse_shading self.canon_im = (self.canon_albedo / 2 + 0.5) * canon_shading * 2 - 1 ## predict viewpoint transformation self.view = self.netV(self.input_im) self.view = torch.cat([ self.view[:, :3] * np.pi / 180 * self.xyz_rotation_range, self.view[:, 3:5] * self.xy_translation_range, self.view[:, 5:] * self.z_translation_range ], 1) ## export to obj strings vertices = self.depth_to_3d_grid(self.canon_depth) # BxHxWx3 self.objs, self.mtls = export_to_obj_string( vertices, self.canon_normal) ## resize to output size self.canon_depth = nn.functional.interpolate( self.canon_depth.unsqueeze(1), (self.output_size, self.output_size), mode='bilinear', align_corners=False).squeeze(1) self.canon_normal = nn.functional.interpolate( self.canon_normal.permute(0, 3, 1, 2), (self.output_size, self.output_size), mode='bilinear', align_corners=False).permute(0, 2, 3, 1) self.canon_normal = self.canon_normal / (self.canon_normal**2).sum( 3, keepdim=True)**0.5 self.canon_diffuse_shading = nn.functional.interpolate( self.canon_diffuse_shading, (self.output_size, self.output_size), mode='bilinear', align_corners=False) self.canon_albedo = nn.functional.interpolate( self.canon_albedo, (self.output_size, self.output_size), mode='bilinear', align_corners=False) self.canon_im = nn.functional.interpolate( self.canon_im, (self.output_size, self.output_size), mode='bilinear', align_corners=False) if self.render_video: self.render_animation() def render_animation(self): print(f"Rendering video animations") b, h, w = self.canon_depth.shape ## morph from target view to canonical morph_frames = 15 view_zero = torch.FloatTensor([0.15 * np.pi / 180 * 60, 0, 0, 0, 0, 0]).to(self.canon_depth.device) morph_s = torch.linspace(0, 1, morph_frames).to(self.canon_depth.device) view_morph = morph_s.view(-1, 1, 1) * view_zero.view(1, 1, -1) + ( 1 - morph_s.view(-1, 1, 1)) * self.view.unsqueeze(0) # TxBx6 ## yaw from canonical to both sides yaw_frames = 80 yaw_rotations = np.linspace(-np.pi / 2, np.pi / 2, yaw_frames) # yaw_rotations = np.concatenate([yaw_rotations[40:], yaw_rotations[::-1], yaw_rotations[:40]], 0) ## whole rotation sequence view_after = torch.cat( [view_morph, view_zero.repeat(yaw_frames, b, 1)], 0) yaw_rotations = np.concatenate([np.zeros(morph_frames), yaw_rotations], 0) def rearrange_frames(frames): morph_seq = frames[:, :morph_frames] yaw_seq = frames[:, morph_frames:] out_seq = torch.cat([ morph_seq[:, :1].repeat(1, 5, 1, 1, 1), morph_seq, morph_seq[:, -1:].repeat(1, 5, 1, 1, 1), yaw_seq[:, yaw_frames // 2:], yaw_seq.flip(1), yaw_seq[:, :yaw_frames // 2], morph_seq[:, -1:].repeat(1, 5, 1, 1, 1), morph_seq.flip(1), morph_seq[:, :1].repeat(1, 5, 1, 1, 1), ], 1) return out_seq ## textureless shape front_light = torch.FloatTensor([0, 0, 1]).to(self.canon_depth.device) canon_shape_im = (self.canon_normal * front_light.view(1, 1, 1, 3)).sum(3).clamp( min=0).unsqueeze(1) canon_shape_im = canon_shape_im.repeat(1, 3, 1, 1) * 0.7 shape_animation = self.renderer.render_yaw( canon_shape_im, self.canon_depth, v_after=view_after, rotations=yaw_rotations) # BxTxCxHxW self.shape_animation = rearrange_frames(shape_animation) ## normal map canon_normal_im = self.canon_normal.permute(0, 3, 1, 2) / 2 + 0.5 normal_animation = self.renderer.render_yaw( canon_normal_im, self.canon_depth, v_after=view_after, rotations=yaw_rotations) # BxTxCxHxW self.normal_animation = rearrange_frames(normal_animation) ## textured texture_animation = self.renderer.render_yaw( self.canon_im / 2 + 0.5, self.canon_depth, v_after=view_after, rotations=yaw_rotations) # BxTxCxHxW self.texture_animation = rearrange_frames(texture_animation) def save_results(self, save_dir): print(f"Saving results to {save_dir}") save_image(save_dir, self.input_im[0] / 2 + 0.5, 'input_image') save_image( save_dir, self.depth_inv_rescaler(self.canon_depth)[0].repeat(3, 1, 1), 'canonical_depth') save_image(save_dir, self.canon_normal[0].permute(2, 0, 1) / 2 + 0.5, 'canonical_normal') save_image(save_dir, self.canon_diffuse_shading[0].repeat(3, 1, 1), 'canonical_diffuse_shading') save_image(save_dir, self.canon_albedo[0] / 2 + 0.5, 'canonical_albedo') save_image(save_dir, self.canon_im[0].clamp(-1, 1) / 2 + 0.5, 'canonical_image') with open(os.path.join(save_dir, 'result.mtl'), "w") as f: f.write(self.mtls[0].replace('$TXTFILE', './canonical_image.png')) with open(os.path.join(save_dir, 'result.obj'), "w") as f: f.write(self.objs[0].replace('$MTLFILE', './result.mtl')) if self.render_video: save_video(save_dir, self.shape_animation[0], 'shape_animation') save_video(save_dir, self.normal_animation[0], 'normal_animation') save_video(save_dir, self.texture_animation[0], 'texture_animation')
video = mmcv.VideoReader('video.mp4') frames = [ Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) for frame in video ] print(len(frames)) display.Video('video.mp4', width=640) frames_tracked = [] for i, frame in enumerate(frames): print('\rTracking frame: {}'.format(i + 1), end='') # Detect faces boxes, _ = mtcnn.detect(frame) # Draw faces frame_draw = frame.copy() draw = ImageDraw.Draw(frame_draw) if (boxes is not None): for box in boxes: draw.rectangle(box.tolist(), outline=(255, 0, 0), width=6) # Add to frame list # frames_tracked.append(frame_draw.resize((640, 360), Image.BILINEAR)) frames_tracked.append(frame_draw) print('\nDone') dim = frames_tracked[0].size fourcc = cv2.VideoWriter_fourcc(*'mp4v')
class FaceDetect: def __init__(self, thresholds=[0.9, 0.9, 0.9], min_face_size=100): self.mtcnn = MTCNN(thresholds=thresholds, select_largest=True, post_process=False, device='cuda:0', min_face_size=min_face_size) def detect(self, img_ls, crop_size=None, mode='Extract_largest', save_faces=False, save_annotate=False, save_path='face_result'): """face detection Args: img_ls (list): list of array crop_size (tuple, optional): crop images with (left, top, right, bottom). Defaults to None. mode (str, optional): There're 3 modes, 'Detect', 'Detect_bool', and 'Extract'. If you only want to know whether there're any faces, use 'Detect_bool' mode. If you want to get boxes and probs of faces, use 'Detect'. If you want to get all information about faces, use 'Extract'. Defaults to 'Detect_bool'. face_num (int, optional): Number of faces to be extracted. Defaults to 1. save_faces (bool, optional): For 'Extract' mode. Defaults to False. save_annotate (bool, optional): For 'Extract' mode. Save images with annotations. Defaults to False. Returns: tuple: depends on the mode. """ if crop_size: for i, img in enumerate(img_ls): img_ls[i] = img.crop(crop_size) try: boxes, probs = self.mtcnn.detect(img_ls) except Exception as e: print( f'{e} \n...add crop_size=(left, top, right, bottom) to make images the same' ) if mode == 'Detect_bool': return isinstance(boxes, np.ndarray) elif mode == 'Detect': return boxes, probs elif 'Extract' in mode: faces = [] annotates = [] boxes = boxes.tolist() probs = probs.tolist() for id_, img in enumerate(img_ls): face_batch = [] img_annotate = img.copy() draw = ImageDraw.Draw(img_annotate) box_all = boxes[id_] if mode == 'Extract_largest': for i, box in enumerate(box_all): left = max(0, box[0]) top = max(0, box[1]) right = min(np.array(img_ls[id_]).shape[1], box[2]) down = min(np.array(img_ls[id_]).shape[0], box[3]) box_all[i] = [left, top, right, down] area = list(map(self._cal_area, box_all)) max_id = area.index(max(area)) box = box_all[max_id] box_head = [ box[0] - box[0] / 8, box[1] - box[1] / 5, box[2] + box[2] / 8, box[3] + box[3] / 10 ] boxes[id_] = [box_head] probs[id_] = [probs[id_][max_id]] draw.rectangle(box_head, width=5) if save_faces: if not os.path.exists(save_path): os.mkdir(save_path) if not os.path.exists(os.path.join(save_path, 'faces')): os.mkdir(os.path.join(save_path, 'faces')) face_batch.append( extract_face(img, box_head, save_path=os.path.join( save_path, f'detected_face_{id_}-{0}.png'))) else: face_batch.append(extract_face(img, box_head)) elif mode == 'Extract_all': for i, box in enumerate(box_all): box_head = [ box[0] - box[0] / 3, box[1] - box[1] / 3, box[2] + box[2] / 83, box[3] + box[3] / 10 ] box_all[i] = box_head draw.rectangle(box_head, width=5) # box.tolist() if save_faces: if not os.path.exists(save_path): os.mkdir(save_path) if not os.path.exists( os.path.join(save_path, 'faces')): os.mkdir(os.path.join(save_path, 'faces')) face_batch.append( extract_face( img, box_head, save_path=os.path.join( save_path, f'detected_face_{id_}-{i}.png'))) else: face_batch.append(extract_face(img, box_head)) else: print(f"Error: there's no mode called {mode}") faces.append(face_batch) annotates.append(np.asarray(img_annotate)) if save_annotate: if not os.path.exists(save_path): os.mkdir(save_path) if not os.path.exists( os.path.join(save_path, 'annotations')): os.mkdir(os.path.join(save_path, 'annotations')) img_annotate.save( os.path.join(save_path, f'annotated_faces_{id_}.png')) return np.asarray(boxes), probs, annotates, faces else: print(f"Error: there's no mode called {mode}") def _cal_area(self, ls): return (ls[2] - ls[0]) * (ls[3] - ls[1])
image_new = image[box[1]:box[3], box[0]:box[2]] # вырезаем лицо из фотки try: image_new = cv2.resize(image_new, (200, 200)) cv2.imwrite(f'./face/with_mask/{ind}.jpg', image_new) ind += 1 print(ind) except Exception as e: print(e) except: continue ''' for filename in glob.iglob('with_mask/*.jpg', recursive=True): image = cv2.imread(filename) try: boxes, probs = mtcnn.detect(image) if any(probs): # если лицо или чтото подобное есть в кадре for prob, box in zip(probs, boxes): if prob > 0.90: # если вероятность захвата лица достаточно высока # сюда попали если лицо четко распознано box = [int(v) for v in box] # координаты лица в инт image_new = image[box[1]:box[3], box[0]:box[2]] # вырезаем лицо из фотки try: image_new = cv2.resize(image_new, (200, 200)) cv2.imwrite(f'./face/with_mask/{ind}.jpg', image_new) ind += 1 print(ind) except Exception as e: print(e)
class FaceDetector: def __init__(self, min_face_size, margin, prob_threshold): self.detector = MTCNN(keep_all=True, post_process=False, device='cuda:0', select_largest=False, min_face_size=min_face_size) self.margin = margin self.prob_threshold = prob_threshold def crop_face(self, img, box): max_x = img.shape[1] - 1 max_y = img.shape[0] - 1 x1, y1, x2, y2 = [int(round(c)) for c in box] x1 = max([x1 - self.margin, 0]) y1 = max([y1 - self.margin, 0]) x2 = min([x2 + self.margin, max_x]) y2 = min([y2 + self.margin, max_y]) face = img[y1:y2, x1:x2] new_box = [[x1, y1], [x2, y2]] return face, new_box def filter_faces(self, frame_data): faces = {} for frame_datum in frame_data: for face_data in frame_datum['faces']: face_id = face_data['id'] if face_id in faces: faces[face_id].append(face_data['prob']) else: faces[face_id] = [face_data['prob']] num_frames = len(frame_data) face_ids_to_del = set() avg_probs = {} for face_id, probs in faces.items(): if len(probs) < (num_frames / 2): print( f'FaceDetector::filter_faces: Face with id {face_id} failed to appear in >= {num_frames / 2} frames.' ) face_ids_to_del.add(face_id) avg_probs[face_id] = sum(probs) / len(probs) faces_remaining = len(faces) - len(face_ids_to_del) if faces_remaining > 2: print( 'FaceDetector::filter_faces: More than 2 faces. Only keeping the two with the highest avg prob.' ) avg_probs_sorted = sorted(avg_probs.items(), key=lambda x: x[1], reverse=True) for face_id, avg_prob in avg_probs_sorted[2:]: face_ids_to_del.add(face_id) filtered_frame_data = [] for frame_datum in frame_data: faces_filtered = [] for face_data in frame_datum['faces']: if face_data['id'] in face_ids_to_del: continue else: faces_filtered.append(face_data) frame_datum['faces'] = faces_filtered filtered_frame_data.append(frame_datum) return filtered_frame_data def detect(self, video, num_frames, filt=True): vc = cv2.VideoCapture(video) imgs = [] for i in range(num_frames): success, img = vc.read() if success: imgs.append(img) else: print( 'FaceDetector::detect: cv2::VideoCapture::read call failed.' ) return [] imgs_pil = [Image.fromarray(i) for i in imgs] video_boxes, video_probs, video_landmarks = self.detector.detect( imgs_pil, landmarks=True) past_faces = [] frame_data = [] face_id = 0 def search_past_faces(box, iou_threshold): i = 0 for past_face in past_faces: iou = bb_iou(past_face['box'], face_box) if iou > iou_threshold: return (True, i) else: i += 1 return (False, i) for frame, (frame_boxes, frame_probs, frame_landmarks) in enumerate( zip(video_boxes, video_probs, video_landmarks)): contrast_boosted = False img = imgs[frame] if frame_boxes is None: # print(f'FaceDetector::detect: No faces in frame {frame}. Boosting contrast.') img = boost_contrast([img], 3.0)[0] img_pil = Image.fromarray(img) frame_boxes, frame_probs, frame_landmarks = self.detector.detect( img_pil, landmarks=True) contrast_boosted = True if frame_boxes is None: # print(f'FaceDetector::detect: No faces after contrast boost. Proceeding to next frame.') frame_data.append({ 'frame': frame, 'faces': [], 'contrast_boosted': contrast_boosted }) continue face_data = [] for face_box, face_prob, face_landmarks in zip( frame_boxes, frame_probs, frame_landmarks): if face_prob < self.prob_threshold: continue face_cropped, new_box = self.crop_face(img, face_box) found, idx = search_past_faces(face_box, 0.5) if found: past_faces[idx]['box'] = face_box face_data.append({ 'id': past_faces[idx]['id'], 'box': new_box, 'prob': face_prob, 'landmarks': face_landmarks, 'img': face_cropped }) else: past_faces.append({'box': face_box, 'id': face_id}) face_data.append({ 'id': face_id, 'box': new_box, 'prob': face_prob, 'landmarks': face_landmarks, 'img': face_cropped }) face_id += 1 frame_data.append({ 'frame': frame, 'faces': face_data, 'contrast_boosted': contrast_boosted }) return frame_data if not filt else self.filter_faces(frame_data)