Beispiel #1
0
class EmotionRecognition(object):
    def __init__(self, device, gpu_id=0):
        assert device == 'cpu' or device == 'gpu'
        if torch.cuda.is_available():
            if device == 'cpu':
                print(
                    '[*]Warning: Your device have GPU, for better performance do EmotionRecognition(device=gpu)'
                )
                self.device = torch.device('cpu')
            if device == 'gpu':
                self.device = torch.device(f'cuda:{str(gpu_id)}')
        else:
            if device == 'gpu':
                print(
                    '[*]Warning: No GPU is detected, so cpu is selected as device'
                )
                self.device = torch.device('cpu')
            if device == 'cpu':
                self.device = torch.device('cpu')
        self.network = NetworkV2(in_c=1, nl=32, out_f=7).to(self.device)
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((48, 48)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5], std=[0.5])
        ])
        self.mtcnn = MTCNN(keep_all=True, device=self.device)
        # print(os.path.join(os.path.dirname(__file__))) C:\Users\mustdur\AppData\Local\Programs\Python\Python37\lib\site-packages\facial_emotion_recognition
        model_dict = torch.load(os.path.join(os.path.dirname(__file__),
                                             'model', 'model.pkl'),
                                map_location=torch.device('cpu'))
        # print(f'[*] Accuracy: {model_dict["accuracy"]}') # Accuracy 0.9565809379727686
        self.emotions = {
            0: 'Angry',
            1: 'Disgust',
            2: 'Fear',
            3: 'Happy',
            4: 'Sad',
            5: 'Surprise',
            6: 'Neutral'
        }
        self.network.load_state_dict(model_dict['network'])
        self.network.eval()

    def _predict(self, image):
        tensor = self.transform(image).unsqueeze(0).to(self.device)
        output = self.network(tensor)
        ps = torch.exp(output).tolist()
        index = np.argmax(ps)
        return self.emotions[index]

    def recognise_emotion(self, frame):
        emotions = []
        f_h, f_w, c = frame.shape
        gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
        boxes, _ = self.mtcnn.detect(frame)
        if boxes is not None:
            for i in range(len(boxes)):
                x1, y1, x2, y2 = int(round(boxes[i][0])), int(
                    round(boxes[i][1])), int(round(boxes[i][2])), int(
                        round(boxes[i][3]))
                emotion = self._predict(gray[y1:y2, x1:x2])
                emotions.append(emotion)
                frame = cv.rectangle(frame, (x1, y1), (x2, y2),
                                     color=[0, 255, 0],
                                     thickness=1)
                frame = cv.rectangle(frame, (x1, y1 - int(f_h * 0.03125)),
                                     (x1 + int(f_w * 0.125), y1),
                                     color=[0, 255, 0],
                                     thickness=-1)
                frame = cv.putText(frame,
                                   text=emotion,
                                   org=(x1 + 5, y1 - 3),
                                   fontFace=cv.FONT_HERSHEY_PLAIN,
                                   color=[0, 0, 0],
                                   fontScale=1,
                                   thickness=1)
            return frame, emotions, boxes

        else:
            #print('No face detected')
            return frame, emotions, boxes

    def recognise_emotion_fast(self, frame):
        emotions = []
        gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
        boxes, _ = self.mtcnn.detect(frame)
        if boxes is not None:
            for i in range(len(boxes)):
                x1, y1, x2, y2 = int(round(boxes[i][0])), int(
                    round(boxes[i][1])), int(round(boxes[i][2])), int(
                        round(boxes[i][3]))
                emotion = self._predict(gray[y1:y2, x1:x2])
                emotions.append(emotion)
            return frame, emotions, boxes

        else:
            #print('No face detected')
            return frame, emotions, boxes
Beispiel #2
0
class MTCNN_extractor(Face_extractor):
    def __init__(self,
                 down_sample=2,
                 batch_size=30,
                 my_device=device,
                 keep_empty=False,
                 factor=0.709,
                 size_range=None,
                 prob_limit=None,
                 same_bbox_size=False,
                 scale=1.2):
        super().__init__()
        self.prob_limit = prob_limit
        self.size_range = size_range
        self.extractor = MTCNN(keep_all=True,
                               device=my_device,
                               min_face_size=80 // down_sample,
                               factor=factor).eval()
        self.down_sample = down_sample
        self.batch_size = batch_size
        self.keep_empty = keep_empty
        self.same_bbox_size = same_bbox_size
        self.scale = scale

    def _get(self, images):
        ret = []
        for start in range(0, len(images), self.batch_size):
            ret.extend(self._limited_get(images[start:start +
                                                self.batch_size]))
        if self.size_range is not None:
            ret = self._filter(ret)
        return ret

    def _limited_get(self, images):
        h, w = images.shape[1:3]
        if h * w < 1280 * 720:
            down_sample = max(1, self.down_sample // 2)
        elif h * w >= 1280 * 720 * 4:
            down_sample = self.down_sample * 2
        else:
            down_sample = self.down_sample
        pils = [
            Image.fromarray(img).resize((w // down_sample, h // down_sample))
            for img in images
        ]
        bboxes, probs = self.extractor.detect(pils)

        clean_bboxes, clean_probs = [], []
        for boxes, prob in zip(bboxes, probs):
            if boxes is not None:
                rets = sorted([(p, box) for box, p in zip(boxes, prob)],
                              key=lambda x: x[0])
                if len(rets) >= 2:
                    rets = rets[
                        -1:] if rets[-1][0] - rets[-2][0] > 0.05 else rets[-2:]
                clean_bboxes.append(np.array([box for p, box in rets]))
                clean_probs.append(np.array([p for p, box in rets]))
            else:
                clean_bboxes.append([])
                clean_probs.append([])

        bsize = sorted([
            max(box[2] - box[0], box[3] - box[1]) * self.scale
            for boxes in clean_bboxes for box in boxes * down_sample
        ])
        if len(bsize) > 0:
            bsize = int(bsize[-len(bsize) // 4])  # -1//4 = -1

        ret = []
        for boxes, img, prob, idx in zip(clean_bboxes, images, clean_probs,
                                         range(len(clean_probs))):
            faceInfo = []
            if boxes is not None and len(boxes) > 0:
                min_size = bsize if self.same_bbox_size else None
                max_size = bsize if self.same_bbox_size else None

                faceInfo = [
                    FaceInfo(face=self._rectang_crop(img, box, self.scale,
                                                     min_size, max_size),
                             box=self._get_boundingbox(box, w, h, self.scale,
                                                       min_size, max_size),
                             prob=p,
                             frame=idx)
                    for box, p in zip(boxes * down_sample, prob)
                ]
                faceInfo = sorted(faceInfo, key=lambda x: -x.prob)
            elif not self.keep_empty:
                continue
            ret.append(faceInfo)
        return ret

    def _filter(self, ret):
        new_ret = []
        for frames in ret:
            ret_frame = []
            for face in frames:
                size = (face.box[2] - face.box[0]) * (face.box[3] -
                                                      face.box[1])
                if self.size_range[0] < size < self.size_range[
                        1] and face.prob > self.prob_limit:
                    ret_frame.append(face)
            new_ret.append(ret_frame)
        return new_ret
Beispiel #3
0
def input_face_embeddings(frames: Union[List[str], np.ndarray],
                          is_path: bool,
                          mtcnn: MTCNN,
                          resnet: InceptionResnetV1,
                          face_embed_cuda: bool,
                          use_half: bool,
                          coord: List,
                          name: str = None,
                          save_frames: bool = False) -> torch.Tensor:
    """
        Get the face embedding

        NOTE: If a face is not detected by the detector,
        instead of throwing an error it zeros the input
        for embedder.

        NOTE: Memory hungry function, hence the profiler.

        Args:
            frames: Frames from the video
            is_path: Whether to read from filesystem or memory
            mtcnn: face detector
            resnet: face embedder
            face_embed_cuda: use cuda for model
            use_half: use half precision

        Returns:
            emb: Embedding for all input frames
    """
    if face_embed_cuda:
        device = torch.device("cuda:0")
    else:
        device = torch.device("cpu")
    result_cropped_tensors = []
    no_face_indices = []
    for i, f in enumerate(frames):
        if is_path:
            frame = Image.open(f)
        else:
            frame = Image.fromarray(f.astype("uint8"))

        with torch.no_grad():
            cropped_tensors = None
            height, width, c = f.shape
            bounding_box, prob = mtcnn.detect(frame)

            if bounding_box is not None:
                for box in bounding_box:
                    x1, y1, x2, y2 = box
                    if (x1 > x2):
                        x1, x2 = x2, x1
                    if (y1 > y2):
                        y1, y2 = y2, y1

                    #for point in coord:
                    x, y = coord[0], coord[1]
                    x *= width
                    y *= height
                    if (x >= x1 and y >= y1 and x <= x2 and y <= y2):
                        cropped_tensors = extract_face(frame, box)
                        #print("found", box, x, y, end='\r')
                        break

        if cropped_tensors is None:
            #Face not detected, for some reason
            cropped_tensors = torch.zeros((3, 160, 160))
            no_face_indices.append(i)

        if save_frames:
            name = name.replace(".mp4", "")
            saveimg = cropped_tensors.detach().cpu().numpy().astype("uint8")
            saveimg = np.squeeze(saveimg.transpose(1, 2, 0))
            Image.fromarray(saveimg).save(f"{name}_{i}.png")

        result_cropped_tensors.append(cropped_tensors.to(device))

    if len(no_face_indices) > 20:
        #few videos start with silence, allow 0.5 seconds of silence else remove
        return None
    del frames
    #Stack all frames
    result_cropped_tensors = torch.stack(result_cropped_tensors)
    #Embed all frames
    result_cropped_tensors = result_cropped_tensors.to(device)
    if use_half:
        result_cropped_tensors = result_cropped_tensors.half()

    with torch.no_grad():
        emb = resnet(result_cropped_tensors)
    if use_half:
        emb = emb.float()
    return emb.to(cpu_device)
Beispiel #4
0
class EmotionRecognition(object):
    def __init__(self, model_path, device='cpu', gpu_id=0):
        assert device == 'cpu' or device == 'gpu',"Need to specify device! ('cpu' or 'gpu')"

        # Set the device according to arguments and what is available
        if torch.cuda.is_available():
            if device == 'cpu':
                logging.warning('Your machine has a GPU. Performance would be better with EmotionRecognition(device=gpu)!')
                self.device = torch.device('cpu')
            if device == 'gpu':
                self.device = torch.device(f'cuda:{str(gpu_id)}')
        else:
            if device == 'gpu':
                logging.warning('No GPU is detected, so cpu is selected as device!')
                self.device = torch.device('cpu')
            if device == 'cpu':
                self.device = torch.device('cpu')

        self.emotions = FERPlus.classes #{0: 'Angry', 1: 'Disgust', 2: 'Fear', 3: 'Happy', 4: 'Sad', 5: 'Surprise', 6: 'Neutral'}

        # Model used for emotion recognition (from cropped face images)
        self.network = NetworkBasic(in_c=1, nl=32, out_f=len(self.emotions)).to(self.device)

        # Load the saved state
        model_dict = torch.load(model_path, map_location=self.device)
        self.network.load_state_dict(model_dict)
        self.network.eval()

        # Normalization
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Grayscale(num_output_channels=1),
            transforms.Resize((48, 48)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5], std=[0.5])
        ])

        # Model used to detect faces in the video stream
        self.mtcnn = MTCNN(keep_all=True, device=self.device)

    def _predict(self, image):
        """Given an image of a face, return the primary emotion shown in the face."""
        tensor = self.transform(image).unsqueeze(0).to(self.device)
        output = self.network(tensor)
        ps = torch.exp(output).tolist()
        index = np.argmax(ps)
        score = np.amax(output.detach().numpy())

        return self.emotions[index], score

    def run_on_face(self, face):
        gray = cv.cvtColor(face, cv.COLOR_BGR2GRAY)
        emotion, score = self._predict(gray)

        return {'emotion': emotion, 'score': score}

    def run(self, frame):
        """Perform emotion recognition on a single frame and return the results.

        Different from show_emotions(), this method does not return a modified frame."""
        f_h, f_w, c = frame.shape
        gray = cv.cvtColor(frame, cv.COLOR_BGR2GRAY)
        boxes, _ = self.mtcnn.detect(frame)

        results = []
        if boxes is not None:
            for i in range(len(boxes)):
                x1, y1, x2, y2 = int(round(boxes[i][0])), int(round(boxes[i][1])), int(round(boxes[i][2])), int(
                    round(boxes[i][3]))
                emotion, score = self._predict(gray[y1:y2, x1:x2])

                results.append( {'emotion': emotion, 'score': score, 'position': (x1, y1, x2, y2)} )
        return results

    def show(self, frame, return_type='BGR'):
        """Perform emotion recognition on a single frame and show the result
        by returning a modified frame.

        The returned frame has a bounding box around all detected faces
        plus the names of the detected emotions."""
        f_h, f_w, c = frame.shape
        detection = self.recognize(frame)

        for result in detection:
            x1, y1, x2, y2 = result['position']
            emotion = result['emotion']
            score = result['score']

            frame = cv.rectangle(frame, (x1, y1), (x2, y2), color=[0, 255, 0], thickness=1)
            frame = cv.rectangle(frame, (x1, y1 - int(f_h*0.03125)), (x1 + int(f_w*0.21), y1), color=[0, 255, 0], thickness=-1)
            frame = cv.putText(frame, text=emotion+' (%0.2f)'%score, org=(x1 + 5, y1 - 3), fontFace=cv.FONT_HERSHEY_PLAIN,
                               color=[0, 0, 0], fontScale=1, thickness=1)

        if return_type == 'BGR':
            return frame
        if return_type == 'RGB':
            return cv.cvtColor(frame, cv.COLOR_BGR2RGB)
        else:
            raise Exception("Unknown return_type!")
Beispiel #5
0
class VideoTracker(object):
    def __init__(self, args):
        print('Initialize DeepSORT & YOLO-V5')
        # ***************** Initialize ******************************************************
        self.args = args
        self.scale = args.scale  # 2
        self.margin_ratio = args.margin_ratio  # 0.2
        self.frame_interval = args.frame_interval  # frequency

        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')
        self.half = self.device.type != 'cpu'  # half precision only supported on CUDA

        # create video capture ****************
        if args.display:
            cv2.namedWindow("test", cv2.WINDOW_NORMAL)
            cv2.resizeWindow("test", args.display_width, args.display_height)

        if args.cam != -1:
            print("Using webcam " + str(args.cam))
            self.vdo = cv2.VideoCapture(args.cam)
        else:
            self.vdo = cv2.VideoCapture()

        # ***************************** initialize DeepSORT **********************************
        cfg = get_config()
        cfg.merge_from_file(args.config_deepsort)

        use_cuda = self.device.type != 'cpu' and torch.cuda.is_available()
        self.deepsort = build_tracker(cfg, use_cuda=use_cuda)

        # ***************************** initialize Face Det **********************************
        self.face_detector = MTCNN(keep_all=True, device=self.device)

        print('Done..')
        if self.device == 'cpu':
            warnings.warn("Running in cpu mode which maybe very slow!",
                          UserWarning)

    def __enter__(self):
        # ************************* Load video from camera *************************
        if self.args.cam != -1:
            print('Camera ...')
            ret, frame = self.vdo.read()
            assert ret, "Error: Camera error"
            self.im_width = int(self.vdo.get(cv2.CAP_PROP_FRAME_WIDTH))
            self.im_height = int(self.vdo.get(cv2.CAP_PROP_FRAME_HEIGHT))

        # ************************* Load video from file *************************
        else:
            assert os.path.isfile(self.args.input_path), "Path error"
            self.vdo.open(self.args.input_path)
            self.im_width = int(self.vdo.get(cv2.CAP_PROP_FRAME_WIDTH))
            self.im_height = int(self.vdo.get(cv2.CAP_PROP_FRAME_HEIGHT))
            assert self.vdo.isOpened()
            print('Done. Load video file ', self.args.input_path)

        # ************************* create output *************************
        if self.args.save_path:
            os.makedirs(self.args.save_path, exist_ok=True)
            # path of saved video and results
            self.save_video_path = os.path.join(self.args.save_path,
                                                "results.mp4")

            # create video writer
            fourcc = cv2.VideoWriter_fourcc(*self.args.fourcc)
            self.writer = cv2.VideoWriter(self.save_video_path, fourcc,
                                          self.vdo.get(cv2.CAP_PROP_FPS),
                                          (self.im_width, self.im_height))
            print('Done. Create output file ', self.save_video_path)

        if self.args.save_txt:
            os.makedirs(self.args.save_txt, exist_ok=True)

        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.vdo.release()
        self.writer.release()
        if exc_type:
            print(exc_type, exc_value, exc_traceback)

    def run(self):
        yolo_time, sort_time, avg_fps = [], [], []
        t_start = time.time()

        idx_frame = 0
        last_out = None
        while self.vdo.grab():
            # Inference *********************************************************************
            t0 = time.time()
            _, img0 = self.vdo.retrieve()

            if idx_frame % self.args.frame_interval == 0:
                outputs, yt, st = self.image_track(
                    img0)  # (#ID, 5) x1,y1,x2,y2,id
                last_out = outputs
                yolo_time.append(yt)
                sort_time.append(st)
                print('Frame %d Done. Det-time:(%.3fs) SORT-time:(%.3fs)' %
                      (idx_frame, yt, st))
            else:
                outputs = last_out  # directly use prediction in last frames
            t1 = time.time()
            avg_fps.append(t1 - t0)

            # post-processing ***************************************************************
            # visualize bbox  ********************************
            if len(outputs) > 0:
                bbox_xyxy = outputs[:, :4]
                identities = outputs[:, -1]
                img0 = draw_boxes(img0, bbox_xyxy, identities)  # BGR

                # add FPS information on output video
                text_scale = max(1, img0.shape[1] // 1600)
                cv2.putText(img0,
                            'frame: %d fps: %.2f ' %
                            (idx_frame, len(avg_fps) / sum(avg_fps)),
                            (20, 20 + text_scale),
                            cv2.FONT_HERSHEY_PLAIN,
                            text_scale, (0, 0, 255),
                            thickness=2)

            # display on window ******************************
            if self.args.display:
                cv2.imshow("test", img0)
                if cv2.waitKey(1) == ord('q'):  # q to quit
                    cv2.destroyAllWindows()
                    break

            # save to video file *****************************
            if self.args.save_path:
                self.writer.write(img0)

            if self.args.save_txt:
                with open(
                        self.args.save_txt + str(idx_frame).zfill(4) + '.txt',
                        'a') as f:
                    for i in range(len(outputs)):
                        x1, y1, x2, y2, idx = outputs[i]
                        f.write('{}\t{}\t{}\t{}\t{}\n'.format(
                            x1, y1, x2, y2, idx))

            idx_frame += 1

        print(
            'Avg Det time (%.3fs), Sort time (%.3fs) per frame' %
            (sum(yolo_time) / len(yolo_time), sum(sort_time) / len(sort_time)))
        t_end = time.time()
        print('Total time (%.3fs), Total Frame: %d' %
              (t_end - t_start, idx_frame))

    def image_track(self, im0):
        """
        :param im0: original image, BGR format cv2
        :return:
        """
        # preprocess ************************************************************
        h, w, _ = im0.shape
        img = cv2.resize(
            im0, (w // self.scale, h // self.scale))  # down sample to speed up
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  #

        # Detection time *********************************************************
        # Inference
        t1 = time.time()
        with torch.no_grad():
            boxes, confs = self.face_detector.detect(img)
            # boxes: (#obj, 4) x1,y1,x2,y2      in img scale !
            # confs: ()

        t2 = time.time()

        # get all obj ************************************************************

        if boxes is not None and len(boxes):
            boxes = boxes * self.scale  # x1,y1,x2,y2  go back to original image

            bbox_xywh = xyxy2xywh(boxes)  # (#obj, 4)     xc,yc,w,h

            # add margin here. only need to revise width and height
            bbox_xywh[:, 2:] = bbox_xywh[:, 2:] * (1 + self.margin_ratio)

            # ****************************** deepsort ****************************
            outputs = self.deepsort.update(bbox_xywh, confs, im0)
            # (#ID, 5) x1,y1,x2,y2,track_ID
        else:
            outputs = torch.zeros((0, 5))

        t3 = time.time()
        return outputs, t2 - t1, t3 - t2
        #             cv2.FONT_HERSHEY_COMPLEX, 0.5, (255, 0, 0), 1, cv2.LINE_AA)

        # Draw landmarks(5)
        cv2.circle(img, tuple(ld[0]), 10, (0, 0, 255), 8)
        cv2.circle(img, tuple(ld[1]), 10, (0, 0, 255), 8)
        cv2.circle(img, tuple(ld[2]), 10, (0, 0, 255), 8)
        cv2.circle(img, tuple(ld[3]), 10, (0, 0, 255), 8)
        cv2.circle(img, tuple(ld[4]), 10, (0, 0, 255), 8)

    return img


path = glob.glob("D:/Image Dataset/val/not_aryan/*.*")
save_path = "D:/Python Files/Face_Detection/New_Image/val/not_aryan"

mtcnn = MTCNN()

for count,file in enumerate(path):
    img = cv2.imread(file)
    # new_image = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    try:
        boxes, probs, landmarks = mtcnn.detect(img, landmarks=True)
        final_image = _draw(img, boxes, probs, landmarks)
    except:
        pass
    cv2.imwrite(os.path.join(save_path, str(count) + "new.jpg"), final_image)


print("Done")

class VideoToFrames:
    """
    Transforms input video files into image frames.
    Additionally detects all faces for each frame.
    """
    def __init__(self, num_frames_per_video=3, face_additional_area=0.5):
        self.num_frames_per_video = num_frames_per_video
        self.face_additional_area = face_additional_area

        self.face_detection_model = MTCNN(
            image_size=224,
            margin=0,
            keep_all=True,
            select_largest=False,
            post_process=False,
            thresholds=[0.8, 0.9, 0.9],
            device="cuda",
        ).eval()

    def video_to_frames(self, video_path, output_path):
        output_image_paths = []
        video_id = os.path.split(video_path)[-1]

        # Read video
        orig_capture = cv2.VideoCapture(video_path)

        # Select only self.num_frames_per_video uniform frames
        n_frames = int(orig_capture.get(cv2.CAP_PROP_FRAME_COUNT))
        frames_idx = np.linspace(0,
                                 n_frames,
                                 self.num_frames_per_video,
                                 endpoint=False,
                                 dtype=np.int)

        # Loop through all frames
        for frame_num in range(n_frames):
            ret = orig_capture.grab()

            if not ret:
                continue

            # Retrieve only required frames
            if frame_num in frames_idx:
                ret, frame_orig = orig_capture.retrieve()

                if ret:
                    # Save the whole video frame to the image
                    # img_path = os.path.join(output_path, f"{video_id}_frame_{frame_num}.png")
                    # cv2.imwrite(frame_orig, img_path)
                    # output_image_paths.append(os.path.split(img_path)[-1])

                    # Skip the next part if want to save the whole frame only
                    frame_orig = cv2.cvtColor(frame_orig, cv2.COLOR_BGR2RGB)

                    # Detect all faces
                    faces, _ = self.face_detection_model.detect(frame_orig)
                    if faces is None:
                        return []

                    # For each detected face
                    for face_id, box in enumerate(faces):

                        # Get face coordinates
                        c0_start, c0_end, c1_start, c1_end = self.get_face_coordinates(
                            frame_orig, box)

                        # Crop face
                        face_full = frame_orig[c0_start:c0_end,
                                               c1_start:c1_end]

                        # Save face to the file
                        img_path = os.path.join(
                            output_path,
                            f"{video_id}_frame_{frame_num}_face_{face_id}.png",
                        )
                        # Return BGR before saving
                        face_full = cv2.cvtColor(face_full, cv2.COLOR_RGB2BGR)
                        cv2.imwrite(img_path, face_full)
                        output_image_paths.append(os.path.split(img_path)[-1])

        return output_image_paths

    def get_face_coordinates(self, frame_orig, box):
        sh0_start = int(box[1])
        sh0_end = int(box[3])
        sh1_start = int(box[0])
        sh1_end = int(box[2])

        # Add area around the face
        d0 = int((sh0_end - sh0_start) * self.face_additional_area)
        d1 = int((sh1_end - sh1_start) * self.face_additional_area)

        c0_start = max(sh0_start - d0, 0)
        c0_end = min(sh0_end + d0, frame_orig.shape[0])

        c1_start = max(sh1_start - d1, 0)
        c1_end = min(sh1_end + d1, frame_orig.shape[1])

        return c0_start, c0_end, c1_start, c1_end
Beispiel #8
0
from tqdm import tqdm_notebook as tqdm
import os
import torchvision.transforms as transforms

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
mtcnn = MTCNN(device=device)
model = InceptionResnetV1(pretrained='vggface2').eval()

img = cv2.imread('./test.jpg')
if img is None:
    print("Img Err")
    import sys
    sys.exit()

s_face = 0
faces, _ = mtcnn.detect(img)

model.classify = True
try:    
    for face in faces:
        face = np.trunc(face)
        s_face = img[int(face[1]):int(face[3]), int(face[0]):int(face[2])]
        
        cv2.rectangle(img, (face[0],face[1]), (face[2], face[3]), (255,0,0), 3)
        
        i_c = mtcnn(s_face)
        emb = model(i_c.unsqueeze(0))
        print(emb)
  
except Exception as e:
    print(e)
Beispiel #9
0
class FaceCam():
    # Video class based on openCV
    def __init__(self):
        self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
        self.mtcnn = MTCNN(device=self.device)
        self.open = True
        self.gender_model = def_model('gender', self.device)
        self.gaze_model = def_model('gaze', self.device)
        self.emotion_model = def_model('emotion', self.device)
        self.multimodal_model = def_model('multimodal', self.device)
        

    def rec(self):
        global label

        cap = cv2.VideoCapture(0)
        
        while(self.open==True):
            timer_start = time.time()

            print('start camera!')
            ret, frame = cap.read()

            try:
                # detect face box and probability
                boxes, probs = self.mtcnn.detect(frame, landmarks=False)

                # draw box on frame
                frame = draw_bbox(frame, boxes, probs)

                # perform only when face is detected
                if len(boxes) > 0:
                    # extract the face rois
                    rois = detect_rois(boxes)
                    for roi in rois:
                        (start_Y, end_Y, start_X, end_X) = roi
                        face = frame[start_Y:end_Y, start_X:end_X]
                        print('detect time: ', time.time()-timer_start)
                    
                    predict_start = time.time()
                    gender_i = predict(self.gender_model, face, self.device)
                    gaze_i = predict(self.gaze_model, face, self.device)
                    emotion_i = predict(self.emotion_model, face, self.device)
                    multimodal_i = predict(self.multimodal_model, face, self.device)

                    cv2.putText(frame, label['gender'][gender_i], (end_X-50, start_Y-55), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,0,0), 2, cv2.LINE_AA)
                    cv2.putText(frame, label['gaze'][gaze_i], (end_X-50, start_Y-40), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,0,0), 2, cv2.LINE_AA)
                    cv2.putText(frame, label['emotion'][emotion_i], (end_X-50, start_Y-25), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,0,0), 2, cv2.LINE_AA)
                    cv2.putText(frame, label['multimodal'][multimodal_i], (end_X-50), start_Y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,0,0), 2, cv2.LINE_AA)
                    print('predict time: ', time.time()-predict_start)
            except Exception as e:
                print(e)
                pass
            
            # show the frame
            cv2.imshow('Demo', frame)
            
            # q to quit
            if cv2.waitKey(1) & 0xFF == ord('q'):
                print('Interrupted by user!')
                break

        # clear program and close windows
        cap.release()
        cv2.destroyAllWindows()
        print('All done!')
Beispiel #10
0
class FaceNet:
    '''Face Net
    '''
    def __init__(self, mtcnn=dict(), resnet=dict(), threshold=1, device='cpu', data=None):
        # default arguments
        default_mtcnn = dict(
            image_size=160, margin=80, min_face_size=20, thresholds=[0.6, 0.7, 0.7],
            factor=0.709, post_process=False, keep_all=True, device=device,
        )
        default_resnet = dict(pretrained='vggface2', device=device)
        default_mtcnn.update(mtcnn)
        default_resnet.update(resnet)
        data = data or collections.defaultdict(list)
        # assign values
        self._kwargs = dict(mtcnn=mtcnn, resnet=resnet, threshold=threshold, device=device, data=data)
        self._mtcnn = MTCNN(**default_mtcnn)
        self._resnet = InceptionResnetV1(**default_resnet).eval()

    def add_image(self, image, label):
        for embedding in self._embed(image):
            self._kwargs['data'][label].append(embedding)

    def add_images_from_folder(self, root, progress_bar=True):
        dataset = datasets.ImageFolder(root)
        idx_to_class = {v: k for k, v in dataset.class_to_idx.items()}
        for image, idx in (tqdm.tqdm(dataset) if progress_bar else dataset):
            self.add_image(image, idx_to_class[idx])
        return self

    def image_to_labels(self, image_or_path, key=None, crop=True):
        '''返回图片人脸的标签
        '''
        key = key or (lambda x: sum(x)/len(x))
        result = list()
        embeddings = self._embed(self.imread(image_or_path), crop=crop)
        for embedding in embeddings:
            distances = {k: key(v) for k, v in self._distances(embedding).items()}
            label = min(distances, key=lambda x: distances[x])
            result.append(label if distances[label]<self._kwargs['threshold'] else None)
        return result

    def image_to_image(self, image_or_path, mark=True, font=5, size=1, thickness=1, offset=(5, 5), color=(255, 0, 0)):
        '''返回人脸标注的图片

        Argument:
            - image_or_path: [str, numpy.ndarray]
            - mark: bool
            - font: int, default is cv2.FONT_HERSHEY_COMPLEX_SMALL
            - size: float
            - thickness: float
            - offset: Tuple[float]
            - color: Tuple[int]
        '''
        image = self.imread(image_or_path)
        boxes, _ = self._mtcnn.detect(image, landmarks=False)
        if isinstance(boxes, numpy.ndarray):
            for box in boxes.astype(numpy.int):
                image = cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), color)
                if mark:
                    crop = image[box[1]: box[3], box[0]: box[2], :]
                    try:
                        label, = self.image_to_labels(crop, crop=False)
                    except:
                        label = 'ERROR'
                    coord = tuple(int(box[i]-offset[i]) for i in range(2))
                    image = cv2.putText(image, label or 'other', coord, font, size, color, thickness)
        return image

    def image_to_crops(self, image_or_path):
        '''
        Argument:
            - image_or_path: [str, numpy.ndarray]
        '''
        result = list()
        image = self.imread(image_or_path)
        boxes, _ = self._mtcnn.detect(image, landmarks=False)
        if isinstance(boxes, numpy.ndarray):
            for box in boxes.astype(numpy.int):
                result.append(image[box[1]: box[3], box[0]: box[2], :])
        return result

    def save(self, path):
        with open(path, 'wb') as f:
            torch.save(self._kwargs, f)

    @classmethod
    def load(cls, path, **kwargs):
        with open(path, 'rb') as f:
            data = torch.load(f)
            data.update(kwargs)
            return cls(**data)

    @classmethod
    def imread(cls, image_or_path):
        if isinstance(image_or_path, str):
            return cv2.imread(image_or_path)[:, :, ::-1].copy()
        elif isinstance(image_or_path, numpy.ndarray):
            return image_or_path
        else:
            raise NotImplementedError

    def _embed(self, image, crop=True):
        # __import__('IPython').embed(colors='Linux')
        if crop:
            faces = self._mtcnn(image)
            if faces is None:
                return numpy.array(tuple())
        else:
            face = cv2.resize(image, (self._mtcnn.image_size, self._mtcnn.image_size))
            faces = (torch.Tensor(face.transpose(2, 1, 0)), )
        faces = faces if self._kwargs['mtcnn'].get('keep_all', True) else (faces, )
        return self._resnet(torch.stack(faces).to(self._kwargs['device'])).detach().cpu()

    def _distances(self, embedding, **kwargs):
        return {
            k: tuple((embedding-v).norm(**kwargs) for v in vs)
            for k, vs in self._kwargs['data'].items()
        }
Beispiel #11
0
def detect_face(tensor_image_stack):
    mtcnn = MTCNN(image_size=160, margin=0, min_face_size=20, thresholds=[0.6, 0.7, 0.7], factor=0.709, post_process=False, device='cuda')
    return mtcnn.detect(tensor_image_stack)
Beispiel #12
0
while True:
    success, frame = cap.read()

    if success:
        width, height, _ = frame.shape
        # if the video is too big uncomment the below code
        #frame = resize(frame, height, width)

        #padding the image to avoid the bounding going out of the image
        #and crashes the program
        padding = cv.copyMakeBorder(frame, 50, 50, 50, 50, cv.BORDER_CONSTANT)
        #converting numpy array into image
        image = Image.fromarray(padding)

        #gives the face co-ordinates
        face_coord, _ = mtcnn.detect(image)

        if face_coord is not None:
            for coord in face_coord:
                for x1, y1, x2, y2 in [coord]:
                    x1, y1, x2, y2 = r(x1), r(y1), r(x2), r(y2)

                    #face array
                    face = padding[y1:y2, x1:x2]

                    #Preprocessing
                    preprocess = Preprocessing(img=Image.fromarray(face))
                    #tensor array
                    tensor_img_array = preprocess.preprocessed_arrays()

                    #Predicting
boxes = []
tocni = 0
ukupno_pronadenih = 0
stvarna_kolicina = len(new_dataset)

for i in range(0, len(new_dataset), 1):
    if ((i + 1 < len(new_dataset))
            and new_dataset[i]['name'] == new_dataset[i + 1]['name']):
        boxes += [new_dataset[i]['box_frame'].numpy()]
        #print("adding to box")
        continue
    boxes += [new_dataset[i]['box_frame'].numpy()]
    pixels = new_dataset[i]['image'].transpose(0, 1).transpose(1, 2).numpy()
    pixels = pixels * new_dataset[i]['image'].size()[2]
    faces = detector.detect(pixels)
    max_iou = [0] * len(boxes)

    if (faces[0] is None):
        boxes = []
        continue

    ukupno_pronadenih += len(faces[0])
    for j in range(len(boxes)):
        found = False
        for k in range(len(faces[0])):
            pxmin, pymin, pxmax, pymax = faces[0][k]
            xmin, ymin, xmax, ymax = boxes[j]
            interxmin = max(xmin, pxmin)
            interymin = max(ymin, pymin)
            interxmax = min(xmax, pxmax)
Beispiel #14
0
class Check_In_Window(QMainWindow):
    def __init__(self):
        super(Check_In_Window, self).__init__()
        loadUi("Check_In_Window.ui", self)
        self.mtcnn = MTCNN(select_largest=True, device='cuda')
        # some constants kept as default from facenet
        self.input_image_size = 160

        self.sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(
            log_device_placement=True))
        pre_trained_facenet.load_model('model/20170512-110547.pb')
        self.images_placeholder = tf.get_default_graph().get_tensor_by_name(
            "input:0")
        self.embeddings = tf.get_default_graph().get_tensor_by_name(
            "embeddings:0")
        self.phase_train_placeholder = tf.get_default_graph(
        ).get_tensor_by_name("phase_train:0")
        self.embedding_size = self.embeddings.get_shape()[1]
        self.startVideo('0')

    def startVideo(self, camera_name):
        """
        :param camera_name: link of camera or usb camera
        :return:
        """
        if len(camera_name) == 1:
            self.capture = cv2.VideoCapture(int(camera_name))
        else:
            self.capture = cv2.VideoCapture(camera_name)
        self.timer = QTimer(self)  # Create Timer
        # path = './Face_Recognition/images'
        path = '/home/ftpuser/ftp/files/'
        if not os.path.exists(path):
            os.mkdir(path)
        # known face encoding and known face name list
        self.images = []
        self.class_names = []
        self.faces = []

        attendance_list = os.listdir(path)
        self.attendance_num = len(attendance_list)
        for cl in attendance_list:
            cur_img = cv2.imread(f'{path}/{cl}')
            print(cur_img)
            self.images.append(cur_img)
            # print('image',cur_img)
            # cur_img = cv2.resize(cur_img, (504,378))
            faces_detected = 0
            start = time.time()
            # result = self.detector.detect_faces(cur_img)
            box = self.mtcnn.detect(cur_img, True)

            faces_detected += len(box)
            print(box)
            print(f'Frames per second: {(time.time() - start):.3f},',
                  f'faces detected: {faces_detected}\r')
            face = self.getFace(cur_img, box)
            self.faces.append(face)
            self.class_names.append(os.path.splitext(cl)[0])

        self.timer.timeout.connect(
            self.update_frame)  # Connect timeout to the output function
        self.timer.start(10)  # emit the timeout() signal at x=10ms

    def face_rec_(self, frame):
        """
        :param frame: frame from camera
        :param encode_list_known: known face encoding
        :param class_names: known face names
        :return:
        """
        box = self.mtcnn.detect(frame, True)
        # print(box)
        # print(box[0])
        # print('길이',len(box[0]))
        if box[0] is None:
            print('없')
        else:
            print('heeeeeeeeeeeeeeeeeeeeeeee')
            print(box)
            print(len(self.images), len(self.faces), len(self.class_names))
            for f, c in zip(self.faces, self.class_names):
                print('for loop')
                distance = self.compare2face(f, frame, box)
                print('여기까진?')
                threshold = 0.7  # set yourself to meet your requirement
                print("distance = " + str(distance), ' 사진번호: ', c)
                name = 'unknonw'
                if (distance <= threshold):
                    name = c
                    print(name)
                    print("distance = " + str(distance), ' 인덱: ', c)
                self.mark_attendance(name)
        return frame

    def mark_attendance(self, name):
        """
        :param name: detected face known or unknown one
        :return:
        """

        if name != 'unknonw':
            print(name)
            self.logIn(name)

    def logIn(self, name):
        customer_id = int(name)
        DB = DB_Connection()
        cnt = DB.select_user(customer_id)

        if cnt[0] == "False":
            greeting = ' Welcome, ' + cnt[1] + '.'
            print(name, '님이 입장하셨습니다.')
            DB.update_login_session_T(customer_id)
            DB.insert_check_In_Time(customer_id)

            self.GreetingLabel.setText(greeting)
            self.timer1 = QTimer(self)
            self.timer1.start(5000)

            self.timer1.timeout.connect(self.clearLabel)

    def clearLabel(self):
        self.GreetingLabel.clear()

    def update_frame(self):
        path = '/home/ftpuser/ftp/files/'
        new_attendance_list = os.listdir(path)
        image_num = len(new_attendance_list)
        ret, image = self.capture.read()
        if image_num == self.attendance_num:
            self.displayImage(image)
        else:
            for cl in new_attendance_list:
                if os.path.splitext(cl)[0] not in self.class_names:
                    cur_img = cv2.imread(f'{path}/{cl}')
                    faces_detected = 0
                    start = time.time()
                    # result = self.detector.detect_faces(cur_img)
                    box = self.mtcnn.detect(cur_img, True)
                    faces_detected += len(box)
                    print(f'Frames per second: {(time.time() - start):.3f},',
                          f'faces detected: {faces_detected}\r')
                    face = self.getFace(cur_img, box)
                    self.faces.append(face)
                    self.class_names.append(os.path.splitext(cl)[0])
            self.displayImage(image)

    def displayImage(self, image, window=1):
        """
        :param image: frame from camera
        :param encode_list: known face encoding list
        :param class_names: known face names
        :param window: number of window
        :return:
        """
        print(image.shape)
        try:
            image = self.face_rec_(image)
        except Exception as e:
            print('뭐지?', e)
        image = cv2.resize(image, (640, 480))
        qformat = QImage.Format_Indexed8
        if len(image.shape) == 3:
            if image.shape[2] == 4:
                qformat = QImage.Format_RGBA8888
            else:
                qformat = QImage.Format_RGB888
        outImage = QImage(image, image.shape[1], image.shape[0],
                          image.strides[0], qformat)
        outImage = outImage.rgbSwapped()

        if window == 1:
            self.imgLabel.setPixmap(QPixmap.fromImage(outImage))
            self.imgLabel.setScaledContents(True)

    def getFace(self, img, box):
        faces = []
        box = box[0][0]
        box = np.int32(box)
        # Result is an array with all the bounding boxes detected. We know that for 'ivan.jpg' there is only one.
        cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]), (0, 155, 255),
                      2)
        cropped = img[box[1]:box[3], box[0]:box[2]]
        rearranged = cv2.resize(cropped,
                                (self.input_image_size, self.input_image_size),
                                interpolation=cv2.INTER_CUBIC)
        prewhitened = pre_trained_facenet.prewhiten(rearranged)
        faces.append({
            'face': rearranged,
            'embedding': self.getEmbedding(prewhitened)
        })
        return faces

    def getEmbedding(self, resized):
        reshaped = resized.reshape(-1, self.input_image_size,
                                   self.input_image_size, 3)
        feed_dict = {
            self.images_placeholder: reshaped,
            self.phase_train_placeholder: False
        }
        embedding = self.sess.run(self.embeddings, feed_dict=feed_dict)
        return embedding

    def compare2face(self, face, img2, box2):
        face1 = face
        print('여기')
        face2 = self.getFace(img2, box2)
        print('여기2')
        if face1 and face2:
            dist = np.sqrt(
                np.sum(
                    np.square(
                        np.subtract(face1[0]['embedding'],
                                    face2[0]['embedding']))))
            return dist
        return -1
    # Checar se há GPU disponível
    device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
    print('Running on device: {}'.format(device))

    # Definir parâmetros do módulo MTCNN
    mtcnn = MTCNN(keep_all=False, device=device, post_process=False)

    # Obter lista de arquivos e diretorios
    fname, dname = listar_imagens(basedir)

    # Detectar faces e salvar na pasta facecrops
    inicio = time.time()
    print('Processamento iniciado')
    facecrop = [it.replace(basedir, basedir+'_faces') for it in fname]
    for f, filename in enumerate(fname):
        try:
            img = Image.open(filename)
            box, prob = mtcnn.detect(img)
        except:
            print('Falha no processamento do arquivo '+filename)
            continue
        if prob[0] and prob[0] >= 0.95:
            savepath = '/projects/jeff/TUMGAIDimage_facecrops3' + '' + \
                os.path.dirname(filename)[-9:]+'-'+os.path.basename(filename)
            extract_face(img, box[0], save_path=savepath)
    print('Processamento concluido')
    print(time.strftime('%H:%M:%S', time.localtime()))
    tempo_total = time.time() - inicio
    print("Tempo total: %02dm:%02ds" % divmod(tempo_total, 60))
Beispiel #16
0
    def detect_live(self):
        
        mtcnn = MTCNN()
        faces = {}
        frameCount = 0

        vid = cv2.VideoCapture(0)

        if self.record_for is not None : 
            start_time = time.time()

        while vid.isOpened():

            if self.record_for is not None :
                curr_time = time.time() - start_time
                if curr_time > self.record_for :
                    break                 
        
            _, frame = vid.read()
            frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
            frameCount = frameCount + 1

            boxes, probs = mtcnn.detect(frame)

            frame_draw = frame.copy()
            draw = ImageDraw.Draw(frame_draw)
            if boxes is not None:

                faces["frame_{}".format(frameCount)] = []

                for box, p in zip(boxes, probs) : 
                    if p > 0.70 :
                        draw.rectangle(box.tolist(), outline = (255, 0, 0), width = 1)
                    if self.extract == True :
                        face = extract_face(frame, box.tolist())
                        faces["frame_{}".format(frameCount)].append(face)
                        if self.save == True :
                            img = self.tsfms(face)

                            if self.saveIn is None :
                                raise ValueError

                            else :
                                img.save(os.path.join(self.saveIn, "frame_{}.jpg".format(len(faces))))

                cv2.imshow("Tracking window", cv2.cvtColor(np.array(frame_draw), cv2.COLOR_RGB2BGR))
                if self.save_video == True : 
                    self.frames_tracked.append(frame_draw)                
                if cv2.waitKey(1) == ord("a") :
                    break
                

        
        vid.release()
        
        if self.save_video == True:
            print(len(self.frames_tracked))
            self.saveVideo(self.saveIn, self.frames_tracked, "trackedVid")

        if self.save == True :
            return len(faces.keys()), faces
        else :
            return None, None         
Beispiel #17
0
class FaceNetSegmenter(TorchDevice, BaseSegmenter):
    """FaceNetSegmenter segments faces from an image.

    - Input shape: `(Height x Width x Channels)`
    - Output shape: `NumFaces x (Channels x ImageSize x ImageSize)`

    `Channels` dimension can be changed (e.g. set `channel_axis` to 0 for channels first mode instead of channels last).

    :param image_size: Height and width of a detected face. Smaller faces are upscaled.
    :param margin: Margin to add to bounding box, in terms of pixels in the final image.
    :param selection_method: Heuristic to use to select a single face from the image. Options:
        "probability": highest probability selected
        "largest": largest box selected
        "largest_over_threshold": largest box over a certain probability selected
        "center_weighted_size": box size minus weighted squared offset from image center
    :param post_process: Flag for normalizing the output image. Required if you want to pass
        these face to the FaceNetEmbedder.
    :param min_face_size: Minimum face size to search for.
    :param channel_axis: Axis of channels in the image. Default is 2 (channels-last), use 0 for channels-first.
    """
    def __init__(self,
                 image_size: int = 160,
                 margin: int = 0,
                 selection_method: str = 'largest',
                 post_process: bool = True,
                 min_face_size: int = 20,
                 channel_axis: int = 2,
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.image_size = image_size
        self.margin = margin
        self.selection_method = selection_method
        self.post_process = post_process
        self.min_face_size = min_face_size
        self.channel_axis = channel_axis

        self._default_channel_axis = 2

    def post_init(self):
        from facenet_pytorch import MTCNN

        self.face_detector = MTCNN(selection_method=self.selection_method,
                                   image_size=self.image_size,
                                   margin=self.margin,
                                   device=self.device,
                                   post_process=self.post_process,
                                   min_face_size=self.min_face_size,
                                   keep_all=True)

    @batching
    def segment(self, blob: 'np.ndarray', *args, **kwargs) -> List[List[Dict]]:
        """Transform a numpy `ndarray` of shape `(Height x Width x Channel)`
        into a list with dicts that contain cropped images.

        :param blob: A numpy `ndarray` that represents a single image.
        :param args: Additional positional arguments.
        :param kwargs: Additional positional arguments.
        :return: A list with dicts that contain cropped images.
        """
        if self.channel_axis != self._default_channel_axis:
            blob = np.moveaxis(blob, self.channel_axis,
                               self._default_channel_axis + 1)

        batch = blob
        results = []
        batch = np.asarray(batch)
        with torch.no_grad():
            image = torch.from_numpy(data.astype('float32')).to(self.device)
            # Create a batch of size 1
            image = image.unsqueeze(0)

            # Detect faces
            batch_boxes, batch_probs, _ = self.face_detector.detect(
                image, landmarks=True)

            # Select faces
            if not self.keep_all:
                batch_boxes, batch_probs, _ = self.face_detector.select_boxes(
                    batch_boxes,
                    batch_probs,
                    _,
                    image,
                    method=self.selection_method)

            # Extract faces
            faces = self.face_detector.extract(image,
                                               batch_boxes,
                                               save_path=None)
            if faces[0] is not None:
                faces = faces[0].view(-1, image.shape[-1], self.image_size,
                                      self.image_size)
                batch_boxes = batch_boxes[0]
                batch_probs = batch_probs[0]

            results = [
                dict(offset=0,
                     weight=probability,
                     blob=face.numpy(),
                     location=bounding_box.tolist())
                for face, probability, bounding_box in zip(
                    faces, batch_probs, batch_boxes) if face is not None
            ]

            return results
Beispiel #18
0
    def detect(self):
    
        vid = cv2.VideoCapture(self.lookIn)
        frameCount = int(vid.get(cv2.CAP_PROP_FRAME_COUNT)) - 1

        mtcnn = MTCNN()

        bboxes_and_probs = []
        count = frameCount
        while vid.isOpened():
            
            #if count <  frameCount:
                #break

            _,  frame = vid.read()
            print("%d to go.." %(count))
            count -= 1
            frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            boxes, prob = mtcnn.detect(frame)
            
            frame_draw = frame.copy()
            draw = ImageDraw.Draw(frame_draw)
            if boxes is None :
                #print("Skipping Frame")
                if self.writeMode == True:
                    detected_frames.append(frame_draw)                 
                cv2.imshow("Frame", cv2.cvtColor(np.asarray(frame_draw), cv2.COLOR_BGR2RGB)) 
                if cv2.waitKey(2) & 0xFF == ord('y'):
                    break
                continue
            for box, p in zip(boxes,prob):
                
                if p > 0.80:   
                                   
                    #print("Not skipping!")      
                    draw.rectangle(box.tolist(), outline= (255, 0, 0), width= 1)
                    bboxes_and_probs.append({"bbox":box, "prob":p})

                if self.writeMode == True:
                    detected_frames.append(frame_draw)
            
            cv2.imshow("Frame", cv2.cvtColor(np.asarray(frame_draw), cv2.COLOR_BGR2RGB)) 
            if cv2.waitKey(1) & 0xFF == ord('y'):
                break
                       
            
        
        print("releasing capture")
        vid.release()
        
        if self.writeMode == True :
            dim = detected_frames[0].size
            print(dim , int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)))
            fourcc = cv2.VideoWriter_fourcc(*"mp4v")    
            video_tracked = cv2.VideoWriter(self.saveIn, fourcc, 25.0, dim)
            for frame in detected_frames:
                video_tracked.write(cv2.cvtColor(np.array(frame), cv2.COLOR_RGB2BGR))
            video_tracked.release()
        return bboxes_and_probs




        
Beispiel #19
0
class FaceRecognition:
    def __init__(self, capture, min_face=300, accuracy_th=0.7):
        self.min_face = min_face
        self.mtcnn_pt = MTCNN(image_size=160,
                              margin=0,
                              min_face_size=self.min_face
                              )  # initializing mtcnn for face detection
        self.resnet = InceptionResnetV1(pretrained='vggface2').eval(
        )  # initializing resnet for face img to embeding conversion
        self.model_path = 'classify_model.pkl'
        self.accuracy_th = accuracy_th
        self.new_boxes = False
        self.lock_boxes = threading.Lock()
        self.lock_cap = threading.Lock()
        self.lock_flag = threading.Lock()
        self.cap = capture
        # self.cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
        # self.cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)
        with open(self.model_path, 'rb') as infile:
            (self.model, self.class_names) = pickle.load(infile)

        # cv2.namedWindow('frame', cv2.WINDOW_AUTOSIZE)
        self.box_draw = [[]]
        self.text_draw = [[]]
        self.mark_draw = [[]]
        self.stop_flag = [False]
        self.mask = cv2.imread('images/fm2.png')

    def set_params(self, min_face=None, accuracy_th=None):
        if min_face is not None and 0 < min_face <= 1000:
            self.min_face = min_face
            self.mtcnn_pt = MTCNN(image_size=160,
                                  margin=0,
                                  min_face_size=self.min_face)
        if accuracy_th is not None and 0 < accuracy_th < 1:
            self.accuracy_th = accuracy_th

    def load_model(self, path=''):
        if path != '' and os.path.isfile(path):
            with open(path, 'rb') as infile:
                (self.model, self.class_names) = pickle.load(infile)
        else:
            with open(self.model_path, 'rb') as infile:
                (self.model, self.class_names) = pickle.load(infile)

    # Draw bounding box and text on image
    def draw_frame(self,
                   image,
                   bounding_boxes,
                   label_texts=[],
                   landmarks=[],
                   face_mask_anchor=False,
                   color=[],
                   thick=2,
                   text_scale=0.5,
                   skip_list=[]):
        if bounding_boxes is None:
            return
        if not color:
            color = [(255, 255, 0)] * len(bounding_boxes)
        for i, box in enumerate(bounding_boxes):
            if i in skip_list:
                continue
            cv2.rectangle(image, (int(box[0]), int(box[1])),
                          (int(box[2]), int(box[3])), color[i], thick)
            if label_texts:
                cv2.putText(image, label_texts[i],
                            (int(box[0]), int(box[1] - 5)),
                            cv2.FONT_HERSHEY_SIMPLEX, text_scale, color[i],
                            thick)
            if landmarks:
                for point in landmarks[i]:
                    cv2.circle(image, (int(point[0]), int(point[1])), 2, color,
                               thick)
                if face_mask_anchor:
                    center_eye = (landmarks[i][0] + landmarks[i][1]) / 2
                    center_lip = (landmarks[i][3] + landmarks[i][4]) / 2
                    slope_ver = (center_eye[1] - center_lip[1]) / (
                        center_eye[0] - center_lip[0])
                    slope_hor = -1 / slope_ver
                    chin = ((box[3] - center_eye[1]) / slope_ver +
                            center_eye[0], box[3])
                    center = (center_eye + landmarks[i][2]) / 2
                    left_ear = (box[2],
                                slope_hor * (box[2] - center[0]) + center[1])
                    right_ear = (box[0],
                                 slope_hor * (box[0] - center[0]) + center[1])
                    cv2.circle(image, (int(chin[0]), int(chin[1])), 2,
                               (255, 255, 255), 2)
                    cv2.circle(image, (int(center[0]), int(center[1])), 2,
                               (255, 255, 255), 2)
                    cv2.circle(image, (int(left_ear[0]), int(left_ear[1])), 2,
                               (255, 255, 255), 2)
                    cv2.circle(image, (int(right_ear[0]), int(right_ear[1])),
                               2, (255, 255, 255), 2)

    # Detect face on image and match with classify model, update result to bounding boxes and texts
    def face_match(self, image, classify_model, person_names):
        box_dr = []
        text_dr = []
        mark_dr = []
        try:
            bboxes, prob, landmarks = self.mtcnn_pt.detect(image,
                                                           landmarks=True)
        except Exception as ex:
            with self.lock_boxes:
                self.box_draw[0] = box_dr
                self.text_draw[0] = text_dr
            return box_dr, text_dr, mark_dr
        if bboxes is None:
            with self.lock_boxes:
                self.box_draw[0] = box_dr
                self.text_draw[0] = text_dr
            return box_dr, text_dr, mark_dr
        for idx, box in enumerate(bboxes):
            if prob[idx] > 0.90:  # if face detected and probability > 90%
                box_dr.append(box)
                mark_dr.append(landmarks[idx])
                face = extract_face(image,
                                    box,
                                    image_size=self.mtcnn_pt.image_size,
                                    margin=self.mtcnn_pt.margin)
                face = fixed_image_standardization(face)
                emb = self.resnet(
                    face.unsqueeze(0)
                )  # passing cropped face into resnet model to get embedding matrix
                emb_array = emb.detach().numpy()
                predictions = classify_model.predict_proba(emb_array)
                best_class_indices = np.argmax(predictions, axis=1)
                best_class_probabilities = predictions[
                    np.arange(len(best_class_indices)), best_class_indices]
                if best_class_probabilities[0] > self.accuracy_th:
                    text = '{0}: {1:.0%}'.format(
                        person_names[best_class_indices[0]],
                        best_class_probabilities[0])
                else:
                    text = '{0}'.format('Unknown')
                text_dr.append(text)

            elif prob[idx] > 0.10:
                continue
            else:
                continue
        with self.lock_boxes:
            self.box_draw[0] = box_dr
            self.text_draw[0] = text_dr
            self.mark_draw[0] = mark_dr
            self.new_boxes = True
        return box_dr, text_dr, mark_dr

    # A thread to apply function face_match
    def thread_face_recog(self):
        while True:
            if self.cap is None:
                break
            with self.lock_flag:
                if self.stop_flag[0]:
                    break
            with self.lock_cap:
                ret_copy, frame_copy = self.cap.read()
            frame_copy = cv2.cvtColor(frame_copy, cv2.COLOR_BGR2RGB)
            self.face_match(frame_copy, self.model, self.class_names)
        print('thread_face_recog stoped')

    # Stop above thread
    def stop_thread_face_recog(self):
        with self.lock_flag:
            self.stop_flag[0] = True

        with self.lock_boxes:
            self.box_draw[0] = []
            self.text_draw[0] = []
            self.mark_draw[0] = []

    # Sample to implement with camera
    def face_recog_cam(self):
        thread = threading.Thread(target=self.thread_face_recog,
                                  args=(),
                                  daemon=True)
        thread.start()
        while True:
            # Capture frame-by-frame
            with self.lock_cap:
                ret, frame = self.cap.read()

            with self.lock_boxes:
                boxes = self.box_draw[0]
                texts = self.text_draw[0]
                marks = self.mark_draw[0]

            self.draw_frame(frame, boxes, texts)

            # frame = add_face_mask(frame, mask)
            # Display the resulting frame
            cv2.imshow('frame', frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    # Sample with image folder or file
    def face_recog_image(self, path):
        if not os.path.exists(path):
            return
        if os.path.isfile(path):
            image = cv2.imread(path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            bboxes, texts, marks = self.face_match(image, self.model,
                                                   self.class_names)
            self.draw_frame(image, bboxes, texts)
            cv2.imshow('', image)
            cv2.waitKey()
            cv2.destroyWindow('')

        if os.path.isdir(path):
            filenames = glob.glob(path + '/*.jpg')
            images = [cv2.imread(img) for img in filenames]
            for idx, img in enumerate(images):
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                bboxes, texts, marks = self.face_match(img, self.model,
                                                       self.class_names)
                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
                self.draw_frame(img, bboxes, texts)
                cv2.imshow(str(idx), img)
                cv2.waitKey()
                cv2.destroyWindow(str(idx))

    def __del__(self):
        self.cap.release()
        cv2.destroyAllWindows()
Beispiel #20
0
v_cap.set(cv2.CAP_PROP_FRAME_WIDTH, image_size)
v_cap.set(cv2.CAP_PROP_FRAME_HEIGHT, image_size)
flag = False
face_results = []
start = time.time()
while (True):
    time_elapsed = time.time() - prev
    break_time = time.time() - start
    if break_time > 10:
        break
    ret, frame = v_cap.read()
    if time_elapsed > 1. / frame_rate:  # Collect frames every 1/frame_rate of a second
        prev = time.time()
        frame_ = Image.fromarray(frame)
        frames.append(frame_)
        batch_boxes, prob, landmark = mtcnn.detect(frames, landmarks=True)
        frames_duplicate = frames.copy()
        boxes.append(batch_boxes)
        boxes_duplicate = boxes.copy()
        # show imgs with bbxs
        face_results.append(
            show_images(frames_duplicate, boxes_duplicate, bbx_color))
        frames = []
        boxes = []
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
v_cap.release()
cv2.destroyAllWindows()
accuracy = (sum(face_results) / len(face_results)) * 100
print('Percentage match ' + '{:.2f}'.format(accuracy))
if accuracy > 0.75:
name = str(input("Person Name: "))
DATASET_PATH = os.path.join("datasets", name)
if not os.path.isdir(DATASET_PATH):
    os.mkdir(DATASET_PATH)

mtcnn = MTCNN(prewhiten=False, keep_all=True, thresholds=[0.6, 0.7, 0.9])

image_no = 0
capture = cv2.VideoCapture(0)
count = 0
while True:
    count += 1
    check, frame = capture.read()
    frame = cv2.resize(frame, (400, 300))
    faces, _ = mtcnn.detect(Image.fromarray(frame))
    if faces is not None and count % 7 == 0:
        image_no += 1
        cv2.imwrite(os.path.join(DATASET_PATH, f"{name}_{image_no}.jpg"),
                    frame)
        if image_no == 100:
            break

    image_text = f"Number of image taken {image_no} for {name}"
    cv2.putText(frame, image_text, (20, 20), cv2.LINE_AA, .5, (100, 0, 200), 1)
    if faces is not None:
        for (x, y, w, h) in faces:
            x, y, w, h = int(x), int(y), int(w), int(h)
            cv2.rectangle(frame, (x, y), (w, h), (200, 100, 0), 2)
    cv2.imshow('frame', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
Beispiel #22
0
    def find_face(self):
        """
        find face on the frames

        create:
            self.faces
            self.frame_ids
        """
        def del_skipped_frames():
            idxs = [idx for idx, val in enumerate(self.centers) if val == 0]
            for index in sorted(idxs, reverse=True):
                del self.centers[index]
                del self.frames[index]
                del self.frame_ids[index]

        def medfilt_filter(step=7):
            y_ = medfilt([i[0] for i in self.centers], step)
            x_ = medfilt([i[1] for i in self.centers], step)
            return y_, x_

        self.centers, h_shift, w_shift, centers = ([], [], [], None)

        # fast mtcnn pytorch; uses with cuda
        if cuda.is_available():
            frames_cropped = []
            box_prev = None

            mtcnn = MTCNN(image_size=200, device=device)
            for frame in tqdm(self.frames):
                box, _ = mtcnn.detect(frame)
                if box is not None:
                    box = np.array(box[0]).astype(int)
                    x1, x2, y1, y2 = box[1], box[3], box[0], box[2]
                    h_shift += [(y2 - y1) // 2]
                    w_shift += [(x2 - x1) // 2]
                    centers = [y1 + h_shift[-1], x1 + w_shift[-1]]
                    #plt.imshow(frame[x1:x2, y1:y2])
                    #plt.show()
                    if centers is not None:
                        self.centers += [centers]
                    else:
                        self.centers += [0]
                else:
                    self.centers += [0]

            del mtcnn

            del_skipped_frames()

        # haard; uses without cuda
        else:
            face_cascade = cv2.CascadeClassifier(
                'haarcascade_frontalface_default.xml')

            for frame in tqdm(self.frames):
                gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
                faces = face_cascade.detectMultiScale(gray)
                for (x, y, w, h) in faces:
                    h_shift += [h // 2]
                    w_shift += [w // 2]
                    centers = [y + h // 2, x + w // 2]
                if centers is not None:
                    self.centers += [centers]
                else:
                    self.centers += [0]

            del face_cascade

            del_skipped_frames()

        self.box_shift = [
            np.mean(w_shift, dtype=int),
            np.mean(h_shift, dtype=int)
        ]
        # drop discharges from signal
        if len(self.centers) == 0:
            raise ValueError("Невозможно определить лицо")
        if cuda.is_available(): y_, x_ = medfilt_filter(5)
        else: y_, x_ = medfilt_filter()

        self.centers = [[int(y), int(x)] for x, y in zip(x_, y_)]
        for frame, (y, x) in tqdm(zip(self.frames, self.centers)):
            face = frame[x - self.box_shift[0]:x + self.box_shift[0],
                         y - self.box_shift[1]:y + self.box_shift[1]]
            self.faces += [face]
Beispiel #23
0
class MTCNN_Model:
    def __init__(self, general_parameters, model_parameters,
                 inference_parameters):

        #---------dataset_infos
        self.X = None
        self.input_images = None
        self.subfolders = None

        #--------general_parameters
        self.root_folder_name = general_parameters['root_folder_name']

        #---------model_parameters
        self.image_size = model_parameters['image_size']
        self.margin = model_parameters['margin']
        self.min_face_size = model_parameters['min_face_size']
        self.thresholds = model_parameters['thresholds']
        self.factor = model_parameters['factor']
        self.keep_all = model_parameters['keep_all']
        self.device = 'cuda:0' if (model_parameters['device'] == "cuda"
                                   and torch.cuda.is_available()) else 'cpu'
        self.seed = model_parameters['seed']
        self.post_process = False

        #---------Inference_parameters
        self.inference_batch_size = inference_parameters[
            'inference_batch_size']
        self.input_square_transformation_size = inference_parameters[
            'input_square_transformation_size']

        #------- Other
        self.num_workers = cpu_count()

        #------- MTCNN
        self.mtcnn = MTCNN(image_size=self.image_size,
                           margin=self.margin,
                           min_face_size=self.min_face_size,
                           thresholds=self.thresholds,
                           factor=self.factor,
                           post_process=self.post_process,
                           keep_all=self.keep_all,
                           device=self.device)

        #------- Reproducibility
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.random.manual_seed(self.seed)
        torch.cuda.manual_seed(self.seed)

        #------- Results
        self.df_result = None

    def predict(self, img_reference, step):
        if step == "Experiment":
            image_array = img_reference
        if step == "Deployment":
            img = img_reference
            image_array = Image.fromarray(img)

        boxes, probs = self.mtcnn.detect(image_array, landmarks=False)

        return (boxes, probs)

    def _construct_result_dataframe(self, step):
        boxes = []
        probs = []

        for i in range(0, len(self.X), self.inference_batch_size):
            img_reference = []
            batch = self.X[i:i + self.inference_batch_size]
            for row in batch:
                v_cap = cv2.VideoCapture(row[0])
                success, frame = v_cap.read()
                img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                img = cv2.resize(img, (self.input_square_transformation_size,
                                       self.input_square_transformation_size))
                img_reference.append(Image.fromarray(img))

            batch_result = self.predict(img_reference, step)

            if self.keep_all:
                for b, p in zip(batch_result[0], batch_result[1]):
                    boxes.append(b)
                    probs.append(p)

            else:
                for b, p in zip(batch_result[0], batch_result[1]):
                    max_prob_position = np.argmax(p)
                    boxes.append(b[max_prob_position])
                    probs.append(np.max(p))

        self.df_result = pd.DataFrame({
            'Input_image': self.input_images,
            'Subfolder': self.subfolders,
            'Bboxes(x1,y1,x2,y2)': boxes,
            'Probabilities': probs
        })

    def get_result_dataframe(self, X, step='Experiment'):

        self.X = X
        self.input_images = X[:, 0]
        self.subfolders = X[:, 1]
        self._construct_result_dataframe(step)
        return self.df_result
Beispiel #24
0
class FaceAndHandDetector(QThread):
    frame_update_signal = pyqtSignal(QPixmap)

    def __init__(self):
        QThread.__init__(self)

        self.frame = 0
        self.mtcnn = MTCNN()
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        # print(self.device)
        self.frame_counter = 0
        self.prev_frame_counter = 0
        self.timer = QTimer(self)
        self.timer.timeout.connect(self.fps_count)
        self.timer.start(1000)

        self.model = HPSearchNET(cnn_num=3,
                                 fc_num=2,
                                 kern_size=3,
                                 func_act='elu',
                                 nn_prn=True,
                                 in_shape=160).to(self.device)
        self.model.load_state_dict(torch.load("hnd_net_elu_cnn3_fc2_kr3.pth", map_location=self.device))
        self.model.eval()

    # Функция рисования прямоугольника лица
    def draw_face(self, frame, boxes, probs):   # , landmarks
        try:
            cnt = 0
            for box, prob in zip(boxes, probs):   # , ld , landmarks
                cnt += 1
                print(f"Лицо {cnt} box: {box} prob: {prob:.4f}")
                # Рисуем обрамляющий прямоугольник лица на кадре
                cv2.rectangle(frame,
                              (box[0], box[1]),
                              (box[2], box[3]),
                              (0, 0, 255),
                              thickness=2)
        except Exception as e:
            print('Error in _draw')
            print(f'error : {e}')
        return frame

    # Функция рисования прямоугольников рук
    def draw_hand(self, frame, hand_landmarks):
        # mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
        max_x = max_y = 0
        min_x = min_y = 65535
        for mark in hand_landmarks.landmark:
            if mark.x > max_x:
                max_x = mark.x
            if mark.x < min_x:
                min_x = mark.x
            if mark.y > max_y:
                max_y = mark.y
            if mark.y < min_y:
                min_y = mark.y
        max_x = round(max_x * IMAGE_WIDTH) + 30
        min_x = round(min_x * IMAGE_WIDTH) - 30
        max_y = round(max_y * IMAGE_HEIGHT) + 30
        min_y = round(min_y * IMAGE_HEIGHT) - 30
        if min_x < 0:
            min_x = 0
        if min_y < 0:
            min_y = 0
        if max_x > IMAGE_WIDTH:
            max_x = IMAGE_WIDTH
        if max_y > IMAGE_HEIGHT:
            max_y = IMAGE_HEIGHT
        print(f"\tmax_x: {max_x} min_x: {min_x} max_y: {max_y} min_y: {min_y}")
        # Рисуем обрамляющий прямоугольник руки на кадре
        cv2.rectangle(frame,
                      (min_x, min_y),
                      (max_x, max_y),
                      (0, 255, 0),
                      thickness=2)
        return frame, [min_x, min_y, max_x, max_y]

    def fps_count(self):
        self.prev_frame_counter, self.frame_counter = self.frame_counter, 0
        # self.frame_counter = 0

    # Определение наличия рук в кадре
    def hand_detection_mp(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)   # cv2.flip(frame, 1)
        frame.flags.writeable = False
        results = hands.process(frame)
        frame.flags.writeable = True

        if results.multi_hand_landmarks:
            count = 0
            for hand_landmarks in results.multi_hand_landmarks:
                count += 1
                print(f"Рука {count}")
                print(
                    f'\tIndex finger tip coordinates: ('
                    f'x: {round(hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].x * IMAGE_WIDTH)}, '
                    f'y: {round(hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].y * IMAGE_HEIGHT)})'
                )
                for num, mark in enumerate(hand_landmarks.landmark):
                    print(f"\tМетка {arm_marks[num]}"
                          f"- x: {round(mark.x * IMAGE_WIDTH)}, y: {round(mark.y * IMAGE_HEIGHT)}")

        return results

    # Функция в которой будет происходить процесс считывания и обработки каждого кадра
    def run(self):
        # Заходим в бесконечный цикл
        while True:
            if cam_index_list:
                # Считываем каждый новый кадр - frame
                # ret - логическая переменая. Смысл - считали ли мы кадр с потока или нет

                hands = []
                ret, self.frame = cam.read()
                self.frame = cv2.flip(self.frame, 1)
                try:
                    # детектируем расположение лица на кадре, вероятности на сколько это лицо
                    boxes, probs = self.mtcnn.detect(self.frame, landmarks=False)   # , landmarks

                    if boxes is not None:
                        # Рисуем на кадре
                        self.frame = self.draw_face(self.frame, boxes, probs)   # , landmarks
                        # Ищем руки
                        hand_detect_rez = self.hand_detection_mp(self.frame)
                        if hand_detect_rez.multi_hand_landmarks:
                            for hand_landmarks in hand_detect_rez.multi_hand_landmarks:
                                self.frame, hand_box = self.draw_hand(self.frame, hand_landmarks)
                                hands.append(self.filter_hand(self.frame, hand_box))
                                # размер 160х160
                                # Нормализуем изображение в значениях [0, 1]
                                img = torch.from_numpy(hands[-1]) / 255
                                img = img.unsqueeze(0).unsqueeze(0)
                                with torch.no_grad():
                                    outputs = self.model(img.to(self.device))
                                    _, predicted = torch.max(outputs.data, 1)
                                    print(f"predicted: {labels_texts[int(predicted)]}")
                                    # пишем в кадре какой жест
                                    cv2.putText(self.frame,
                                                labels_texts[int(predicted)],
                                                (hand_box[2], hand_box[3]),
                                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

                except Exception as e:
                    print(f'Error {e} in run')

                # пишем в кадре число FPS
                cv2.putText(self.frame,
                            f"FPS: {self.prev_frame_counter}",
                            (20, 30),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 1, cv2.LINE_AA)

                self.frame_counter += 1
                self.frame_update_signal.emit(self.frame_to_qpixmap(self.frame))   # cv2.imshow(self.label, self.frame)
                # if hands:
                #     self.frame_update_signal.emit(self.frame_to_qpixmap(hands[0]))

    # Функция преобразования врейма в QPixmap
    def frame_to_qpixmap(self, frame):
        rgb_image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        convert_to_qt_format = QImage(rgb_image.data,
                                      rgb_image.shape[1],
                                      rgb_image.shape[0],
                                      QImage.Format_RGB888)
        convert_to_qt_format = QPixmap.fromImage(convert_to_qt_format)
        pixmap = QPixmap(convert_to_qt_format)
        return pixmap

    def filter_hand(self, frame, hand_box):
        hand_img = frame[int(hand_box[1]):int(hand_box[3]),
                         int(hand_box[0]):int(hand_box[2])]
        # hand_img = cv2.resize(hand_img, (48, 48))

        hsv = cv2.cvtColor(hand_img, cv2.COLOR_BGR2HSV)

        # define range of skin color in HSV
        lower_skin = np.array([0, 20, 70], dtype=np.uint8)
        upper_skin = np.array([20, 255, 255], dtype=np.uint8)

        # extract skin colur imagw
        mask = cv2.inRange(hsv, lower_skin, upper_skin)

        # extrapolate the hand to fill dark spots within
        # kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (11, 11))   #
        kernel = np.ones((3, 3), np.uint8)
        mask = cv2.erode(mask, kernel, iterations=2)
        mask = cv2.dilate(mask, kernel, iterations=2)  # mask = cv2.dilate(mask, kernel, iterations=4)

        # blur the image
        mask = cv2.GaussianBlur(mask, (3, 3), 0)  # 10

        # mask = cv2.resize(mask, (48, 48))
        # hand_img = cv2.resize(hand_img, (48, 48))
        res = cv2.bitwise_and(hand_img, hand_img, mask=mask)
        res = cv2.resize(res, (160, 160))

        # Превращаем в 1-канальное серое изображение
        res = cv2.cvtColor(res, cv2.COLOR_BGR2GRAY)
        return res
Beispiel #25
0
labelColor = [(10, 255, 0), (10, 0, 255)]

cap = cv2.VideoCapture(0)

# MTCNN for detecting the presence of faces
mtcnn = MTCNN(keep_all=True, device=device)

model.to(device)
model.eval()
while True:
    ret, frame = cap.read()
    if ret == False:
        pass

    img_ = frame.copy()
    boxes, _ = mtcnn.detect(img_)
    # Using PIL to draw boxes
    '''frame_draw = frame.copy()
    draw = ImageDraw.Draw(frame_draw)
    for box in boxes:
        draw.rectangle(box.tolist(), outline=(255, 0, 0), width=6)'''
    '''
    try:
        for x1,y1,x2,y2 in boxes:
            frame = cv2.rectangle(frame,(x1,y1),(x2,y2),(0,0,255),3)
            roi = img_[int(y1):int(y2) , int(x1):int(x2)]
    except TypeError as e:
        pass'''

    try:
        for i in range(len(boxes)):
Beispiel #26
0
class Demo():
    def __init__(self, args):
        ## configs
        self.device = 'cuda:0' if args.gpu else 'cpu'
        self.checkpoint_path = args.checkpoint
        self.detect_human_face = args.detect_human_face
        self.render_video = args.render_video
        self.output_size = args.output_size
        self.image_size = 64
        self.min_depth = 0.9
        self.max_depth = 1.1
        self.border_depth = 1.05
        self.xyz_rotation_range = 60
        self.xy_translation_range = 0.1
        self.z_translation_range = 0
        self.fov = 10  # in degrees

        self.depth_rescaler = lambda d: (1 + d) / 2 * self.max_depth + (
            1 - d) / 2 * self.min_depth  # (-1,1) => (min_depth,max_depth)
        self.depth_inv_rescaler = lambda d: (d - self.min_depth) / (
            self.max_depth - self.min_depth)  # (min_depth,max_depth) => (0,1)

        fx = (self.image_size - 1) / 2 / (np.tan(self.fov / 2 * np.pi / 180))
        fy = (self.image_size - 1) / 2 / (np.tan(self.fov / 2 * np.pi / 180))
        cx = (self.image_size - 1) / 2
        cy = (self.image_size - 1) / 2
        K = [[fx, 0., cx], [0., fy, cy], [0., 0., 1.]]
        K = torch.FloatTensor(K).to(self.device)
        self.inv_K = torch.inverse(K).unsqueeze(0)
        self.K = K.unsqueeze(0)

        ## NN models
        self.netD = EDDeconv(cin=3, cout=1, nf=64, zdim=256, activation=None)
        self.netA = EDDeconv(cin=3, cout=3, nf=64, zdim=256)
        self.netL = Encoder(cin=3, cout=4, nf=32)
        self.netV = Encoder(cin=3, cout=6, nf=32)

        self.netD = self.netD.to(self.device)
        self.netA = self.netA.to(self.device)
        self.netL = self.netL.to(self.device)
        self.netV = self.netV.to(self.device)
        self.load_checkpoint()

        self.netD.eval()
        self.netA.eval()
        self.netL.eval()
        self.netV.eval()

        ## face detecter
        if self.detect_human_face:
            from facenet_pytorch import MTCNN
            self.face_detector = MTCNN(select_largest=True, device=self.device)

        ## renderer
        if self.render_video:
            from unsup3d.renderer import Renderer
            assert 'cuda' in self.device, 'A GPU device is required for rendering because the neural_renderer only has GPU implementation.'
            cfgs = {
                'device': self.device,
                'image_size': self.output_size,
                'min_depth': self.min_depth,
                'max_depth': self.max_depth,
                'fov': self.fov,
            }
            self.renderer = Renderer(cfgs)

    def load_checkpoint(self):
        print(f"Loading checkpoint from {self.checkpoint_path}")
        cp = torch.load(self.checkpoint_path, map_location=self.device)
        self.netD.load_state_dict(cp['netD'])
        self.netA.load_state_dict(cp['netA'])
        self.netL.load_state_dict(cp['netL'])
        self.netV.load_state_dict(cp['netV'])

    def depth_to_3d_grid(self, depth, inv_K=None):
        if inv_K is None:
            inv_K = self.inv_K
        b, h, w = depth.shape
        grid_2d = get_grid(b, h, w,
                           normalize=False).to(depth.device)  # Nxhxwx2
        depth = depth.unsqueeze(-1)
        grid_3d = torch.cat((grid_2d, torch.ones_like(depth)), dim=3)
        grid_3d = grid_3d.matmul(inv_K.transpose(2, 1)) * depth
        return grid_3d

    def get_normal_from_depth(self, depth):
        b, h, w = depth.shape
        grid_3d = self.depth_to_3d_grid(depth)

        tu = grid_3d[:, 1:-1, 2:] - grid_3d[:, 1:-1, :-2]
        tv = grid_3d[:, 2:, 1:-1] - grid_3d[:, :-2, 1:-1]
        normal = tu.cross(tv, dim=3)

        zero = normal.new_tensor([0, 0, 1])
        normal = torch.cat(
            [zero.repeat(b, h - 2, 1, 1), normal,
             zero.repeat(b, h - 2, 1, 1)], 2)
        normal = torch.cat(
            [zero.repeat(b, 1, w, 1), normal,
             zero.repeat(b, 1, w, 1)], 1)
        normal = normal / (((normal**2).sum(3, keepdim=True))**0.5 + EPS)
        return normal

    def detect_face(self, im):
        print("Detecting face using MTCNN face detector")
        try:
            bboxes, prob = self.face_detector.detect(im)
            w0, h0, w1, h1 = bboxes[0]
        except:
            print("Could not detect faces in the image")
            return None

        hc, wc = (h0 + h1) / 2, (w0 + w1) / 2
        crop = int(((h1 - h0) + (w1 - w0)) / 2 / 2 * 1.1)
        im = np.pad(
            im, ((crop, crop), (crop, crop), (0, 0)),
            mode='edge')  # allow cropping outside by replicating borders
        h0 = int(hc - crop + crop + crop * 0.15)
        w0 = int(wc - crop + crop)
        return im[h0:h0 + crop * 2, w0:w0 + crop * 2]

    def run(self, pil_im):
        im = np.uint8(pil_im)

        ## face detection
        if self.detect_human_face:
            im = self.detect_face(im)
            if im is None:
                return -1

        h, w, _ = im.shape
        im = torch.FloatTensor(im / 255.).permute(2, 0, 1).unsqueeze(0)
        # resize to 128 first if too large, to avoid bilinear downsampling artifacts
        if h > self.image_size * 4 and w > self.image_size * 4:
            im = nn.functional.interpolate(
                im, (self.image_size * 2, self.image_size * 2),
                mode='bilinear',
                align_corners=False)
        im = nn.functional.interpolate(im, (self.image_size, self.image_size),
                                       mode='bilinear',
                                       align_corners=False)

        with torch.no_grad():
            self.input_im = im.to(self.device) * 2. - 1.
            b, c, h, w = self.input_im.shape

            ## predict canonical depth
            self.canon_depth_raw = self.netD(self.input_im).squeeze(1)  # BxHxW
            self.canon_depth = self.canon_depth_raw - self.canon_depth_raw.view(
                b, -1).mean(1).view(b, 1, 1)
            self.canon_depth = self.canon_depth.tanh()
            self.canon_depth = self.depth_rescaler(self.canon_depth)

            ## clamp border depth
            depth_border = torch.zeros(1, h, w - 4).to(self.input_im.device)
            depth_border = nn.functional.pad(depth_border, (2, 2),
                                             mode='constant',
                                             value=1)
            self.canon_depth = self.canon_depth * (
                1 - depth_border) + depth_border * self.border_depth

            ## predict canonical albedo
            self.canon_albedo = self.netA(self.input_im)  # Bx3xHxW

            ## predict lighting
            canon_light = self.netL(self.input_im)  # Bx4
            self.canon_light_a = canon_light[:, :1] / 2 + 0.5  # ambience term
            self.canon_light_b = canon_light[:, 1:2] / 2 + 0.5  # diffuse term
            canon_light_dxy = canon_light[:, 2:]
            self.canon_light_d = torch.cat(
                [canon_light_dxy,
                 torch.ones(b, 1).to(self.input_im.device)], 1)
            self.canon_light_d = self.canon_light_d / (
                (self.canon_light_d**2).sum(
                    1, keepdim=True))**0.5  # diffuse light direction

            ## shading
            self.canon_normal = self.get_normal_from_depth(self.canon_depth)
            self.canon_diffuse_shading = (
                self.canon_normal *
                self.canon_light_d.view(-1, 1, 1, 3)).sum(3).clamp(
                    min=0).unsqueeze(1)
            canon_shading = self.canon_light_a.view(
                -1, 1, 1, 1) + self.canon_light_b.view(
                    -1, 1, 1, 1) * self.canon_diffuse_shading
            self.canon_im = (self.canon_albedo / 2 +
                             0.5) * canon_shading * 2 - 1

            ## predict viewpoint transformation
            self.view = self.netV(self.input_im)
            self.view = torch.cat([
                self.view[:, :3] * np.pi / 180 * self.xyz_rotation_range,
                self.view[:, 3:5] * self.xy_translation_range,
                self.view[:, 5:] * self.z_translation_range
            ], 1)

            ## export to obj strings
            vertices = self.depth_to_3d_grid(self.canon_depth)  # BxHxWx3
            self.objs, self.mtls = export_to_obj_string(
                vertices, self.canon_normal)

            ## resize to output size
            self.canon_depth = nn.functional.interpolate(
                self.canon_depth.unsqueeze(1),
                (self.output_size, self.output_size),
                mode='bilinear',
                align_corners=False).squeeze(1)
            self.canon_normal = nn.functional.interpolate(
                self.canon_normal.permute(0, 3, 1, 2),
                (self.output_size, self.output_size),
                mode='bilinear',
                align_corners=False).permute(0, 2, 3, 1)
            self.canon_normal = self.canon_normal / (self.canon_normal**2).sum(
                3, keepdim=True)**0.5
            self.canon_diffuse_shading = nn.functional.interpolate(
                self.canon_diffuse_shading,
                (self.output_size, self.output_size),
                mode='bilinear',
                align_corners=False)
            self.canon_albedo = nn.functional.interpolate(
                self.canon_albedo, (self.output_size, self.output_size),
                mode='bilinear',
                align_corners=False)
            self.canon_im = nn.functional.interpolate(
                self.canon_im, (self.output_size, self.output_size),
                mode='bilinear',
                align_corners=False)

            if self.render_video:
                self.render_animation()

    def render_animation(self):
        print(f"Rendering video animations")
        b, h, w = self.canon_depth.shape

        ## morph from target view to canonical
        morph_frames = 15
        view_zero = torch.FloatTensor([0.15 * np.pi / 180 * 60, 0, 0, 0, 0,
                                       0]).to(self.canon_depth.device)
        morph_s = torch.linspace(0, 1,
                                 morph_frames).to(self.canon_depth.device)
        view_morph = morph_s.view(-1, 1, 1) * view_zero.view(1, 1, -1) + (
            1 - morph_s.view(-1, 1, 1)) * self.view.unsqueeze(0)  # TxBx6

        ## yaw from canonical to both sides
        yaw_frames = 80
        yaw_rotations = np.linspace(-np.pi / 2, np.pi / 2, yaw_frames)
        # yaw_rotations = np.concatenate([yaw_rotations[40:], yaw_rotations[::-1], yaw_rotations[:40]], 0)

        ## whole rotation sequence
        view_after = torch.cat(
            [view_morph, view_zero.repeat(yaw_frames, b, 1)], 0)
        yaw_rotations = np.concatenate([np.zeros(morph_frames), yaw_rotations],
                                       0)

        def rearrange_frames(frames):
            morph_seq = frames[:, :morph_frames]
            yaw_seq = frames[:, morph_frames:]
            out_seq = torch.cat([
                morph_seq[:, :1].repeat(1, 5, 1, 1, 1),
                morph_seq,
                morph_seq[:, -1:].repeat(1, 5, 1, 1, 1),
                yaw_seq[:, yaw_frames // 2:],
                yaw_seq.flip(1),
                yaw_seq[:, :yaw_frames // 2],
                morph_seq[:, -1:].repeat(1, 5, 1, 1, 1),
                morph_seq.flip(1),
                morph_seq[:, :1].repeat(1, 5, 1, 1, 1),
            ], 1)
            return out_seq

        ## textureless shape
        front_light = torch.FloatTensor([0, 0, 1]).to(self.canon_depth.device)
        canon_shape_im = (self.canon_normal *
                          front_light.view(1, 1, 1, 3)).sum(3).clamp(
                              min=0).unsqueeze(1)
        canon_shape_im = canon_shape_im.repeat(1, 3, 1, 1) * 0.7
        shape_animation = self.renderer.render_yaw(
            canon_shape_im,
            self.canon_depth,
            v_after=view_after,
            rotations=yaw_rotations)  # BxTxCxHxW
        self.shape_animation = rearrange_frames(shape_animation)

        ## normal map
        canon_normal_im = self.canon_normal.permute(0, 3, 1, 2) / 2 + 0.5
        normal_animation = self.renderer.render_yaw(
            canon_normal_im,
            self.canon_depth,
            v_after=view_after,
            rotations=yaw_rotations)  # BxTxCxHxW
        self.normal_animation = rearrange_frames(normal_animation)

        ## textured
        texture_animation = self.renderer.render_yaw(
            self.canon_im / 2 + 0.5,
            self.canon_depth,
            v_after=view_after,
            rotations=yaw_rotations)  # BxTxCxHxW
        self.texture_animation = rearrange_frames(texture_animation)

    def save_results(self, save_dir):
        print(f"Saving results to {save_dir}")
        save_image(save_dir, self.input_im[0] / 2 + 0.5, 'input_image')
        save_image(
            save_dir,
            self.depth_inv_rescaler(self.canon_depth)[0].repeat(3, 1, 1),
            'canonical_depth')
        save_image(save_dir, self.canon_normal[0].permute(2, 0, 1) / 2 + 0.5,
                   'canonical_normal')
        save_image(save_dir, self.canon_diffuse_shading[0].repeat(3, 1, 1),
                   'canonical_diffuse_shading')
        save_image(save_dir, self.canon_albedo[0] / 2 + 0.5,
                   'canonical_albedo')
        save_image(save_dir, self.canon_im[0].clamp(-1, 1) / 2 + 0.5,
                   'canonical_image')

        with open(os.path.join(save_dir, 'result.mtl'), "w") as f:
            f.write(self.mtls[0].replace('$TXTFILE', './canonical_image.png'))
        with open(os.path.join(save_dir, 'result.obj'), "w") as f:
            f.write(self.objs[0].replace('$MTLFILE', './result.mtl'))

        if self.render_video:
            save_video(save_dir, self.shape_animation[0], 'shape_animation')
            save_video(save_dir, self.normal_animation[0], 'normal_animation')
            save_video(save_dir, self.texture_animation[0],
                       'texture_animation')
Beispiel #27
0
    video = mmcv.VideoReader('video.mp4')
    frames = [
        Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        for frame in video
    ]

    print(len(frames))

    display.Video('video.mp4', width=640)

    frames_tracked = []
    for i, frame in enumerate(frames):
        print('\rTracking frame: {}'.format(i + 1), end='')

        # Detect faces
        boxes, _ = mtcnn.detect(frame)

        # Draw faces
        frame_draw = frame.copy()
        draw = ImageDraw.Draw(frame_draw)
        if (boxes is not None):
            for box in boxes:
                draw.rectangle(box.tolist(), outline=(255, 0, 0), width=6)

        # Add to frame list
        # frames_tracked.append(frame_draw.resize((640, 360), Image.BILINEAR))
        frames_tracked.append(frame_draw)
    print('\nDone')

    dim = frames_tracked[0].size
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
Beispiel #28
0
class FaceDetect:
    def __init__(self, thresholds=[0.9, 0.9, 0.9], min_face_size=100):
        self.mtcnn = MTCNN(thresholds=thresholds,
                           select_largest=True,
                           post_process=False,
                           device='cuda:0',
                           min_face_size=min_face_size)

    def detect(self,
               img_ls,
               crop_size=None,
               mode='Extract_largest',
               save_faces=False,
               save_annotate=False,
               save_path='face_result'):
        """face detection

        Args:
            img_ls (list): list of array
            crop_size (tuple, optional): crop images with (left, top, right, bottom). Defaults to None.
            mode (str, optional): There're 3 modes, 'Detect', 'Detect_bool', and 'Extract'. 
                                    If you only want to know whether there're any faces, use 'Detect_bool' mode. 
                                    If you want to get boxes and probs of faces, use 'Detect'.
                                    If you want to get all information about faces, use 'Extract'.
                                    Defaults to 'Detect_bool'.
            face_num (int, optional): Number of faces to be extracted. Defaults to 1.
            save_faces (bool, optional): For 'Extract' mode. Defaults to False.
            save_annotate (bool, optional): For 'Extract' mode. Save images with annotations. Defaults to False.

        Returns:
            tuple: depends on the mode.

        """
        if crop_size:
            for i, img in enumerate(img_ls):
                img_ls[i] = img.crop(crop_size)

        try:
            boxes, probs = self.mtcnn.detect(img_ls)
        except Exception as e:
            print(
                f'{e} \n...add crop_size=(left, top, right, bottom) to make images the same'
            )

        if mode == 'Detect_bool':
            return isinstance(boxes, np.ndarray)
        elif mode == 'Detect':
            return boxes, probs
        elif 'Extract' in mode:
            faces = []
            annotates = []
            boxes = boxes.tolist()
            probs = probs.tolist()
            for id_, img in enumerate(img_ls):
                face_batch = []
                img_annotate = img.copy()
                draw = ImageDraw.Draw(img_annotate)
                box_all = boxes[id_]
                if mode == 'Extract_largest':
                    for i, box in enumerate(box_all):
                        left = max(0, box[0])
                        top = max(0, box[1])
                        right = min(np.array(img_ls[id_]).shape[1], box[2])
                        down = min(np.array(img_ls[id_]).shape[0], box[3])
                        box_all[i] = [left, top, right, down]
                    area = list(map(self._cal_area, box_all))
                    max_id = area.index(max(area))
                    box = box_all[max_id]
                    box_head = [
                        box[0] - box[0] / 8, box[1] - box[1] / 5,
                        box[2] + box[2] / 8, box[3] + box[3] / 10
                    ]
                    boxes[id_] = [box_head]
                    probs[id_] = [probs[id_][max_id]]

                    draw.rectangle(box_head, width=5)
                    if save_faces:
                        if not os.path.exists(save_path):
                            os.mkdir(save_path)
                        if not os.path.exists(os.path.join(save_path,
                                                           'faces')):
                            os.mkdir(os.path.join(save_path, 'faces'))
                        face_batch.append(
                            extract_face(img,
                                         box_head,
                                         save_path=os.path.join(
                                             save_path,
                                             f'detected_face_{id_}-{0}.png')))
                    else:
                        face_batch.append(extract_face(img, box_head))
                elif mode == 'Extract_all':
                    for i, box in enumerate(box_all):
                        box_head = [
                            box[0] - box[0] / 3, box[1] - box[1] / 3,
                            box[2] + box[2] / 83, box[3] + box[3] / 10
                        ]
                        box_all[i] = box_head
                        draw.rectangle(box_head, width=5)  # box.tolist()
                        if save_faces:
                            if not os.path.exists(save_path):
                                os.mkdir(save_path)
                            if not os.path.exists(
                                    os.path.join(save_path, 'faces')):
                                os.mkdir(os.path.join(save_path, 'faces'))
                            face_batch.append(
                                extract_face(
                                    img,
                                    box_head,
                                    save_path=os.path.join(
                                        save_path,
                                        f'detected_face_{id_}-{i}.png')))
                        else:
                            face_batch.append(extract_face(img, box_head))
                else:
                    print(f"Error: there's no mode called {mode}")
                faces.append(face_batch)
                annotates.append(np.asarray(img_annotate))
                if save_annotate:
                    if not os.path.exists(save_path):
                        os.mkdir(save_path)
                    if not os.path.exists(
                            os.path.join(save_path, 'annotations')):
                        os.mkdir(os.path.join(save_path, 'annotations'))
                    img_annotate.save(
                        os.path.join(save_path, f'annotated_faces_{id_}.png'))
            return np.asarray(boxes), probs, annotates, faces
        else:
            print(f"Error: there's no mode called {mode}")

    def _cal_area(self, ls):
        return (ls[2] - ls[0]) * (ls[3] - ls[1])
                    image_new = image[box[1]:box[3], box[0]:box[2]]  # вырезаем лицо из фотки
                    try:
                        image_new = cv2.resize(image_new, (200, 200))
                        cv2.imwrite(f'./face/with_mask/{ind}.jpg', image_new)
                        ind += 1
                        print(ind)
                    except Exception as e:
                        print(e)
                    
    except:
        continue
'''
for filename in glob.iglob('with_mask/*.jpg', recursive=True):
    image = cv2.imread(filename)
    try:
        boxes, probs = mtcnn.detect(image)
        if any(probs):  # если лицо или чтото подобное есть в кадре
            for prob, box in zip(probs, boxes):
                if prob > 0.90:  # если вероятность захвата лица достаточно высока
                    # сюда попали если лицо четко распознано
                    box = [int(v) for v in box]  # координаты лица в инт
                    image_new = image[box[1]:box[3],
                                      box[0]:box[2]]  # вырезаем лицо из фотки
                    try:
                        image_new = cv2.resize(image_new, (200, 200))
                        cv2.imwrite(f'./face/with_mask/{ind}.jpg', image_new)
                        ind += 1
                        print(ind)
                    except Exception as e:
                        print(e)
Beispiel #30
0
class FaceDetector:
    def __init__(self, min_face_size, margin, prob_threshold):
        self.detector = MTCNN(keep_all=True,
                              post_process=False,
                              device='cuda:0',
                              select_largest=False,
                              min_face_size=min_face_size)
        self.margin = margin
        self.prob_threshold = prob_threshold

    def crop_face(self, img, box):
        max_x = img.shape[1] - 1
        max_y = img.shape[0] - 1
        x1, y1, x2, y2 = [int(round(c)) for c in box]
        x1 = max([x1 - self.margin, 0])
        y1 = max([y1 - self.margin, 0])
        x2 = min([x2 + self.margin, max_x])
        y2 = min([y2 + self.margin, max_y])
        face = img[y1:y2, x1:x2]
        new_box = [[x1, y1], [x2, y2]]

        return face, new_box

    def filter_faces(self, frame_data):
        faces = {}

        for frame_datum in frame_data:
            for face_data in frame_datum['faces']:
                face_id = face_data['id']

                if face_id in faces:
                    faces[face_id].append(face_data['prob'])
                else:
                    faces[face_id] = [face_data['prob']]

        num_frames = len(frame_data)
        face_ids_to_del = set()
        avg_probs = {}

        for face_id, probs in faces.items():
            if len(probs) < (num_frames / 2):
                print(
                    f'FaceDetector::filter_faces: Face with id {face_id} failed to appear in >= {num_frames / 2} frames.'
                )
                face_ids_to_del.add(face_id)
            avg_probs[face_id] = sum(probs) / len(probs)

        faces_remaining = len(faces) - len(face_ids_to_del)

        if faces_remaining > 2:
            print(
                'FaceDetector::filter_faces: More than 2 faces. Only keeping the two with the highest avg prob.'
            )
            avg_probs_sorted = sorted(avg_probs.items(),
                                      key=lambda x: x[1],
                                      reverse=True)

            for face_id, avg_prob in avg_probs_sorted[2:]:
                face_ids_to_del.add(face_id)

        filtered_frame_data = []
        for frame_datum in frame_data:
            faces_filtered = []

            for face_data in frame_datum['faces']:
                if face_data['id'] in face_ids_to_del:
                    continue
                else:
                    faces_filtered.append(face_data)

            frame_datum['faces'] = faces_filtered
            filtered_frame_data.append(frame_datum)

        return filtered_frame_data

    def detect(self, video, num_frames, filt=True):
        vc = cv2.VideoCapture(video)
        imgs = []

        for i in range(num_frames):
            success, img = vc.read()
            if success:
                imgs.append(img)
            else:
                print(
                    'FaceDetector::detect: cv2::VideoCapture::read call failed.'
                )
                return []

        imgs_pil = [Image.fromarray(i) for i in imgs]
        video_boxes, video_probs, video_landmarks = self.detector.detect(
            imgs_pil, landmarks=True)
        past_faces = []
        frame_data = []
        face_id = 0

        def search_past_faces(box, iou_threshold):
            i = 0
            for past_face in past_faces:
                iou = bb_iou(past_face['box'], face_box)
                if iou > iou_threshold:
                    return (True, i)
                else:
                    i += 1
            return (False, i)

        for frame, (frame_boxes, frame_probs, frame_landmarks) in enumerate(
                zip(video_boxes, video_probs, video_landmarks)):
            contrast_boosted = False
            img = imgs[frame]

            if frame_boxes is None:
                #                 print(f'FaceDetector::detect: No faces in frame {frame}. Boosting contrast.')
                img = boost_contrast([img], 3.0)[0]
                img_pil = Image.fromarray(img)
                frame_boxes, frame_probs, frame_landmarks = self.detector.detect(
                    img_pil, landmarks=True)
                contrast_boosted = True

                if frame_boxes is None:
                    #                     print(f'FaceDetector::detect: No faces after contrast boost. Proceeding to next frame.')
                    frame_data.append({
                        'frame': frame,
                        'faces': [],
                        'contrast_boosted': contrast_boosted
                    })
                    continue

            face_data = []

            for face_box, face_prob, face_landmarks in zip(
                    frame_boxes, frame_probs, frame_landmarks):
                if face_prob < self.prob_threshold:
                    continue

                face_cropped, new_box = self.crop_face(img, face_box)
                found, idx = search_past_faces(face_box, 0.5)

                if found:
                    past_faces[idx]['box'] = face_box
                    face_data.append({
                        'id': past_faces[idx]['id'],
                        'box': new_box,
                        'prob': face_prob,
                        'landmarks': face_landmarks,
                        'img': face_cropped
                    })
                else:
                    past_faces.append({'box': face_box, 'id': face_id})
                    face_data.append({
                        'id': face_id,
                        'box': new_box,
                        'prob': face_prob,
                        'landmarks': face_landmarks,
                        'img': face_cropped
                    })
                    face_id += 1

            frame_data.append({
                'frame': frame,
                'faces': face_data,
                'contrast_boosted': contrast_boosted
            })

        return frame_data if not filt else self.filter_faces(frame_data)