def __init__(self, model_path):
        # 检测结果阈值。低于这个阈值的检测结果将会被忽略  # 过滤掉置信度小于self.min_confidence的bbox,生成detections
        self.min_confidence = 0.25
        self.nms_max_overlap = 1.0  # 非极大抑制的阈值 原始值1.0
        # NMS (这里self.nms_max_overlap的值为1,即保留了所有的detections)
        self.extractor = Extractor(model_path, use_cuda=True)

        max_cosine_distance = 0.2  # 0.2 余弦距离的控制阈值 调节这个能改善IDsw
        # 描述的区域的最大值 它是一个列表,列出了每次出现曲目的特征。nn_bodget确定此列表的大小。例如,如果它是10,则仅存储曲目在板上出现的最后10次的特征
        nn_budget = 100
        metric = NearestNeighborDistanceMetric("cosine", max_cosine_distance,
                                               nn_budget)
        self.tracker = Tracker(metric)
Beispiel #2
0
    def __init__(self, model_path):
        self.min_confidence = 0.3
        self.nms_max_overlap = 1.0

        self.extractor = Extractor(model_path, use_cuda=True)

        max_cosine_distance = 0.2
        nn_budget = 100
        n_init = 0
        max_age = 30
        metric = NearestNeighborDistanceMetric("cosine", max_cosine_distance,
                                               nn_budget)
        self.tracker = Tracker(metric, max_age=max_age, n_init=n_init)
Beispiel #3
0
 def __init__(self,
              model_path='yolov3/of_model/yolov3_model_python/',
              gpu_ids='0',
              model_name='resid',
              confidence_l=0.2,
              confidence_h=0.4,
              max_cosine_distance=0.2,
              max_iou_distance=0.7,
              save_feature=False,
              use_filter=False,
              init_extractor=True,
              max_age=30,
              std_Q_w=1e-1,
              std_Q_wv=1e-3,
              std_R_w=5e-2,
              cls_=0):
     self.confidence_l = confidence_l
     self.confidence_h = confidence_h
     self.iou_thresh_l = 0.24
     self.iou_thresh = 0.5
     self.nms_max_overlap = 1.0
     self.extractor = None
     self.height, self.width = None, None
     if init_extractor:
         self.extractor = Extractor(model_name=model_name,
                                    load_path=model_path,
                                    gpu_ids=gpu_ids,
                                    cls=cls_)
     max_iou = max_iou_distance
     nn_budget = 100
     metric = NearestNeighborDistanceMetric("cosine", max_cosine_distance,
                                            nn_budget)
     self.tracker = Tracker(metric,
                            max_iou_distance=max_iou,
                            max_age=max_age,
                            std_Q_w=std_Q_w,
                            std_Q_wv=std_Q_wv,
                            std_R_w=std_R_w)
     self.all_feature = None
     self.save_feature = save_feature
     self.count = 1
     self.result = []
     self.use_filter = use_filter
Beispiel #4
0
def recognize_from_video():
    results = []
    idx_frame = 0

    # net initialize
    detector = init_detector(args.env_id)
    extractor = ailia.Net(EX_MODEL_PATH, EX_WEIGHT_PATH, env_id=args.env_id)

    # tracker class instance
    metric = NearestNeighborDistanceMetric(
        "cosine", MAX_COSINE_DISTANCE, NN_BUDGET
    )
    tracker = Tracker(
        metric,
        max_iou_distance=0.7,
        max_age=70,
        n_init=3
    )

    capture = webcamera_utils.get_capture(args.video)

    # create video writer
    if args.savepath is not None:
        writer = webcamera_utils.get_writer(
            args.savepath,
            int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)),
            int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)),
        )
    else:
        writer = None

    print('Start Inference...')
    while(True):
        idx_frame += 1
        ret, frame = capture.read()
        if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret:
            break

        # In order to use ailia.Detector, the input should have 4 channels.
        input_img = cv2.cvtColor(frame, cv2.COLOR_BGR2BGRA)
        h, w = frame.shape[0], frame.shape[1]

        # do detection
        detector.compute(input_img, THRESHOLD, IOU)
        bbox_xywh, cls_conf, cls_ids = get_detector_result(detector, h, w)

        # select person class
        mask = cls_ids == 0
        bbox_xywh = bbox_xywh[mask]

        # bbox dilation just in case bbox too small,
        # delete this line if using a better pedestrian detector
        bbox_xywh[:, 3:] *= 1.2
        cls_conf = cls_conf[mask]

        # do tracking
        img_crops = []
        for box in bbox_xywh:
            x1, y1, x2, y2 = xywh_to_xyxy(box, h, w)
            img_crops.append(frame[y1:y2, x1:x2])

        if img_crops:
            # preprocess
            img_batch = np.concatenate([
                normalize_image(resize(img), 'ImageNet')[np.newaxis, :, :, :]
                for img in img_crops
            ], axis=0).transpose(0, 3, 1, 2)

            # TODO better to pass a batch at once
            # features = extractor.predict(img_batch)
            features = []
            for img in img_batch:
                features.append(extractor.predict(img[np.newaxis, :, :, :])[0])
            features = np.array(features)
        else:
            features = np.array([])

        bbox_tlwh = xywh_to_tlwh(bbox_xywh)
        detections = [
            Detection(bbox_tlwh[i], conf, features[i])
            for i, conf in enumerate(cls_conf) if conf > MIN_CONFIDENCE
        ]

        # run on non-maximum supression
        boxes = np.array([d.tlwh for d in detections])
        scores = np.array([d.confidence for d in detections])
        nms_max_overlap = 1.0
        indices = non_max_suppression(boxes, nms_max_overlap, scores)
        detections = [detections[i] for i in indices]

        # update tracker
        tracker.predict()
        tracker.update(detections)

        # update bbox identities
        outputs = []
        for track in tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            box = track.to_tlwh()
            x1, y1, x2, y2 = tlwh_to_xyxy(box, h, w)
            track_id = track.track_id
            outputs.append(np.array([x1, y1, x2, y2, track_id], dtype=np.int))
        if len(outputs) > 0:
            outputs = np.stack(outputs, axis=0)

        # draw box for visualization
        if len(outputs) > 0:
            bbox_tlwh = []
            bbox_xyxy = outputs[:, :4]
            identities = outputs[:, -1]
            frame = draw_boxes(frame, bbox_xyxy, identities)

            for bb_xyxy in bbox_xyxy:
                bbox_tlwh.append(xyxy_to_tlwh(bb_xyxy))

            results.append((idx_frame - 1, bbox_tlwh, identities))

        cv2.imshow('frame', frame)

        if writer is not None:
            writer.write(frame)

        if args.savepath is not None:
            write_results(args.savepath.split('.')[0] + '.txt', results, 'mot')
        else:
            write_results('result.txt', results, 'mot')

    capture.release()
    cv2.destroyAllWindows()
    if writer is not None:
        writer.release()

    print(f'Save results to {args.savepath}')
    print('Script finished successfully.')
def recognize_from_video():
    try:
        print('[INFO] Webcam mode is activated')
        RECORD_TIME = 80
        capture = cv2.VideoCapture(int(args.video))
        if not capture.isOpened():
            print("[ERROR] webcamera not found")
            sys.exit(1)
    except ValueError:
        if check_file_existance(args.video):
            capture = cv2.VideoCapture(args.video)

    frame_rate = capture.get(cv2.CAP_PROP_FPS)
    if FRAME_SKIP:
        action_recognize_fps = int(args.fps)
    else:
        action_recognize_fps = frame_rate

    if args.savepath != "":
        size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)),
                int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
        fmt = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
        writer = cv2.VideoWriter(args.savepath, fmt, action_recognize_fps,
                                 size)
    else:
        writer = None

    # pose estimation
    env_id = ailia.get_gpu_environment_id()
    print(f'env_id: {env_id}')
    if args.arch == "lw_human_pose":
        pose = ailia.PoseEstimator(MODEL_PATH,
                                   WEIGHT_PATH,
                                   env_id=env_id,
                                   algorithm=ALGORITHM)

        detector = None
    else:
        detector = ailia.Detector(DETECTOR_MODEL_PATH,
                                  DETECTOR_WEIGHT_PATH,
                                  len(COCO_CATEGORY),
                                  format=ailia.NETWORK_IMAGE_FORMAT_RGB,
                                  channel=ailia.NETWORK_IMAGE_CHANNEL_FIRST,
                                  range=ailia.NETWORK_IMAGE_RANGE_U_FP32,
                                  algorithm=ailia.DETECTOR_ALGORITHM_YOLOV3,
                                  env_id=env_id)

        pose = ailia.Net(POSE_MODEL_PATH, POSE_WEIGHT_PATH, env_id=env_id)

    # tracker class instance
    extractor = ailia.Net(EX_MODEL_PATH, EX_WEIGHT_PATH, env_id=env_id)
    metric = NearestNeighborDistanceMetric("cosine", MAX_COSINE_DISTANCE,
                                           NN_BUDGET)
    tracker = Tracker(metric, max_iou_distance=0.7, max_age=70, n_init=3)

    # action recognition
    env_id = ailia.get_gpu_environment_id()
    print(f'env_id: {env_id}')
    model = ailia.Net(ACTION_MODEL_PATH, ACTION_WEIGHT_PATH, env_id=env_id)

    action_data = {}

    frame_nb = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
    idx_frame = 0

    time_start = time.time()
    while (True):
        time_curr = time.time()
        if args.video == '0' and time_curr - time_start > RECORD_TIME:
            break
        ret, frame = capture.read()

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
        if (not ret) or (frame_nb >= 1 and idx_frame >= frame_nb):
            break

        if FRAME_SKIP:
            mod = round(frame_rate / action_recognize_fps)
            if mod >= 1:
                if idx_frame % mod != 0:
                    idx_frame = idx_frame + 1
                    continue

        input_image, input_data = adjust_frame_size(
            frame,
            frame.shape[0],
            frame.shape[1],
        )
        input_data = cv2.cvtColor(input_data, cv2.COLOR_BGR2BGRA)

        # inferece
        if args.arch == "lw_human_pose":
            _ = pose.compute(input_data)
        else:
            detector.compute(input_data, THRESHOLD, IOU)

        # deepsort format
        h, w = input_image.shape[0], input_image.shape[1]
        if args.arch == "lw_human_pose":
            bbox_xywh, cls_conf, cls_ids = get_detector_result_lw_human_pose(
                pose, h, w)
        else:
            bbox_xywh, cls_conf, cls_ids = get_detector_result(detector, h, w)

        mask = cls_ids == 0
        bbox_xywh = bbox_xywh[mask]

        # bbox dilation just in case bbox too small,
        # delete this line if using a better pedestrian detector
        if args.arch == "pose_resnet":
            # bbox_xywh[:, 3:] *= 1.2   #May need to be removed in the future
            cls_conf = cls_conf[mask]

        # do tracking
        img_crops = []
        for box in bbox_xywh:
            x1, y1, x2, y2 = xywh_to_xyxy(box, h, w)
            img_crops.append(input_image[y1:y2, x1:x2])

        if img_crops:
            # preprocess
            img_batch = np.concatenate([
                normalize_image(resize(img), 'ImageNet')[np.newaxis, :, :, :]
                for img in img_crops
            ],
                                       axis=0).transpose(0, 3, 1, 2)

            # TODO better to pass a batch at once
            # features = extractor.predict(img_batch)
            features = []
            for img in img_batch:
                features.append(extractor.predict(img[np.newaxis, :, :, :])[0])
            features = np.array(features)
        else:
            features = np.array([])

        bbox_tlwh = xywh_to_tlwh(bbox_xywh)
        detections = [
            Detection(bbox_tlwh[i], conf, features[i])
            for i, conf in enumerate(cls_conf) if conf > MIN_CONFIDENCE
        ]

        # run on non-maximum supression
        boxes = np.array([d.tlwh for d in detections])
        scores = np.array([d.confidence for d in detections])
        nms_max_overlap = 1.0
        indices = non_max_suppression(boxes, nms_max_overlap, scores)
        detections = [detections[i] for i in indices]

        # update tracker
        tracker.predict()
        tracker.update(detections)

        # update bbox identities
        outputs = []
        for track in tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            box = track.to_tlwh()
            x1, y1, x2, y2 = tlwh_to_xyxy(box, h, w)
            track_id = track.track_id
            outputs.append(np.array([x1, y1, x2, y2, track_id], dtype=np.int))
        if len(outputs) > 0:
            outputs = np.stack(outputs, axis=0)

        # action detection
        actions = []
        persons = []
        if len(outputs) > 0:
            bbox_xyxy = outputs[:, :4]
            identities = outputs[:, -1]
            for i, box in enumerate(bbox_xyxy):
                id = identities[i]

                if not (id in action_data):
                    action_data[id] = np.zeros(
                        (ailia.POSE_KEYPOINT_CNT - 1, TIME_RANGE, 3))

                # action recognition
                action, person = action_recognition(box, input_image, pose,
                                                    detector, model,
                                                    action_data[id])
                actions.append(action)
                persons.append(person)

        # draw box for visualization
        if len(outputs) > 0:
            bbox_tlwh = []
            bbox_xyxy = outputs[:, :4]
            identities = outputs[:, -1]
            frame = draw_boxes(input_image, bbox_xyxy, identities, actions,
                               action_data, (0, 0))

            for bb_xyxy in bbox_xyxy:
                bbox_tlwh.append(xyxy_to_tlwh(bb_xyxy))

        # draw skelton
        for person in persons:
            if person != None:
                display_result(input_image, person)

        if writer is not None:
            writer.write(input_image)

            # show progress
            if idx_frame == "0":
                print()
            print("\r" + str(idx_frame + 1) + " / " + str(frame_nb), end="")
            if idx_frame == frame_nb - 1:
                print()

        cv2.imshow('frame', input_image)

        idx_frame = idx_frame + 1

    if writer is not None:
        writer.release()

    capture.release()
    cv2.destroyAllWindows()
    print('Script finished successfully.')