Esempio n. 1
0
def main():
    # model files check and download
    print("=== ST-GCN model ===")
    check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH)
    print("=== OpenPose model ===")
    check_and_download_models(WEIGHT_POSE_PATH, MODEL_POSE_PATH,
                              REMOTE_POSE_PATH)

    # net initialize
    net = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id)

    if args.arch == "pyopenpose":
        pose = op.WrapperPython()
        params = dict(model_folder='.', model_pose='COCO')
        pose.configure(params)
        pose.start()
    else:
        pose = ailia.PoseEstimator(MODEL_POSE_PATH,
                                   WEIGHT_POSE_PATH,
                                   env_id=args.env_id,
                                   algorithm=POSE_ALGORITHM)
        if args.arch == "openpose":
            pose.set_threshold(0.1)

    if args.video is not None:
        # realtime mode
        recognize_realtime(args.video, pose, net)
    else:
        # offline mode
        recognize_from_file(args.input, pose, net)
Esempio n. 2
0
def recognize_from_image():
    # prepare input data
    src_img = cv2.imread(args.input)
    input_image = load_image(
        args.input,
        (IMAGE_HEIGHT, IMAGE_WIDTH),
        normalize_type='None'
    )
    input_data = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGRA)

    # net initialize
    env_id = ailia.get_gpu_environment_id()
    print(f'env_id: {env_id}')
    pose = ailia.PoseEstimator(
        MODEL_PATH, WEIGHT_PATH, env_id=env_id, algorithm=ALGORITHM
    )

    # inference
    print('Start inference...')
    if args.benchmark:
        print('BENCHMARK mode')
        for i in range(5):
            start = int(round(time.time() * 1000))
            _ = pose.compute(input_data)
            end = int(round(time.time() * 1000))
            print(f'\tailia processing time {end - start} ms')
    else:
        _ = pose.compute(input_data)
        
    # postprocessing
    count = pose.get_object_count()
    print(f'person_count={count}')
    display_result(src_img, pose)
    cv2.imwrite(args.savepath, src_img)
    print('Script finished successfully.')
def recognize_from_video():
    # net initialize
    pose = ailia.PoseEstimator(MODEL_PATH,
                               WEIGHT_PATH,
                               env_id=args.env_id,
                               algorithm=ALGORITHM)
    shape = pose.get_input_shape()
    print(shape)
    IMAGE_WIDTH = shape[3]
    IMAGE_HEIGHT = shape[2]

    capture = get_capture(args.video)

    while (True):
        ret, frame = capture.read()
        if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret:
            break

        input_image, input_data = adjust_frame_size(
            frame,
            IMAGE_HEIGHT,
            IMAGE_WIDTH,
        )
        input_data = cv2.cvtColor(input_data, cv2.COLOR_BGR2BGRA)

        # inference
        _ = pose.compute(input_data)

        # postprocessing
        display_result(input_image, pose)
        cv2.imshow('frame', input_image)

    capture.release()
    cv2.destroyAllWindows()
    print('Script finished successfully.')
Esempio n. 4
0
def recognize_from_image():
    # net initialize
    pose = ailia.PoseEstimator(MODEL_PATH,
                               WEIGHT_PATH,
                               env_id=args.env_id,
                               algorithm=ALGORITHM)

    # input image loop
    for image_path in args.input:
        # prepare input data
        logger.info(image_path)
        src_img = cv2.imread(image_path)
        input_image = load_image(image_path, (IMAGE_HEIGHT, IMAGE_WIDTH),
                                 normalize_type='None')
        input_data = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGRA)

        # inference
        logger.info('Start inference...')
        if args.benchmark:
            logger.info('BENCHMARK mode')
            for i in range(5):
                start = int(round(time.time() * 1000))
                _ = pose.compute(input_data)
                end = int(round(time.time() * 1000))
                logger.info(f'\tailia processing time {end - start} ms')
        else:
            _ = pose.compute(input_data)

        # post-processing
        count = pose.get_object_count()
        logger.info(f'person_count={count}')
        display_result(src_img, pose)
        # cv2.imwrite(args.savepath, src_img)
        cv2.imwrite(get_savepath(args.savepath, image_path), src_img)
    logger.info('Script finished successfully.')
Esempio n. 5
0
def recognize_from_video():
    # net initialize
    pose = ailia.PoseEstimator(
        MODEL_PATH, WEIGHT_PATH, env_id=args.env_id, algorithm=ALGORITHM
    )
    baseline = ailia.Net(
        BASELINE_MODEL_PATH, BASELINE_WEIGHT_PATH, env_id=args.env_id
    )
    baseline.set_input_shape((1, 32))

    capture = webcamera_utils.get_capture(args.video)

    # create video writer if savepath is specified as video format
    if args.savepath != SAVE_IMAGE_PATH:
        logger.warning(
            'currently, video results cannot be output correctly...'
        )
        f_h = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
        f_w = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
        writer = webcamera_utils.get_writer(args.savepath, f_h, f_w)
    else:
        writer = None

    while(True):
        ret, frame = capture.read()
        if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret:
            break

        input_image, input_data = webcamera_utils.adjust_frame_size(
            frame, IMAGE_HEIGHT, IMAGE_WIDTH,
        )
        input_data = cv2.cvtColor(input_data, cv2.COLOR_BGR2BGRA)

        # inference
        _ = pose.compute(input_data)

        # postprocessing
        display_result(input_image, pose, baseline)
        cv2.imshow('frame', input_image)

        # display 3d pose
        plt.pause(0.01)
        if not plt.get_fignums():
            break
        # # save results
        # if writer is not None:
        #     writer.write(res_img)

    capture.release()
    cv2.destroyAllWindows()
    if writer is not None:
        writer.release()
    logger.info('Script finished successfully.')
Esempio n. 6
0
def recognize_from_video():
    # net initialize
    env_id = ailia.get_gpu_environment_id()
    print(f'env_id: {env_id}')
    pose = ailia.PoseEstimator(MODEL_PATH,
                               WEIGHT_PATH,
                               env_id=env_id,
                               algorithm=ALGORITHM)
    baseline = ailia.Net(BASELINE_MODEL_PATH,
                         BASELINE_WEIGHT_PATH,
                         env_id=env_id)
    baseline.set_input_shape((1, 32))

    if args.video == '0':
        print('[INFO] Webcam mode is activated')
        capture = cv2.VideoCapture(0)
        if not capture.isOpened():
            print("[ERROR] webcamera not found")
            sys.exit(1)
    else:
        if check_file_existance(args.video):
            capture = cv2.VideoCapture(args.video)

    while (True):
        ret, frame = capture.read()
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
        if not ret:
            continue

        input_image, input_data = adjust_frame_size(
            frame,
            IMAGE_HEIGHT,
            IMAGE_WIDTH,
        )
        input_data = cv2.cvtColor(input_data, cv2.COLOR_BGR2BGRA)

        # inferece
        _ = pose.compute(input_data)

        # postprocessing
        display_result(input_image, pose, baseline)
        cv2.imshow('frame', input_image)

        # display 3d pose
        plt.pause(0.01)

    capture.release()
    cv2.destroyAllWindows()
    print('Script finished successfully.')
Esempio n. 7
0
def recognize_from_video():
    # net initialize
    pose = ailia.PoseEstimator(MODEL_PATH,
                               WEIGHT_PATH,
                               env_id=args.env_id,
                               algorithm=ALGORITHM)
    if args.threshold != THRESHOLD_DEFAULT:
        pose.set_threshold(args.threshold)

    capture = webcamera_utils.get_capture(args.video)
    # create video writer if savepath is specified as video format
    if args.savepath != SAVE_IMAGE_PATH:
        f_h = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
        f_w = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
        save_h, save_w = webcamera_utils.calc_adjust_fsize(
            f_h, f_w, IMAGE_HEIGHT, IMAGE_WIDTH)
        writer = webcamera_utils.get_writer(args.savepath, save_h, save_w)
    else:
        writer = None

    while (True):
        ret, frame = capture.read()
        if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret:
            break

        input_image, input_data = webcamera_utils.adjust_frame_size(
            frame,
            IMAGE_HEIGHT,
            IMAGE_WIDTH,
        )
        input_data = cv2.cvtColor(input_data, cv2.COLOR_BGR2BGRA)

        # inference
        _ = pose.compute(input_data)

        # postprocessing
        display_result(input_image, pose)
        cv2.imshow('frame', input_image)

        # save results
        if writer is not None:
            writer.write(input_image)

    capture.release()
    cv2.destroyAllWindows()
    if writer is not None:
        writer.release()
    print('Script finished successfully.')
Esempio n. 8
0
def recognize_from_image():
    # net initialize
    pose = ailia.PoseEstimator(MODEL_PATH,
                               WEIGHT_PATH,
                               env_id=args.env_id,
                               algorithm=ALGORITHM)
    if args.detection_width != IMAGE_WIDTH or args.detection_height != IMAGE_HEIGHT:
        pose.set_input_shape(
            (1, 3, args.detection_height, args.detection_width))

    # input image loop
    for image_path in args.input:
        # prepare input data
        logger.info(image_path)
        # prepare input data
        src_img = cv2.imread(image_path)
        input_image = load_image(
            image_path,
            (args.detection_height, args.detection_width),
            normalize_type='None',
        )
        input_data = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGRA)

        # inference
        logger.info('Start inference...')
        if args.benchmark:
            logger.info('BENCHMARK mode')
            total_time = 0
            for i in range(args.benchmark_count):
                start = int(round(time.time() * 1000))
                _ = pose.compute(input_data)
                end = int(round(time.time() * 1000))
                if i != 0:
                    total_time = total_time + (end - start)
                logger.info(f'\tailia processing time {end - start} ms')
            logger.info(
                f'\taverage time {total_time / (args.benchmark_count-1)} ms')
        else:
            _ = pose.compute(input_data)

        # postprocessing
        count = pose.get_object_count()
        logger.info(f'person_count={count}')
        display_result(src_img, pose)
        savepath = get_savepath(args.savepath, image_path)
        logger.info(f'saved at : {savepath}')
        cv2.imwrite(savepath, src_img)
    logger.info('Script finished successfully.')
Esempio n. 9
0
def recognize_from_video():
    # net initialize
    pose = ailia.PoseEstimator(MODEL_PATH,
                               WEIGHT_PATH,
                               env_id=args.env_id,
                               algorithm=ALGORITHM)
    if args.detection_width != IMAGE_WIDTH or args.detection_height != IMAGE_HEIGHT:
        pose.set_input_shape(
            (1, 3, args.detection_height, args.detection_width))

    capture = webcamera_utils.get_capture(args.video)

    # create video writer if savepath is specified as video format
    if args.savepath != SAVE_IMAGE_PATH:
        f_h = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
        f_w = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
        writer = webcamera_utils.get_writer(args.savepath, f_h, f_w)
    else:
        writer = None

    while (True):
        ret, frame = capture.read()
        if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret:
            break

        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2BGRA)

        # inference
        _ = pose.compute(frame)

        # postprocessing
        display_result(frame, pose)
        cv2.imshow('frame', frame)

        # save results
        if writer is not None:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR)
            writer.write(frame)

    capture.release()
    cv2.destroyAllWindows()
    if writer is not None:
        writer.release()
    logger.info('Script finished successfully.')
Esempio n. 10
0
def recognize_from_video():
    # net initialize
    env_id = ailia.get_gpu_environment_id()
    print(f'env_id: {env_id}')
    detector = ailia.Detector(MODEL_PATH,
                              WEIGHT_PATH,
                              len(HAND_CATEGORY),
                              format=ailia.NETWORK_IMAGE_FORMAT_RGB,
                              channel=ailia.NETWORK_IMAGE_CHANNEL_FIRST,
                              range=ailia.NETWORK_IMAGE_RANGE_U_FP32,
                              algorithm=ailia.DETECTOR_ALGORITHM_YOLOV3,
                              env_id=env_id)

    hand = ailia.PoseEstimator(HAND_MODEL_PATH,
                               HAND_WEIGHT_PATH,
                               env_id=env_id,
                               algorithm=HAND_ALGORITHM)
    hand.set_threshold(0.1)

    if args.video == '0':
        print('[INFO] Webcam mode is activated')
        capture = cv2.VideoCapture(0)
        if not capture.isOpened():
            print("[ERROR] webcamera not found")
            sys.exit(1)
    else:
        if check_file_existance(args.video):
            capture = cv2.VideoCapture(args.video)

    while (True):
        ret, frame = capture.read()
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
        if not ret:
            continue

        img = cv2.cvtColor(frame, cv2.COLOR_BGR2BGRA)
        detector.compute(img, THRESHOLD, IOU)

        h, w = img.shape[0], img.shape[1]
        count = detector.get_object_count()
        for idx in range(count):
            # get detected hand
            obj = detector.get_object(idx)
            margin = 1.0
            cx = (obj.x + obj.w / 2) * w
            cy = (obj.y + obj.h / 2) * h
            cw = max(obj.w * w, obj.h * h) * margin
            fx = max(cx - cw / 2, 0)
            fy = max(cy - cw / 2, 0)
            fw = min(cw, w - fx)
            fh = min(cw, h - fy)
            top_left = (int(fx), int(fy))
            bottom_right = (int(fx + fw), int(fy + fh))

            # display detected hand
            color = hsv_to_rgb(0, 255, 255)
            cv2.rectangle(frame, top_left, bottom_right, color, 4)

            # get detected face
            crop_img = img[top_left[1]:bottom_right[1],
                           top_left[0]:bottom_right[0], 0:4]
            if crop_img.shape[0] <= 0 or crop_img.shape[1] <= 0:
                continue

            # inferece
            _ = hand.compute(crop_img.astype(np.uint8, order='C'))

            # postprocessing
            display_result(frame, hand, top_left, bottom_right)

        cv2.imshow('frame', frame)

    capture.release()
    cv2.destroyAllWindows()
    print('Script finished successfully.')
Esempio n. 11
0
def recognize_from_video():
    # net initialize
    env_id = ailia.get_gpu_environment_id()
    if args.env_id is not None:
        count = ailia.get_environment_count()
        if count > args.env_id:
            env_id = args.env_id
        else:
            print(f'specified env_id: {args.env_id} cannot found error')
    print(f'env_id: {env_id}')

    detector = ailia.Detector(MODEL_PATH,
                              WEIGHT_PATH,
                              len(HAND_CATEGORY),
                              format=ailia.NETWORK_IMAGE_FORMAT_RGB,
                              channel=ailia.NETWORK_IMAGE_CHANNEL_FIRST,
                              range=ailia.NETWORK_IMAGE_RANGE_U_FP32,
                              algorithm=ailia.DETECTOR_ALGORITHM_YOLOV3,
                              env_id=env_id)

    hand = ailia.PoseEstimator(HAND_MODEL_PATH,
                               HAND_WEIGHT_PATH,
                               env_id=env_id,
                               algorithm=HAND_ALGORITHM)
    hand.set_threshold(0.1)

    ailia_input_w = detector.get_input_shape()[3]
    ailia_input_h = detector.get_input_shape()[2]

    capture = get_capture(args.video)
    # create video writer if savepath is specified as video format
    if args.savepath != SAVE_PATH:
        f_h = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
        f_w = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
        save_h, save_w = calc_adjust_fsize(f_h, f_w, ailia_input_h,
                                           ailia_input_w)
        writer = get_writer(args.savepath, save_h, save_w)
    else:
        writer = None

    while (True):
        ret, frame = capture.read()
        if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret:
            break

        img = cv2.cvtColor(frame, cv2.COLOR_BGR2BGRA)
        detector.compute(img, THRESHOLD, IOU)

        h, w = img.shape[0], img.shape[1]
        count = detector.get_object_count()
        for idx in range(count):
            # get detected hand
            obj = detector.get_object(idx)
            margin = 1.0
            cx = (obj.x + obj.w / 2) * w
            cy = (obj.y + obj.h / 2) * h
            cw = max(obj.w * w, obj.h * h) * margin
            fx = max(cx - cw / 2, 0)
            fy = max(cy - cw / 2, 0)
            fw = min(cw, w - fx)
            fh = min(cw, h - fy)
            top_left = (int(fx), int(fy))
            bottom_right = (int(fx + fw), int(fy + fh))

            # display detected hand
            color = hsv_to_rgb(0, 255, 255)
            cv2.rectangle(frame, top_left, bottom_right, color, 4)

            # get detected face
            crop_img = img[top_left[1]:bottom_right[1],
                           top_left[0]:bottom_right[0], 0:4]
            if crop_img.shape[0] <= 0 or crop_img.shape[1] <= 0:
                continue

            # inference
            _ = hand.compute(crop_img.astype(np.uint8, order='C'))

            # postprocessing
            display_result(frame, hand, top_left, bottom_right)

        cv2.imshow('frame', frame)

        # save results
        if writer is not None:
            writer.write(frame)

    capture.release()
    cv2.destroyAllWindows()
    if writer is not None:
        writer.release()
    print('Script finished successfully.')
def recognize_from_video():
    try:
        print('[INFO] Webcam mode is activated')
        RECORD_TIME = 80
        capture = cv2.VideoCapture(int(args.video))
        if not capture.isOpened():
            print("[ERROR] webcamera not found")
            sys.exit(1)
    except ValueError:
        if check_file_existance(args.video):
            capture = cv2.VideoCapture(args.video)

    frame_rate = capture.get(cv2.CAP_PROP_FPS)
    if FRAME_SKIP:
        action_recognize_fps = int(args.fps)
    else:
        action_recognize_fps = frame_rate

    if args.savepath != "":
        size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)),
                int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
        fmt = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
        writer = cv2.VideoWriter(args.savepath, fmt, action_recognize_fps,
                                 size)
    else:
        writer = None

    # pose estimation
    env_id = ailia.get_gpu_environment_id()
    print(f'env_id: {env_id}')
    if args.arch == "lw_human_pose":
        pose = ailia.PoseEstimator(MODEL_PATH,
                                   WEIGHT_PATH,
                                   env_id=env_id,
                                   algorithm=ALGORITHM)

        detector = None
    else:
        detector = ailia.Detector(DETECTOR_MODEL_PATH,
                                  DETECTOR_WEIGHT_PATH,
                                  len(COCO_CATEGORY),
                                  format=ailia.NETWORK_IMAGE_FORMAT_RGB,
                                  channel=ailia.NETWORK_IMAGE_CHANNEL_FIRST,
                                  range=ailia.NETWORK_IMAGE_RANGE_U_FP32,
                                  algorithm=ailia.DETECTOR_ALGORITHM_YOLOV3,
                                  env_id=env_id)

        pose = ailia.Net(POSE_MODEL_PATH, POSE_WEIGHT_PATH, env_id=env_id)

    # tracker class instance
    extractor = ailia.Net(EX_MODEL_PATH, EX_WEIGHT_PATH, env_id=env_id)
    metric = NearestNeighborDistanceMetric("cosine", MAX_COSINE_DISTANCE,
                                           NN_BUDGET)
    tracker = Tracker(metric, max_iou_distance=0.7, max_age=70, n_init=3)

    # action recognition
    env_id = ailia.get_gpu_environment_id()
    print(f'env_id: {env_id}')
    model = ailia.Net(ACTION_MODEL_PATH, ACTION_WEIGHT_PATH, env_id=env_id)

    action_data = {}

    frame_nb = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
    idx_frame = 0

    time_start = time.time()
    while (True):
        time_curr = time.time()
        if args.video == '0' and time_curr - time_start > RECORD_TIME:
            break
        ret, frame = capture.read()

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
        if (not ret) or (frame_nb >= 1 and idx_frame >= frame_nb):
            break

        if FRAME_SKIP:
            mod = round(frame_rate / action_recognize_fps)
            if mod >= 1:
                if idx_frame % mod != 0:
                    idx_frame = idx_frame + 1
                    continue

        input_image, input_data = adjust_frame_size(
            frame,
            frame.shape[0],
            frame.shape[1],
        )
        input_data = cv2.cvtColor(input_data, cv2.COLOR_BGR2BGRA)

        # inferece
        if args.arch == "lw_human_pose":
            _ = pose.compute(input_data)
        else:
            detector.compute(input_data, THRESHOLD, IOU)

        # deepsort format
        h, w = input_image.shape[0], input_image.shape[1]
        if args.arch == "lw_human_pose":
            bbox_xywh, cls_conf, cls_ids = get_detector_result_lw_human_pose(
                pose, h, w)
        else:
            bbox_xywh, cls_conf, cls_ids = get_detector_result(detector, h, w)

        mask = cls_ids == 0
        bbox_xywh = bbox_xywh[mask]

        # bbox dilation just in case bbox too small,
        # delete this line if using a better pedestrian detector
        if args.arch == "pose_resnet":
            # bbox_xywh[:, 3:] *= 1.2   #May need to be removed in the future
            cls_conf = cls_conf[mask]

        # do tracking
        img_crops = []
        for box in bbox_xywh:
            x1, y1, x2, y2 = xywh_to_xyxy(box, h, w)
            img_crops.append(input_image[y1:y2, x1:x2])

        if img_crops:
            # preprocess
            img_batch = np.concatenate([
                normalize_image(resize(img), 'ImageNet')[np.newaxis, :, :, :]
                for img in img_crops
            ],
                                       axis=0).transpose(0, 3, 1, 2)

            # TODO better to pass a batch at once
            # features = extractor.predict(img_batch)
            features = []
            for img in img_batch:
                features.append(extractor.predict(img[np.newaxis, :, :, :])[0])
            features = np.array(features)
        else:
            features = np.array([])

        bbox_tlwh = xywh_to_tlwh(bbox_xywh)
        detections = [
            Detection(bbox_tlwh[i], conf, features[i])
            for i, conf in enumerate(cls_conf) if conf > MIN_CONFIDENCE
        ]

        # run on non-maximum supression
        boxes = np.array([d.tlwh for d in detections])
        scores = np.array([d.confidence for d in detections])
        nms_max_overlap = 1.0
        indices = non_max_suppression(boxes, nms_max_overlap, scores)
        detections = [detections[i] for i in indices]

        # update tracker
        tracker.predict()
        tracker.update(detections)

        # update bbox identities
        outputs = []
        for track in tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            box = track.to_tlwh()
            x1, y1, x2, y2 = tlwh_to_xyxy(box, h, w)
            track_id = track.track_id
            outputs.append(np.array([x1, y1, x2, y2, track_id], dtype=np.int))
        if len(outputs) > 0:
            outputs = np.stack(outputs, axis=0)

        # action detection
        actions = []
        persons = []
        if len(outputs) > 0:
            bbox_xyxy = outputs[:, :4]
            identities = outputs[:, -1]
            for i, box in enumerate(bbox_xyxy):
                id = identities[i]

                if not (id in action_data):
                    action_data[id] = np.zeros(
                        (ailia.POSE_KEYPOINT_CNT - 1, TIME_RANGE, 3))

                # action recognition
                action, person = action_recognition(box, input_image, pose,
                                                    detector, model,
                                                    action_data[id])
                actions.append(action)
                persons.append(person)

        # draw box for visualization
        if len(outputs) > 0:
            bbox_tlwh = []
            bbox_xyxy = outputs[:, :4]
            identities = outputs[:, -1]
            frame = draw_boxes(input_image, bbox_xyxy, identities, actions,
                               action_data, (0, 0))

            for bb_xyxy in bbox_xyxy:
                bbox_tlwh.append(xyxy_to_tlwh(bb_xyxy))

        # draw skelton
        for person in persons:
            if person != None:
                display_result(input_image, person)

        if writer is not None:
            writer.write(input_image)

            # show progress
            if idx_frame == "0":
                print()
            print("\r" + str(idx_frame + 1) + " / " + str(frame_nb), end="")
            if idx_frame == frame_nb - 1:
                print()

        cv2.imshow('frame', input_image)

        idx_frame = idx_frame + 1

    if writer is not None:
        writer.release()

    capture.release()
    cv2.destroyAllWindows()
    print('Script finished successfully.')