def main(): # model files check and download print("=== ST-GCN model ===") check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH) print("=== OpenPose model ===") check_and_download_models(WEIGHT_POSE_PATH, MODEL_POSE_PATH, REMOTE_POSE_PATH) # net initialize net = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id) if args.arch == "pyopenpose": pose = op.WrapperPython() params = dict(model_folder='.', model_pose='COCO') pose.configure(params) pose.start() else: pose = ailia.PoseEstimator(MODEL_POSE_PATH, WEIGHT_POSE_PATH, env_id=args.env_id, algorithm=POSE_ALGORITHM) if args.arch == "openpose": pose.set_threshold(0.1) if args.video is not None: # realtime mode recognize_realtime(args.video, pose, net) else: # offline mode recognize_from_file(args.input, pose, net)
def recognize_from_image(): # prepare input data src_img = cv2.imread(args.input) input_image = load_image( args.input, (IMAGE_HEIGHT, IMAGE_WIDTH), normalize_type='None' ) input_data = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGRA) # net initialize env_id = ailia.get_gpu_environment_id() print(f'env_id: {env_id}') pose = ailia.PoseEstimator( MODEL_PATH, WEIGHT_PATH, env_id=env_id, algorithm=ALGORITHM ) # inference print('Start inference...') if args.benchmark: print('BENCHMARK mode') for i in range(5): start = int(round(time.time() * 1000)) _ = pose.compute(input_data) end = int(round(time.time() * 1000)) print(f'\tailia processing time {end - start} ms') else: _ = pose.compute(input_data) # postprocessing count = pose.get_object_count() print(f'person_count={count}') display_result(src_img, pose) cv2.imwrite(args.savepath, src_img) print('Script finished successfully.')
def recognize_from_video(): # net initialize pose = ailia.PoseEstimator(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id, algorithm=ALGORITHM) shape = pose.get_input_shape() print(shape) IMAGE_WIDTH = shape[3] IMAGE_HEIGHT = shape[2] capture = get_capture(args.video) while (True): ret, frame = capture.read() if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret: break input_image, input_data = adjust_frame_size( frame, IMAGE_HEIGHT, IMAGE_WIDTH, ) input_data = cv2.cvtColor(input_data, cv2.COLOR_BGR2BGRA) # inference _ = pose.compute(input_data) # postprocessing display_result(input_image, pose) cv2.imshow('frame', input_image) capture.release() cv2.destroyAllWindows() print('Script finished successfully.')
def recognize_from_image(): # net initialize pose = ailia.PoseEstimator(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id, algorithm=ALGORITHM) # input image loop for image_path in args.input: # prepare input data logger.info(image_path) src_img = cv2.imread(image_path) input_image = load_image(image_path, (IMAGE_HEIGHT, IMAGE_WIDTH), normalize_type='None') input_data = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGRA) # inference logger.info('Start inference...') if args.benchmark: logger.info('BENCHMARK mode') for i in range(5): start = int(round(time.time() * 1000)) _ = pose.compute(input_data) end = int(round(time.time() * 1000)) logger.info(f'\tailia processing time {end - start} ms') else: _ = pose.compute(input_data) # post-processing count = pose.get_object_count() logger.info(f'person_count={count}') display_result(src_img, pose) # cv2.imwrite(args.savepath, src_img) cv2.imwrite(get_savepath(args.savepath, image_path), src_img) logger.info('Script finished successfully.')
def recognize_from_video(): # net initialize pose = ailia.PoseEstimator( MODEL_PATH, WEIGHT_PATH, env_id=args.env_id, algorithm=ALGORITHM ) baseline = ailia.Net( BASELINE_MODEL_PATH, BASELINE_WEIGHT_PATH, env_id=args.env_id ) baseline.set_input_shape((1, 32)) capture = webcamera_utils.get_capture(args.video) # create video writer if savepath is specified as video format if args.savepath != SAVE_IMAGE_PATH: logger.warning( 'currently, video results cannot be output correctly...' ) f_h = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)) f_w = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)) writer = webcamera_utils.get_writer(args.savepath, f_h, f_w) else: writer = None while(True): ret, frame = capture.read() if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret: break input_image, input_data = webcamera_utils.adjust_frame_size( frame, IMAGE_HEIGHT, IMAGE_WIDTH, ) input_data = cv2.cvtColor(input_data, cv2.COLOR_BGR2BGRA) # inference _ = pose.compute(input_data) # postprocessing display_result(input_image, pose, baseline) cv2.imshow('frame', input_image) # display 3d pose plt.pause(0.01) if not plt.get_fignums(): break # # save results # if writer is not None: # writer.write(res_img) capture.release() cv2.destroyAllWindows() if writer is not None: writer.release() logger.info('Script finished successfully.')
def recognize_from_video(): # net initialize env_id = ailia.get_gpu_environment_id() print(f'env_id: {env_id}') pose = ailia.PoseEstimator(MODEL_PATH, WEIGHT_PATH, env_id=env_id, algorithm=ALGORITHM) baseline = ailia.Net(BASELINE_MODEL_PATH, BASELINE_WEIGHT_PATH, env_id=env_id) baseline.set_input_shape((1, 32)) if args.video == '0': print('[INFO] Webcam mode is activated') capture = cv2.VideoCapture(0) if not capture.isOpened(): print("[ERROR] webcamera not found") sys.exit(1) else: if check_file_existance(args.video): capture = cv2.VideoCapture(args.video) while (True): ret, frame = capture.read() if cv2.waitKey(1) & 0xFF == ord('q'): break if not ret: continue input_image, input_data = adjust_frame_size( frame, IMAGE_HEIGHT, IMAGE_WIDTH, ) input_data = cv2.cvtColor(input_data, cv2.COLOR_BGR2BGRA) # inferece _ = pose.compute(input_data) # postprocessing display_result(input_image, pose, baseline) cv2.imshow('frame', input_image) # display 3d pose plt.pause(0.01) capture.release() cv2.destroyAllWindows() print('Script finished successfully.')
def recognize_from_video(): # net initialize pose = ailia.PoseEstimator(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id, algorithm=ALGORITHM) if args.threshold != THRESHOLD_DEFAULT: pose.set_threshold(args.threshold) capture = webcamera_utils.get_capture(args.video) # create video writer if savepath is specified as video format if args.savepath != SAVE_IMAGE_PATH: f_h = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)) f_w = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)) save_h, save_w = webcamera_utils.calc_adjust_fsize( f_h, f_w, IMAGE_HEIGHT, IMAGE_WIDTH) writer = webcamera_utils.get_writer(args.savepath, save_h, save_w) else: writer = None while (True): ret, frame = capture.read() if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret: break input_image, input_data = webcamera_utils.adjust_frame_size( frame, IMAGE_HEIGHT, IMAGE_WIDTH, ) input_data = cv2.cvtColor(input_data, cv2.COLOR_BGR2BGRA) # inference _ = pose.compute(input_data) # postprocessing display_result(input_image, pose) cv2.imshow('frame', input_image) # save results if writer is not None: writer.write(input_image) capture.release() cv2.destroyAllWindows() if writer is not None: writer.release() print('Script finished successfully.')
def recognize_from_image(): # net initialize pose = ailia.PoseEstimator(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id, algorithm=ALGORITHM) if args.detection_width != IMAGE_WIDTH or args.detection_height != IMAGE_HEIGHT: pose.set_input_shape( (1, 3, args.detection_height, args.detection_width)) # input image loop for image_path in args.input: # prepare input data logger.info(image_path) # prepare input data src_img = cv2.imread(image_path) input_image = load_image( image_path, (args.detection_height, args.detection_width), normalize_type='None', ) input_data = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGRA) # inference logger.info('Start inference...') if args.benchmark: logger.info('BENCHMARK mode') total_time = 0 for i in range(args.benchmark_count): start = int(round(time.time() * 1000)) _ = pose.compute(input_data) end = int(round(time.time() * 1000)) if i != 0: total_time = total_time + (end - start) logger.info(f'\tailia processing time {end - start} ms') logger.info( f'\taverage time {total_time / (args.benchmark_count-1)} ms') else: _ = pose.compute(input_data) # postprocessing count = pose.get_object_count() logger.info(f'person_count={count}') display_result(src_img, pose) savepath = get_savepath(args.savepath, image_path) logger.info(f'saved at : {savepath}') cv2.imwrite(savepath, src_img) logger.info('Script finished successfully.')
def recognize_from_video(): # net initialize pose = ailia.PoseEstimator(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id, algorithm=ALGORITHM) if args.detection_width != IMAGE_WIDTH or args.detection_height != IMAGE_HEIGHT: pose.set_input_shape( (1, 3, args.detection_height, args.detection_width)) capture = webcamera_utils.get_capture(args.video) # create video writer if savepath is specified as video format if args.savepath != SAVE_IMAGE_PATH: f_h = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)) f_w = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)) writer = webcamera_utils.get_writer(args.savepath, f_h, f_w) else: writer = None while (True): ret, frame = capture.read() if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret: break frame = cv2.cvtColor(frame, cv2.COLOR_BGR2BGRA) # inference _ = pose.compute(frame) # postprocessing display_result(frame, pose) cv2.imshow('frame', frame) # save results if writer is not None: frame = cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR) writer.write(frame) capture.release() cv2.destroyAllWindows() if writer is not None: writer.release() logger.info('Script finished successfully.')
def recognize_from_video(): # net initialize env_id = ailia.get_gpu_environment_id() print(f'env_id: {env_id}') detector = ailia.Detector(MODEL_PATH, WEIGHT_PATH, len(HAND_CATEGORY), format=ailia.NETWORK_IMAGE_FORMAT_RGB, channel=ailia.NETWORK_IMAGE_CHANNEL_FIRST, range=ailia.NETWORK_IMAGE_RANGE_U_FP32, algorithm=ailia.DETECTOR_ALGORITHM_YOLOV3, env_id=env_id) hand = ailia.PoseEstimator(HAND_MODEL_PATH, HAND_WEIGHT_PATH, env_id=env_id, algorithm=HAND_ALGORITHM) hand.set_threshold(0.1) if args.video == '0': print('[INFO] Webcam mode is activated') capture = cv2.VideoCapture(0) if not capture.isOpened(): print("[ERROR] webcamera not found") sys.exit(1) else: if check_file_existance(args.video): capture = cv2.VideoCapture(args.video) while (True): ret, frame = capture.read() if cv2.waitKey(1) & 0xFF == ord('q'): break if not ret: continue img = cv2.cvtColor(frame, cv2.COLOR_BGR2BGRA) detector.compute(img, THRESHOLD, IOU) h, w = img.shape[0], img.shape[1] count = detector.get_object_count() for idx in range(count): # get detected hand obj = detector.get_object(idx) margin = 1.0 cx = (obj.x + obj.w / 2) * w cy = (obj.y + obj.h / 2) * h cw = max(obj.w * w, obj.h * h) * margin fx = max(cx - cw / 2, 0) fy = max(cy - cw / 2, 0) fw = min(cw, w - fx) fh = min(cw, h - fy) top_left = (int(fx), int(fy)) bottom_right = (int(fx + fw), int(fy + fh)) # display detected hand color = hsv_to_rgb(0, 255, 255) cv2.rectangle(frame, top_left, bottom_right, color, 4) # get detected face crop_img = img[top_left[1]:bottom_right[1], top_left[0]:bottom_right[0], 0:4] if crop_img.shape[0] <= 0 or crop_img.shape[1] <= 0: continue # inferece _ = hand.compute(crop_img.astype(np.uint8, order='C')) # postprocessing display_result(frame, hand, top_left, bottom_right) cv2.imshow('frame', frame) capture.release() cv2.destroyAllWindows() print('Script finished successfully.')
def recognize_from_video(): # net initialize env_id = ailia.get_gpu_environment_id() if args.env_id is not None: count = ailia.get_environment_count() if count > args.env_id: env_id = args.env_id else: print(f'specified env_id: {args.env_id} cannot found error') print(f'env_id: {env_id}') detector = ailia.Detector(MODEL_PATH, WEIGHT_PATH, len(HAND_CATEGORY), format=ailia.NETWORK_IMAGE_FORMAT_RGB, channel=ailia.NETWORK_IMAGE_CHANNEL_FIRST, range=ailia.NETWORK_IMAGE_RANGE_U_FP32, algorithm=ailia.DETECTOR_ALGORITHM_YOLOV3, env_id=env_id) hand = ailia.PoseEstimator(HAND_MODEL_PATH, HAND_WEIGHT_PATH, env_id=env_id, algorithm=HAND_ALGORITHM) hand.set_threshold(0.1) ailia_input_w = detector.get_input_shape()[3] ailia_input_h = detector.get_input_shape()[2] capture = get_capture(args.video) # create video writer if savepath is specified as video format if args.savepath != SAVE_PATH: f_h = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)) f_w = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)) save_h, save_w = calc_adjust_fsize(f_h, f_w, ailia_input_h, ailia_input_w) writer = get_writer(args.savepath, save_h, save_w) else: writer = None while (True): ret, frame = capture.read() if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret: break img = cv2.cvtColor(frame, cv2.COLOR_BGR2BGRA) detector.compute(img, THRESHOLD, IOU) h, w = img.shape[0], img.shape[1] count = detector.get_object_count() for idx in range(count): # get detected hand obj = detector.get_object(idx) margin = 1.0 cx = (obj.x + obj.w / 2) * w cy = (obj.y + obj.h / 2) * h cw = max(obj.w * w, obj.h * h) * margin fx = max(cx - cw / 2, 0) fy = max(cy - cw / 2, 0) fw = min(cw, w - fx) fh = min(cw, h - fy) top_left = (int(fx), int(fy)) bottom_right = (int(fx + fw), int(fy + fh)) # display detected hand color = hsv_to_rgb(0, 255, 255) cv2.rectangle(frame, top_left, bottom_right, color, 4) # get detected face crop_img = img[top_left[1]:bottom_right[1], top_left[0]:bottom_right[0], 0:4] if crop_img.shape[0] <= 0 or crop_img.shape[1] <= 0: continue # inference _ = hand.compute(crop_img.astype(np.uint8, order='C')) # postprocessing display_result(frame, hand, top_left, bottom_right) cv2.imshow('frame', frame) # save results if writer is not None: writer.write(frame) capture.release() cv2.destroyAllWindows() if writer is not None: writer.release() print('Script finished successfully.')
def recognize_from_video(): try: print('[INFO] Webcam mode is activated') RECORD_TIME = 80 capture = cv2.VideoCapture(int(args.video)) if not capture.isOpened(): print("[ERROR] webcamera not found") sys.exit(1) except ValueError: if check_file_existance(args.video): capture = cv2.VideoCapture(args.video) frame_rate = capture.get(cv2.CAP_PROP_FPS) if FRAME_SKIP: action_recognize_fps = int(args.fps) else: action_recognize_fps = frame_rate if args.savepath != "": size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))) fmt = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') writer = cv2.VideoWriter(args.savepath, fmt, action_recognize_fps, size) else: writer = None # pose estimation env_id = ailia.get_gpu_environment_id() print(f'env_id: {env_id}') if args.arch == "lw_human_pose": pose = ailia.PoseEstimator(MODEL_PATH, WEIGHT_PATH, env_id=env_id, algorithm=ALGORITHM) detector = None else: detector = ailia.Detector(DETECTOR_MODEL_PATH, DETECTOR_WEIGHT_PATH, len(COCO_CATEGORY), format=ailia.NETWORK_IMAGE_FORMAT_RGB, channel=ailia.NETWORK_IMAGE_CHANNEL_FIRST, range=ailia.NETWORK_IMAGE_RANGE_U_FP32, algorithm=ailia.DETECTOR_ALGORITHM_YOLOV3, env_id=env_id) pose = ailia.Net(POSE_MODEL_PATH, POSE_WEIGHT_PATH, env_id=env_id) # tracker class instance extractor = ailia.Net(EX_MODEL_PATH, EX_WEIGHT_PATH, env_id=env_id) metric = NearestNeighborDistanceMetric("cosine", MAX_COSINE_DISTANCE, NN_BUDGET) tracker = Tracker(metric, max_iou_distance=0.7, max_age=70, n_init=3) # action recognition env_id = ailia.get_gpu_environment_id() print(f'env_id: {env_id}') model = ailia.Net(ACTION_MODEL_PATH, ACTION_WEIGHT_PATH, env_id=env_id) action_data = {} frame_nb = int(capture.get(cv2.CAP_PROP_FRAME_COUNT)) idx_frame = 0 time_start = time.time() while (True): time_curr = time.time() if args.video == '0' and time_curr - time_start > RECORD_TIME: break ret, frame = capture.read() if cv2.waitKey(1) & 0xFF == ord('q'): break if (not ret) or (frame_nb >= 1 and idx_frame >= frame_nb): break if FRAME_SKIP: mod = round(frame_rate / action_recognize_fps) if mod >= 1: if idx_frame % mod != 0: idx_frame = idx_frame + 1 continue input_image, input_data = adjust_frame_size( frame, frame.shape[0], frame.shape[1], ) input_data = cv2.cvtColor(input_data, cv2.COLOR_BGR2BGRA) # inferece if args.arch == "lw_human_pose": _ = pose.compute(input_data) else: detector.compute(input_data, THRESHOLD, IOU) # deepsort format h, w = input_image.shape[0], input_image.shape[1] if args.arch == "lw_human_pose": bbox_xywh, cls_conf, cls_ids = get_detector_result_lw_human_pose( pose, h, w) else: bbox_xywh, cls_conf, cls_ids = get_detector_result(detector, h, w) mask = cls_ids == 0 bbox_xywh = bbox_xywh[mask] # bbox dilation just in case bbox too small, # delete this line if using a better pedestrian detector if args.arch == "pose_resnet": # bbox_xywh[:, 3:] *= 1.2 #May need to be removed in the future cls_conf = cls_conf[mask] # do tracking img_crops = [] for box in bbox_xywh: x1, y1, x2, y2 = xywh_to_xyxy(box, h, w) img_crops.append(input_image[y1:y2, x1:x2]) if img_crops: # preprocess img_batch = np.concatenate([ normalize_image(resize(img), 'ImageNet')[np.newaxis, :, :, :] for img in img_crops ], axis=0).transpose(0, 3, 1, 2) # TODO better to pass a batch at once # features = extractor.predict(img_batch) features = [] for img in img_batch: features.append(extractor.predict(img[np.newaxis, :, :, :])[0]) features = np.array(features) else: features = np.array([]) bbox_tlwh = xywh_to_tlwh(bbox_xywh) detections = [ Detection(bbox_tlwh[i], conf, features[i]) for i, conf in enumerate(cls_conf) if conf > MIN_CONFIDENCE ] # run on non-maximum supression boxes = np.array([d.tlwh for d in detections]) scores = np.array([d.confidence for d in detections]) nms_max_overlap = 1.0 indices = non_max_suppression(boxes, nms_max_overlap, scores) detections = [detections[i] for i in indices] # update tracker tracker.predict() tracker.update(detections) # update bbox identities outputs = [] for track in tracker.tracks: if not track.is_confirmed() or track.time_since_update > 1: continue box = track.to_tlwh() x1, y1, x2, y2 = tlwh_to_xyxy(box, h, w) track_id = track.track_id outputs.append(np.array([x1, y1, x2, y2, track_id], dtype=np.int)) if len(outputs) > 0: outputs = np.stack(outputs, axis=0) # action detection actions = [] persons = [] if len(outputs) > 0: bbox_xyxy = outputs[:, :4] identities = outputs[:, -1] for i, box in enumerate(bbox_xyxy): id = identities[i] if not (id in action_data): action_data[id] = np.zeros( (ailia.POSE_KEYPOINT_CNT - 1, TIME_RANGE, 3)) # action recognition action, person = action_recognition(box, input_image, pose, detector, model, action_data[id]) actions.append(action) persons.append(person) # draw box for visualization if len(outputs) > 0: bbox_tlwh = [] bbox_xyxy = outputs[:, :4] identities = outputs[:, -1] frame = draw_boxes(input_image, bbox_xyxy, identities, actions, action_data, (0, 0)) for bb_xyxy in bbox_xyxy: bbox_tlwh.append(xyxy_to_tlwh(bb_xyxy)) # draw skelton for person in persons: if person != None: display_result(input_image, person) if writer is not None: writer.write(input_image) # show progress if idx_frame == "0": print() print("\r" + str(idx_frame + 1) + " / " + str(frame_nb), end="") if idx_frame == frame_nb - 1: print() cv2.imshow('frame', input_image) idx_frame = idx_frame + 1 if writer is not None: writer.release() capture.release() cv2.destroyAllWindows() print('Script finished successfully.')