def inference_image(model, parser): print("image") # preparing list_image_paths = glob(os.path.join(parser.input, "*.jpg")) preprocess = InferenceTransformation(320, 320) pose_parser = Pose(image_scale=0.125) hand_model = HandPoseDetector(static_model=True) average_time = 0 for ind, image_path in tqdm.tqdm(enumerate(list_image_paths)): origin_image = cv2.imread(image_path) height, width, _ = origin_image.shape ratio_scale = height / 320 add_width = (320 - int(320 * width / height)) // 2 current_time = cv2.getTickCount() process_image = preprocess(origin_image) image = preprocess_tensor(process_image.copy()) paf, heatmap = model(image) # paf = paf[0] # heatmap = heatmap[0] paf = paf.reshape((38, 40, 40)) heatmap = heatmap.reshape((19, 40, 40)) pose_parser.parser_pose(paf, heatmap) draw_pose(process_image, pose_parser.poses_list, pose_parser.hand_window, body_edges) hand_images = pose_parser.get_hand_head_images(origin_image, ratio_scale, add_width) if hand_images is not None: image, _ = hand_model(hand_images) current_time = (cv2.getTickCount() - current_time) / cv2.getTickFrequency() # cv2.imwrite("./_image/temp_hand_%d.jpg"%ind, image) if hand_images is not None: process_image[-200:, :100] = cv2.resize(image, (100, 200)) average_time += current_time cv2.putText(process_image, 'parsing time: {}'.format(current_time), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 255)) cv2.imwrite("./_image/img_with_pose_%d.jpg" % ind, process_image) print("avg time: %f" % (average_time / len(list_image_paths)))
def inference_image(model, parser): print("image") # preparing list_image_paths = glob(os.path.join(parser.input, "*.jpg")) preprocess = InferenceTransformation(368, 368) pose_parser = Pose(image_scale=0.125) hand_model = HandPoseDetector(static_model=True) face_model = FaceMeshDetector(static_model=True) average_time = 0 for ind, image_path in tqdm.tqdm(enumerate(list_image_paths)): origin_image = cv2.imread(image_path) height, width,_ = origin_image.shape ratio_scale = height / 368 add_width = (368 - int(368 * width / height))//2 current_time = cv2.getTickCount() process_image = preprocess(origin_image) image = torch.Tensor(preprocess_tensor(process_image.copy())).unsqueeze(0).cuda() paf, heatmap = model(image) paf = paf.detach().cpu().numpy()[0] heatmap = heatmap.detach().cpu().numpy()[0] pose_parser.parser_pose(paf, heatmap) draw_pose(process_image, pose_parser.poses_list, pose_parser.hand_window, body_edges) hand_images = pose_parser.get_hand_head_images(origin_image,ratio_scale, add_width) for hand_ind, image in enumerate(hand_images): image, _ = hand_model(image) # cv2.imwrite("./_image/temp_hand_%d.jpg"%ind, image) if hand_ind == 1: process_image[-100:,:100] = cv2.resize(image,(100,100)) elif hand_ind == 0: process_image[-100:,-100:] = cv2.resize(image,(100,100)) current_time = (cv2.getTickCount() - current_time) / cv2.getTickFrequency() average_time += current_time cv2.putText(process_image, 'parsing time: {}'.format(current_time), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 255)) cv2.imwrite("./_image/img_with_pose_%d.jpg" % ind, process_image) print("avg time: %f" % (average_time / len(list_image_paths)))
def inference_video(model, parser, is_optical_flow=False): print("video") # preparing list_video_paths = glob(os.path.join(parser.input, "*.mp4")) FIX_SIZE = 320 preprocess = InferenceTransformation(FIX_SIZE, FIX_SIZE) mean_time = 0 pose_parser = Pose(image_scale=0.125) hand_model = HandPoseDetector(static_model= True) face_model = FaceMeshDetector(static_model= False) lk_params = dict(winSize=(100, 100), maxLevel=10, criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 20, 0.03)) for ind, video_path in tqdm.tqdm(enumerate(list_video_paths)): vidcap = cv2.VideoCapture(video_path) success, image = vidcap.read() height, width,_ = image.shape ratio_scale = height / FIX_SIZE add_width = (FIX_SIZE - int(FIX_SIZE * width / height))//2 count = 0 video = cv2.VideoWriter('_image/video_with_hand_optical%d.avi'%ind, cv2.VideoWriter_fourcc(*'DIVX'), 30, (FIX_SIZE, FIX_SIZE)) old_gray = None num_frame = 0 while success: success, origin_image = vidcap.read() if success: current_time = cv2.getTickCount() # origin_image = cv2.rotate(origin_image, cv2.ROTATE_90_COUNTERCLOCKWISE) process_image = preprocess(origin_image) if is_optical_flow and 0<count <4 and len(pose_parser.poses_list) >0: visible_mask = pose_parser.poses_list[0][:,2]>0 visible_point = pose_parser.poses_list[0][visible_mask][:,0:2] # get only coord current_gray = cv2.cvtColor(process_image, cv2.COLOR_BGR2GRAY) visible_point = np.expand_dims(visible_point.astype(np.float32), axis = 1) p1, st, err = cv2.calcOpticalFlowPyrLK(old_gray, current_gray, visible_point, None, **lk_params) old_gray = current_gray.copy() visible_point[st==1] = p1[st==1] visible_point = np.squeeze(visible_point, axis=1) pose_parser.poses_list[0][np.where(visible_mask),:-1] = visible_point.copy() pose_parser.get_hand_head_window() else: image = torch.Tensor(preprocess_tensor(process_image.copy())).unsqueeze(0) image = image.cuda() paf, heatmap = model(image) paf = paf.detach().cpu().numpy()[0] heatmap = heatmap.detach().cpu().numpy()[0] pose_parser.parser_pose(paf, heatmap) old_gray = cv2.cvtColor(process_image, cv2.COLOR_BGR2GRAY) count = 0 draw_pose(process_image, pose_parser.poses_list, pose_parser.hand_window, body_edges) hand_images = pose_parser.get_hand_head_images(origin_image,ratio_scale, add_width) if hand_images is not None: hand_images, _ = hand_model(hand_images) current_time = (cv2.getTickCount() - current_time) / cv2.getTickFrequency() if hand_images is not None: process_image[-200:, :100] = cv2.resize(hand_images, (100, 200)) if mean_time == 0: mean_time = current_time else: mean_time = mean_time * 0.95 + current_time * 0.05 cv2.putText(process_image, 'FPS: {}'.format(int(1 / mean_time * 10) / 10), (10, 20), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 0, 255)) video.write(process_image) count += 1 num_frame+=1 vidcap.release() video.release() print(count) print(int(1/mean_time*10)/10)