def get(self, read_image=True, extra=False): [joint3d, joint2d, hand_side, img_dir, K, DC] = tf.train.slice_input_producer([self.joints3d, self.joints2d, self.hand_sides, self.img_dirs, self.Ks, self.DCs], shuffle=False) keypoint_xyz21 = joint3d keypoint_uv21 = joint2d if not self.use_wrist_coord: palm_coord = tf.expand_dims(0.5*(keypoint_xyz21[0, :] + keypoint_xyz21[12, :]), 0) keypoint_xyz21 = tf.concat([palm_coord, keypoint_xyz21[1:21, :]], 0) palm_coord_uv = tf.expand_dims(0.5*(keypoint_uv21[0, :] + keypoint_uv21[12, :]), 0) keypoint_uv21 = tf.concat([palm_coord_uv, keypoint_uv21[1:21, :]], 0) if self.coord_uv_noise: noise = tf.truncated_normal([21, 2], mean=0.0, stddev=self.coord_uv_noise_sigma) keypoint_uv21 += noise data_dict = {} data_dict['img_dir'] = img_dir data_dict['hand_side'] = tf.one_hot(tf.cast(hand_side, tf.uint8), depth=2, on_value=1.0, off_value=0.0, dtype=tf.float32) data_dict['K'] = K data_dict['DC'] = DC keypoint_vis21 = tf.ones([21,], tf.bool) data_dict['keypoint_vis21'] = keypoint_vis21 keypoint_xyz21 /= 100 # convert dome (centimeter) to meter if self.flip_2d: keypoint_xyz21 = tf.cond(hand_side, lambda: tf.concat([tf.expand_dims(-keypoint_xyz21[:, 0], axis=1), keypoint_xyz21[:, 1:]], axis=1), lambda: keypoint_xyz21) kp_coord_xyz21 = keypoint_xyz21 data_dict['keypoint_xyz21'] = keypoint_xyz21 data_dict['keypoint_uv21_origin'] = data_dict['keypoint_uv21'] = keypoint_uv21 kp_coord_xyz_root = kp_coord_xyz21[0, :] # this is the palm coord kp_coord_xyz21_rel = kp_coord_xyz21 - kp_coord_xyz_root # relative coords in metric coords index_root_bone_length = tf.sqrt(tf.reduce_sum(tf.square(kp_coord_xyz21_rel[12, :] - kp_coord_xyz21_rel[11, :]))) # data_dict['keypoint_scale'] = index_root_bone_length # data_dict['keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length # normalized by length of 12->11 data_dict['keypoint_scale'] = hand_size_tf(kp_coord_xyz21_rel) data_dict['index_scale'] = index_root_bone_length data_dict['keypoint_xyz21_normed'] = kp_coord_xyz21_rel / data_dict['keypoint_scale'] # calculate viewpoint and coords in canonical coordinates cond_left = tf.logical_and(tf.cast(tf.ones_like(kp_coord_xyz21), tf.bool), tf.logical_not(hand_side)) kp_coord_xyz21_rel_can, rot_mat = canonical_trafo(data_dict['keypoint_xyz21_normed']) kp_coord_xyz21_rel_can, rot_mat = tf.squeeze(kp_coord_xyz21_rel_can), tf.squeeze(rot_mat) if not self.flip_2d: kp_coord_xyz21_rel_can = flip_right_hand(kp_coord_xyz21_rel_can, tf.logical_not(cond_left)) data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can data_dict['rot_mat'] = tf.matrix_inverse(rot_mat) if self.hand_crop: crop_center = keypoint_uv21[12, ::-1] crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)), lambda: crop_center, lambda: tf.constant([0.0, 0.0])) crop_center.set_shape([2,]) if self.crop_center_noise: noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_center_noise_sigma) crop_center += noise crop_scale_noise = tf.constant(1.0) if self.crop_scale_noise: crop_scale_noise = tf.squeeze(tf.exp(tf.truncated_normal([1], mean=0.0, stddev=0.1))) kp_coord_hw = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], 1) # determine size of crop (measure spatial extend of hw coords first) min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0) max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0), self.image_size) # find out larger distance wrt the center of crop crop_size_best = 2*tf.maximum(max_coord - crop_center, crop_center - min_coord) crop_size_best = tf.reduce_max(crop_size_best) crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0), 500.0) crop_size_best = tf.cond(tf.reduce_all(tf.is_finite(crop_size_best)), lambda: crop_size_best, lambda: tf.constant(200.0)) crop_size_best.set_shape([]) crop_size_best *= self.crop_size_zoom crop_size_best *= crop_scale_noise # calculate necessary scaling scale = tf.cast(self.crop_size, tf.float32) / crop_size_best # scale = tf.minimum(tf.maximum(scale, 1.0), 10.0) data_dict['crop_scale'] = scale if self.flip_2d: crop_center = tf.cond(hand_side, lambda: tf.constant(self.image_size, dtype=tf.float32) - tf.ones((2,), dtype=tf.float32) - crop_center, lambda: crop_center) data_dict['crop_center'] = crop_center if self.crop_offset_noise: noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_offset_noise_sigma) crop_center += noise # Modify uv21 coordinates crop_center_float = tf.cast(crop_center, tf.float32) if self.flip_2d: keypoint_uv21_u = tf.cond(hand_side, lambda: -(keypoint_uv21[:, 0] - crop_center_float[1]) * scale + self.crop_size // 2, lambda: (keypoint_uv21[:, 0] - crop_center_float[1]) * scale + self.crop_size // 2) else: keypoint_uv21_u = (keypoint_uv21[:, 0] - crop_center_float[1]) * scale + self.crop_size // 2 keypoint_uv21_v = (keypoint_uv21[:, 1] - crop_center_float[0]) * scale + self.crop_size // 2 keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1) data_dict['keypoint_uv21'] = keypoint_uv21 if read_image: img_file = tf.read_file(img_dir) image = tf.image.decode_image(img_file, channels=3) image = tf.image.pad_to_bounding_box(image, 0, 0, 1080, 1920) image.set_shape((1080, 1920, 3)) image = tf.cast(image, tf.float32) if self.hand_crop: img_crop = crop_image_from_xy(tf.expand_dims(image, 0), crop_center, self.crop_size, scale) img_crop = img_crop / 255.0 - 0.5 img_crop = tf.squeeze(img_crop) if self.flip_2d: img_crop = tf.cond(hand_side, lambda: img_crop[:, ::-1, :], lambda: img_crop) data_dict['image_crop'] = img_crop if self.flip_2d: image = tf.cond(hand_side, lambda: image[:, ::-1, :], lambda: image) data_dict['image'] = image / 255.0 - 0.5 keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], -1) scoremap_size = self.image_size if self.hand_crop: scoremap_size = (self.crop_size, self.crop_size) scoremap = self.create_multiple_gaussian_map(keypoint_hw21, scoremap_size, self.sigma, valid_vec=keypoint_vis21, extra=extra) data_dict['scoremap'] = scoremap data_dict['scoremap_3d'] = create_multiple_gaussian_map_3d(data_dict['keypoint_xyz21_normed'], int(self.crop_size/8), 5, extra=extra) names, tensors = zip(*data_dict.items()) if self.shuffle: tensors = tf.train.shuffle_batch_join([tensors], batch_size=self.batch_size, capacity=100, min_after_dequeue=50, enqueue_many=False) else: tensors = tf.train.batch_join([tensors], batch_size=self.batch_size, capacity=100, enqueue_many=False) return dict(zip(names, tensors))
def __getitem__(self, index): data = self.data image = Image.open(os.path.join(self.image_path, '%.5d.png' % index)).convert('RGB') #image = cv2.imread(os.path.join(self.image_path, '%.5d.png' % index), 0) if self.transform is not None: image = self.transform(image) mask = Image.open(os.path.join(self.mask_path, '%.5d.png' % index)) #mask = Image.open(os.path.join(self.mask_path, '%.5d.png' % index), 0) if self.mask_transform is not None: mask = self.mask_transform(mask) keypoint_xyz = data[index]['xyz'] if not self.use_wrist_coord: palm_coord_l = torch.squeeze( 0.5 * (keypoint_xyz[0, :] + keypoint_xyz[12, :]), 0) palm_coord_r = torch.squeeze( 0.5 * (keypoint_xyz[21, :] + keypoint_xyz[33, :]), 0) keypoint_xyz = torch.cat([ palm_coord_l, keypoint_xyz[1:21, :], palm_coord_r, keypoint_xyz[-20:, :] ], 0) keypoint_uv = data[index]['uv_vis'] u = torch.tensor(data[index]['uv_vis'][:, 0]) v = torch.tensor(data[index]['uv_vis'][:, 1]) keypoint_uv = torch.stack((u, v), dim=1) if not self.use_wrist_coord: palm_coord_uv_l = torch.squeeze( 0.5 * (keypoint_uv[0, :] + keypoint_uv[12, :]), 0) palm_coord_uv_r = torch.squeeze( 0.5 * (keypoint_uv[21, :] + keypoint_uv[33, :]), 0) keypoint_uv = tf.concat([ palm_coord_uv_l, keypoint_uv[1:21, :], palm_coord_uv_r, keypoint_uv[-20:, :] ], 0) if self.coord_uv_noise: #shape, mean, stddev ##################### truncated normal distribution 추가 ##################### noise = tf.truncated_normal([42, 2], mean=0.0, stddev=self.coord_uv_noise_sigma) keypoint_uv += noise hand_parts = mask.to(torch.int32) hand_mask = (mask > 1) bg_mask = ~hand_mask hand_mask = torch.stack([bg_mask, hand_mask], 1) keypoint_vis = data[index]['uv_vis'][:, 2].astype(bool) if not self.use_wrist_coord: palm_vis_l = torch.squeeze( np.logical_or(keypoint_vis[0], keypoint_vis[12]), 0) palm_vis_r = torch.squeeze( np.logical_or(keypoint_vis[21], keypoint_vis[33]), 0) keypoint_vis = torch.concat([ palm_vis_l, keypoint_vis[1:21], palm_vis_r, keypoint_vis[-20:] ], 0) one_map, zero_map = torch.ones_like(mask), torch.zeros_like(mask) one_map = one_map.to(torch.int32) zero_map = zero_map.to(torch.int32) cond_l = np.logical_and((hand_parts > one_map), (hand_parts < one_map * 18)) cond_r = (hand_parts > one_map * 17) hand_map_l = torch.where(cond_l, one_map, zero_map) hand_map_r = torch.where(cond_r, one_map, zero_map) num_px_left_hand = hand_map_l.sum() num_px_right_hand = hand_map_r.sum() # produce the 21 subset using the segmentation masks kp_coord_xyz_left = keypoint_xyz[:21, :] kp_coord_xyz_right = keypoint_xyz[-21:, :] cond_left = np.logical_and( torch.ones_like(torch.tensor(kp_coord_xyz_left)).to(torch.bool), (num_px_left_hand > num_px_right_hand)) kp_coord_xyz21 = torch.where(cond_left, torch.tensor(kp_coord_xyz_left), torch.tensor(kp_coord_xyz_right)) hand_side = torch.where((num_px_left_hand > num_px_right_hand), torch.tensor(1, dtype=torch.int32), torch.tensor(0, dtype=torch.int32)) temp = torch.zeros(1, 2) temp[range(temp.shape[0]), hand_side] = 1 hand_side = temp.reshape(temp.size()[-1]) keypoint_xyz21 = kp_coord_xyz21 # make coords relative to root joint kp_coord_xyz_root = kp_coord_xyz21[0, :] # palm coord kp_coord_xyz21_rel = kp_coord_xyz21 - kp_coord_xyz_root # relative coords in metric coords index_root_bone_length = torch.sqrt( ((kp_coord_xyz21_rel[12, :] - kp_coord_xyz21_rel[11, :])**2).sum()) keypoint_scale = index_root_bone_length keypoint_xyz21_normed = kp_coord_xyz21_rel / index_root_bone_length # normalized # calculate viewpoint and coords in canonical coordinates kp_coord_xyz21_rel_can, rot_mat = canonical_trafo( keypoint_xyz21_normed) kp_coord_xyz21_rel_can, rot_mat = torch.squeeze( kp_coord_xyz21_rel_can), torch.squeeze(rot_mat) kp_coord_xyz21_rel_can = flip_right_hand(kp_coord_xyz21_rel_can, np.logical_not(cond_left)) keypoint_xyz21_can = kp_coord_xyz21_rel_can rot_mat = torch.inverse(rot_mat) # set 21 of for visibility keypoint_vis_left = keypoint_vis[:21] keypoint_vis_right = keypoint_vis[-21:] keypoint_vis_left = keypoint_vis_left.astype(np.int32) keypoint_vis21 = torch.where( cond_left[:, 0], torch.tensor(keypoint_vis_left.astype(np.int32)), torch.tensor(keypoint_vis_right.astype(np.int32))) # set of 21 for UV coordinates keypoint_uv_left = keypoint_uv[:21, :] keypoint_uv_right = keypoint_uv[-21:, :] keypoint_uv21 = torch.where(cond_left[:, :2], keypoint_uv_left, keypoint_uv_right) # create scoremaps from the subset of 2D annotation keypoint_hw21 = torch.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], -1) scoremap_size = self.image_size if self.hand_crop: scoremap_size = (self.crop_size, self.crop_size) score_map = self.create_multiple_gaussian_map(keypoint_hw21, scoremap_size, self.sigma, valid_vec=keypoint_vis21) if self.scoremap_dropout: drop = torch.nn.Dropout(self.scoremap_dropout_prob) score_map = drop(score_map) if self.scale_to_size: s = image.size() image = image.resize(self.scale_target_size) scale = (self.scale_target_size[0] / float(s[0]), self.scale_target_size[1] / float(s[1])) keypoint_uv21 = torch.stack([ keypoint_uv21[:, 0] * scale[1], keypoint_uv21[:, 1] * scale[0] ], 1) return image, hand_side, keypoint_xyz21, rot_mat
def get(self): """ Provides input data to the graph. """ # calculate size of each record (this lists what is contained in the db and how many bytes are occupied) record_bytes = 2 encoding_bytes = 4 kp_xyz_entries = 3 * self.num_kp record_bytes += encoding_bytes * kp_xyz_entries encoding_bytes = 4 kp_uv_entries = 2 * self.num_kp record_bytes += encoding_bytes * kp_uv_entries cam_matrix_entries = 9 record_bytes += encoding_bytes * cam_matrix_entries image_bytes = self.image_size[0] * self.image_size[1] * 3 record_bytes += image_bytes hand_parts_bytes = self.image_size[0] * self.image_size[1] record_bytes += hand_parts_bytes kp_vis_bytes = self.num_kp record_bytes += kp_vis_bytes """ READ DATA ITEMS""" # Start reader reader = tf.FixedLengthRecordReader(header_bytes=0, record_bytes=record_bytes) _, value = reader.read( tf.train.string_input_producer([self.path_to_db])) # decode to floats bytes_read = 0 data_dict = dict() record_bytes_float32 = tf.decode_raw(value, tf.float32) # 1. Read keypoint xyz keypoint_xyz = tf.reshape( tf.slice(record_bytes_float32, [bytes_read // 4], [kp_xyz_entries]), [self.num_kp, 3]) bytes_read += encoding_bytes * kp_xyz_entries # calculate palm coord if not self.use_wrist_coord: palm_coord_l = tf.expand_dims( 0.5 * (keypoint_xyz[0, :] + keypoint_xyz[12, :]), 0) palm_coord_r = tf.expand_dims( 0.5 * (keypoint_xyz[21, :] + keypoint_xyz[33, :]), 0) keypoint_xyz = tf.concat([ palm_coord_l, keypoint_xyz[1:21, :], palm_coord_r, keypoint_xyz[-20:, :] ], 0) data_dict['keypoint_xyz'] = keypoint_xyz # 2. Read keypoint uv keypoint_uv = tf.cast( tf.reshape( tf.slice(record_bytes_float32, [bytes_read // 4], [kp_uv_entries]), [self.num_kp, 2]), tf.int32) bytes_read += encoding_bytes * kp_uv_entries keypoint_uv = tf.cast(keypoint_uv, tf.float32) # calculate palm coord if not self.use_wrist_coord: palm_coord_uv_l = tf.expand_dims( 0.5 * (keypoint_uv[0, :] + keypoint_uv[12, :]), 0) palm_coord_uv_r = tf.expand_dims( 0.5 * (keypoint_uv[21, :] + keypoint_uv[33, :]), 0) keypoint_uv = tf.concat([ palm_coord_uv_l, keypoint_uv[1:21, :], palm_coord_uv_r, keypoint_uv[-20:, :] ], 0) if self.coord_uv_noise: noise = tf.truncated_normal([42, 2], mean=0.0, stddev=self.coord_uv_noise_sigma) keypoint_uv += noise data_dict['keypoint_uv'] = keypoint_uv # 3. Camera intrinsics cam_mat = tf.reshape( tf.slice(record_bytes_float32, [bytes_read // 4], [cam_matrix_entries]), [3, 3]) bytes_read += encoding_bytes * cam_matrix_entries data_dict['cam_mat'] = cam_mat # decode to uint8 bytes_read += 2 record_bytes_uint8 = tf.decode_raw(value, tf.uint8) # 4. Read image image = tf.reshape( tf.slice(record_bytes_uint8, [bytes_read], [image_bytes]), [self.image_size[0], self.image_size[1], 3]) image = tf.cast(image, tf.float32) bytes_read += image_bytes # subtract mean image = image / 255.0 - 0.5 if self.hue_aug: image = tf.image.random_hue(image, self.hue_aug_max) data_dict['image'] = image # 5. Read mask hand_parts_mask = tf.reshape( tf.slice(record_bytes_uint8, [bytes_read], [hand_parts_bytes]), [self.image_size[0], self.image_size[1]]) hand_parts_mask = tf.cast(hand_parts_mask, tf.int32) bytes_read += hand_parts_bytes data_dict['hand_parts'] = hand_parts_mask hand_mask = tf.greater(hand_parts_mask, 1) bg_mask = tf.logical_not(hand_mask) data_dict['hand_mask'] = tf.cast(tf.stack([bg_mask, hand_mask], 2), tf.int32) # 6. Read visibilty keypoint_vis = tf.reshape( tf.slice(record_bytes_uint8, [bytes_read], [kp_vis_bytes]), [self.num_kp]) keypoint_vis = tf.cast(keypoint_vis, tf.bool) bytes_read += kp_vis_bytes # calculate palm visibility if not self.use_wrist_coord: palm_vis_l = tf.expand_dims( tf.logical_or(keypoint_vis[0], keypoint_vis[12]), 0) palm_vis_r = tf.expand_dims( tf.logical_or(keypoint_vis[21], keypoint_vis[33]), 0) keypoint_vis = tf.concat([ palm_vis_l, keypoint_vis[1:21], palm_vis_r, keypoint_vis[-20:] ], 0) data_dict['keypoint_vis'] = keypoint_vis assert bytes_read == record_bytes, "Doesnt add up." """ DEPENDENT DATA ITEMS: SUBSET of 21 keypoints""" # figure out dominant hand by analysis of the segmentation mask one_map, zero_map = tf.ones_like(hand_parts_mask), tf.zeros_like( hand_parts_mask) cond_l = tf.logical_and(tf.greater(hand_parts_mask, one_map), tf.less(hand_parts_mask, one_map * 18)) cond_r = tf.greater(hand_parts_mask, one_map * 17) hand_map_l = tf.where(cond_l, one_map, zero_map) hand_map_r = tf.where(cond_r, one_map, zero_map) num_px_left_hand = tf.reduce_sum(hand_map_l) num_px_right_hand = tf.reduce_sum(hand_map_r) # PRODUCE the 21 subset using the segmentation masks # We only deal with the more prominent hand for each frame and discard the second set of keypoints kp_coord_xyz_left = keypoint_xyz[:21, :] kp_coord_xyz_right = keypoint_xyz[-21:, :] cond_left = tf.logical_and( tf.cast(tf.ones_like(kp_coord_xyz_left), tf.bool), tf.greater(num_px_left_hand, num_px_right_hand)) kp_coord_xyz21 = tf.where(cond_left, kp_coord_xyz_left, kp_coord_xyz_right) hand_side = tf.where( tf.greater(num_px_left_hand, num_px_right_hand), tf.constant(0, dtype=tf.int32), tf.constant(1, dtype=tf.int32)) # left hand = 0; right hand = 1 data_dict['hand_side'] = tf.one_hot(hand_side, depth=2, on_value=1.0, off_value=0.0, dtype=tf.float32) data_dict['keypoint_xyz21'] = kp_coord_xyz21 # make coords relative to root joint kp_coord_xyz_root = kp_coord_xyz21[0, :] # this is the palm coord kp_coord_xyz21_rel = kp_coord_xyz21 - kp_coord_xyz_root # relative coords in metric coords index_root_bone_length = tf.sqrt( tf.reduce_sum( tf.square(kp_coord_xyz21_rel[12, :] - kp_coord_xyz21_rel[11, :]))) data_dict['keypoint_scale'] = index_root_bone_length data_dict[ 'keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length # normalized by length of 12->11 # calculate local coordinates kp_coord_xyz21_local = bone_rel_trafo( data_dict['keypoint_xyz21_normed']) kp_coord_xyz21_local = tf.squeeze(kp_coord_xyz21_local) data_dict['keypoint_xyz21_local'] = kp_coord_xyz21_local # calculate viewpoint and coords in canonical coordinates kp_coord_xyz21_rel_can, rot_mat = canonical_trafo( data_dict['keypoint_xyz21_normed']) kp_coord_xyz21_rel_can, rot_mat = tf.squeeze( kp_coord_xyz21_rel_can), tf.squeeze(rot_mat) kp_coord_xyz21_rel_can = flip_right_hand(kp_coord_xyz21_rel_can, tf.logical_not(cond_left)) data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can data_dict['rot_mat'] = tf.matrix_inverse(rot_mat) # Set of 21 for visibility keypoint_vis_left = keypoint_vis[:21] keypoint_vis_right = keypoint_vis[-21:] keypoint_vis21 = tf.where(cond_left[:, 0], keypoint_vis_left, keypoint_vis_right) data_dict['keypoint_vis21'] = keypoint_vis21 # Set of 21 for UV coordinates keypoint_uv_left = keypoint_uv[:21, :] keypoint_uv_right = keypoint_uv[-21:, :] keypoint_uv21 = tf.where(cond_left[:, :2], keypoint_uv_left, keypoint_uv_right) data_dict['keypoint_uv21'] = keypoint_uv21 """ DEPENDENT DATA ITEMS: HAND CROP """ if self.hand_crop: crop_center = keypoint_uv21[12, ::-1] # catch problem, when no valid kp available (happens almost never) crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)), lambda: crop_center, lambda: tf.constant([0.0, 0.0])) crop_center.set_shape([ 2, ]) if self.crop_center_noise: noise = tf.truncated_normal( [2], mean=0.0, stddev=self.crop_center_noise_sigma) crop_center += noise crop_scale_noise = tf.constant(1.0) if self.crop_scale_noise: crop_scale_noise = tf.squeeze( tf.random_uniform([1], minval=1.0, maxval=1.2)) # select visible coords only kp_coord_h = tf.boolean_mask(keypoint_uv21[:, 1], keypoint_vis21) kp_coord_w = tf.boolean_mask(keypoint_uv21[:, 0], keypoint_vis21) kp_coord_hw = tf.stack([kp_coord_h, kp_coord_w], 1) # determine size of crop (measure spatial extend of hw coords first) min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0) max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0), self.image_size) # find out larger distance wrt the center of crop crop_size_best = 2 * tf.maximum(max_coord - crop_center, crop_center - min_coord) crop_size_best = tf.reduce_max(crop_size_best) crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0), 500.0) # catch problem, when no valid kp available crop_size_best = tf.cond( tf.reduce_all(tf.is_finite(crop_size_best)), lambda: crop_size_best, lambda: tf.constant(200.0)) crop_size_best.set_shape([]) # calculate necessary scaling scale = tf.cast(self.crop_size, tf.float32) / crop_size_best scale = tf.minimum(tf.maximum(scale, 1.0), 10.0) scale *= crop_scale_noise data_dict['crop_scale'] = scale if self.crop_offset_noise: noise = tf.truncated_normal( [2], mean=0.0, stddev=self.crop_offset_noise_sigma) crop_center += noise # Crop image img_crop = crop_image_from_xy(tf.expand_dims(image, 0), crop_center, self.crop_size, scale) data_dict['image_crop'] = tf.squeeze(img_crop) # Modify uv21 coordinates crop_center_float = tf.cast(crop_center, tf.float32) keypoint_uv21_u = (keypoint_uv21[:, 0] - crop_center_float[1] ) * scale + self.crop_size // 2 keypoint_uv21_v = (keypoint_uv21[:, 1] - crop_center_float[0] ) * scale + self.crop_size // 2 keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1) data_dict['keypoint_uv21'] = keypoint_uv21 # Modify camera intrinsics scale = tf.reshape(scale, [ 1, ]) scale_matrix = tf.dynamic_stitch([ [0], [1], [2], [3], [4], [5], [6], [7], [8] ], [scale, [0.0], [0.0], [0.0], scale, [0.0], [0.0], [0.0], [1.0]]) scale_matrix = tf.reshape(scale_matrix, [3, 3]) crop_center_float = tf.cast(crop_center, tf.float32) trans1 = crop_center_float[0] * scale - self.crop_size // 2 trans2 = crop_center_float[1] * scale - self.crop_size // 2 trans1 = tf.reshape(trans1, [ 1, ]) trans2 = tf.reshape(trans2, [ 1, ]) trans_matrix = tf.dynamic_stitch( [[0], [1], [2], [3], [4], [5], [6], [7], [8]], [[1.0], [0.0], -trans2, [0.0], [1.0], -trans1, [0.0], [0.0], [1.0]]) trans_matrix = tf.reshape(trans_matrix, [3, 3]) data_dict['cam_mat'] = tf.matmul(trans_matrix, tf.matmul(scale_matrix, cam_mat)) """ DEPENDENT DATA ITEMS: Scoremap from the SUBSET of 21 keypoints""" # create scoremaps from the subset of 2D annoataion keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], -1) scoremap_size = self.image_size if self.hand_crop: scoremap_size = (self.crop_size, self.crop_size) scoremap = self.create_multiple_gaussian_map(keypoint_hw21, scoremap_size, self.sigma, valid_vec=keypoint_vis21) if self.scoremap_dropout: scoremap = tf.nn.dropout(scoremap, self.scoremap_dropout_prob, noise_shape=[1, 1, 21]) scoremap *= self.scoremap_dropout_prob data_dict['scoremap'] = scoremap if self.scale_to_size: image, keypoint_uv21, keypoint_vis21 = data_dict[ 'image'], data_dict['keypoint_uv21'], data_dict[ 'keypoint_vis21'] s = image.get_shape().as_list() image = tf.image.resize_images(image, self.scale_target_size) scale = (self.scale_target_size[0] / float(s[0]), self.scale_target_size[1] / float(s[1])) keypoint_uv21 = tf.stack([ keypoint_uv21[:, 0] * scale[1], keypoint_uv21[:, 1] * scale[0] ], 1) data_dict = dict( ) # delete everything else because the scaling makes the data invalid anyway data_dict['image'] = image data_dict['keypoint_uv21'] = keypoint_uv21 data_dict['keypoint_vis21'] = keypoint_vis21 elif self.random_crop_to_size: tensor_stack = tf.concat([ data_dict['image'], tf.expand_dims(tf.cast(data_dict['hand_parts'], tf.float32), -1), tf.cast(data_dict['hand_mask'], tf.float32) ], 2) s = tensor_stack.get_shape().as_list() tensor_stack_cropped = tf.random_crop( tensor_stack, [self.random_crop_size, self.random_crop_size, s[2]]) data_dict = dict( ) # delete everything else because the random cropping makes the data invalid anyway data_dict['image'], data_dict['hand_parts'], data_dict['hand_mask'] = tensor_stack_cropped[:, :, :3],\ tf.cast(tensor_stack_cropped[:, :, 3], tf.int32),\ tf.cast(tensor_stack_cropped[:, :, 4:], tf.int32) names, tensors = zip(*data_dict.items()) if self.shuffle: tensors = tf.train.shuffle_batch_join([tensors], batch_size=self.batch_size, capacity=100, min_after_dequeue=50, enqueue_many=False) else: tensors = tf.train.batch_join([tensors], batch_size=self.batch_size, capacity=100, enqueue_many=False) return dict(zip(names, tensors))
def get(self, augment=True, slight=False): [joint3d, calibK, calibR, calibt, hand_side] = tf.train.slice_input_producer([ self.joints3d, self.calibKs, self.calibRs, self.calibts, self.hand_sides ], shuffle=False) if augment: # perform augment by rotation joint3d = random_rotate(joint3d, slight) joint2d, joint3d = project_tf(joint3d, calibK, calibR, calibt) keypoint_xyz21 = joint3d keypoint_uv21 = joint2d if not self.use_wrist_coord: palm_coord = tf.expand_dims( 0.5 * (keypoint_xyz21[0, :] + keypoint_xyz21[12, :]), 0) keypoint_xyz21 = tf.concat([palm_coord, keypoint_xyz21[1:21, :]], 0) palm_coord_uv = tf.expand_dims( 0.5 * (keypoint_uv21[0, :] + keypoint_uv21[12, :]), 0) keypoint_uv21 = tf.concat([palm_coord_uv, keypoint_uv21[1:21, :]], 0) if self.coord_uv_noise: noise = tf.truncated_normal([21, 2], mean=0.0, stddev=self.coord_uv_noise_sigma) keypoint_uv21 += noise data_dict = {} data_dict['hand_side'] = tf.one_hot(tf.cast(hand_side, tf.uint8), depth=2, on_value=1.0, off_value=0.0, dtype=tf.float32) keypoint_vis21 = tf.ones([ 21, ], tf.bool) data_dict['keypoint_vis21'] = keypoint_vis21 kp_coord_xyz21 = keypoint_xyz21 data_dict['keypoint_xyz21'] = keypoint_xyz21 data_dict['keypoint_uv21'] = keypoint_uv21 kp_coord_xyz_root = kp_coord_xyz21[0, :] # this is the palm coord kp_coord_xyz21_rel = kp_coord_xyz21 - kp_coord_xyz_root # relative coords in metric coords index_root_bone_length = tf.sqrt( tf.reduce_sum( tf.square(kp_coord_xyz21_rel[12, :] - kp_coord_xyz21_rel[11, :]))) # data_dict['keypoint_scale'] = index_root_bone_length # data_dict['keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length # normalized by length of 12->11 data_dict['keypoint_scale'] = hand_size_tf(kp_coord_xyz21_rel) data_dict['keypoint_xyz21_normed'] = kp_coord_xyz21_rel / data_dict[ 'keypoint_scale'] # calculate viewpoint and coords in canonical coordinates cond_left = tf.logical_and( tf.cast(tf.ones_like(kp_coord_xyz21), tf.bool), tf.logical_not(hand_side)) kp_coord_xyz21_rel_can, rot_mat = canonical_trafo( data_dict['keypoint_xyz21_normed']) kp_coord_xyz21_rel_can, rot_mat = tf.squeeze( kp_coord_xyz21_rel_can), tf.squeeze(rot_mat) kp_coord_xyz21_rel_can = flip_right_hand(kp_coord_xyz21_rel_can, tf.logical_not(cond_left)) data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can data_dict['rot_mat'] = tf.matrix_inverse(rot_mat) if self.hand_crop: crop_center = keypoint_uv21[12, ::-1] crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)), lambda: crop_center, lambda: tf.constant([0.0, 0.0])) crop_center.set_shape([ 2, ]) if self.crop_center_noise: noise = tf.truncated_normal( [2], mean=0.0, stddev=self.crop_center_noise_sigma) crop_center += noise crop_scale_noise = tf.constant(1.0) if self.crop_scale_noise: crop_scale_noise = tf.squeeze( tf.random_uniform([1], minval=0.8, maxval=1.2)) kp_coord_hw = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], 1) # determine size of crop (measure spatial extend of hw coords first) min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0) max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0), self.image_size) # find out larger distance wrt the center of crop crop_size_best = 2 * tf.maximum(max_coord - crop_center, crop_center - min_coord) crop_size_best = tf.reduce_max(crop_size_best) crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0), 500.0) crop_size_best = tf.cond( tf.reduce_all(tf.is_finite(crop_size_best)), lambda: crop_size_best, lambda: tf.constant(200.0)) crop_size_best.set_shape([]) crop_size_best *= 1.25 # calculate necessary scaling scale = tf.cast(self.crop_size, tf.float32) / crop_size_best scale = tf.minimum(tf.maximum(scale, 1.0), 10.0) scale *= crop_scale_noise data_dict['crop_scale'] = scale if self.crop_offset_noise: noise = tf.truncated_normal( [2], mean=0.0, stddev=self.crop_offset_noise_sigma) crop_center += noise # Modify uv21 coordinates crop_center_float = tf.cast(crop_center, tf.float32) keypoint_uv21_u = (keypoint_uv21[:, 0] - crop_center_float[1] ) * scale + self.crop_size // 2 keypoint_uv21_v = (keypoint_uv21[:, 1] - crop_center_float[0] ) * scale + self.crop_size // 2 keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1) data_dict['keypoint_uv21'] = keypoint_uv21 keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], -1) scoremap_size = self.image_size if self.hand_crop: scoremap_size = (self.crop_size, self.crop_size) scoremap = self.create_multiple_gaussian_map(keypoint_hw21, scoremap_size, self.sigma, valid_vec=keypoint_vis21) data_dict['scoremap'] = scoremap names, tensors = zip(*data_dict.items()) if self.shuffle: tensors = tf.train.shuffle_batch_join([tensors], batch_size=self.batch_size, capacity=100, min_after_dequeue=50, enqueue_many=False) else: tensors = tf.train.batch_join([tensors], batch_size=self.batch_size, capacity=100, enqueue_many=False) return dict(zip(names, tensors))