Exemple #1
0
    def get(self, read_image=True, extra=False):

        [joint3d, joint2d, hand_side, img_dir, K, DC] = tf.train.slice_input_producer([self.joints3d, self.joints2d, self.hand_sides, self.img_dirs, self.Ks, self.DCs], shuffle=False)
        keypoint_xyz21 = joint3d
        keypoint_uv21 = joint2d
        if not self.use_wrist_coord:
            palm_coord = tf.expand_dims(0.5*(keypoint_xyz21[0, :] + keypoint_xyz21[12, :]), 0)
            keypoint_xyz21 = tf.concat([palm_coord, keypoint_xyz21[1:21, :]], 0)
            palm_coord_uv = tf.expand_dims(0.5*(keypoint_uv21[0, :] + keypoint_uv21[12, :]), 0)
            keypoint_uv21 = tf.concat([palm_coord_uv, keypoint_uv21[1:21, :]], 0)
        if self.coord_uv_noise:
            noise = tf.truncated_normal([21, 2], mean=0.0, stddev=self.coord_uv_noise_sigma)
            keypoint_uv21 += noise

        data_dict = {}
        data_dict['img_dir'] = img_dir
        data_dict['hand_side'] = tf.one_hot(tf.cast(hand_side, tf.uint8), depth=2, on_value=1.0, off_value=0.0, dtype=tf.float32)
        data_dict['K'] = K
        data_dict['DC'] = DC

        keypoint_vis21 = tf.ones([21,], tf.bool)
        data_dict['keypoint_vis21'] = keypoint_vis21

        keypoint_xyz21 /= 100 # convert dome (centimeter) to meter
        if self.flip_2d:
            keypoint_xyz21 = tf.cond(hand_side, lambda: tf.concat([tf.expand_dims(-keypoint_xyz21[:, 0], axis=1), keypoint_xyz21[:, 1:]], axis=1), lambda: keypoint_xyz21)

        kp_coord_xyz21 = keypoint_xyz21
        data_dict['keypoint_xyz21'] = keypoint_xyz21
        data_dict['keypoint_uv21_origin'] = data_dict['keypoint_uv21'] = keypoint_uv21
        kp_coord_xyz_root = kp_coord_xyz21[0, :] # this is the palm coord
        kp_coord_xyz21_rel = kp_coord_xyz21 - kp_coord_xyz_root  # relative coords in metric coords
        index_root_bone_length = tf.sqrt(tf.reduce_sum(tf.square(kp_coord_xyz21_rel[12, :] - kp_coord_xyz21_rel[11, :])))
        # data_dict['keypoint_scale'] = index_root_bone_length
        # data_dict['keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length  # normalized by length of 12->11
        data_dict['keypoint_scale'] = hand_size_tf(kp_coord_xyz21_rel) 
        data_dict['index_scale'] = index_root_bone_length
        data_dict['keypoint_xyz21_normed'] = kp_coord_xyz21_rel / data_dict['keypoint_scale']

        # calculate viewpoint and coords in canonical coordinates
        cond_left = tf.logical_and(tf.cast(tf.ones_like(kp_coord_xyz21), tf.bool), tf.logical_not(hand_side))
        kp_coord_xyz21_rel_can, rot_mat = canonical_trafo(data_dict['keypoint_xyz21_normed'])
        kp_coord_xyz21_rel_can, rot_mat = tf.squeeze(kp_coord_xyz21_rel_can), tf.squeeze(rot_mat)
        if not self.flip_2d:
            kp_coord_xyz21_rel_can = flip_right_hand(kp_coord_xyz21_rel_can, tf.logical_not(cond_left))
        data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can
        data_dict['rot_mat'] = tf.matrix_inverse(rot_mat)

        if self.hand_crop:
            crop_center = keypoint_uv21[12, ::-1]
            crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)), lambda: crop_center,
                                  lambda: tf.constant([0.0, 0.0]))
            crop_center.set_shape([2,])

            if self.crop_center_noise:
                noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_center_noise_sigma)
                crop_center += noise

            crop_scale_noise = tf.constant(1.0)
            if self.crop_scale_noise:
                crop_scale_noise = tf.squeeze(tf.exp(tf.truncated_normal([1], mean=0.0, stddev=0.1)))

            kp_coord_hw = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], 1)
            # determine size of crop (measure spatial extend of hw coords first)
            min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0)
            max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0), self.image_size)

            # find out larger distance wrt the center of crop
            crop_size_best = 2*tf.maximum(max_coord - crop_center, crop_center - min_coord)
            crop_size_best = tf.reduce_max(crop_size_best)
            crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0), 500.0)
            crop_size_best = tf.cond(tf.reduce_all(tf.is_finite(crop_size_best)), lambda: crop_size_best,
                                  lambda: tf.constant(200.0))
            crop_size_best.set_shape([])
            crop_size_best *= self.crop_size_zoom
            crop_size_best *= crop_scale_noise

            # calculate necessary scaling
            scale = tf.cast(self.crop_size, tf.float32) / crop_size_best
            # scale = tf.minimum(tf.maximum(scale, 1.0), 10.0)
            data_dict['crop_scale'] = scale
            if self.flip_2d:
                crop_center = tf.cond(hand_side, lambda: tf.constant(self.image_size, dtype=tf.float32) - tf.ones((2,), dtype=tf.float32)  - crop_center, lambda: crop_center)
            data_dict['crop_center'] = crop_center

            if self.crop_offset_noise:
                noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_offset_noise_sigma)
                crop_center += noise

            # Modify uv21 coordinates
            crop_center_float = tf.cast(crop_center, tf.float32)
            if self.flip_2d:
                keypoint_uv21_u = tf.cond(hand_side,
                    lambda: -(keypoint_uv21[:, 0] - crop_center_float[1]) * scale + self.crop_size // 2,
                    lambda: (keypoint_uv21[:, 0] - crop_center_float[1]) * scale + self.crop_size // 2)
            else:
                keypoint_uv21_u = (keypoint_uv21[:, 0] - crop_center_float[1]) * scale + self.crop_size // 2
            keypoint_uv21_v = (keypoint_uv21[:, 1] - crop_center_float[0]) * scale + self.crop_size // 2
            keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1)
            data_dict['keypoint_uv21'] = keypoint_uv21            

        if read_image:
            img_file = tf.read_file(img_dir)
            image = tf.image.decode_image(img_file, channels=3)
            image = tf.image.pad_to_bounding_box(image, 0, 0, 1080, 1920)
            image.set_shape((1080, 1920, 3))
            image = tf.cast(image, tf.float32)
            if self.hand_crop:
                img_crop = crop_image_from_xy(tf.expand_dims(image, 0), crop_center, self.crop_size, scale)
                img_crop =  img_crop / 255.0 - 0.5
                img_crop = tf.squeeze(img_crop)
                if self.flip_2d:
                    img_crop = tf.cond(hand_side, lambda: img_crop[:, ::-1, :], lambda: img_crop)
                data_dict['image_crop'] = img_crop
            if self.flip_2d:
                image = tf.cond(hand_side, lambda: image[:, ::-1, :], lambda: image)
            data_dict['image'] = image / 255.0 - 0.5

        keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], -1)

        scoremap_size = self.image_size
        
        if self.hand_crop:
            scoremap_size = (self.crop_size, self.crop_size)

        scoremap = self.create_multiple_gaussian_map(keypoint_hw21,
                                                     scoremap_size,
                                                     self.sigma,
                                                     valid_vec=keypoint_vis21,
                                                     extra=extra)

        data_dict['scoremap'] = scoremap
        
        data_dict['scoremap_3d'] = create_multiple_gaussian_map_3d(data_dict['keypoint_xyz21_normed'], int(self.crop_size/8), 5, extra=extra)

        names, tensors = zip(*data_dict.items())

        if self.shuffle:
            tensors = tf.train.shuffle_batch_join([tensors],
                                                  batch_size=self.batch_size,
                                                  capacity=100,
                                                  min_after_dequeue=50,
                                                  enqueue_many=False)
        else:
            tensors = tf.train.batch_join([tensors],
                                          batch_size=self.batch_size,
                                          capacity=100,
                                          enqueue_many=False)

        return dict(zip(names, tensors))
Exemple #2
0
    def __getitem__(self, index):
        data = self.data
        image = Image.open(os.path.join(self.image_path,
                                        '%.5d.png' % index)).convert('RGB')
        #image = cv2.imread(os.path.join(self.image_path, '%.5d.png' % index), 0)
        if self.transform is not None:
            image = self.transform(image)

        mask = Image.open(os.path.join(self.mask_path, '%.5d.png' % index))
        #mask = Image.open(os.path.join(self.mask_path, '%.5d.png' % index), 0)
        if self.mask_transform is not None:
            mask = self.mask_transform(mask)

        keypoint_xyz = data[index]['xyz']
        if not self.use_wrist_coord:
            palm_coord_l = torch.squeeze(
                0.5 * (keypoint_xyz[0, :] + keypoint_xyz[12, :]), 0)
            palm_coord_r = torch.squeeze(
                0.5 * (keypoint_xyz[21, :] + keypoint_xyz[33, :]), 0)
            keypoint_xyz = torch.cat([
                palm_coord_l, keypoint_xyz[1:21, :], palm_coord_r,
                keypoint_xyz[-20:, :]
            ], 0)

        keypoint_uv = data[index]['uv_vis']
        u = torch.tensor(data[index]['uv_vis'][:, 0])
        v = torch.tensor(data[index]['uv_vis'][:, 1])
        keypoint_uv = torch.stack((u, v), dim=1)
        if not self.use_wrist_coord:
            palm_coord_uv_l = torch.squeeze(
                0.5 * (keypoint_uv[0, :] + keypoint_uv[12, :]), 0)
            palm_coord_uv_r = torch.squeeze(
                0.5 * (keypoint_uv[21, :] + keypoint_uv[33, :]), 0)
            keypoint_uv = tf.concat([
                palm_coord_uv_l, keypoint_uv[1:21, :], palm_coord_uv_r,
                keypoint_uv[-20:, :]
            ], 0)
        if self.coord_uv_noise:
            #shape, mean, stddev
            ##################### truncated normal distribution 추가 #####################
            noise = tf.truncated_normal([42, 2],
                                        mean=0.0,
                                        stddev=self.coord_uv_noise_sigma)
            keypoint_uv += noise

        hand_parts = mask.to(torch.int32)
        hand_mask = (mask > 1)
        bg_mask = ~hand_mask
        hand_mask = torch.stack([bg_mask, hand_mask], 1)

        keypoint_vis = data[index]['uv_vis'][:, 2].astype(bool)
        if not self.use_wrist_coord:
            palm_vis_l = torch.squeeze(
                np.logical_or(keypoint_vis[0], keypoint_vis[12]), 0)
            palm_vis_r = torch.squeeze(
                np.logical_or(keypoint_vis[21], keypoint_vis[33]), 0)
            keypoint_vis = torch.concat([
                palm_vis_l, keypoint_vis[1:21], palm_vis_r, keypoint_vis[-20:]
            ], 0)

        one_map, zero_map = torch.ones_like(mask), torch.zeros_like(mask)
        one_map = one_map.to(torch.int32)
        zero_map = zero_map.to(torch.int32)
        cond_l = np.logical_and((hand_parts > one_map),
                                (hand_parts < one_map * 18))
        cond_r = (hand_parts > one_map * 17)
        hand_map_l = torch.where(cond_l, one_map, zero_map)
        hand_map_r = torch.where(cond_r, one_map, zero_map)
        num_px_left_hand = hand_map_l.sum()
        num_px_right_hand = hand_map_r.sum()

        # produce the 21 subset using the segmentation masks
        kp_coord_xyz_left = keypoint_xyz[:21, :]
        kp_coord_xyz_right = keypoint_xyz[-21:, :]

        cond_left = np.logical_and(
            torch.ones_like(torch.tensor(kp_coord_xyz_left)).to(torch.bool),
            (num_px_left_hand > num_px_right_hand))
        kp_coord_xyz21 = torch.where(cond_left,
                                     torch.tensor(kp_coord_xyz_left),
                                     torch.tensor(kp_coord_xyz_right))

        hand_side = torch.where((num_px_left_hand > num_px_right_hand),
                                torch.tensor(1, dtype=torch.int32),
                                torch.tensor(0, dtype=torch.int32))
        temp = torch.zeros(1, 2)
        temp[range(temp.shape[0]), hand_side] = 1
        hand_side = temp.reshape(temp.size()[-1])

        keypoint_xyz21 = kp_coord_xyz21

        # make coords relative to root joint
        kp_coord_xyz_root = kp_coord_xyz21[0, :]  # palm coord
        kp_coord_xyz21_rel = kp_coord_xyz21 - kp_coord_xyz_root  # relative coords in metric coords
        index_root_bone_length = torch.sqrt(
            ((kp_coord_xyz21_rel[12, :] - kp_coord_xyz21_rel[11, :])**2).sum())
        keypoint_scale = index_root_bone_length
        keypoint_xyz21_normed = kp_coord_xyz21_rel / index_root_bone_length  # normalized

        # calculate viewpoint and coords in canonical coordinates
        kp_coord_xyz21_rel_can, rot_mat = canonical_trafo(
            keypoint_xyz21_normed)
        kp_coord_xyz21_rel_can, rot_mat = torch.squeeze(
            kp_coord_xyz21_rel_can), torch.squeeze(rot_mat)
        kp_coord_xyz21_rel_can = flip_right_hand(kp_coord_xyz21_rel_can,
                                                 np.logical_not(cond_left))
        keypoint_xyz21_can = kp_coord_xyz21_rel_can
        rot_mat = torch.inverse(rot_mat)

        # set 21 of for visibility
        keypoint_vis_left = keypoint_vis[:21]
        keypoint_vis_right = keypoint_vis[-21:]
        keypoint_vis_left = keypoint_vis_left.astype(np.int32)
        keypoint_vis21 = torch.where(
            cond_left[:, 0], torch.tensor(keypoint_vis_left.astype(np.int32)),
            torch.tensor(keypoint_vis_right.astype(np.int32)))

        # set of 21 for UV coordinates
        keypoint_uv_left = keypoint_uv[:21, :]
        keypoint_uv_right = keypoint_uv[-21:, :]
        keypoint_uv21 = torch.where(cond_left[:, :2], keypoint_uv_left,
                                    keypoint_uv_right)

        # create scoremaps from the subset of 2D annotation
        keypoint_hw21 = torch.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]],
                                    -1)

        scoremap_size = self.image_size
        if self.hand_crop:
            scoremap_size = (self.crop_size, self.crop_size)

        score_map = self.create_multiple_gaussian_map(keypoint_hw21,
                                                      scoremap_size,
                                                      self.sigma,
                                                      valid_vec=keypoint_vis21)
        if self.scoremap_dropout:
            drop = torch.nn.Dropout(self.scoremap_dropout_prob)
            score_map = drop(score_map)

        if self.scale_to_size:
            s = image.size()
            image = image.resize(self.scale_target_size)
            scale = (self.scale_target_size[0] / float(s[0]),
                     self.scale_target_size[1] / float(s[1]))
            keypoint_uv21 = torch.stack([
                keypoint_uv21[:, 0] * scale[1], keypoint_uv21[:, 1] * scale[0]
            ], 1)

        return image, hand_side, keypoint_xyz21, rot_mat
    def get(self):
        """ Provides input data to the graph. """
        # calculate size of each record (this lists what is contained in the db and how many bytes are occupied)
        record_bytes = 2

        encoding_bytes = 4
        kp_xyz_entries = 3 * self.num_kp
        record_bytes += encoding_bytes * kp_xyz_entries

        encoding_bytes = 4
        kp_uv_entries = 2 * self.num_kp
        record_bytes += encoding_bytes * kp_uv_entries

        cam_matrix_entries = 9
        record_bytes += encoding_bytes * cam_matrix_entries

        image_bytes = self.image_size[0] * self.image_size[1] * 3
        record_bytes += image_bytes

        hand_parts_bytes = self.image_size[0] * self.image_size[1]
        record_bytes += hand_parts_bytes

        kp_vis_bytes = self.num_kp
        record_bytes += kp_vis_bytes
        """ READ DATA ITEMS"""
        # Start reader
        reader = tf.FixedLengthRecordReader(header_bytes=0,
                                            record_bytes=record_bytes)
        _, value = reader.read(
            tf.train.string_input_producer([self.path_to_db]))

        # decode to floats
        bytes_read = 0
        data_dict = dict()
        record_bytes_float32 = tf.decode_raw(value, tf.float32)

        # 1. Read keypoint xyz
        keypoint_xyz = tf.reshape(
            tf.slice(record_bytes_float32, [bytes_read // 4],
                     [kp_xyz_entries]), [self.num_kp, 3])
        bytes_read += encoding_bytes * kp_xyz_entries

        # calculate palm coord
        if not self.use_wrist_coord:
            palm_coord_l = tf.expand_dims(
                0.5 * (keypoint_xyz[0, :] + keypoint_xyz[12, :]), 0)
            palm_coord_r = tf.expand_dims(
                0.5 * (keypoint_xyz[21, :] + keypoint_xyz[33, :]), 0)
            keypoint_xyz = tf.concat([
                palm_coord_l, keypoint_xyz[1:21, :], palm_coord_r,
                keypoint_xyz[-20:, :]
            ], 0)

        data_dict['keypoint_xyz'] = keypoint_xyz

        # 2. Read keypoint uv
        keypoint_uv = tf.cast(
            tf.reshape(
                tf.slice(record_bytes_float32, [bytes_read // 4],
                         [kp_uv_entries]), [self.num_kp, 2]), tf.int32)
        bytes_read += encoding_bytes * kp_uv_entries

        keypoint_uv = tf.cast(keypoint_uv, tf.float32)

        # calculate palm coord
        if not self.use_wrist_coord:
            palm_coord_uv_l = tf.expand_dims(
                0.5 * (keypoint_uv[0, :] + keypoint_uv[12, :]), 0)
            palm_coord_uv_r = tf.expand_dims(
                0.5 * (keypoint_uv[21, :] + keypoint_uv[33, :]), 0)
            keypoint_uv = tf.concat([
                palm_coord_uv_l, keypoint_uv[1:21, :], palm_coord_uv_r,
                keypoint_uv[-20:, :]
            ], 0)

        if self.coord_uv_noise:
            noise = tf.truncated_normal([42, 2],
                                        mean=0.0,
                                        stddev=self.coord_uv_noise_sigma)
            keypoint_uv += noise

        data_dict['keypoint_uv'] = keypoint_uv

        # 3. Camera intrinsics
        cam_mat = tf.reshape(
            tf.slice(record_bytes_float32, [bytes_read // 4],
                     [cam_matrix_entries]), [3, 3])
        bytes_read += encoding_bytes * cam_matrix_entries
        data_dict['cam_mat'] = cam_mat

        # decode to uint8
        bytes_read += 2
        record_bytes_uint8 = tf.decode_raw(value, tf.uint8)

        # 4. Read image
        image = tf.reshape(
            tf.slice(record_bytes_uint8, [bytes_read], [image_bytes]),
            [self.image_size[0], self.image_size[1], 3])
        image = tf.cast(image, tf.float32)
        bytes_read += image_bytes

        # subtract mean
        image = image / 255.0 - 0.5
        if self.hue_aug:
            image = tf.image.random_hue(image, self.hue_aug_max)
        data_dict['image'] = image

        # 5. Read mask
        hand_parts_mask = tf.reshape(
            tf.slice(record_bytes_uint8, [bytes_read], [hand_parts_bytes]),
            [self.image_size[0], self.image_size[1]])
        hand_parts_mask = tf.cast(hand_parts_mask, tf.int32)
        bytes_read += hand_parts_bytes
        data_dict['hand_parts'] = hand_parts_mask
        hand_mask = tf.greater(hand_parts_mask, 1)
        bg_mask = tf.logical_not(hand_mask)
        data_dict['hand_mask'] = tf.cast(tf.stack([bg_mask, hand_mask], 2),
                                         tf.int32)

        # 6. Read visibilty
        keypoint_vis = tf.reshape(
            tf.slice(record_bytes_uint8, [bytes_read], [kp_vis_bytes]),
            [self.num_kp])
        keypoint_vis = tf.cast(keypoint_vis, tf.bool)
        bytes_read += kp_vis_bytes

        # calculate palm visibility
        if not self.use_wrist_coord:
            palm_vis_l = tf.expand_dims(
                tf.logical_or(keypoint_vis[0], keypoint_vis[12]), 0)
            palm_vis_r = tf.expand_dims(
                tf.logical_or(keypoint_vis[21], keypoint_vis[33]), 0)
            keypoint_vis = tf.concat([
                palm_vis_l, keypoint_vis[1:21], palm_vis_r, keypoint_vis[-20:]
            ], 0)
        data_dict['keypoint_vis'] = keypoint_vis

        assert bytes_read == record_bytes, "Doesnt add up."
        """ DEPENDENT DATA ITEMS: SUBSET of 21 keypoints"""
        # figure out dominant hand by analysis of the segmentation mask
        one_map, zero_map = tf.ones_like(hand_parts_mask), tf.zeros_like(
            hand_parts_mask)
        cond_l = tf.logical_and(tf.greater(hand_parts_mask, one_map),
                                tf.less(hand_parts_mask, one_map * 18))
        cond_r = tf.greater(hand_parts_mask, one_map * 17)
        hand_map_l = tf.where(cond_l, one_map, zero_map)
        hand_map_r = tf.where(cond_r, one_map, zero_map)
        num_px_left_hand = tf.reduce_sum(hand_map_l)
        num_px_right_hand = tf.reduce_sum(hand_map_r)

        # PRODUCE the 21 subset using the segmentation masks
        # We only deal with the more prominent hand for each frame and discard the second set of keypoints
        kp_coord_xyz_left = keypoint_xyz[:21, :]
        kp_coord_xyz_right = keypoint_xyz[-21:, :]

        cond_left = tf.logical_and(
            tf.cast(tf.ones_like(kp_coord_xyz_left), tf.bool),
            tf.greater(num_px_left_hand, num_px_right_hand))
        kp_coord_xyz21 = tf.where(cond_left, kp_coord_xyz_left,
                                  kp_coord_xyz_right)

        hand_side = tf.where(
            tf.greater(num_px_left_hand,
                       num_px_right_hand), tf.constant(0, dtype=tf.int32),
            tf.constant(1, dtype=tf.int32))  # left hand = 0; right hand = 1
        data_dict['hand_side'] = tf.one_hot(hand_side,
                                            depth=2,
                                            on_value=1.0,
                                            off_value=0.0,
                                            dtype=tf.float32)

        data_dict['keypoint_xyz21'] = kp_coord_xyz21

        # make coords relative to root joint
        kp_coord_xyz_root = kp_coord_xyz21[0, :]  # this is the palm coord
        kp_coord_xyz21_rel = kp_coord_xyz21 - kp_coord_xyz_root  # relative coords in metric coords
        index_root_bone_length = tf.sqrt(
            tf.reduce_sum(
                tf.square(kp_coord_xyz21_rel[12, :] -
                          kp_coord_xyz21_rel[11, :])))
        data_dict['keypoint_scale'] = index_root_bone_length
        data_dict[
            'keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length  # normalized by length of 12->11

        # calculate local coordinates
        kp_coord_xyz21_local = bone_rel_trafo(
            data_dict['keypoint_xyz21_normed'])
        kp_coord_xyz21_local = tf.squeeze(kp_coord_xyz21_local)
        data_dict['keypoint_xyz21_local'] = kp_coord_xyz21_local

        # calculate viewpoint and coords in canonical coordinates
        kp_coord_xyz21_rel_can, rot_mat = canonical_trafo(
            data_dict['keypoint_xyz21_normed'])
        kp_coord_xyz21_rel_can, rot_mat = tf.squeeze(
            kp_coord_xyz21_rel_can), tf.squeeze(rot_mat)
        kp_coord_xyz21_rel_can = flip_right_hand(kp_coord_xyz21_rel_can,
                                                 tf.logical_not(cond_left))
        data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can
        data_dict['rot_mat'] = tf.matrix_inverse(rot_mat)

        # Set of 21 for visibility
        keypoint_vis_left = keypoint_vis[:21]
        keypoint_vis_right = keypoint_vis[-21:]
        keypoint_vis21 = tf.where(cond_left[:, 0], keypoint_vis_left,
                                  keypoint_vis_right)
        data_dict['keypoint_vis21'] = keypoint_vis21

        # Set of 21 for UV coordinates
        keypoint_uv_left = keypoint_uv[:21, :]
        keypoint_uv_right = keypoint_uv[-21:, :]
        keypoint_uv21 = tf.where(cond_left[:, :2], keypoint_uv_left,
                                 keypoint_uv_right)
        data_dict['keypoint_uv21'] = keypoint_uv21
        """ DEPENDENT DATA ITEMS: HAND CROP """
        if self.hand_crop:
            crop_center = keypoint_uv21[12, ::-1]

            # catch problem, when no valid kp available (happens almost never)
            crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)),
                                  lambda: crop_center,
                                  lambda: tf.constant([0.0, 0.0]))
            crop_center.set_shape([
                2,
            ])

            if self.crop_center_noise:
                noise = tf.truncated_normal(
                    [2], mean=0.0, stddev=self.crop_center_noise_sigma)
                crop_center += noise

            crop_scale_noise = tf.constant(1.0)
            if self.crop_scale_noise:
                crop_scale_noise = tf.squeeze(
                    tf.random_uniform([1], minval=1.0, maxval=1.2))

            # select visible coords only
            kp_coord_h = tf.boolean_mask(keypoint_uv21[:, 1], keypoint_vis21)
            kp_coord_w = tf.boolean_mask(keypoint_uv21[:, 0], keypoint_vis21)
            kp_coord_hw = tf.stack([kp_coord_h, kp_coord_w], 1)

            # determine size of crop (measure spatial extend of hw coords first)
            min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0)
            max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0),
                                   self.image_size)

            # find out larger distance wrt the center of crop
            crop_size_best = 2 * tf.maximum(max_coord - crop_center,
                                            crop_center - min_coord)
            crop_size_best = tf.reduce_max(crop_size_best)
            crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0),
                                        500.0)

            # catch problem, when no valid kp available
            crop_size_best = tf.cond(
                tf.reduce_all(tf.is_finite(crop_size_best)),
                lambda: crop_size_best, lambda: tf.constant(200.0))
            crop_size_best.set_shape([])

            # calculate necessary scaling
            scale = tf.cast(self.crop_size, tf.float32) / crop_size_best
            scale = tf.minimum(tf.maximum(scale, 1.0), 10.0)
            scale *= crop_scale_noise
            data_dict['crop_scale'] = scale

            if self.crop_offset_noise:
                noise = tf.truncated_normal(
                    [2], mean=0.0, stddev=self.crop_offset_noise_sigma)
                crop_center += noise

            # Crop image
            img_crop = crop_image_from_xy(tf.expand_dims(image, 0),
                                          crop_center, self.crop_size, scale)
            data_dict['image_crop'] = tf.squeeze(img_crop)

            # Modify uv21 coordinates
            crop_center_float = tf.cast(crop_center, tf.float32)
            keypoint_uv21_u = (keypoint_uv21[:, 0] - crop_center_float[1]
                               ) * scale + self.crop_size // 2
            keypoint_uv21_v = (keypoint_uv21[:, 1] - crop_center_float[0]
                               ) * scale + self.crop_size // 2
            keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1)
            data_dict['keypoint_uv21'] = keypoint_uv21

            # Modify camera intrinsics
            scale = tf.reshape(scale, [
                1,
            ])
            scale_matrix = tf.dynamic_stitch([
                [0], [1], [2], [3], [4], [5], [6], [7], [8]
            ], [scale, [0.0], [0.0], [0.0], scale, [0.0], [0.0], [0.0], [1.0]])
            scale_matrix = tf.reshape(scale_matrix, [3, 3])

            crop_center_float = tf.cast(crop_center, tf.float32)
            trans1 = crop_center_float[0] * scale - self.crop_size // 2
            trans2 = crop_center_float[1] * scale - self.crop_size // 2
            trans1 = tf.reshape(trans1, [
                1,
            ])
            trans2 = tf.reshape(trans2, [
                1,
            ])
            trans_matrix = tf.dynamic_stitch(
                [[0], [1], [2], [3], [4], [5], [6], [7], [8]],
                [[1.0], [0.0], -trans2, [0.0], [1.0], -trans1, [0.0], [0.0],
                 [1.0]])
            trans_matrix = tf.reshape(trans_matrix, [3, 3])

            data_dict['cam_mat'] = tf.matmul(trans_matrix,
                                             tf.matmul(scale_matrix, cam_mat))
        """ DEPENDENT DATA ITEMS: Scoremap from the SUBSET of 21 keypoints"""
        # create scoremaps from the subset of 2D annoataion
        keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]],
                                 -1)

        scoremap_size = self.image_size

        if self.hand_crop:
            scoremap_size = (self.crop_size, self.crop_size)

        scoremap = self.create_multiple_gaussian_map(keypoint_hw21,
                                                     scoremap_size,
                                                     self.sigma,
                                                     valid_vec=keypoint_vis21)

        if self.scoremap_dropout:
            scoremap = tf.nn.dropout(scoremap,
                                     self.scoremap_dropout_prob,
                                     noise_shape=[1, 1, 21])
            scoremap *= self.scoremap_dropout_prob

        data_dict['scoremap'] = scoremap

        if self.scale_to_size:
            image, keypoint_uv21, keypoint_vis21 = data_dict[
                'image'], data_dict['keypoint_uv21'], data_dict[
                    'keypoint_vis21']
            s = image.get_shape().as_list()
            image = tf.image.resize_images(image, self.scale_target_size)
            scale = (self.scale_target_size[0] / float(s[0]),
                     self.scale_target_size[1] / float(s[1]))
            keypoint_uv21 = tf.stack([
                keypoint_uv21[:, 0] * scale[1], keypoint_uv21[:, 1] * scale[0]
            ], 1)

            data_dict = dict(
            )  # delete everything else because the scaling makes the data invalid anyway
            data_dict['image'] = image
            data_dict['keypoint_uv21'] = keypoint_uv21
            data_dict['keypoint_vis21'] = keypoint_vis21

        elif self.random_crop_to_size:
            tensor_stack = tf.concat([
                data_dict['image'],
                tf.expand_dims(tf.cast(data_dict['hand_parts'], tf.float32),
                               -1),
                tf.cast(data_dict['hand_mask'], tf.float32)
            ], 2)
            s = tensor_stack.get_shape().as_list()
            tensor_stack_cropped = tf.random_crop(
                tensor_stack,
                [self.random_crop_size, self.random_crop_size, s[2]])
            data_dict = dict(
            )  # delete everything else because the random cropping makes the data invalid anyway
            data_dict['image'], data_dict['hand_parts'], data_dict['hand_mask'] = tensor_stack_cropped[:, :, :3],\
                                                                                  tf.cast(tensor_stack_cropped[:, :, 3], tf.int32),\
                                                                                  tf.cast(tensor_stack_cropped[:, :, 4:], tf.int32)

        names, tensors = zip(*data_dict.items())

        if self.shuffle:
            tensors = tf.train.shuffle_batch_join([tensors],
                                                  batch_size=self.batch_size,
                                                  capacity=100,
                                                  min_after_dequeue=50,
                                                  enqueue_many=False)
        else:
            tensors = tf.train.batch_join([tensors],
                                          batch_size=self.batch_size,
                                          capacity=100,
                                          enqueue_many=False)

        return dict(zip(names, tensors))
Exemple #4
0
    def get(self, augment=True, slight=False):
        [joint3d, calibK, calibR, calibt,
         hand_side] = tf.train.slice_input_producer([
             self.joints3d, self.calibKs, self.calibRs, self.calibts,
             self.hand_sides
         ],
                                                    shuffle=False)
        if augment:
            # perform augment by rotation
            joint3d = random_rotate(joint3d, slight)
        joint2d, joint3d = project_tf(joint3d, calibK, calibR, calibt)
        keypoint_xyz21 = joint3d
        keypoint_uv21 = joint2d
        if not self.use_wrist_coord:
            palm_coord = tf.expand_dims(
                0.5 * (keypoint_xyz21[0, :] + keypoint_xyz21[12, :]), 0)
            keypoint_xyz21 = tf.concat([palm_coord, keypoint_xyz21[1:21, :]],
                                       0)
            palm_coord_uv = tf.expand_dims(
                0.5 * (keypoint_uv21[0, :] + keypoint_uv21[12, :]), 0)
            keypoint_uv21 = tf.concat([palm_coord_uv, keypoint_uv21[1:21, :]],
                                      0)
        if self.coord_uv_noise:
            noise = tf.truncated_normal([21, 2],
                                        mean=0.0,
                                        stddev=self.coord_uv_noise_sigma)
            keypoint_uv21 += noise

        data_dict = {}
        data_dict['hand_side'] = tf.one_hot(tf.cast(hand_side, tf.uint8),
                                            depth=2,
                                            on_value=1.0,
                                            off_value=0.0,
                                            dtype=tf.float32)

        keypoint_vis21 = tf.ones([
            21,
        ], tf.bool)
        data_dict['keypoint_vis21'] = keypoint_vis21

        kp_coord_xyz21 = keypoint_xyz21
        data_dict['keypoint_xyz21'] = keypoint_xyz21
        data_dict['keypoint_uv21'] = keypoint_uv21
        kp_coord_xyz_root = kp_coord_xyz21[0, :]  # this is the palm coord
        kp_coord_xyz21_rel = kp_coord_xyz21 - kp_coord_xyz_root  # relative coords in metric coords
        index_root_bone_length = tf.sqrt(
            tf.reduce_sum(
                tf.square(kp_coord_xyz21_rel[12, :] -
                          kp_coord_xyz21_rel[11, :])))
        # data_dict['keypoint_scale'] = index_root_bone_length
        # data_dict['keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length  # normalized by length of 12->11
        data_dict['keypoint_scale'] = hand_size_tf(kp_coord_xyz21_rel)
        data_dict['keypoint_xyz21_normed'] = kp_coord_xyz21_rel / data_dict[
            'keypoint_scale']

        # calculate viewpoint and coords in canonical coordinates
        cond_left = tf.logical_and(
            tf.cast(tf.ones_like(kp_coord_xyz21), tf.bool),
            tf.logical_not(hand_side))
        kp_coord_xyz21_rel_can, rot_mat = canonical_trafo(
            data_dict['keypoint_xyz21_normed'])
        kp_coord_xyz21_rel_can, rot_mat = tf.squeeze(
            kp_coord_xyz21_rel_can), tf.squeeze(rot_mat)
        kp_coord_xyz21_rel_can = flip_right_hand(kp_coord_xyz21_rel_can,
                                                 tf.logical_not(cond_left))
        data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can
        data_dict['rot_mat'] = tf.matrix_inverse(rot_mat)

        if self.hand_crop:
            crop_center = keypoint_uv21[12, ::-1]
            crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)),
                                  lambda: crop_center,
                                  lambda: tf.constant([0.0, 0.0]))
            crop_center.set_shape([
                2,
            ])

            if self.crop_center_noise:
                noise = tf.truncated_normal(
                    [2], mean=0.0, stddev=self.crop_center_noise_sigma)
                crop_center += noise

            crop_scale_noise = tf.constant(1.0)
            if self.crop_scale_noise:
                crop_scale_noise = tf.squeeze(
                    tf.random_uniform([1], minval=0.8, maxval=1.2))

            kp_coord_hw = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]],
                                   1)
            # determine size of crop (measure spatial extend of hw coords first)
            min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0)
            max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0),
                                   self.image_size)

            # find out larger distance wrt the center of crop
            crop_size_best = 2 * tf.maximum(max_coord - crop_center,
                                            crop_center - min_coord)
            crop_size_best = tf.reduce_max(crop_size_best)
            crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0),
                                        500.0)
            crop_size_best = tf.cond(
                tf.reduce_all(tf.is_finite(crop_size_best)),
                lambda: crop_size_best, lambda: tf.constant(200.0))
            crop_size_best.set_shape([])
            crop_size_best *= 1.25

            # calculate necessary scaling
            scale = tf.cast(self.crop_size, tf.float32) / crop_size_best
            scale = tf.minimum(tf.maximum(scale, 1.0), 10.0)
            scale *= crop_scale_noise
            data_dict['crop_scale'] = scale

            if self.crop_offset_noise:
                noise = tf.truncated_normal(
                    [2], mean=0.0, stddev=self.crop_offset_noise_sigma)
                crop_center += noise

            # Modify uv21 coordinates
            crop_center_float = tf.cast(crop_center, tf.float32)
            keypoint_uv21_u = (keypoint_uv21[:, 0] - crop_center_float[1]
                               ) * scale + self.crop_size // 2
            keypoint_uv21_v = (keypoint_uv21[:, 1] - crop_center_float[0]
                               ) * scale + self.crop_size // 2
            keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1)
            data_dict['keypoint_uv21'] = keypoint_uv21

        keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]],
                                 -1)

        scoremap_size = self.image_size

        if self.hand_crop:
            scoremap_size = (self.crop_size, self.crop_size)

        scoremap = self.create_multiple_gaussian_map(keypoint_hw21,
                                                     scoremap_size,
                                                     self.sigma,
                                                     valid_vec=keypoint_vis21)

        data_dict['scoremap'] = scoremap

        names, tensors = zip(*data_dict.items())

        if self.shuffle:
            tensors = tf.train.shuffle_batch_join([tensors],
                                                  batch_size=self.batch_size,
                                                  capacity=100,
                                                  min_after_dequeue=50,
                                                  enqueue_many=False)
        else:
            tensors = tf.train.batch_join([tensors],
                                          batch_size=self.batch_size,
                                          capacity=100,
                                          enqueue_many=False)

        return dict(zip(names, tensors))