Ejemplo n.º 1
0
def prepare_input(
    rgb: torch.Tensor,
    resize_res: int = 256,
    inp_res: int = 224,
    mean: torch.Tensor = 0.5 * torch.ones(3), std=1.0 * torch.ones(3),
):
    """
    Process the video:
    1) Resize to [resize_res x resize_res]
    2) Center crop with [inp_res x inp_res]
    3) Color normalize using mean/std
    """
    iC, iF, iH, iW = rgb.shape
    rgb_resized = np.zeros((iF, resize_res, resize_res, iC))
    for t in range(iF):
        tmp = rgb[:, t, :, :]
        tmp = resize_generic(
            im_to_numpy(tmp), resize_res, resize_res, interp="bilinear", is_flow=False
        )
        rgb_resized[t] = tmp
    rgb = np.transpose(rgb_resized, (3, 0, 1, 2))
    # Center crop coords
    ulx = int((resize_res - inp_res) / 2)
    uly = int((resize_res - inp_res) / 2)
    # Crop 256x256
    rgb = rgb[:, :, uly : uly + inp_res, ulx : ulx + inp_res]
    rgb = to_torch(rgb).float()
    assert rgb.max() <= 1
    rgb = color_normalize(rgb, mean, std)
    return rgb
Ejemplo n.º 2
0
 def prepare_image(self, image):
     was_fixed_point = not image.is_floating_point()
     image = torch.empty_like(image, dtype=torch.float32).copy_(image)
     if was_fixed_point:
         image /= 255.0
     if image.shape[-2:] != self.input_shape:
         image = fit(image, self.input_shape, fit_mode='contain')
     image = color_normalize(image, self.data_info.rgb_mean,
                             self.data_info.rgb_stddev)
     return image
Ejemplo n.º 3
0
def get_training_image(img_path,
                       bbox=None,
                       inp_res=256,
                       mean=(0.6419, 0.6292, 0.5994),
                       std=(0.2311, 0.2304, 0.2379)):
    img = load_image(img_path)

    if bbox is not None:
        x0, y0, x1, y1 = bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]
        c = np.array([(x0 + x1), (y0 + y1)]) / 2  # center
        s = np.sqrt((y1 - y0) * (x1 - x0)) / 60.0  # scale

    else:
        c = np.array([img.shape[2] / 2, img.shape[1] / 2])
        s = 5.0  # THIS HAS TO BE FIXED !!!

    r = 0  # rotation

    inp = crop(img, c, s, [inp_res, inp_res], rot=r)
    inp = color_normalize(inp, mean, std)

    meta = {'center': c, 'scale': s}

    return inp, meta
Ejemplo n.º 4
0
    def _get_single_video(self, index, data_index, frame_ix):
        """Loads/augments/returns the video data
        :param index: Index wrt to the data loader
        :param data_index: Index wrt to train/valid list
        :param frame_ix: A list of frame indices to sample from the video
        :return data: Dictionary of input/output and other metadata
        """
        # If the input is pose (Pose->Sign experiments)
        if hasattr(self, "input_type") and self.input_type == "pose":
            data = {
                "rgb": self._get_pose(data_index, frame_ix),
                "index": index,
                "data_index": data_index,
                "class": self._get_class(data_index, frame_ix),
                "class_names": self.class_names,
                "dataset": self.datasetname,
            }
            return data
        # Otherwise the input is RGB
        else:
            rgb = self._load_rgb(data_index, frame_ix)
            if getattr(self, "mask_rgb", False):
                rgb = self._mask_rgb(
                    rgb,
                    data_index,
                    frame_ix,
                    region=self.mask_rgb,
                    mask_type=self.mask_type,
                )

        if getattr(self, "gpu_collation", False):
            # Meta info
            data = {
                "rgb": rgb,
                "index": index,
                "data_index": data_index,
                "class": self._get_class(data_index, frame_ix),
                "class_names": self.class_names,
                "dataset": self.datasetname,
            }
            return data

        # Preparing RGB data
        if self.setname == "train":
            # Horizontal flip: disable for now, should be done after the bbox cropping
            is_hflip = random.random() < self.hflip
            if is_hflip:
                rgb = torch.flip(rgb, dims=[2])
            # Color jitter
            rgb = im_color_jitter(rgb, num_in_frames=self.num_in_frames, thr=0.2)

        rgb = im_to_numpy(rgb)
        iH, iW, iC = rgb.shape

        if self.use_bbox:
            y0, x0, y1, x1 = self._get_bbox(data_index)
            y0 = max(0, int(y0 * iH))
            y1 = min(iH, int(y1 * iH))
            x0 = max(0, int(x0 * iW))
            x1 = min(iW, int(x1 * iW))
            if self.setname == "train" and is_hflip:
                x0 = iW - x0
                x1 = iW - x1
                x0, x1 = x1, x0
            rgb = rgb[y0:y1, x0:x1, :]
            rgb = resize_generic(
                rgb, self.resize_res, self.resize_res, interp="bilinear", is_flow=False,
            )
            iH, iW, iC = rgb.shape

        resol = self.resize_res  # 300 for 256, 130 for 112 etc.
        if self.setname == "train":
            # Augment the scaled resolution between:
            #     [1 - self.scale_factor, 1 + self.scale_factor)
            rand_scale = random.random()
            resol *= 1 - self.scale_factor + 2 * self.scale_factor * rand_scale
            resol = int(resol)
        if iW > iH:
            nH, nW = resol, int(resol * iW / iH)
        else:
            nH, nW = int(resol * iH / iW), resol
        # Resize to nH, nW resolution
        rgb = resize_generic(rgb, nH, nW, interp="bilinear", is_flow=False)

        # Crop
        if self.setname == "train":
            # Random crop coords
            ulx = random.randint(0, nW - self.inp_res)
            uly = random.randint(0, nH - self.inp_res)
        else:
            # Center crop coords
            ulx = int((nW - self.inp_res) / 2)
            uly = int((nH - self.inp_res) / 2)
        # Crop 256x256
        rgb = rgb[uly : uly + self.inp_res, ulx : ulx + self.inp_res]
        rgb = im_to_torch(rgb)
        rgb = im_to_video(rgb)
        rgb = color_normalize(rgb, self.mean, self.std)

        # Return
        data = {
            "rgb": rgb,
            "class": self._get_class(data_index, frame_ix),
            "index": index,
            "class_names": self.class_names,
            "dataset": self.datasetname,
        }

        return data
Ejemplo n.º 5
0
    def gpu_collater(self, minibatch, concat_datasets=None):
        rgb = minibatch["rgb"]
        assert rgb.is_cuda, "expected tensor to be on the GPU"
        if self.setname == "train":
            is_hflip = random.random() < self.hflip
            if is_hflip:
                # horizontal axis is last
                rgb = torch.flip(rgb, dims=[-1])

        if self.setname == "train":
            rgb = im_color_jitter(rgb, num_in_frames=self.num_in_frames, thr=0.2)

        # For now, mimic the original pipeline.  If it's still a bottleneck, we should
        # collapse the cropping, resizing etc. logic into a single sampling grid.
        iB, iC, iK, iH, iW = rgb.shape
        assert iK == self.num_in_frames, "unexpected number of frames per clip"

        bbox_yxyx = np.zeros((iB, 4), dtype=np.float32)
        for ii, data_index in enumerate(minibatch["data_index"]):
            bbox_yxyx[ii] = np.array([0, 0, 1, 1])
            # Otherwise, it fails when mixing use_bbox True and False for two datasets
            if concat_datasets is not None:
                local_use_bbox = concat_datasets[minibatch["dataset"][ii]].use_bbox
            else:
                local_use_bbox = self.use_bbox
            if local_use_bbox:
                # Until we patch ConcatDataset, we need to pass the dataset object
                # explicitly to handle bbox selection
                if concat_datasets is not None:
                    get_bbox = concat_datasets[minibatch["dataset"][ii]]._get_bbox
                else:
                    get_bbox = self._get_bbox
                bbox_yxyx[ii] = get_bbox(data_index)

        # require that the original boxes lie inside the image
        bbox_yxyx[:, :2] = np.maximum(0, bbox_yxyx[:, :2])
        bbox_yxyx[:, 2:] = np.minimum(1, bbox_yxyx[:, 2:])

        if self.setname == "train":
            if is_hflip:
                flipped_xmin = 1 - bbox_yxyx[:, 3]
                flipped_xmax = 1 - bbox_yxyx[:, 1]
                bbox_yxyx[:, 1] = flipped_xmin
                bbox_yxyx[:, 3] = flipped_xmax

            # apply a random (isotropic) scale factor to box coordinates
            rand_scale = np.random.rand(iB, 1)
            rand_scale = 1 - self.scale_factor + 2 * self.scale_factor * rand_scale
            # Mimic the meaning of scale used in CPU pipeline
            rand_scale = 1 / rand_scale
            bbox_yxyx = scale_yxyx_bbox(bbox_yxyx, scale=rand_scale)

        # apply random/center cropping to match the proportions used in the original code
        # (the scaling is not quite identical, but close to it)
        if self.setname == "train":
            crop_box_sc = (self.inp_res / self.resize_res) * rand_scale
        else:
            crop_box_sc = self.inp_res / self.resize_res
        bbox_yxyx = scale_yxyx_bbox(bbox_yxyx, scale=crop_box_sc)

        # If training, jitter the location such that it still lies within the appropriate
        # region defined by the (optionally scaled) bounding box
        if self.setname == "train":
            crop_bbox_cenhw = bbox_format(bbox_yxyx, src="yxyx", dest="cenhw")
            cropped_hw = crop_bbox_cenhw[:, 2:]
            valid_offset_region_hw = ((1 - crop_box_sc) / crop_box_sc) * cropped_hw
            valid_offset_samples = np.random.rand(iB, 2)
            valid_rand_offsets = (valid_offset_samples - 0.5) * valid_offset_region_hw
            # apply offsets
            bbox_yxyx += np.tile(valid_rand_offsets, (1, 2))

        # TODO(Samuel): go back over:
        #  (1) the corner alignment logic to check we are doing # the right thing here.
        #  (2) whether zero padding is appropriate for out-of-bounds handling
        # center in [-1, -1] coordinates
        bbox_yxyx = 2 * bbox_yxyx - 1
        grids = torch.zeros(
            iB, self.inp_res, self.inp_res, 2, device=rgb.device, dtype=rgb.dtype
        )

        for ii, bbox in enumerate(bbox_yxyx):
            yticks = torch.linspace(start=bbox[0], end=bbox[2], steps=self.inp_res)
            xticks = torch.linspace(start=bbox[1], end=bbox[3], steps=self.inp_res)
            grid_y, grid_x = torch.meshgrid(yticks, xticks)
            # The grid expects the ordering to be x then y
            grids[ii] = torch.stack((grid_x, grid_y), 2)

        # merge RGB and clip dimensions to use with grid sampler
        rgb = rgb.view(rgb.shape[0], 3 * self.num_in_frames, iH, iW)
        rgb = torch.nn.functional.grid_sample(
            rgb, grid=grids, mode="bilinear", align_corners=False, padding_mode="zeros",
        )
        # unflatten channel/clip dimension
        rgb = rgb.view(rgb.shape[0], 3, self.num_in_frames, self.inp_res, self.inp_res)
        rgb = color_normalize(rgb, self.mean, self.std)
        minibatch["rgb"] = rgb
        return minibatch
Ejemplo n.º 6
0
    def __getitem__(self, index):
        sf = self.scale_factor
        rf = self.rot_factor
        if self.is_train:
            a = self.anno[self.train_list[index]]
        else:
            a = self.anno[self.valid_list[index]]

        img_path = os.path.join(self.img_folder, a['img_paths'])
        pts = torch.Tensor(a['joint_self'])
        # pts[:, 0:2] -= 1  # Convert pts to zero based

        # c = torch.Tensor(a['objpos']) - 1
        c = torch.Tensor(a['objpos'])
        s = a['scale_provided']

        # Adjust center/scale slightly to avoid cropping limbs
        if c[0] != -1:
            c[1] = c[1] + 15 * s
            s = s * 1.25

        # For single-person pose estimation with a centered/scaled figure
        nparts = pts.size(0)
        img = load_image(img_path)  # CxHxW

        r = 0
        if self.is_train:
            s = s * torch.randn(1).mul_(sf).add_(1).clamp(1 - sf, 1 + sf)[0]
            r = torch.randn(1).mul_(rf).clamp(
                -2 * rf, 2 * rf)[0] if random.random() <= 0.6 else 0

            # Flip
            if random.random() <= 0.5:
                img = fliplr(img)
                pts = shufflelr(pts, img.size(2), self.DATA_INFO.hflip_indices)
                c[0] = img.size(2) - c[0]

            # Color
            img[0, :, :].mul_(random.uniform(0.8, 1.2)).clamp_(0, 1)
            img[1, :, :].mul_(random.uniform(0.8, 1.2)).clamp_(0, 1)
            img[2, :, :].mul_(random.uniform(0.8, 1.2)).clamp_(0, 1)

        # Prepare image and groundtruth map
        inp = crop(img, c, s, self.inp_res, rot=r)
        inp = color_normalize(inp, self.DATA_INFO.rgb_mean,
                              self.DATA_INFO.rgb_stddev)

        # Generate ground truth
        tpts = pts.clone()
        target = torch.zeros(nparts, *self.out_res)
        target_weight = tpts[:, 2].clone().view(nparts, 1)

        for i in range(nparts):
            # if tpts[i, 2] > 0: # This is evil!!
            if tpts[i, 1] > 0:
                tpts[i, 0:2] = to_torch(
                    transform(tpts[i, 0:2] + 1, c, s, self.out_res, rot=r))
                target[i], vis = draw_labelmap(target[i],
                                               tpts[i] - 1,
                                               self.sigma,
                                               type=self.label_type)
                target_weight[i, 0] *= vis

        # Meta info
        if not isinstance(s, torch.Tensor):
            s = torch.Tensor(s)

        meta = {
            'index': index,
            'center': c,
            'scale': s,
            'pts': pts,
            'tpts': tpts,
            'target_weight': target_weight
        }

        return inp, target, meta