Esempio n. 1
0
    def __call__(self, images):
        self.visual_backprop_anchors.clear()
        h = self.feature_extractor(images)
        self.visual_backprop_anchors.append(h)

        batch_size = len(h)
        transform_params = self.get_transform_params(h)

        boxes = F.spatial_transformer_grid(transform_params, self.out_size)

        expanded_images = F.broadcast_to(
            F.expand_dims(images, axis=1),
            (batch_size, self.num_bboxes_to_localize) + images.shape[1:])
        expanded_images = F.reshape(expanded_images,
                                    (-1, ) + expanded_images.shape[2:])
        rois = F.spatial_transformer_sampler(expanded_images, boxes)

        rois = F.reshape(
            rois, (batch_size, self.num_bboxes_to_localize, images.shape[1],
                   self.out_size.height, self.out_size.width))
        boxes = F.reshape(boxes, (batch_size, self.num_bboxes_to_localize, 2,
                                  self.out_size.height, self.out_size.width))

        # return shapes:
        # 1. batch_size, num_bboxes, num_channels, (out-)height, (out-)width
        # 2. batch_size, num_bboxes, 2, (out-)height, (out-)width
        return rois, boxes
Esempio n. 2
0
    def __call__(self, images, localizations):
        points = F.spatial_transformer_grid(localizations, self.target_shape)
        rois = F.spatial_transformer_sampler(images, points)

        # h = self.data_bn(rois)
        h = F.relu(self.bn0(self.conv0(rois)))
        h = F.average_pooling_2d(h, 2, stride=2)

        h = self.rs1(h)
        h = self.rs2(h)
        h = F.max_pooling_2d(h, 2, stride=2)
        h = self.rs3(h)
        self.vis_anchor = h

        h = F.average_pooling_2d(h, 5, stride=1)

        h = F.relu(self.fc1(h))

        # for each timestep of the localization net do the 'classification'
        h = F.reshape(h, (self.num_timesteps * 2 + 1, -1, self.fc1.out_size))
        overall_predictions = []
        for timestep in F.separate(h, axis=0):
            # go 2x num_labels plus 1 timesteps because of ctc loss
            lstm_predictions = []
            self.lstm.reset_state()
            for _ in range(self.num_labels):
                lstm_prediction = self.lstm(timestep)
                classified = self.classifier(lstm_prediction)
                lstm_predictions.append(classified)
            overall_predictions.append(lstm_predictions)

        return overall_predictions, rois, points
Esempio n. 3
0
    def crop_from_image(self,
                        image,
                        output_size=None,
                        use_spatial_transformer=True):
        if output_size is None:
            output_size = (self.width, self.height)

        if use_spatial_transformer:
            image_array = np.asarray(image).transpose(2, 0,
                                                      1).astype(np.float32)
            crop_transform = self.get_affine_transform_params(
                image.size).astype(np.float32)
            transform_grid = spatial_transformer_grid(
                crop_transform[np.newaxis, ...],
                (output_size[1], output_size[0]))
            cropped_image = spatial_transformer_sampler(
                image_array[np.newaxis, ...], transform_grid).data[0]
            cropped_image = cropped_image.astype(np.uint8)

            cropped_image = Image.fromarray(cropped_image.transpose(1, 2, 0))
        else:
            cropped_image = image.crop(self.to_aabb())
            cropped_image = cropped_image.resize(output_size, Image.BILINEAR)

        return cropped_image
Esempio n. 4
0
    def __call__(self, images):
        self.visual_backprop_anchors.clear()

        with cuda.Device(images.data.device):
            input_images = self.prepare_images(images.copy() * 255)
        h = self.feature_extractor(input_images)

        if self.train_imagenet:
            return h

        if images.shape[-2] > 224:
            h = self.res6(h)

            if images.shape[-2] > 300:
                h = self.res7(h)

        self.visual_backprop_anchors.append(h)
        h = _global_average_pooling_2d(h)

        transform_params = self.param_predictor(h)
        transform_params = rotation_dropout(F.reshape(transform_params,
                                                      (-1, 2, 3)),
                                            ratio=0.0)
        points = F.spatial_transformer_grid(transform_params, self.out_size)
        rois = F.spatial_transformer_sampler(images, points)

        if self.transform_rois_to_grayscale:
            assert rois.shape[
                1] == 3, "rois are not in RGB, can not convert them to grayscale"
            b, g, r = F.split_axis(rois, 3, axis=1)
            rois = 0.299 * r + 0.587 * g + 0.114 * b

        return rois, points
Esempio n. 5
0
    def apply_transform_params(self, image, transform_params):
        image = self.xp.tile(image[np.newaxis, ...],
                             (len(transform_params), 1, 1, 1))

        transform_grid = spatial_transformer_grid(transform_params,
                                                  self.image_size)
        cropped_image = spatial_transformer_sampler(image,
                                                    transform_grid).array
        return cropped_image
Esempio n. 6
0
    def __call__(self, images, localizations):
        points = F.spatial_transformer_grid(localizations, self.target_shape)
        rois = F.spatial_transformer_sampler(images, points)

        h = self.bn0(self.conv0(rois))
        h = F.average_pooling_2d(F.relu(h), 2, stride=2)

        h = self.rs1(h)
        h = self.rs2(h)
        h = F.max_pooling_2d(h, 2, stride=2)
        h = self.rs3(h)
        self.vis_anchor = h

        h = F.average_pooling_2d(h, 5, stride=1)

        if self.uses_original_data:
            # merge data of all 4 individual images in channel dimension
            batch_size, num_channels, height, width = h.shape
            h = F.reshape(h,
                          (batch_size // 4, 4 * num_channels, height, width))

        h = F.relu(self.fc1(h))

        # for each timestep of the localization net do the 'classification'
        h = F.reshape(h, (self.num_timesteps, -1, self.fc1.out_size))
        overall_predictions = []
        for timestep in F.separate(h, axis=0):
            # go 2x num_labels plus 1 timesteps because of ctc loss
            lstm_predictions = []
            self.lstm.reset_state()
            if self.use_blstm:
                self.blstm.reset_state()

            for _ in range(self.num_labels):
                lstm_prediction = self.lstm(timestep)
                lstm_predictions.append(lstm_prediction)

            if self.use_blstm:
                blstm_predictions = []
                for lstm_prediction in reversed(lstm_predictions):
                    blstm_prediction = self.blstm(lstm_prediction)
                    blstm_predictions.append(blstm_prediction)

                lstm_predictions = reversed(blstm_predictions)

            final_lstm_predictions = []
            for lstm_prediction in lstm_predictions:
                classified = self.classifier(lstm_prediction)
                final_lstm_predictions.append(F.expand_dims(classified,
                                                            axis=0))

            final_lstm_predictions = F.concat(final_lstm_predictions, axis=0)
            overall_predictions.append(final_lstm_predictions)

        return overall_predictions, rois, points
Esempio n. 7
0
    def __call__(self, images, localizations):
        points = F.spatial_transformer_grid(localizations, self.target_shape)
        rois = F.spatial_transformer_sampler(images, points)

        h = self.data_bn(rois)
        h = F.relu(self.bn0(self.conv0(h)))
        h = F.average_pooling_2d(h, 2, stride=2)

        h = self.rs1(h)
        h = self.rs2(h)
        h = F.max_pooling_2d(h, 2, stride=2)
        h = self.rs3(h)
        self.vis_anchor = h

        h = F.average_pooling_2d(h, 5, stride=1)

        h = F.relu(self.fc1(h))

        # for each timestep of the localization net do the 'classification'
        h = F.reshape(h, (self.num_timesteps, -1, self.fc1.out_size))
        overall_predictions = []
        for timestep in F.separate(h, axis=0):
            lstm_predictions = []
            self.lstm.reset_state()
            if self.use_blstm:
                self.blstm.reset_state()

            for _ in range(self.num_labels):
                lstm_prediction = self.lstm(timestep)
                lstm_predictions.append(lstm_prediction)

            if self.use_blstm:
                blstm_predictions = []
                for lstm_prediction in reversed(lstm_predictions):
                    blstm_prediction = self.blstm(lstm_prediction)
                    blstm_predictions.append(blstm_prediction)

                lstm_predictions = reversed(blstm_predictions)

            final_lstm_predictions = []
            for lstm_prediction in lstm_predictions:
                classified = self.classifier(lstm_prediction)
                final_lstm_predictions.append(F.expand_dims(classified,
                                                            axis=1))

            final_lstm_predictions = F.concat(final_lstm_predictions, axis=1)
            overall_predictions.append(final_lstm_predictions)

        return overall_predictions, rois, points
    def check_forward(self, theta, output_shape):
        grid = functions.spatial_transformer_grid(theta, output_shape).data

        theta = cuda.to_cpu(theta)
        B = theta.shape[0]
        H, W = output_shape

        expected = []
        for b in range(B):
            for i in numpy.linspace(-1., 1., H):
                for j in numpy.linspace(-1., 1., W):
                    coord = numpy.array([j, i, 1])
                    expected.append(self.theta[b].dot(coord))
        expected = numpy.array(
            expected).reshape(B, H, W, 2).transpose(0, 3, 1, 2)
        testing.assert_allclose(grid, expected)
        self.assertEqual(grid.dtype, theta.dtype)
Esempio n. 9
0
    def check_forward(self, theta, output_shape):
        grid = functions.spatial_transformer_grid(theta, output_shape).data

        theta = cuda.to_cpu(theta)
        B = theta.shape[0]
        H, W = output_shape

        expected = []
        for b in range(B):
            for i in numpy.linspace(-1., 1., H):
                for j in numpy.linspace(-1., 1., W):
                    coord = numpy.array([j, i, 1])
                    expected.append(self.theta[b].dot(coord))
        expected = numpy.array(expected).reshape(B, H, W,
                                                 2).transpose(0, 3, 1, 2)
        testing.assert_allclose(grid, expected)
        self.assertEqual(grid.dtype, theta.dtype)
Esempio n. 10
0
    def do_transformation_param_refinement_step(self, images, transformation_params):
        transformation_params = self.remove_homogeneous_coordinates(transformation_params)
        points = F.spatial_transformer_grid(transformation_params, self.target_shape)
        rois = F.spatial_transformer_sampler(images, points)

        # rerun parts of the feature extraction for producing a refined version of the transformation params
        h = self.bn0_1(self.conv0_1(rois))
        h = F.average_pooling_2d(F.relu(h), 2, stride=2)

        h = self.rs4(h)
        h = F.max_pooling_2d(h, 2, stride=2)

        h = self.rs5(h)
        h = F.max_pooling_2d(h, 2, stride=2)

        transformation_params = self.refinement_transform(h)
        transformation_params = F.reshape(transformation_params, (-1, 2, 3))
        transformation_params = rotation_dropout(transformation_params, ratio=self.dropout_ratio)
        return transformation_params
    def __call__(self, encs, hiddens, batch_size, prev_image, num_masks, color_channels):
        """
            Learn through StatelessSTP.
            Args:
                encs: An array of computed transformation
                hiddens: An array of hidden layers
                batch_size: Size of mini batches
                prev_image: The image to transform
                num_masks: Number of masks to apply
                color_channels: Output color channels
            Returns:
                transformed: A list of masks to apply on the previous image
        """
        logger = logging.getLogger(__name__)
        
        enc0, enc1, enc2, enc3, enc4, enc5, enc6 = encs
        hidden1, hidden2, hidden3, hidden4, hidden5, hidden6, hidden7 = hiddens

        xp = chainer.cuda.get_array_module(enc6.data)
        # STP specific
        enc7 = self.enc7(enc6)
        transformed = list([F.sigmoid(enc7)])

        stp_input0 = F.reshape(hidden5, (int(batch_size), -1))
        stp_input1 = self.stp_input(stp_input0)
        stp_input1 = F.relu(stp_input1)
        identity_params = np.array([[1.0, 0.0, 0.0, 0.0, 1.0, 0.0]], dtype=np.float32)
        identity_params = np.repeat(identity_params, int(batch_size), axis=0)
        identity_params = variable.Variable(xp.array(identity_params))

        stp_transformations = []
        for i in range(num_masks-1):
            params = self.identity_params(stp_input1)
            params = params + identity_params
            params = F.reshape(params, (int(params.shape[0]), 2, 3))
            grid = F.spatial_transformer_grid(params, (prev_image.shape[2], prev_image.shape[3]))
            trans = F.spatial_transformer_sampler(prev_image, grid)
            stp_transformations.append(trans)

        transformed += stp_transformations

        return transformed, enc7
    def __call__(self, images, localizations):
        self.lstm.reset_state()
        if self.use_blstm:
            self.blstm.reset_state()

        points = [
            F.spatial_transformer_grid(localization, self.target_shape)
            for localization in localizations
        ]
        rois = [
            F.spatial_transformer_sampler(images, point) for point in points
        ]

        h = F.relu(self.bn0(self.conv0(rois[-1])))
        h = F.average_pooling_2d(h, 2, stride=2)

        h = self.rs1(h)
        h = self.rs2(h)
        h = F.max_pooling_2d(h, 2, stride=2)
        h = self.rs3(h)
        self.vis_anchor = h

        h = F.average_pooling_2d(h, 5, stride=1)

        h = F.relu(self.fc1(h))

        # each timestep of the localization contains one character prediction, that needs to be classified
        overall_predictions = []
        h = F.reshape(h, (self.num_rois, -1, self.fc1.out_size))

        for timestep in F.separate(h, axis=0):
            lstm_state = self.lstm(timestep)

            prediction = self.classifier(lstm_state)
            overall_predictions.append(prediction)

        return overall_predictions, rois, points
 def f(theta):
     return functions.spatial_transformer_grid(theta, output_shape)
 def f(theta):
     return functions.spatial_transformer_grid(theta, output_shape)
Esempio n. 15
0
 def __call__(self, x):
     theta = self.affine_matrix(x)
     self.grid = F.spatial_transformer_grid(theta, x.shape[2:])
     return F.spatial_transformer_sampler(x, self.grid)