예제 #1
0
    def test_rescaling(self):
        """ Test the rescaling method """
        # case 1: none scale will cause overflow
        gs = [
            chainer.Variable(np.random.normal(size=16)),
            chainer.Variable(np.random.normal(size=16)),
        ]
        gs[0].__dict__['loss_scale'] = 1
        gs[1].__dict__['loss_scale'] = 2

        ada_loss = AdaLossChainer()
        gs = ada_loss.rescaling(gs)

        # scale to the larger one
        self.assertEqual(gs[0].__dict__['loss_scale'], 2)

        # case 2: now we have overflow problem
        gs = [
            chainer.Variable(np.random.normal(size=16)),
            chainer.Variable(np.random.normal(size=16)),
        ]
        gs[0].__dict__['loss_scale'] = 65536
        gs[1].__dict__['loss_scale'] = 2
        gs = ada_loss.rescaling(gs)
        self.assertEqual(gs[0].__dict__['loss_scale'], 2)
예제 #2
0
    def test_element_wise_multiply(self):
        """ Element-wise multiplication """
        ada_loss = AdaLossChainer()

        g = chainer.Variable(np.random.normal(size=(2, 2)).astype('float32'))
        W = chainer.Variable(np.random.normal(size=(2, 2)).astype('float32'))
        r = ada_loss.get_element_wise_multiply(g, W)

        # expected sequence
        self.assertEqual(r[0], g.data[0, 0] * W.data[0, 0])
        self.assertEqual(r[1], g.data[0, 1] * W.data[1, 0])
        self.assertEqual(r[2], g.data[0, 0] * W.data[0, 1])
        self.assertEqual(r[3], g.data[0, 1] * W.data[1, 1])
        self.assertEqual(r[4], g.data[1, 0] * W.data[0, 0])
        self.assertEqual(r[5], g.data[1, 1] * W.data[1, 0])
        self.assertEqual(r[6], g.data[1, 0] * W.data[0, 1])
        self.assertEqual(r[7], g.data[1, 1] * W.data[1, 1])

        # manually insert zeros into g and W
        g.data[0, 0] = 0
        W.data[1, 1] = 0
        r = ada_loss.get_element_wise_multiply(g, W)
        self.assertEqual(len(r), 4)
        r = ada_loss.get_element_wise_multiply(g, W, filter_zero=False)
        self.assertEqual(len(r), 8)
예제 #3
0
    def test_get_prev_scale(self):
        """ Check how prev_scale is implemented.
            Should extract correctly the loss_scale from the previous layer. """
        g = chainer.Variable(np.random.normal(size=1).astype('float32'))
        g.__dict__['loss_scale'] = 2.0

        ada_loss = AdaLossChainer()
        self.assertEqual(ada_loss.get_prev_scale(g), 2.0)
예제 #4
0
    def _test_scaled_grad(self, scale_val, dtype, prev_scale):
        g_data = np.random.normal(size=16).astype(dtype)
        g = chainer.Variable(g_data)
        scale = np.array(scale_val, dtype=dtype)

        ada_loss = AdaLossChainer(dtype=dtype)
        sg = ada_loss.get_scaled_gradient(g, scale, prev_scale=prev_scale)

        self.assertTrue(np.allclose(sg.array, g_data * scale_val))
        self.assertEqual(ada_loss.grad_loss_scale(sg), scale_val * prev_scale)
예제 #5
0
    def test_get_scaled_gradient(self):
        """ Test the scaling """
        g = chainer.Variable(np.random.normal(size=1).astype('float32'))
        scale = 2.0

        # NOTE: float32 is necessary
        ada_loss = AdaLossChainer(dtype='float32')
        s_grad = ada_loss.get_scaled_gradient(g, scale)
        self.assertTrue(np.allclose(g.array * 2, s_grad.array))
        self.assertEqual(getattr(s_grad, 'loss_scale'), 2.0)
    def __init__(self,
                 eps=2e-5,
                 mean=None,
                 var=None,
                 decay=0.9,
                 axis=None,
                 ada_loss_cfg=None):
        super().__init__(eps=eps, mean=mean, var=var, decay=decay, axis=axis)

        if ada_loss_cfg is None:
            ada_loss_cfg = {}
        self.ada_loss = AdaLossChainer(**ada_loss_cfg)
예제 #7
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 ksize=None,
                 stride=1,
                 pad=0,
                 nobias=False,
                 initialW=None,
                 initial_bias=None,
                 ada_loss_cfg=None,
                 **kwargs):
        super().__init__(in_channels,
                         out_channels,
                         ksize=ksize,
                         stride=stride,
                         pad=pad,
                         nobias=nobias,
                         initialW=initialW,
                         initial_bias=initial_bias,
                         **kwargs)

        if ada_loss_cfg is None:
            ada_loss_cfg = {}
        self.ada_loss_cfg = ada_loss_cfg
        self.ada_loss = AdaLossChainer(**ada_loss_cfg)
class AdaLossFixedBatchNormalization(
        batch_normalization.FixedBatchNormalization):
    """ Wrapping the fixed batch normalization function """
    def __init__(self, eps=2e-5, axis=None, ada_loss_cfg=None):
        super().__init__(eps=eps, axis=axis)

        if ada_loss_cfg is None:
            ada_loss_cfg = {}
        self.ada_loss = AdaLossChainer(**ada_loss_cfg)

    def backward(self, indexes, grad_outputs):
        """ wrap around the original backward function """
        x, gamma, mean, var = self.get_retained_inputs()
        gy, = grad_outputs
        f = batch_normalization.FixedBatchNormalizationGrad(
            self.eps, self.expander, self.axis, self.inv_std, self.inv_var)
        gx, ggamma, gbeta, gmean, gvar = f.apply((x, gamma, mean, var, gy))

        prev_scale = self.ada_loss.get_prev_scale(gy)
        ggamma_ = self.ada_loss.get_unscaled_gradient(ggamma,
                                                      prev_scale,
                                                      dtype=ggamma.dtype)
        gbeta_ = self.ada_loss.get_unscaled_gradient(gbeta,
                                                     prev_scale,
                                                     dtype=gbeta.dtype)
        gmean_ = self.ada_loss.get_unscaled_gradient(gmean,
                                                     prev_scale,
                                                     dtype=gmean.dtype)
        gvar_ = self.ada_loss.get_unscaled_gradient(gvar,
                                                    prev_scale,
                                                    dtype=gvar.dtype)
        self.ada_loss.set_loss_scale(gx, prev_scale)  # pass along

        return gx, ggamma_, gbeta_, gmean_, gvar_
class AdaLossBatchNormalization(batch_normalization.BatchNormalization):
    """ """
    def __init__(self,
                 eps=2e-5,
                 mean=None,
                 var=None,
                 decay=0.9,
                 axis=None,
                 ada_loss_cfg=None):
        super().__init__(eps=eps, mean=mean, var=var, decay=decay, axis=axis)

        if ada_loss_cfg is None:
            ada_loss_cfg = {}
        self.ada_loss = AdaLossChainer(**ada_loss_cfg)

    def backward(self, indexes, grad_outputs):
        x, gamma = self.get_retained_inputs()
        gy, = grad_outputs

        if self.use_ideep:
            assert self.var is not None
            var = self.var
        else:
            var = None

        f = batch_normalization.BatchNormalizationGrad(
            self.eps, self.use_cudnn, self.mode, self.expander, self.axis,
            self.mean, var, self.inv_std, self.key_axis)
        gx, ggamma, gbeta = f.apply((x, gamma, gy))

        # update the loss scale
        prev_scale = self.ada_loss.get_prev_scale(gy)
        # NOTE: the numerical stability here?
        # NOTE: efficiency?
        ggamma_ = self.ada_loss.get_unscaled_gradient(ggamma,
                                                      prev_scale,
                                                      dtype=ggamma.dtype)
        gbeta_ = self.ada_loss.get_unscaled_gradient(gbeta,
                                                     prev_scale,
                                                     dtype=ggamma.dtype)
        self.ada_loss.set_loss_scale(gx, prev_scale)  # pass along

        return gx, ggamma_, gbeta_
예제 #10
0
    def _test_get_loss_scale_by_approx_range(self, g_sigma, W_sigma, dtype):
        g_data = np.random.normal(scale=g_sigma, size=(32, 32)).astype(dtype)
        W_data = np.random.normal(scale=W_sigma, size=(32, 32)).astype(dtype)
        g = chainer.Variable(g_data)
        W = chainer.Variable(W_data)

        ada_loss = AdaLossChainer(dtype=dtype, debug_level=1)

        scale = ada_loss.get_loss_scale_by_approx_range(g, W)
        self.assertEqual(scale.dtype, ada_loss.full_dtype)

        nnz1 = np.count_nonzero(np.dot(g_data, W_data))
        nnz2 = np.count_nonzero(np.dot(scale * g_data, W_data))
        if scale > 1:  # scaling is effective
            self.assertTrue(nnz1 < nnz2)
            self.assertFalse(np.isinf(np.dot(scale * g_data, W_data)).any())
        elif scale == 1:  # scaling has no effect
            self.assertTrue(nnz1 == nnz2)
        else:  # prevent overflow
            self.assertTrue(np.isinf(np.dot(g_data, W_data)).any())
            self.assertFalse(np.isinf(np.dot(scale * g_data, W_data)).any())
예제 #11
0
    def test_forward(self):
        dtype = np.float16
        x_data = np.random.normal(size=(2, 4)).astype(dtype)
        W_data = np.random.normal(size=(3, 4)).astype(dtype)
        b_data = np.random.normal(size=(3)).astype(dtype)

        x = chainer.Variable(x_data)
        W = chainer.Variable(W_data)
        b = chainer.Variable(b_data)
        y1 = F.linear(x, W, b=b)
        y2 = ada_loss_linear(x, W, b=b, ada_loss=AdaLossChainer())
        self.assertTrue(np.allclose(y1.array, y2.array))
예제 #12
0
    def test_get_loss_scale_by_element_wise_range(self):
        """ Test how the loss scale works """
        u_max, u_min = 1e3, 1e-3
        ada_loss = AdaLossChainer(u_max=u_max, u_min=u_min)

        g = chainer.Variable(np.random.normal(size=(32, 32)).astype('float32'))
        W = chainer.Variable(np.random.normal(size=(32, 32)).astype('float32'))
        s = ada_loss.get_loss_scale_by_element_wise_range(g, W)

        # no overflow will happen
        self.assertTrue((np.dot(g.array, W.array) * s < u_max).all())

        # will overflow
        g = chainer.Variable(
            np.random.normal(scale=16, size=(32, 32)).astype('float32'))
        W = chainer.Variable(
            np.random.normal(scale=16, size=(32, 32)).astype('float32'))
        s = ada_loss.get_loss_scale_by_element_wise_range(g, W)

        self.assertLessEqual(s, 1.0)
        # no overflow will happen still
        self.assertTrue((np.dot(g.array, W.array) * s < u_max).all())
예제 #13
0
    def test_get_mean_and_std_of_product(self):
        dtype = np.float16
        X_data = np.random.normal(size=16).astype(dtype)
        Y_data = np.random.normal(size=16).astype(dtype)
        X = chainer.Variable(X_data)
        Y = chainer.Variable(Y_data)

        ada_loss = AdaLossChainer(dtype=dtype)
        mu, sigma = ada_loss.get_mean_and_std_of_product(X, Y)

        X_mu, X_sigma = (X_data.astype(np.float32).mean(),
                         X_data.astype(np.float32).std())
        Y_mu, Y_sigma = (Y_data.astype(np.float32).mean(),
                         Y_data.astype(np.float32).std())

        self.assertEqual(mu.dtype, np.float32)
        self.assertEqual(sigma.dtype, np.float32)

        self.assertTrue(np.allclose(X_mu * Y_mu, mu))
        self.assertTrue(
            np.allclose(
                np.sqrt((X_sigma**2 + X_mu**2) * (Y_sigma**2 + Y_mu**2) -
                        (X_mu * Y_mu)**2).astype(np.float32), sigma))
    def test_backward(self):
        """ """
        x = chainer.Variable(
            np.random.normal(size=(1, 3, 4, 4)).astype('float16'))
        W = chainer.Variable(
            np.random.normal(size=(4, 3, 3, 3)).astype('float16'))
        y = loss_scaling(
            ada_loss_convolution_2d(
                x, W, ada_loss=AdaLossChainer(loss_scale_method='fixed')), 2.)
        y.grad = np.ones_like(y.array)
        y.backward()

        self.assertTrue(hasattr(x.grad_var, 'loss_scale'))
        self.assertTrue(hasattr(W.grad_var, 'loss_scale'))
        # scaled down
        self.assertEqual(getattr(W.grad_var, 'loss_scale'), 1.0)
예제 #15
0
    def test_unscaled_gradient(self):
        """ Check whether the unscaled performs correctly. """
        g_data = np.random.normal(size=16).astype('float32')
        g = chainer.Variable(g_data)

        ada_loss = AdaLossChainer(dtype='float32', debug_level=1)
        ug = ada_loss.get_unscaled_gradient(g, 2.0)
        self.assertTrue(np.allclose(ug.array * 2.0, g_data))

        # float16
        g_data = np.random.normal(size=16).astype('float16') * 100
        g = chainer.Variable(g_data)
        ada_loss = AdaLossChainer(dtype='float16', debug_level=1)
        ug = ada_loss.get_unscaled_gradient(g, np.array(2.0, dtype='float32'))
        self.assertTrue(np.allclose(ug.array * 2.0, g_data))
        # cause overflow
        loss_scale = 1e-6
        with self.assertRaises(ValueError):
            ada_loss.get_unscaled_gradient(
                g, np.array(loss_scale, dtype='float32'))
예제 #16
0
    def forward(self, x):
        """Compute localization and classification from a batch of images.

        This method computes two variables, :obj:`mb_locs` and :obj:`mb_confs`.
        :func:`self.coder.decode` converts these variables to bounding box
        coordinates and confidence scores.
        These variables are also used in training SSD.

        Args:
            x (chainer.Variable): A variable holding a batch of images.
                The images are preprocessed by :meth:`_prepare`.

        Returns:
            tuple of chainer.Variable:
            This method returns two variables, :obj:`mb_locs` and
            :obj:`mb_confs`.

            * **mb_locs**: A variable of float arrays of shape \
                :math:`(B, K, 4)`, \
                where :math:`B` is the number of samples in the batch and \
                :math:`K` is the number of default bounding boxes.
            * **mb_confs**: A variable of float arrays of shape \
                :math:`(B, K, n\_fg\_class + 1)`.
        """

        ys = self.extractor(x)
        ys = list(ys)
        # for i in range(len(ys)):
        #     ys[i] = F.cast(ys[i], 'float32')
        # TODO: refactorize this. Instead of hardcoding, use AdaLossScaled

        if self.dtype != np.float32:
            if not isinstance(self.extractor.conv1_1, AdaLossConvolution2D):
                y = F.cast(ys[0], 'float32')
            else:
                if self.type_cast_ada_loss is None:
                    self.type_cast_ada_loss = AdaLossChainer(
                        **self.extractor.conv1_1.ada_loss_cfg)
                y = ada_loss_cast(ys[0], 'float32', self.type_cast_ada_loss)

        ys[0] = self.norm(y)
        ys = tuple(ys)
        return self.multibox(ys)
예제 #17
0
    def __init__(self,
                 in_size,
                 out_size=None,
                 nobias=False,
                 initialW=None,
                 initial_bias=None,
                 ada_loss_cfg=None,
                 **kwargs):
        """ """
        super().__init__(in_size,
                         out_size=out_size,
                         nobias=nobias,
                         initialW=initialW,
                         initial_bias=initial_bias)

        # TODO: refactorize
        # To be passed to the ada loss function:
        if ada_loss_cfg is None:
            ada_loss_cfg = kwargs

        self.ada_loss = AdaLossChainer(**ada_loss_cfg)
예제 #18
0
    def test_backward(self):
        dtype = np.float16
        x_data = np.random.normal(size=(2, 4)).astype(dtype)
        W_data = np.random.normal(size=(3, 4)).astype(dtype)
        b_data = np.random.normal(size=(3)).astype(dtype)
        g_data = np.random.normal(size=(2, 3)).astype(dtype)

        x = chainer.Variable(x_data)
        W = chainer.Variable(W_data)
        b = chainer.Variable(b_data)

        # no loss scaling
        y1 = F.linear(x, W, b=b)
        y1.grad = g_data
        y1.backward()

        W_grad1 = W.grad
        x_grad1 = x.grad
        b_grad1 = b.grad

        x = chainer.Variable(x_data)
        W = chainer.Variable(W_data)
        b = chainer.Variable(b_data)
        # with loss scaling
        y2 = loss_scaling(
            ada_loss_linear(x,
                            W,
                            b=b,
                            ada_loss=AdaLossChainer(loss_scale_method='fixed',
                                                    fixed_loss_scale=2.0)),
            2.0)
        y2.grad = g_data
        y2.backward()

        self.assertTrue(np.allclose(x.grad, x_grad1 * 4))
        self.assertTrue(np.allclose(W.grad, W_grad1))
        self.assertTrue(np.allclose(b.grad, b_grad1))
예제 #19
0
    def test_get_mean_and_std(self):
        """ """
        dtype = np.float16
        x_data = np.random.normal(size=16).astype(dtype)
        x = chainer.Variable(x_data)
        ada_loss = AdaLossChainer(dtype=dtype)
        mu, sigma = ada_loss.get_mean_and_std(x)

        self.assertTrue(np.allclose(mu, x_data.astype(np.float32).mean()))
        self.assertTrue(np.allclose(sigma, x_data.astype(np.float32).std()))
        self.assertEqual(mu.dtype, np.float32)
        self.assertEqual(sigma.dtype, np.float32)

        # test numerical issue
        ada_loss = AdaLossChainer(dtype=dtype, debug_level=1)
        with self.assertRaises(AssertionError):
            x_data[0] = np.nan
            mu, sigma = ada_loss.get_mean_and_std(x)
예제 #20
0
    def test_power_of_two_in_get_loss_scale(self):
        """ Check the switch of power_of_two """
        # turn ON
        dtype = np.float16
        ada_loss = AdaLossChainer(dtype=dtype,
                                  loss_scale_method='element_wise_range',
                                  use_bound=False)
        g = chainer.Variable(np.array([[1e-5]], dtype=dtype))
        W = chainer.Variable(np.array([[1e-4]], dtype=dtype))
        s = ada_loss.get_loss_scale(g, W)
        self.assertEqual(s, 32)

        # turn OFF
        ada_loss = AdaLossChainer(dtype=dtype,
                                  loss_scale_method='element_wise_range',
                                  power_of_two=False,
                                  use_bound=False)
        g = chainer.Variable(np.array([[1e-5]], dtype=dtype))
        W = chainer.Variable(np.array([[1e-4]], dtype=dtype))
        s = ada_loss.get_loss_scale(g, W)
        self.assertFalse(s == 32)
예제 #21
0
    def forward(self, xs):
        """Compute loc and conf from feature maps

        This method computes :obj:`mb_locs` and :obj:`mb_confs`
        from given feature maps.

        Args:
            xs (iterable of chainer.Variable): An iterable of feature maps.
                The number of feature maps must be same as the number of
                :obj:`aspect_ratios`.

        Returns:
            tuple of chainer.Variable:
            This method returns two :obj:`chainer.Variable`: :obj:`mb_locs` and
            :obj:`mb_confs`.

            * **mb_locs**: A variable of float arrays of shape \
                :math:`(B, K, 4)`, \
                where :math:`B` is the number of samples in the batch and \
                :math:`K` is the number of default bounding boxes.
            * **mb_confs**: A variable of float arrays of shape \
                :math:`(B, K, n\_fg\_class + 1)`.

        """

        mb_locs = []
        mb_confs = []

        dtype = chainer.global_config.dtype

        for i, x in enumerate(xs):
            # TODO: can we don't refer to AdaLossBranch here? Maybe turn it to a
            # general forward function?
            x1, x2 = AdaLossBranch().apply((x, ))
            loc = getattr(self, 'loc_{}'.format(i))
            mb_loc = loc(x1)
            mb_loc = self.post_loc(mb_loc)

            conf = getattr(self, 'conf_{}'.format(i))
            mb_conf = conf(x2)
            mb_conf = self.post_conf(mb_conf)

            if dtype != np.float32:
                if not isinstance(loc, AdaLossConvolution2D):
                    mb_loc = F.cast(mb_loc, 'float32')
                    mb_conf = F.cast(mb_conf, 'float32')
                else:
                    if self.tc_locs[i] is None:
                        self.tc_locs[i] = AdaLossChainer(**loc.ada_loss_cfg)
                    if self.tc_confs[i] is None:
                        self.tc_confs[i] = AdaLossChainer(**loc.ada_loss_cfg)
                    mb_loc = ada_loss_cast(mb_loc, 'float32', self.tc_locs[i])
                    mb_conf = ada_loss_cast(mb_conf,
                                            'float32',
                                            self.tc_confs[i],
                                            lognormal=True)

            mb_locs.append(mb_loc)
            mb_confs.append(mb_conf)

        mb_locs = self.concat_locs(mb_locs)
        mb_confs = self.concat_confs(mb_confs)

        return mb_locs, mb_confs
예제 #22
0
    def forward(self, x):
        """Compute an image-wise score from a batch of images

        Args:
            x (chainer.Variable): A variable with 4D image array.

        Returns:
            chainer.Variable:
            An image-wise score. Its channel size is :obj:`self.n_class`.

        """
        # h = F.local_response_normalization(x, 5, 1, 1e-4 / 5., 0.75)
        # h, indices1 = F.max_pooling_2d(
        #     F.relu(self.conv1_bn(self.conv1(h))), 2, 2, return_indices=True)
        # h, indices2 = F.max_pooling_2d(
        #     F.relu(self.conv2_bn(self.conv2(h))), 2, 2, return_indices=True)
        # h, indices3 = F.max_pooling_2d(
        #     F.relu(self.conv3_bn(self.conv3(h))), 2, 2, return_indices=True)
        # h, indices4 = F.max_pooling_2d(
        #     F.relu(self.conv4_bn(self.conv4(h))), 2, 2, return_indices=True)
        # h = self._upsampling_2d(h, indices4)
        # h = self.conv_decode4_bn(self.conv_decode4(h))
        # h = self._upsampling_2d(h, indices3)
        # h = self.conv_decode3_bn(self.conv_decode3(h))
        # h = self._upsampling_2d(h, indices2)
        # h = self.conv_decode2_bn(self.conv_decode2(h))
        # h = self._upsampling_2d(h, indices1)
        # h = self.conv_decode1_bn(self.conv_decode1(h))

        h = self.lrn(x)

        h, indices1 = self.conv1_pool(
            self.conv1_relu(self.conv1_bn(self.conv1(h))))
        h, indices2 = self.conv2_pool(
            self.conv2_relu(self.conv2_bn(self.conv2(h))))
        h, indices3 = self.conv3_pool(
            self.conv3_relu(self.conv3_bn(self.conv3(h))))
        h, indices4 = self.conv4_pool(
            self.conv4_relu(self.conv4_bn(self.conv4(h))))

        h = self.upsampling4(h, indices4)
        h = self.conv_decode4_bn(self.conv_decode4(h))
        h = self.upsampling3(h, indices3)
        h = self.conv_decode3_bn(self.conv_decode3(h))
        h = self.upsampling2(h, indices2)
        h = self.conv_decode2_bn(self.conv_decode2(h))
        h = self.upsampling1(h, indices1)
        h = self.conv_decode1_bn(self.conv_decode1(h))

        h = self.conv_classifier(h)

        # TODO: refactorize this. Instead of hardcoding, use AdaLossScaled
        if self.dtype != np.float32:
            if not isinstance(self.conv1_bn, AdaLossBatchNormalization):
                h = F.cast(h, 'float32')
            else:
                if self.type_cast_ada_loss is None:
                    self.type_cast_ada_loss = AdaLossChainer(
                        **self.conv1_bn.ada_loss_cfg)
                h = ada_loss_cast(h,
                                  'float32',
                                  self.type_cast_ada_loss,
                                  lognormal=True)

        return h
    def __init__(self, eps=2e-5, axis=None, ada_loss_cfg=None):
        super().__init__(eps=eps, axis=axis)

        if ada_loss_cfg is None:
            ada_loss_cfg = {}
        self.ada_loss = AdaLossChainer(**ada_loss_cfg)