def test_rescaling(self): """ Test the rescaling method """ # case 1: none scale will cause overflow gs = [ chainer.Variable(np.random.normal(size=16)), chainer.Variable(np.random.normal(size=16)), ] gs[0].__dict__['loss_scale'] = 1 gs[1].__dict__['loss_scale'] = 2 ada_loss = AdaLossChainer() gs = ada_loss.rescaling(gs) # scale to the larger one self.assertEqual(gs[0].__dict__['loss_scale'], 2) # case 2: now we have overflow problem gs = [ chainer.Variable(np.random.normal(size=16)), chainer.Variable(np.random.normal(size=16)), ] gs[0].__dict__['loss_scale'] = 65536 gs[1].__dict__['loss_scale'] = 2 gs = ada_loss.rescaling(gs) self.assertEqual(gs[0].__dict__['loss_scale'], 2)
def test_element_wise_multiply(self): """ Element-wise multiplication """ ada_loss = AdaLossChainer() g = chainer.Variable(np.random.normal(size=(2, 2)).astype('float32')) W = chainer.Variable(np.random.normal(size=(2, 2)).astype('float32')) r = ada_loss.get_element_wise_multiply(g, W) # expected sequence self.assertEqual(r[0], g.data[0, 0] * W.data[0, 0]) self.assertEqual(r[1], g.data[0, 1] * W.data[1, 0]) self.assertEqual(r[2], g.data[0, 0] * W.data[0, 1]) self.assertEqual(r[3], g.data[0, 1] * W.data[1, 1]) self.assertEqual(r[4], g.data[1, 0] * W.data[0, 0]) self.assertEqual(r[5], g.data[1, 1] * W.data[1, 0]) self.assertEqual(r[6], g.data[1, 0] * W.data[0, 1]) self.assertEqual(r[7], g.data[1, 1] * W.data[1, 1]) # manually insert zeros into g and W g.data[0, 0] = 0 W.data[1, 1] = 0 r = ada_loss.get_element_wise_multiply(g, W) self.assertEqual(len(r), 4) r = ada_loss.get_element_wise_multiply(g, W, filter_zero=False) self.assertEqual(len(r), 8)
def test_get_prev_scale(self): """ Check how prev_scale is implemented. Should extract correctly the loss_scale from the previous layer. """ g = chainer.Variable(np.random.normal(size=1).astype('float32')) g.__dict__['loss_scale'] = 2.0 ada_loss = AdaLossChainer() self.assertEqual(ada_loss.get_prev_scale(g), 2.0)
def _test_scaled_grad(self, scale_val, dtype, prev_scale): g_data = np.random.normal(size=16).astype(dtype) g = chainer.Variable(g_data) scale = np.array(scale_val, dtype=dtype) ada_loss = AdaLossChainer(dtype=dtype) sg = ada_loss.get_scaled_gradient(g, scale, prev_scale=prev_scale) self.assertTrue(np.allclose(sg.array, g_data * scale_val)) self.assertEqual(ada_loss.grad_loss_scale(sg), scale_val * prev_scale)
def test_get_scaled_gradient(self): """ Test the scaling """ g = chainer.Variable(np.random.normal(size=1).astype('float32')) scale = 2.0 # NOTE: float32 is necessary ada_loss = AdaLossChainer(dtype='float32') s_grad = ada_loss.get_scaled_gradient(g, scale) self.assertTrue(np.allclose(g.array * 2, s_grad.array)) self.assertEqual(getattr(s_grad, 'loss_scale'), 2.0)
def __init__(self, eps=2e-5, mean=None, var=None, decay=0.9, axis=None, ada_loss_cfg=None): super().__init__(eps=eps, mean=mean, var=var, decay=decay, axis=axis) if ada_loss_cfg is None: ada_loss_cfg = {} self.ada_loss = AdaLossChainer(**ada_loss_cfg)
def __init__(self, in_channels, out_channels, ksize=None, stride=1, pad=0, nobias=False, initialW=None, initial_bias=None, ada_loss_cfg=None, **kwargs): super().__init__(in_channels, out_channels, ksize=ksize, stride=stride, pad=pad, nobias=nobias, initialW=initialW, initial_bias=initial_bias, **kwargs) if ada_loss_cfg is None: ada_loss_cfg = {} self.ada_loss_cfg = ada_loss_cfg self.ada_loss = AdaLossChainer(**ada_loss_cfg)
class AdaLossFixedBatchNormalization( batch_normalization.FixedBatchNormalization): """ Wrapping the fixed batch normalization function """ def __init__(self, eps=2e-5, axis=None, ada_loss_cfg=None): super().__init__(eps=eps, axis=axis) if ada_loss_cfg is None: ada_loss_cfg = {} self.ada_loss = AdaLossChainer(**ada_loss_cfg) def backward(self, indexes, grad_outputs): """ wrap around the original backward function """ x, gamma, mean, var = self.get_retained_inputs() gy, = grad_outputs f = batch_normalization.FixedBatchNormalizationGrad( self.eps, self.expander, self.axis, self.inv_std, self.inv_var) gx, ggamma, gbeta, gmean, gvar = f.apply((x, gamma, mean, var, gy)) prev_scale = self.ada_loss.get_prev_scale(gy) ggamma_ = self.ada_loss.get_unscaled_gradient(ggamma, prev_scale, dtype=ggamma.dtype) gbeta_ = self.ada_loss.get_unscaled_gradient(gbeta, prev_scale, dtype=gbeta.dtype) gmean_ = self.ada_loss.get_unscaled_gradient(gmean, prev_scale, dtype=gmean.dtype) gvar_ = self.ada_loss.get_unscaled_gradient(gvar, prev_scale, dtype=gvar.dtype) self.ada_loss.set_loss_scale(gx, prev_scale) # pass along return gx, ggamma_, gbeta_, gmean_, gvar_
class AdaLossBatchNormalization(batch_normalization.BatchNormalization): """ """ def __init__(self, eps=2e-5, mean=None, var=None, decay=0.9, axis=None, ada_loss_cfg=None): super().__init__(eps=eps, mean=mean, var=var, decay=decay, axis=axis) if ada_loss_cfg is None: ada_loss_cfg = {} self.ada_loss = AdaLossChainer(**ada_loss_cfg) def backward(self, indexes, grad_outputs): x, gamma = self.get_retained_inputs() gy, = grad_outputs if self.use_ideep: assert self.var is not None var = self.var else: var = None f = batch_normalization.BatchNormalizationGrad( self.eps, self.use_cudnn, self.mode, self.expander, self.axis, self.mean, var, self.inv_std, self.key_axis) gx, ggamma, gbeta = f.apply((x, gamma, gy)) # update the loss scale prev_scale = self.ada_loss.get_prev_scale(gy) # NOTE: the numerical stability here? # NOTE: efficiency? ggamma_ = self.ada_loss.get_unscaled_gradient(ggamma, prev_scale, dtype=ggamma.dtype) gbeta_ = self.ada_loss.get_unscaled_gradient(gbeta, prev_scale, dtype=ggamma.dtype) self.ada_loss.set_loss_scale(gx, prev_scale) # pass along return gx, ggamma_, gbeta_
def _test_get_loss_scale_by_approx_range(self, g_sigma, W_sigma, dtype): g_data = np.random.normal(scale=g_sigma, size=(32, 32)).astype(dtype) W_data = np.random.normal(scale=W_sigma, size=(32, 32)).astype(dtype) g = chainer.Variable(g_data) W = chainer.Variable(W_data) ada_loss = AdaLossChainer(dtype=dtype, debug_level=1) scale = ada_loss.get_loss_scale_by_approx_range(g, W) self.assertEqual(scale.dtype, ada_loss.full_dtype) nnz1 = np.count_nonzero(np.dot(g_data, W_data)) nnz2 = np.count_nonzero(np.dot(scale * g_data, W_data)) if scale > 1: # scaling is effective self.assertTrue(nnz1 < nnz2) self.assertFalse(np.isinf(np.dot(scale * g_data, W_data)).any()) elif scale == 1: # scaling has no effect self.assertTrue(nnz1 == nnz2) else: # prevent overflow self.assertTrue(np.isinf(np.dot(g_data, W_data)).any()) self.assertFalse(np.isinf(np.dot(scale * g_data, W_data)).any())
def test_forward(self): dtype = np.float16 x_data = np.random.normal(size=(2, 4)).astype(dtype) W_data = np.random.normal(size=(3, 4)).astype(dtype) b_data = np.random.normal(size=(3)).astype(dtype) x = chainer.Variable(x_data) W = chainer.Variable(W_data) b = chainer.Variable(b_data) y1 = F.linear(x, W, b=b) y2 = ada_loss_linear(x, W, b=b, ada_loss=AdaLossChainer()) self.assertTrue(np.allclose(y1.array, y2.array))
def test_get_loss_scale_by_element_wise_range(self): """ Test how the loss scale works """ u_max, u_min = 1e3, 1e-3 ada_loss = AdaLossChainer(u_max=u_max, u_min=u_min) g = chainer.Variable(np.random.normal(size=(32, 32)).astype('float32')) W = chainer.Variable(np.random.normal(size=(32, 32)).astype('float32')) s = ada_loss.get_loss_scale_by_element_wise_range(g, W) # no overflow will happen self.assertTrue((np.dot(g.array, W.array) * s < u_max).all()) # will overflow g = chainer.Variable( np.random.normal(scale=16, size=(32, 32)).astype('float32')) W = chainer.Variable( np.random.normal(scale=16, size=(32, 32)).astype('float32')) s = ada_loss.get_loss_scale_by_element_wise_range(g, W) self.assertLessEqual(s, 1.0) # no overflow will happen still self.assertTrue((np.dot(g.array, W.array) * s < u_max).all())
def test_get_mean_and_std_of_product(self): dtype = np.float16 X_data = np.random.normal(size=16).astype(dtype) Y_data = np.random.normal(size=16).astype(dtype) X = chainer.Variable(X_data) Y = chainer.Variable(Y_data) ada_loss = AdaLossChainer(dtype=dtype) mu, sigma = ada_loss.get_mean_and_std_of_product(X, Y) X_mu, X_sigma = (X_data.astype(np.float32).mean(), X_data.astype(np.float32).std()) Y_mu, Y_sigma = (Y_data.astype(np.float32).mean(), Y_data.astype(np.float32).std()) self.assertEqual(mu.dtype, np.float32) self.assertEqual(sigma.dtype, np.float32) self.assertTrue(np.allclose(X_mu * Y_mu, mu)) self.assertTrue( np.allclose( np.sqrt((X_sigma**2 + X_mu**2) * (Y_sigma**2 + Y_mu**2) - (X_mu * Y_mu)**2).astype(np.float32), sigma))
def test_backward(self): """ """ x = chainer.Variable( np.random.normal(size=(1, 3, 4, 4)).astype('float16')) W = chainer.Variable( np.random.normal(size=(4, 3, 3, 3)).astype('float16')) y = loss_scaling( ada_loss_convolution_2d( x, W, ada_loss=AdaLossChainer(loss_scale_method='fixed')), 2.) y.grad = np.ones_like(y.array) y.backward() self.assertTrue(hasattr(x.grad_var, 'loss_scale')) self.assertTrue(hasattr(W.grad_var, 'loss_scale')) # scaled down self.assertEqual(getattr(W.grad_var, 'loss_scale'), 1.0)
def test_unscaled_gradient(self): """ Check whether the unscaled performs correctly. """ g_data = np.random.normal(size=16).astype('float32') g = chainer.Variable(g_data) ada_loss = AdaLossChainer(dtype='float32', debug_level=1) ug = ada_loss.get_unscaled_gradient(g, 2.0) self.assertTrue(np.allclose(ug.array * 2.0, g_data)) # float16 g_data = np.random.normal(size=16).astype('float16') * 100 g = chainer.Variable(g_data) ada_loss = AdaLossChainer(dtype='float16', debug_level=1) ug = ada_loss.get_unscaled_gradient(g, np.array(2.0, dtype='float32')) self.assertTrue(np.allclose(ug.array * 2.0, g_data)) # cause overflow loss_scale = 1e-6 with self.assertRaises(ValueError): ada_loss.get_unscaled_gradient( g, np.array(loss_scale, dtype='float32'))
def forward(self, x): """Compute localization and classification from a batch of images. This method computes two variables, :obj:`mb_locs` and :obj:`mb_confs`. :func:`self.coder.decode` converts these variables to bounding box coordinates and confidence scores. These variables are also used in training SSD. Args: x (chainer.Variable): A variable holding a batch of images. The images are preprocessed by :meth:`_prepare`. Returns: tuple of chainer.Variable: This method returns two variables, :obj:`mb_locs` and :obj:`mb_confs`. * **mb_locs**: A variable of float arrays of shape \ :math:`(B, K, 4)`, \ where :math:`B` is the number of samples in the batch and \ :math:`K` is the number of default bounding boxes. * **mb_confs**: A variable of float arrays of shape \ :math:`(B, K, n\_fg\_class + 1)`. """ ys = self.extractor(x) ys = list(ys) # for i in range(len(ys)): # ys[i] = F.cast(ys[i], 'float32') # TODO: refactorize this. Instead of hardcoding, use AdaLossScaled if self.dtype != np.float32: if not isinstance(self.extractor.conv1_1, AdaLossConvolution2D): y = F.cast(ys[0], 'float32') else: if self.type_cast_ada_loss is None: self.type_cast_ada_loss = AdaLossChainer( **self.extractor.conv1_1.ada_loss_cfg) y = ada_loss_cast(ys[0], 'float32', self.type_cast_ada_loss) ys[0] = self.norm(y) ys = tuple(ys) return self.multibox(ys)
def __init__(self, in_size, out_size=None, nobias=False, initialW=None, initial_bias=None, ada_loss_cfg=None, **kwargs): """ """ super().__init__(in_size, out_size=out_size, nobias=nobias, initialW=initialW, initial_bias=initial_bias) # TODO: refactorize # To be passed to the ada loss function: if ada_loss_cfg is None: ada_loss_cfg = kwargs self.ada_loss = AdaLossChainer(**ada_loss_cfg)
def test_backward(self): dtype = np.float16 x_data = np.random.normal(size=(2, 4)).astype(dtype) W_data = np.random.normal(size=(3, 4)).astype(dtype) b_data = np.random.normal(size=(3)).astype(dtype) g_data = np.random.normal(size=(2, 3)).astype(dtype) x = chainer.Variable(x_data) W = chainer.Variable(W_data) b = chainer.Variable(b_data) # no loss scaling y1 = F.linear(x, W, b=b) y1.grad = g_data y1.backward() W_grad1 = W.grad x_grad1 = x.grad b_grad1 = b.grad x = chainer.Variable(x_data) W = chainer.Variable(W_data) b = chainer.Variable(b_data) # with loss scaling y2 = loss_scaling( ada_loss_linear(x, W, b=b, ada_loss=AdaLossChainer(loss_scale_method='fixed', fixed_loss_scale=2.0)), 2.0) y2.grad = g_data y2.backward() self.assertTrue(np.allclose(x.grad, x_grad1 * 4)) self.assertTrue(np.allclose(W.grad, W_grad1)) self.assertTrue(np.allclose(b.grad, b_grad1))
def test_get_mean_and_std(self): """ """ dtype = np.float16 x_data = np.random.normal(size=16).astype(dtype) x = chainer.Variable(x_data) ada_loss = AdaLossChainer(dtype=dtype) mu, sigma = ada_loss.get_mean_and_std(x) self.assertTrue(np.allclose(mu, x_data.astype(np.float32).mean())) self.assertTrue(np.allclose(sigma, x_data.astype(np.float32).std())) self.assertEqual(mu.dtype, np.float32) self.assertEqual(sigma.dtype, np.float32) # test numerical issue ada_loss = AdaLossChainer(dtype=dtype, debug_level=1) with self.assertRaises(AssertionError): x_data[0] = np.nan mu, sigma = ada_loss.get_mean_and_std(x)
def test_power_of_two_in_get_loss_scale(self): """ Check the switch of power_of_two """ # turn ON dtype = np.float16 ada_loss = AdaLossChainer(dtype=dtype, loss_scale_method='element_wise_range', use_bound=False) g = chainer.Variable(np.array([[1e-5]], dtype=dtype)) W = chainer.Variable(np.array([[1e-4]], dtype=dtype)) s = ada_loss.get_loss_scale(g, W) self.assertEqual(s, 32) # turn OFF ada_loss = AdaLossChainer(dtype=dtype, loss_scale_method='element_wise_range', power_of_two=False, use_bound=False) g = chainer.Variable(np.array([[1e-5]], dtype=dtype)) W = chainer.Variable(np.array([[1e-4]], dtype=dtype)) s = ada_loss.get_loss_scale(g, W) self.assertFalse(s == 32)
def forward(self, xs): """Compute loc and conf from feature maps This method computes :obj:`mb_locs` and :obj:`mb_confs` from given feature maps. Args: xs (iterable of chainer.Variable): An iterable of feature maps. The number of feature maps must be same as the number of :obj:`aspect_ratios`. Returns: tuple of chainer.Variable: This method returns two :obj:`chainer.Variable`: :obj:`mb_locs` and :obj:`mb_confs`. * **mb_locs**: A variable of float arrays of shape \ :math:`(B, K, 4)`, \ where :math:`B` is the number of samples in the batch and \ :math:`K` is the number of default bounding boxes. * **mb_confs**: A variable of float arrays of shape \ :math:`(B, K, n\_fg\_class + 1)`. """ mb_locs = [] mb_confs = [] dtype = chainer.global_config.dtype for i, x in enumerate(xs): # TODO: can we don't refer to AdaLossBranch here? Maybe turn it to a # general forward function? x1, x2 = AdaLossBranch().apply((x, )) loc = getattr(self, 'loc_{}'.format(i)) mb_loc = loc(x1) mb_loc = self.post_loc(mb_loc) conf = getattr(self, 'conf_{}'.format(i)) mb_conf = conf(x2) mb_conf = self.post_conf(mb_conf) if dtype != np.float32: if not isinstance(loc, AdaLossConvolution2D): mb_loc = F.cast(mb_loc, 'float32') mb_conf = F.cast(mb_conf, 'float32') else: if self.tc_locs[i] is None: self.tc_locs[i] = AdaLossChainer(**loc.ada_loss_cfg) if self.tc_confs[i] is None: self.tc_confs[i] = AdaLossChainer(**loc.ada_loss_cfg) mb_loc = ada_loss_cast(mb_loc, 'float32', self.tc_locs[i]) mb_conf = ada_loss_cast(mb_conf, 'float32', self.tc_confs[i], lognormal=True) mb_locs.append(mb_loc) mb_confs.append(mb_conf) mb_locs = self.concat_locs(mb_locs) mb_confs = self.concat_confs(mb_confs) return mb_locs, mb_confs
def forward(self, x): """Compute an image-wise score from a batch of images Args: x (chainer.Variable): A variable with 4D image array. Returns: chainer.Variable: An image-wise score. Its channel size is :obj:`self.n_class`. """ # h = F.local_response_normalization(x, 5, 1, 1e-4 / 5., 0.75) # h, indices1 = F.max_pooling_2d( # F.relu(self.conv1_bn(self.conv1(h))), 2, 2, return_indices=True) # h, indices2 = F.max_pooling_2d( # F.relu(self.conv2_bn(self.conv2(h))), 2, 2, return_indices=True) # h, indices3 = F.max_pooling_2d( # F.relu(self.conv3_bn(self.conv3(h))), 2, 2, return_indices=True) # h, indices4 = F.max_pooling_2d( # F.relu(self.conv4_bn(self.conv4(h))), 2, 2, return_indices=True) # h = self._upsampling_2d(h, indices4) # h = self.conv_decode4_bn(self.conv_decode4(h)) # h = self._upsampling_2d(h, indices3) # h = self.conv_decode3_bn(self.conv_decode3(h)) # h = self._upsampling_2d(h, indices2) # h = self.conv_decode2_bn(self.conv_decode2(h)) # h = self._upsampling_2d(h, indices1) # h = self.conv_decode1_bn(self.conv_decode1(h)) h = self.lrn(x) h, indices1 = self.conv1_pool( self.conv1_relu(self.conv1_bn(self.conv1(h)))) h, indices2 = self.conv2_pool( self.conv2_relu(self.conv2_bn(self.conv2(h)))) h, indices3 = self.conv3_pool( self.conv3_relu(self.conv3_bn(self.conv3(h)))) h, indices4 = self.conv4_pool( self.conv4_relu(self.conv4_bn(self.conv4(h)))) h = self.upsampling4(h, indices4) h = self.conv_decode4_bn(self.conv_decode4(h)) h = self.upsampling3(h, indices3) h = self.conv_decode3_bn(self.conv_decode3(h)) h = self.upsampling2(h, indices2) h = self.conv_decode2_bn(self.conv_decode2(h)) h = self.upsampling1(h, indices1) h = self.conv_decode1_bn(self.conv_decode1(h)) h = self.conv_classifier(h) # TODO: refactorize this. Instead of hardcoding, use AdaLossScaled if self.dtype != np.float32: if not isinstance(self.conv1_bn, AdaLossBatchNormalization): h = F.cast(h, 'float32') else: if self.type_cast_ada_loss is None: self.type_cast_ada_loss = AdaLossChainer( **self.conv1_bn.ada_loss_cfg) h = ada_loss_cast(h, 'float32', self.type_cast_ada_loss, lognormal=True) return h
def __init__(self, eps=2e-5, axis=None, ada_loss_cfg=None): super().__init__(eps=eps, axis=axis) if ada_loss_cfg is None: ada_loss_cfg = {} self.ada_loss = AdaLossChainer(**ada_loss_cfg)