def copydata(self, var): """Copies the data array from given source variable. This method copies the data array from given variable to this variable. The copy is done even if the arrays reside on different devices, including across the host and a GPU device. If this variable has an uninitialized data array, this method initializes it by the data array of the given variable. Similarly, if the given variable has an uninitialized data array, this method initializes it by the data array of this variable (``self``). If both are uninitialized, this method does nothing. Args: var (Variable): Source variable. """ src = var.data dst = self.data if src is None: if dst is None: return var.initialize(self.shape) src = var.data elif dst is None: self.initialize(src.shape) dst = self.data src_xp = cuda.get_array_module(src) dst_xp = cuda.get_array_module(dst) if dst_xp is src_xp: dst_xp.copyto(dst, src) elif dst_xp is numpy: dst_xp.copyto(dst, src.get()) else: dst.set(src)
def train(self, x): # Encoder/Decoder h = self.encoder(x) x_rec = self.decoder(h) l_rec = self.recon_loss(x, x_rec) self.cleargrads() l_rec.backward() self.optimizer_enc.update() self.optimizer_dec.update() # Discriminator h = Variable(h.data) # disconnect xp = cuda.get_array_module(x) z = Variable(cuda.to_gpu(xp.random.rand(x.shape[0], self.dim).astype(xp.float32), self.device)) x_gen = self.decoder(self.generator0(z)) d_x_gen = self.discriminator(x_gen) d_x_real = self.discriminator(x) l_dis = self.lsgan_loss(d_x_gen, d_x_real) self.cleargrads() l_dis.backward() self.optimizer_dis.update() # Generator xp = cuda.get_array_module(x) z = Variable(cuda.to_gpu(xp.random.rand(x.shape[0], self.dim).astype(xp.float32), self.device)) x_gen = self.decoder(self.generator0(z)) d_x_gen = self.discriminator(x_gen) h_gen = self.encoder(x_gen) l_gen = self.lsgan_loss(d_x_gen) self.cleargrads() l_gen.backward() self.optimizer_gen.update()
def _preprocess_const(x, value): xp = cuda.get_array_module(x) if not numpy.isscalar(value) and cuda.get_array_module(value) != xp: # TODO(unno): We can transfer arrays automatically raise TypeError('Cannot mix cupy.ndarray and numpy.ndarray') b = xp.broadcast(x, value) if b.shape != x.shape: raise ValueError('Failed to broadcast arrays') return utils.force_type(x.dtype, value)
def check_backward_consistency_regression(self, x_data, gy_data, use_cudnn=True): # Regression test to two-dimensional max pooling layer. if len(self.dims) != 2: return ksize = self.ksize stride = self.stride pad = self.pad xp = cuda.get_array_module(x_data) # Backward computation for N-dimensional max pooling layer. x_nd = chainer.Variable(xp.array(x_data)) func_nd = functions.MaxPoolingND(self.ndim, ksize, stride=stride, pad=pad, use_cudnn=use_cudnn, cover_all=self.cover_all) y_nd = func_nd(x_nd) y_nd.grad = gy_data y_nd.backward() # Backward computation for two-dimensional max pooling layer. x_2d = chainer.Variable(xp.array(x_data)) func_2d = functions.MaxPooling2D(ksize, stride=stride, pad=pad, use_cudnn=use_cudnn, cover_all=self.cover_all) y_2d = func_2d(x_2d) y_2d.grad = gy_data y_2d.backward() # Test that the two result gradients are close enough. testing.assert_allclose(x_nd.grad, x_2d.grad)
def _bbox_transform_inv(boxes, deltas): xp = get_array_module(boxes) if boxes.shape[0] == 0: return xp.zeros((0, deltas.shape[1]), dtype=deltas.dtype) widths = boxes[:, 2] - boxes[:, 0] + 1.0 heights = boxes[:, 3] - boxes[:, 1] + 1.0 ctr_x = boxes[:, 0] + 0.5 * widths ctr_y = boxes[:, 1] + 0.5 * heights dx = deltas[:, 0::4] dy = deltas[:, 1::4] dw = deltas[:, 2::4] dh = deltas[:, 3::4] pred_ctr_x = dx * widths[:, xp.newaxis] + ctr_x[:, xp.newaxis] pred_ctr_y = dy * heights[:, xp.newaxis] + ctr_y[:, xp.newaxis] pred_w = xp.exp(dw) * widths[:, xp.newaxis] pred_h = xp.exp(dh) * heights[:, xp.newaxis] pred_boxes = xp.zeros(deltas.shape, dtype=deltas.dtype) # x1 pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w # y1 pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h # x2 pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w # y2 pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h return pred_boxes
def backward(self, inputs, grad_outputs): gy = grad_outputs[0] x = _as_mat(inputs[0]) W = inputs[1] xp = cuda.get_array_module(*inputs) # gradient of z = xW + b gz = xp.zeros((gy.shape[0], W.shape[1], gy.shape[1]), x.dtype) if xp == numpy: idx0 = xp.arange(len(gy))[:, None] idx1 = xp.arange(gy.shape[1]) gz[idx0, self.argmax, idx1] = gy else: gz_r = xp.rollaxis(gz, 1) cuda.elementwise( 'T gy, S argmax, int32 n', 'raw T gz', 'gz[argmax * n + i] = gy', 'maxout_bwd' )(gy, self.argmax, gz_r.size // len(gz_r), gz_r) gx = xp.tensordot(gz, W, ((1, 2), (1, 2))).reshape(inputs[0].shape) gW = xp.tensordot(x, gz, (0, 0)) if len(inputs) == 3: gb = gz.sum(axis=0) return gx, gW, gb else: return gx, gW
def update_parameter_by_meta_learner( self, model_params, loss, x_l0, x_l1, y_l): # Forward meta-learner namedparams = model_params for i, elm in enumerate(namedparams.items()): # parameter-loop k, p = elm with cuda.get_device_from_id(self.device): shape = p.shape xp = cuda.get_array_module(p.data) x = p.grad grad = xp.reshape(x, (np.prod(shape), )) meta_learner = self.meta_learners[i] g = meta_learner(Variable(grad)) # forward w = p - F.reshape(g, shape) self.model_params[k] = w # Train meta-learner with main objective y_pred = self.model(x_l0, self.model_params) loss_ce = F.softmax_cross_entropy(y_pred, y_l) self.cleargrads() # need to clear W'grad due to loss_rec.backward for meta_learner in self.meta_learners: meta_learner.cleargrads() loss_ce.backward(retain_grad=True) for opt in self.opt_meta_learners: opt.update() loss_ce.unchain_backward() #TODO: here is a proper place to unchain?
def _offset2grid(offset, kh, kw, sy, sx, ph, pw, h, w): n, khkw2, out_h, out_w = offset.shape khkw = int(khkw2 / 2) xp = cuda.get_array_module(offset) ys, xs = xp.meshgrid( xp.arange(0, sy * out_h, sy, dtype=numpy.float32), xp.arange(0, sx * out_w, sx, dtype=numpy.float32), indexing='ij', copy=False ) filter_offset_x = xp.tile(xp.arange(kw, dtype=numpy.float32), kh) filter_offset_y = xp.repeat(xp.arange(kh, dtype=numpy.float32), kw) x_coord = (offset[:, :khkw] + xs[None, None] + filter_offset_x[None, :, None, None]) y_coord = (offset[:, khkw:] + ys[None, None] + filter_offset_y[None, :, None, None]) # The values of this variable is clipped in range [-1, 1]. # The coordinate (-1, -1) corresponds to the upper-left # corner of the input image. x_coord = (x_coord / (w + 2 * pw - 1) - 0.5) * 2 y_coord = (y_coord / (h + 2 * ph - 1) - 0.5) * 2 # Shape of `coord` is (n, 2 * kh * kw, out_h, out_w) coord = concat.concat([x_coord, y_coord], axis=1) return coord
def backward(self, inputs, grad_outputs): xp = cuda.get_array_module(inputs[0]) context, weight = inputs weight = weight[:, self.index].reshape(-1, 1) z = xp.zeros((context.shape[0], 2), dtype=xp.float32) z[:, self.index] = xp.sum(grad_outputs[0] * context, axis=1) return grad_outputs[0] * weight, z
def check_backward_consistency_regression(self, x_data, gy_data): # Regression test to two-dimensional unpooling layer. ndim = len(self.dims) if ndim != 2: return ksize = self.ksize stride = self.stride pad = self.pad xp = cuda.get_array_module(x_data) # Backward computation for N-dimensional unpooling layer. x_nd = chainer.Variable(xp.array(x_data)) func_nd = functions.UnpoolingND(ndim, ksize, stride=stride, pad=pad, cover_all=self.cover_all) y_nd = func_nd(x_nd) y_nd.grad = gy_data y_nd.backward() # Backward computation for two-dimensional unpooling layer. x_2d = chainer.Variable(xp.array(x_data)) func_2d = functions.Unpooling2D(ksize, stride=stride, pad=pad, cover_all=self.cover_all) y_2d = func_2d.apply((x_2d,))[0] y_2d.grad = gy_data y_2d.backward() # Test that the two result gradients are close enough. opt = self.check_backward_options testing.assert_allclose( x_nd.grad, x_2d.grad, atol=opt['atol'], rtol=opt['rtol'])
def addgrad(self, var): """Accumulates the gradient array from given source variable. This method just runs ``self.grad += var.grad``, except that the accumulation is even done across the host and different devices. Args: var (Variable): Source variable. """ src = var._grad dst = self._grad if src is None: raise ValueError('Source gradient is not set.') if dst is None: raise ValueError('Target graidient is not set.') xp = cuda.get_array_module(dst) if xp is numpy: dst += cuda.to_cpu(src) elif isinstance(src, numpy.ndarray): dst += cuda.to_gpu(src, device=dst) else: dst_dev = dst.device if dst_dev == src.device: dst += src else: with dst_dev: dst += xp.copy(src)
def backward(self, inputs, grads): xp = cuda.get_array_module(*inputs) _, indices, _ = inputs g = grads[0] gv = g[xp.arange(len(indices)), indices] g[xp.arange(len(indices)), indices] = 0 return g, None, gv
def backward(self, x_orig, gy): # TODO(beam2d): Support backprop on inference mode assert self.use_batch_mean and not self.is_finetune ldim, cdim, rdim = self._internal_shape(x_orig[0]) gy = gy[0].reshape(ldim, cdim, rdim) inv_m = 1. / (ldim * rdim) gbeta = gy.sum(axis=(0, 2), keepdims=True) self.gbeta += gbeta ggamma = (gy * self.x_hat).sum(axis=(0, 2), keepdims=True) self.ggamma += ggamma if cuda.get_array_module(*x_orig) == numpy: coeff = self.gamma / self.std gx = coeff * (gy - (self.x_hat * ggamma + gbeta) * inv_m) else: gx = cuda.elementwise( 'T gy, T gbeta, T ggamma, T x_hat, T gamma, T std, T inv_m', 'T gx', 'gx = gamma / std * (gy - (x_hat * ggamma + gbeta) * inv_m)', 'bn_bwd')( gy, gbeta, ggamma, self.x_hat, self.gamma, self.std, inv_m) return gx.reshape(x_orig[0].shape),
def __call__(self, x, test=False): # add gaussian noise xp = cuda.get_array_module(x.data) with cuda.get_device(self.device): noise = xp.random.randn(*x.shape) * 0.15 x.data += noise # (conv -> act -> bn) x 3 -> maxpool -> dropout h = self.bn_conv0(self.act(self.conv0(x), 0.1), test) h = self.bn_conv1(self.act(self.conv1(h), 0.1), test) h = self.bn_conv2(self.act(self.conv2(h), 0.1), test) h = F.max_pooling_2d(h, (2, 2)) # 32 -> 16 h = F.dropout(h, 0.5, not test) # (conv -> act -> bn) x 3 -> maxpool -> dropout h = self.bn_conv3(self.act(self.conv3(h), 0.1), test) h = self.bn_conv4(self.act(self.conv4(h), 0.1), test) h = self.bn_conv5(self.act(self.conv5(h), 0.1), test) h = F.max_pooling_2d(h, (2, 2)) # 16 -> 8 h = F.dropout(h, 0.5, not test) # conv -> act -> bn -> (nin -> act -> bn) x 2 h = self.bn_conv6(self.act(self.conv6(h), 0.1), test) # 8 -> 6 h = self.bn_conv7(self.act(self.conv7(h), 0.1), test) h = self.bn_conv8(self.act(self.conv8(h), 0.1), test) h = F.average_pooling_2d(h, (6, 6)) h = self.linear(h) return h
def backward(self, inputs, grad_outputs): x, gamma = inputs[:2] gy = grad_outputs[0] head_ndim = gamma.ndim + 1 expander = (None, Ellipsis) + (None,) * (x.ndim - head_ndim) m = gamma.dtype.type(x.size // gamma.size) axis = (0,) + tuple(range(head_ndim, x.ndim)) gbeta = gy.sum(axis=axis) ggamma = (gy * self.x_hat).sum(axis=axis) xp = cuda.get_array_module(x) if len(inputs) == 5: var = inputs[4] gs = gamma / self.std gmean = -gs * gbeta gvar = -0.5 * gamma / var * ggamma gx = gs[expander] * gy return gx, ggamma, gbeta, gmean, gvar if xp is numpy: gx = (gamma / self.std)[expander] * ( gy - (self.x_hat * ggamma[expander] + gbeta[expander]) / m) else: inv_m = numpy.float32(1) / m gx = cuda.elementwise( 'T gy, T x_hat, T gamma, T std, T ggamma, T gbeta, T inv_m', 'T gx', 'gx = (gamma / std) * (gy - (x_hat * ggamma + gbeta) * inv_m)', 'bn_bwd')(gy, self.x_hat, gamma[expander], self.std[expander], ggamma[expander], gbeta[expander], inv_m) return gx, ggamma, gbeta
def backward(self, x, gy): xp = cuda.get_array_module(*x) gx = utils.force_array(xp.cos(x[0])) xp.square(gx, out=gx) xp.reciprocal(gx, out=gx) gx *= gy[0] return gx,
def __init__(self, initializer=None, shape=None, name=None): if initializer is None: initializer = constant.NaN() elif numpy.isscalar(initializer): initializer = constant.Constant(initializer) if shape is None: if isinstance(initializer, (numpy.ndarray, cuda.ndarray)): # parameter initialized by the initial array super(Parameter, self).__init__(initializer, name=name) else: # uninitialized parameter super(Parameter, self).__init__(name=name) self.initializer = initializer dtype = getattr(initializer, 'dtype', numpy.float32) self._grad_initializer = constant.NaN(dtype) else: # parameter initialized with a given shape if isinstance(initializer, (numpy.ndarray, cuda.ndarray)): xp = cuda.get_array_module(initializer) initializer = constant.Constant(initializer) else: xp = numpy data = initializers.generate_array(initializer, shape, xp) grad = xp.full_like(data, numpy.nan) super(Parameter, self).__init__(data, name=name, grad=grad) self.update_rule = None
def backward(self, inputs, gy): xp = cuda.get_array_module(*inputs) x, gamma, beta = inputs gy = gy[0] g_beta = gy.sum(axis=0) g_scaled_x = gy g_gamma = xp.sum(g_scaled_x * self.x_hat, axis=0) g_x_hat = g_scaled_x * gamma[None, ] g_inv_std = xp.sum(g_x_hat * self.x_mu, axis=1, keepdims=True) g_x_mu_1 = g_x_hat * self.inv_std g_std = g_inv_std * (- 1. / self.var) g_var = g_std * 0.5 * self.inv_std n_units = x.shape[1] g_squ_x_mu = _broadcast_to(xp, g_var * 1. / n_units, x.shape) g_x_mu_2 = g_squ_x_mu * 2 * self.x_mu g_x_1 = g_x_mu_1 + g_x_mu_2 g_mu = xp.sum(g_x_1, axis=1, keepdims=True) * (- 1.) g_x_2 = _broadcast_to(xp, g_mu * 1. / n_units, x.shape) g_x = g_x_1 + g_x_2 return g_x, g_gamma, g_beta,
def variable_repr(var): """Return the string representation of a variable. Args: var (~chainer.Variable): Input Variable. .. seealso:: numpy.array_repr """ xp = cuda.get_array_module(var) if xp is numpy: arr = var.data else: arr = var.data.get() if var.name: prefix = 'variable ' + var.name else: prefix = 'variable' if arr is None: lst = 'None' elif arr.size > 0 or arr.shape == (0,): lst = numpy.array2string(arr, None, None, None, ', ', prefix + '(') else: # show zero-length shape unless it is (0,) lst = '[], shape=%s' % (repr(arr.shape),) return '%s(%s)' % (prefix, lst)
def forward(self, x): self.retain_inputs(()) self._in_shape = x[0].shape self._in_dtype = x[0].dtype self._xp = cuda.get_array_module(*x) return self._xp.asarray( x[0].sum(axis=self.axis, keepdims=self.keepdims)),
def forward(self, inputs): xp = cuda.get_array_module(*inputs) x, h_tm1, c_tm1, q = inputs batchsize = x.shape[0] self.z = xp.empty((batchsize,self.out_size*4),dtype=np.dtype('float32')) self.c = xp.empty((batchsize,self.out_size),dtype=np.dtype('float32')) self.h = xp.empty((batchsize,self.out_size),dtype=np.dtype('float32')) if xp is np: self.z = np.dot(x, self.W.T, out=self.z) self.z += np.dot(h_tm1, self.V.T) self.z += np.dot(q, self.U.T) if not self.nobias: self.z += self.b _lstm_forward_cpu(z=self.z, c_tm1=c_tm1, c=self.c, h=self.h, out_size=self.out_size) else: self.z = cp.dot(x, self.W.T, out=self.z) gpu.utils.dot_add(A=h_tm1, B=self.V, C=self.z, transb=True) gpu.utils.dot_add(A=q, B=self.U, C=self.z, transb=True) if not self.nobias: gpu.utils.addVec2Mat(self.z, self.b) _lstm_forward_gpu(z=self.z, c_tm1=c_tm1, c=self.c, h=self.h, out_size=self.out_size) return self.h, self.c
def forward(self, x): xp = cuda.get_array_module(*x) n, c = x[0].shape[:2] y = xp.zeros((n,c,self.size,self.size), dtype=numpy.float32) for k in range(n): y[k]= x[0][k,:,self.i1[k,0]:self.i2[k,0],self.i1[k,1]:self.i2[k,1]] return y,
def check_proposal_target_creator( self, bbox, label, roi, proposal_target_creator): xp = cuda.get_array_module(roi) sample_roi, gt_roi_loc, gt_roi_label =\ proposal_target_creator(roi, bbox, label) # Test types self.assertIsInstance(sample_roi, xp.ndarray) self.assertIsInstance(gt_roi_loc, xp.ndarray) self.assertIsInstance(gt_roi_label, xp.ndarray) sample_roi = cuda.to_cpu(sample_roi) gt_roi_loc = cuda.to_cpu(gt_roi_loc) gt_roi_label = cuda.to_cpu(gt_roi_label) # Test shapes self.assertEqual(sample_roi.shape, (self.n_sample, 4)) self.assertEqual(gt_roi_loc.shape, (self.n_sample, 4)) self.assertEqual(gt_roi_label.shape, (self.n_sample,)) # Test foreground and background labels np.testing.assert_equal(np.sum(gt_roi_label >= 0), self.n_sample) n_pos = np.sum(gt_roi_label >= 1) n_neg = np.sum(gt_roi_label == 0) self.assertLessEqual(n_pos, self.n_sample * self.pos_ratio) self.assertLessEqual(n_neg, self.n_sample - n_pos)
def backward(self, indexes, grad_outputs): anchor, positive, negative = self.get_retained_inputs() N = anchor.shape[0] x_dim = anchor.shape[1] xp = cuda.get_array_module(anchor) tmp = xp.repeat(self.dist_hinge[:, None], x_dim, axis=1) mask = xp.array(tmp > 0, dtype=numpy.float32) gy, = grad_outputs if self.reduce == 'mean': g = gy / N else: g = gy[:, None] tmp = 2 * chainer.functions.broadcast_to(g, mask.shape) * mask ret = [] if 0 in indexes: ret.append(tmp * (negative - positive)) if 1 in indexes: ret.append(tmp * (positive - anchor)) if 2 in indexes: ret.append(tmp * (anchor - negative)) return ret
def check_backward(self, x_data, W_data, b_data, y_grad): xp = cuda.get_array_module(x_data) if not self.c_contiguous: x_data = xp.asfortranarray(x_data) W_data = xp.asfortranarray(W_data) y_grad = xp.asfortranarray(y_grad) self.assertFalse(x_data.flags.c_contiguous) self.assertFalse(W_data.flags.c_contiguous) self.assertFalse(y_grad.flags.c_contiguous) if b_data is not None: b = xp.empty((len(b_data) * 2,), dtype=self.b.dtype) b[::2] = b_data b_data = b[::2] self.assertFalse(b_data.flags.c_contiguous) args = (x_data, W_data) if b_data is not None: args = args + (b_data,) with chainer.using_config('use_cudnn', self.use_cudnn): with chainer.using_config('cudnn_deterministic', self.cudnn_deterministic): gradient_check.check_backward( convolution_2d.Convolution2DFunction( self.stride, self.pad, self.cover_all), args, y_grad, **self.check_backward_options)
def backward(self, xs, gy): if not xs[:-1]: return gy xp = cuda.get_array_module(*xs) sizes = numpy.array([x.shape[self.axis] for x in xs[:-1]]).cumsum() return xp.split(gy[0], sizes, axis=self.axis)
def forward(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None kh, kw = W.shape[2:] xp = cuda.get_array_module(*x) if xp is numpy: self.col = conv.im2col_cpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw) else: self.col = conv.im2col_gpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw) B, C, KY, KX, IY, IX = self.col.shape D = W.shape[0] # (D, C, KY, KX) c_ = self.col.transpose(1, 0, 4, 5, 2, 3) \ .reshape((C, B * IY * IX, KY * KX)) w_ = W.transpose(1, 2, 3, 0).reshape((C, KY * KX, D)) # (C, B*IY*IX, KY*KX), (C, KY*KX, D)-> (C, B*IY*IX, D) y = _matmul(c_, w_, xp).astype(x.dtype, copy=False) # (C, B*IY*IX, D) -> (B, C*D, IY, IX) y = y.reshape((C, B, IY * IX, D)).transpose(1, 0, 3, 2) \ .reshape((B, C * D, IY, IX)) if b is not None: y += b[None, :, None, None] return y,
def check_forward(self, t_data, xs_data): x = tuple(chainer.Variable(x_data) for x_data in xs_data) t = chainer.Variable(t_data) loss = functions.connectionist_temporal_classification(x, t, 2) loss_value = float(loss.data) # compute expected value by recursive computation. xp = cuda.get_array_module(self.x) xt = xp.swapaxes(self.x, 0, 1) for b in range(xt.shape[0]): for t in range(xt.shape[1]): xt[b][t] = numpy.exp(xt[b][t]) / numpy.sum(numpy.exp(xt[b][t])) loss_expect = 0 batch_size = xt.shape[0] for b in range(batch_size): loss_expect += -math.log(self.alpha(xt[b], self.l[b], self.x.shape[0]-1, self.l[b].shape[0]-1) + self.alpha(xt[b], self.l[b], self.x.shape[0]-1, self.l[b].shape[0]-2)) loss_expect /= batch_size self.assertAlmostEqual(loss_expect, loss_value, places=5)
def __call__(self, rule, param): grad = param.grad if grad is None: return xp = cuda.get_array_module(grad) with cuda.get_device_from_array(grad): xp.clip(grad, self.lower_bound, self.upper_bound, out=grad)
def backward(self, inputs, grad_outputs): xp = cuda.get_array_module(*inputs) x, W = inputs gy = grad_outputs[0] gW = xp.zeros_like(W) if xp is numpy: # It is equivalent to `numpy.add.at(gW, x, gy)` but ufunc.at is # too slow. for ix, igy in six.moves.zip(x.ravel(), gy.reshape(x.size, -1)): if ix == self.ignore_label: continue gW[ix] += igy else: if self.ignore_label is None: cuda.elementwise( 'T gy, int32 x, int32 n_out', 'raw T gW', 'int w_ind[] = {x, i % n_out}; atomicAdd(&gW[w_ind], gy)', 'embed_id_bwd')( gy, xp.expand_dims(x, -1), gW.shape[1], gW) else: cuda.elementwise( 'T gy, int32 x, int32 n_out, int32 ignore', 'raw T gW', ''' if (x != ignore) { int w_ind[] = {x, i % n_out}; atomicAdd(&gW[w_ind], gy); } ''', 'embed_id_bwd_ignore_label')( gy, xp.expand_dims(x, -1), gW.shape[1], self.ignore_label, gW) return None, gW
def forward(self, inputs): self.retain_inputs((0, 1)) x, gamma, beta = inputs xp = cuda.get_array_module(x) if self.running_mean is None: self.running_mean = xp.zeros_like(gamma) self.running_var = xp.zeros_like(gamma) self.mode = _BNMode(x, gamma) # expander inserts singleton dimensions to gamma and beta so that they # can be broadcasted with x. head_ndim = gamma.ndim + 1 expander = (None, Ellipsis) + (None, ) * (x.ndim - head_ndim) self.expander = expander self.axis = (0, ) + tuple(range(head_ndim, x.ndim)) self.use_cudnn = self.mode.can_use_cudnn(xp) if self.use_cudnn: x = cuda.cupy.ascontiguousarray(x) gamma = cuda.cupy.ascontiguousarray(gamma) beta = cuda.cupy.ascontiguousarray(beta) dtype = x.dtype handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(_as4darray(x)) derivedBnDesc = cudnn.create_uninitialized_tensor_descriptor() cudnn_mode = self.mode.get_cudnn_mode() libcudnn.deriveBNTensorDescriptor(derivedBnDesc.value, x_desc.value, cudnn_mode) one = numpy.array(1, dtype=dtype).ctypes zero = numpy.array(0, dtype=dtype).ctypes y = cuda.cupy.empty_like(x) # Factor used in the moving average factor = 1 - self.decay if self.mean is None: # Output cache to speed up backward pass. self.mean = xp.empty_like(gamma) # Output cache to speed up backward pass. self.inv_std = xp.empty_like(gamma) # Note: cuDNN computes the mini-batch mean and variance # internally. We can simply (optionally) pass # it the running-average mean and variance arrays. # Note: This API seems to set the inverse of the standard deviation # (instead of variance) to resultSaveInvVariance argument. The # current implementation of our BN depends on this behavior so that # we can reduce the number of reduction kernels. libcudnn.batchNormalizationForwardTraining( handle, cudnn_mode, one.data, zero.data, x_desc.value, x.data.ptr, x_desc.value, y.data.ptr, derivedBnDesc.value, gamma.data.ptr, beta.data.ptr, factor, self.running_mean.data.ptr, self.running_var.data.ptr, self.eps, self.mean.data.ptr, self.inv_std.data.ptr) else: gamma = gamma[expander] beta = beta[expander] self.mean = x.mean(axis=self.axis) var = x.var(axis=self.axis) var += self.eps self.inv_std = var**(-0.5) y = _apply_bn_fwd(xp, x, self.mean[expander], self.inv_std[expander], gamma, beta) # Update running statistics m = x.size // gamma.size adjust = m / max(m - 1., 1.) # unbiased estimation self.running_mean *= self.decay self.running_mean += (1 - self.decay) * self.mean self.running_var *= self.decay self.running_var += (1 - self.decay) * adjust * var return y,
def backward_preprocess(self, function, in_data, out_grad): self.xp = cuda.get_array_module(*(in_data + out_grad)) self._preprocess()
def init_state(self, param): xp = cuda.get_array_module(param.data) with cuda.get_device_from_array(param.data): self.state['msg'] = xp.zeros_like(param.data) self.state['msdx'] = xp.zeros_like(param.data)
def connectionist_temporal_classification( x, t, blank_symbol, input_length=None, label_length=None): """Connectionist Temporal Classification loss function. Connectionist Temporal Classification(CTC) [Graves2006]_ is a loss function of sequence labeling where the alignment between the inputs and target is unknown. See also [Graves2012]_ Args: x (sequence of Variable): RNN output at each time. ``x`` must be a list of :class:`~chainer.Variable` s. Each element of ``x``, ``x[i]`` is a :class:`~chainer.Variable` representing output of RNN at time ``i``. t (Variable): Expected label sequence. blank_symbol (int): Index of blank_symbol. This value must be non-negative. input_length (Variable): Length of valid sequence for each of mini batch x (optional). If input_length is skipped, It regards that all of x is valid input. label_length (Variable): Length of valid sequence for each of mini batch t (optional). If label_length is skipped, It regards that all of t is valid input. Returns: Variable: A variable holding a scalar value of the CTC loss. .. note:: You need to input ``x`` without applying to activation functions(e.g. softmax function), because this function applies softmax functions to ``x`` before calculating CTC loss to avoid numerical limitations. You also need to apply softmax function to forwarded values before you decode it. .. note:: This function is differentiable only by ``x``. .. note:: This function supports (batch, sequence, 1-dimensional input)-data. .. [Graves2006] Alex Graves, Santiago Fernandez,\ Faustino Gomez, Jurgen Schmidhuber,\ `Connectionist Temporal Classification: Labelling Unsegmented\ Sequence Data with Recurrent Neural Networks\ <ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf>`_ .. [Graves2012] Alex Graves,\ `Supervised Sequence Labelling with Recurrent Neural Networks\ <http://www.cs.toronto.edu/~graves/preprint.pdf>`_ """ if not isinstance(x, collections.Sequence): raise TypeError('x must be a list of Variables') if not isinstance(blank_symbol, int): raise TypeError('blank_symbol must be non-negative integer.') assert blank_symbol >= 0 assert blank_symbol < x[0].data.shape[1] # This implementation only supports 1-dimensional data. # TODO(jnishi): Support d(>1)-dimentinal inputs. assert(len(x[0].data.shape) == 2) if input_length is None: xp = cuda.get_array_module(x[0].data) input_length = chainer.Variable( xp.full((len(x[0].data),), len(x), dtype=numpy.int32), volatile='auto') label_length = chainer.Variable( xp.full((len(t.data),), len(t.data[0]), dtype=numpy.int32), volatile='auto') # Batch size check. assert len(x[0].data) == len(t.data) assert len(x[0].data) == len(input_length.data) assert len(x[0].data) == len(label_length.data) # Length check. assert len(x) >= max(input_length.data) assert len(t.data[0]) >= max(label_length.data) return ConnectionistTemporalClassification(blank_symbol)( input_length, label_length, t, *x)
def backward(self, retain_grad=False): """Runs error backpropagation (a.k.a. backprop) from this variable. On backprop, :meth:`Function.backward` is called on each :class:`Function` object appearing in the backward graph starting from this variable. The backward graph is represented by backward references from variable nodes to their creators, and from functions to their input variable nodes. The backprop stops at all root nodes. Some functions set ``None`` as gradients of some inputs, where further backprop does not take place at such inputs. This method uses :data:`grad` as the initial error array. User can manually set a gradient array before calling this method. If :data:`data` contains only one element (i.e., it is scalar) and :data:`grad` is ``None``, then this method automatically complements 1.0 as the initial error. This is useful on starting backprop from some scalar loss value. Args: retain_grad (bool): If ``True``, the gradient arrays of all intermediate variables are kept. Otherwise, :data:`grad` of the intermediate variables are set to ``None`` on appropriate timing, which may reduce the maximum memory consumption. In most cases of training some models, the purpose of backprop is to compute gradients of parameters, not of all variables, and therefore it is recommended to set this flag ``False``. """ if self.creator is None: return initial_device = None if cuda.available and isinstance(self.data, cuda.cupy.ndarray): try: initial_device = cuda.Device() except cuda.cupy.cuda.runtime.CUDARuntimeError as e: if e.status != 38: # cudaErrorNoDevice raise is_debug = chainer.is_debug() cand_funcs = [] seen_set = set() seen_vars = set() need_copy = set() # Initialize error by 1, if this is a loss variable if self.data.size == 1 and self.grad is None: with cuda.get_device_from_array(self.data) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.data) else: self.grad = cuda.cupy.ones_like(self.data) def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator) while cand_funcs: _, _, func = heapq.heappop(cand_funcs) outputs = [y() for y in func.outputs] # access via weak ref in_data = tuple([x.data for x in func.inputs]) out_grad = tuple([None if y is None else y.grad for y in outputs]) hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance cuda.get_device_from_array(*(in_data + out_grad)).use() for hook in hooks: hook.backward_preprocess(func, in_data, out_grad) func.output_data = tuple( [None if y is None else y.data for y in outputs]) gxs = func.backward(in_data, out_grad) assert len(gxs) == len(in_data) if not getattr(func, '_retain_after_backward', False): func.output_data = None for hook in hooks: hook.backward_postprocess(func, in_data, out_grad) if is_debug: for gx in gxs: if gx is None: continue cuda.get_device_from_array(gx).use() if cuda.get_array_module(gx).isnan(gx).any(): msg = 'NaN is detected on backward computation' raise RuntimeError(msg) if not retain_grad: for y in outputs: if y is not None and y is not self.node: y.grad = None for x, gx in zip(func.inputs, gxs): if gx is None: continue if not x.requires_grad: continue _check_grad_type(func, x, gx) # Accumulate the gradient to x. It is a bit tricky to handle # branches and parameter gradient accumulation correctly. id_x = id(x) if x.creator is None: # leaf if x._grad is None: x.grad = gx need_copy.add(id_x) else: cuda.get_device_from_array(gx).use() if id_x in need_copy: x.grad = utils.force_array(x._grad + gx) # copy need_copy.remove(id_x) else: x._grad += gx else: # not a leaf add_cand(x.creator) if id_x not in seen_vars: # 1st visit x.grad = gx seen_vars.add(id_x) need_copy.add(id_x) else: cuda.get_device_from_array(gx).use() if id_x in need_copy: # 2nd visit x.grad = utils.force_array(gx + x._grad) # copied need_copy.remove(id_x) else: # 3rd or later visit x._grad += gx del gxs # to reduce memory usage if initial_device is not None: initial_device.use()
def f(self, xs): xp = cuda.get_array_module(*xs) return xp.exp(xs[0]),
def _zeros_like(x): xp = cuda.get_array_module(x) return xp.zeros_like(x)
def _full_like(x, val): xp = cuda.get_array_module(x) return xp.full_like(x, val)
def backward_postprocess(self, function, in_data, out_grad): xp = cuda.get_array_module(*(in_data + out_grad)) assert xp == self.xp self._postprocess(function_namer(function, in_data), bwd=True)
def forward_preprocess(self, function, in_data): self.xp = cuda.get_array_module(*in_data) self._preprocess()
def check_orthogonality(self, w): self.initializer(w) xp = cuda.get_array_module(w) testing.assert_allclose(w, xp.ones((), dtype=numpy.float32) * 2)
def forward_postprocess(self, function, in_data): xp = cuda.get_array_module(*in_data) assert xp == self.xp self._postprocess(function_namer(function, in_data))
def backward(self, x, grad_outputs): xp = cuda.get_array_module(*x) return xp.zeros_like(x[0]),
def check_orthogonality(self, w): self.initializer(w) xp = cuda.get_array_module(w) w = w.reshape(len(w), -1) dots = xp.tensordot(w, w, (1, 1)) testing.assert_allclose(dots, xp.identity(len(w)))
def __call__(self, roi, bbox, label, mask, levels, loc_normalize_mean=(0., 0., 0., 0.), loc_normalize_std=(0.1, 0.1, 0.2, 0.2), mask_size=14, binary_mask=True): """ binary_mask = False -> keypoint """ xp = cuda.get_array_module(roi) roi = cuda.to_cpu(roi) bbox = cuda.to_cpu(bbox) label = cuda.to_cpu(label) mask = cuda.to_cpu(mask) levels = cuda.to_cpu(levels) n_bbox, _ = bbox.shape n_proposal = roi.shape[0] roi = np.concatenate((roi, bbox), axis=0) # assign feature levels of ground truth boxes bbox_levels = map_rois_to_fpn_levels(np, bbox) levels = np.concatenate([levels, bbox_levels]) pos_roi_per_image = np.round(self.n_sample * self.pos_ratio) iou = bbox_iou(roi, bbox) gt_assignment = iou.argmax(axis=1) max_iou = iou.max(axis=1) # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class]. # The label with value 0 is the background. gt_roi_label = label[gt_assignment] + 1 # Select foreground RoIs as those with >= pos_iou_thresh IoU. pos_index = np.where(max_iou >= self.pos_iou_thresh)[0] pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size)) if pos_index.size > 0: pos_index = np.random.choice(pos_index, size=pos_roi_per_this_image, replace=False) # Select background RoIs as those within # [neg_iou_thresh_lo, neg_iou_thresh_hi). neg_index = np.where((max_iou < self.neg_iou_thresh_hi) & (max_iou >= self.neg_iou_thresh_lo))[0] neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image neg_roi_per_this_image = int( min(neg_roi_per_this_image, neg_index.size)) if neg_index.size > 0: neg_index = np.random.choice(neg_index, size=neg_roi_per_this_image, replace=False) # The indices that we're selecting (both positive and negative). keep_index = np.append(pos_index, neg_index) gt_roi_label = gt_roi_label[keep_index] gt_roi_label[pos_roi_per_this_image:] = 0 # negative labels --> 0 sample_roi = roi[keep_index] sample_levels = levels[keep_index] # Compute offsets and scales to match sampled RoIs to the GTs. gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]]) gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32)) / np.array(loc_normalize_std, np.float32)) # https://engineer.dena.jp/2017/12/chainercvmask-r-cnn.html gt_roi_mask = [] _, h, w = mask.shape if binary_mask: for i, idx in enumerate(gt_assignment[pos_index]): A = mask[idx, np.max((int(sample_roi[i, 0]), 0)):np.min((int(sample_roi[i, 2]), h)), np.max((int(sample_roi[i, 1]), 0)):np.min((int(sample_roi[i, 3]), w))] gt_roi_mask.append( cv2.resize(A, (mask_size, mask_size)).astype(np.int32)) else: for i, idx in enumerate(gt_assignment[pos_index]): m = np.zeros((mask_size, mask_size), dtype=np.int32) # remind: shape of keypoints is (N, 17, 3), N is number of bbox, 17 is number of keypoints, 3 is (x, y, v) # v=0: unlabeled, v=1, labeled but invisible, v=2 labeled and visible # bbox's (y0, x0), (y1, x1) y0, x0, y1, x1 = list(map(int, sample_roi[i, :4])) kp = mask[idx] # shape is (17, 3) # convert keypoints coordinate (y, x) into mask coordinate system [0, mask_size]x[0, mask_size] kp[:, :2] = (kp[:, :2] - [y0, x0]) / \ [max(y1 - y0, 1), max(x1 - x0, 1)] * mask_size # mask_size x mask_size 空間でどこにあるかをラベルとして扱う(あとでsoftmax cross entropyする) # -1でignoreされる keypoint_labels = np.zeros(kp.shape[0], dtype=np.int32) for j, r in enumerate(kp): y, x, v = list(map(int, r)) if v == 2 and 0 <= y and y < mask_size and 0 <= x and x < mask_size: keypoint_labels[j] = y * mask_size + x else: keypoint_labels[j] = -1 gt_roi_mask.append(keypoint_labels) gt_roi_mask = xp.array(gt_roi_mask) if xp != np: sample_roi = cuda.to_gpu(sample_roi) gt_roi_loc = cuda.to_gpu(gt_roi_loc) gt_roi_label = cuda.to_gpu(gt_roi_label) gt_roi_mask = cuda.to_gpu(gt_roi_mask) sample_levels = cuda.to_gpu(sample_levels) return sample_roi, sample_levels, gt_roi_loc, gt_roi_label, gt_roi_mask
def main(): parser = argparse.ArgumentParser(description='Chainer CIFAR example:') parser.add_argument('resume') parser.add_argument('--nb_trials', type=int, default=50) parser.add_argument('--model', default='c5') parser.add_argument('--gpu', '-g', type=int, default=0) parser.add_argument('--nb_valid', type=int, default=10000) parser.add_argument('--seed', type=int, default=1701) parser.add_argument('--debug', action='store_true') args = parser.parse_args() start = time.time() logger.initialize("grad_"+args.model) logger.info(vars(args)) np.random.seed(args.seed) save_dir = logger.get_savedir() logger.info("Written to {}".format(save_dir)) logger.info('GPU: {}'.format(args.gpu)) train_all, test = get_cifar10() if args.debug: valid = train_all[200:400] else: valid_choice = np.random.choice(range(len(train_all)), args.nb_valid, replace=False) valid = [x for idx, x in enumerate(train_all) if idx in valid_choice] print(len(valid)) model = get_model(args.model, args.gpu, args.resume) # Get one image per iteration valid_iter = chainer.iterators.SerialIterator( valid, 1, repeat=False, shuffle=False) if not os.path.exists("grads"): os.makedirs("grads") chainer.config.train = False chainer.config.enable_backprop = True for idx, tup in enumerate(valid_iter): print(idx) img = tup[0][0] # Tile image to calculate all the trials at once inp = np.tile(img.copy()[np.newaxis, ...], (args.nb_trials, 1, 1, 1)) label = tup[0][1][np.newaxis, ...] sigma = (inp.max() - inp.min()) * 0.025 # noise level model.cleargrads() inp = inp + np.random.randn(*inp.shape).astype(np.float32) * sigma # Add noise to every image x = Variable(cuda.to_gpu(inp, args.gpu)) xp = cuda.get_array_module(x) pred = model.get_feature(x, False) # print(class_list[int(cuda.to_cpu(xp.argmax(pred.data)))], class_list[int(label)]) pred.grad = xp.ones(pred.shape, dtype=np.float32) pred.backward() mean_grad = cuda.to_cpu(xp.mean(x.grad.copy(), axis=0)) mean_grad = np.max(np.abs(mean_grad), axis=0) mean_grad = color.gray2rgb(mean_grad) mean_grad = clip_image(mean_grad) orig_img = np.transpose(img, (1, 2, 0)) masked = orig_img * mean_grad out = np.concatenate((mean_grad, masked, orig_img), axis=1) plt.imsave("grads/{:05d}.png".format(idx), out) model.cleargrads() print(time.time()-start)
def n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction, **kwargs): """n_step_rnn_base(n_layers, dropout_ratio, hx, ws, bs, xs, activation, use_bi_direction) Base function for Stack RNN/BiRNN functions. This function is used at :func:`chainer.functions.n_step_birnn` and :func:`chainer.functions.n_step_rnn`. This function's behavior depends on following arguments, ``activation`` and ``use_bi_direction``. .. warning:: ``train`` and ``use_cudnn`` arguments are not supported anymore since v2. Instead, use ``chainer.using_config('train', train)`` and ``chainer.using_config('use_cudnn', use_cudnn)`` respectively. See :func:`chainer.using_config`. Args: n_layers(int): Number of layers. dropout_ratio(float): Dropout ratio. hx (chainer.Variable): Variable holding stacked hidden states. Its shape is ``(S, B, N)`` where ``S`` is number of layers and is equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is dimention of hidden units. ws (list of list of chainer.Variable): Weight matrices. ``ws[i]`` represents weights for i-th layer. Each ``ws[i]`` is a list containing two matrices. ``ws[i][j]`` is corresponding with ``W_j`` in the equation. Only ``ws[0][j]`` where ``0 <= j < 1`` is ``(I, N)`` shape as they are multiplied with input variables. All other matrices has ``(N, N)`` shape. bs (list of list of chainer.Variable): Bias vectors. ``bs[i]`` represnents biases for i-th layer. Each ``bs[i]`` is a list containing two vectors. ``bs[i][j]`` is corresponding with ``b_j`` in the equation. Shape of each matrix is ``(N,)`` where ``N`` is dimention of hidden units. xs (list of chainer.Variable): A list of :class:`~chainer.Variable` holding input values. Each element ``xs[t]`` holds input value for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is mini-batch size for time ``t``, and ``I`` is size of input units. Note that this functions supports variable length sequences. When sequneces has different lengths, sort sequences in descending order by length, and transpose the sorted sequence. :func:`~chainer.functions.transpose_sequence` transpose a list of :func:`~chainer.Variable` holding sequence. So ``xs`` needs to satisfy ``xs[t].shape[0] >= xs[t + 1].shape[0]``. activation (str): Activation function name. Please select ``tanh`` or ``relu``. use_bi_direction (bool): If ``True``, this function uses Bi-directional RNN. Returns: tuple: This functions returns a tuple concaining three elements, ``hy`` and ``ys``. - ``hy`` is an updated hidden states whose shape is same as ``hx``. - ``ys`` is a list of :class:`~chainer.Variable` . Each element ``ys[t]`` holds hidden states of the last layer corresponding to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is mini-batch size for time ``t``, and ``N`` is size of hidden units. Note that ``B_t`` is the same value as ``xs[t]``. .. seealso:: :func:`chainer.functions.n_step_rnn` :func:`chainer.functions.n_step_birnn` """ # NOQA argument.check_unexpected_kwargs( kwargs, train='train argument is not supported anymore. ' 'Use chainer.using_config', use_cudnn='use_cudnn argument is not supported anymore. ' 'Use chainer.using_config') argument.assert_kwargs_empty(kwargs) activation_list = ['tanh', 'relu'] if activation not in activation_list: candidate = ','.join(activation_list) raise ValueError('Invalid activation: "%s". Please select from [%s]' % (activation, candidate)) xp = cuda.get_array_module(hx) if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000): states = get_random_state().create_dropout_states(dropout_ratio) # flatten all input variables inputs = tuple(itertools.chain( (hx, ), itertools.chain.from_iterable(ws), itertools.chain.from_iterable(bs), xs)) if use_bi_direction: # Bi-directional RNN if activation == 'tanh': rnn = NStepBiRNNTanh(n_layers, states) elif activation == 'relu': rnn = NStepBiRNNReLU(n_layers, states) else: # Uni-directional RNN if activation == 'tanh': rnn = NStepRNNTanh(n_layers, states) elif activation == 'relu': rnn = NStepRNNReLU(n_layers, states) ret = rnn(*inputs) hy, = ret[:1] ys = ret[1:] return hy, ys else: direction = 2 if use_bi_direction else 1 hx = split_axis.split_axis(hx, n_layers * direction, axis=0, force_tuple=True) hx = [reshape.reshape(h, h.shape[1:]) for h in hx] xws = [_stack_weight([w[0]]) for w in ws] hws = [_stack_weight([w[1]]) for w in ws] xbs = [_stack_weight([b[0]]) for b in bs] hbs = [_stack_weight([b[1]]) for b in bs] xs_next = xs hy = [] for layer in six.moves.range(n_layers): def _one_directional_loop(di): # di=0, forward RNN # di=1, backward RNN xs_list = xs_next if di == 0 else reversed(xs_next) layer_idx = direction * layer + di h = hx[layer_idx] h_list = [] for x in xs_list: batch = x.shape[0] if h.shape[0] > batch: h, h_rest = split_axis.split_axis(h, [batch], axis=0) else: h_rest = None if layer > 0: x = dropout.dropout(x, ratio=dropout_ratio) rnn_in = (linear.linear(x, xws[layer_idx], xbs[layer_idx]) + linear.linear(h, hws[layer_idx], hbs[layer_idx])) if activation == 'tanh': h_bar = tanh.tanh(rnn_in) elif activation == 'relu': h_bar = relu.relu(rnn_in) if h_rest is not None: h = concat.concat([h_bar, h_rest], axis=0) else: h = h_bar h_list.append(h_bar) return h, h_list # Forward RNN h, h_forward = _one_directional_loop(di=0) hy.append(h) if use_bi_direction: # Backward RNN h, h_backward = _one_directional_loop(di=1) h_backward.reverse() # Concat xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in six.moves.zip(h_forward, h_backward)] hy.append(h) else: # Uni-directional RNN xs_next = h_forward ys = xs_next hy = stack.stack(hy) return hy, tuple(ys)
def add_noise(h, test, sigma=0.2): xp = cuda.get_array_module(h.data) if test: return h else: return h + sigma * xp.random.randn(*h.data.shape)
def forward(self, inputs): x, = inputs xp = cuda.get_array_module(x) return xp.expand_dims(x, self.axis),
def _compute_core(self, *inputs): # Usually, backward() is not necessary for calculating occlusion with chainer.using_config('enable_backprop', self.enable_backprop): original_result = self.eval_fun(*inputs) target_var = self.get_target_var(inputs) original_target_array = target_var.array.copy() original_score = self.get_output_var(original_result) xp = cuda.get_array_module(target_var.array) value = 0. # fill with `value` target_dim = target_var.ndim batch_size = target_var.shape[0] occlusion_window_shape = [1] * target_dim occlusion_window_shape[0] = batch_size for axis, size in zip(self.slide_axis, self.size): occlusion_window_shape[axis] = size occlusion_scores_shape = [1] * target_dim occlusion_scores_shape[0] = batch_size for axis, size in zip(self.slide_axis, self.size): occlusion_scores_shape[axis] = target_var.shape[axis] occlusion_window = xp.ones(occlusion_window_shape, dtype=target_var.dtype) * value occlusion_scores = xp.zeros(occlusion_scores_shape, dtype=xp.float32) def _extract_index(slide_axis, size, start_indices): colon = slice(None) index = [colon] * target_dim for axis, size, start in zip(slide_axis, size, start_indices): index[axis] = slice(start, start + size, 1) return tuple(index) end_list = [target_var.data.shape[axis] - size + 1 for axis, size in zip(self.slide_axis, self.size)] for start in itertools.product(*[six.moves.range(end) for end in end_list]): occlude_index = _extract_index(self.slide_axis, self.size, start) if self.target_extractor is None: inputs[0].array = original_target_array.copy() inputs[0].array[occlude_index] = occlusion_window with chainer.using_config('enable_backprop', self.enable_backprop): occluded_result = self.eval_fun(*inputs) else: def mask_target_var(hook, args, _target_var): _target_var.array = original_target_array.copy() _target_var.array[occlude_index] = occlusion_window self.target_extractor.add_process( '/saliency/mask_target_var', mask_target_var) with chainer.using_config('enable_backprop', self.enable_backprop): occluded_result = self.eval_fun(*inputs) self.target_extractor.delete_process( '/saliency/mask_target_var') occluded_score = self.get_output_var(occluded_result) score_diff_var = original_score - occluded_score # (bs, 1) # expand_dim for ch_axis score_diff = xp.reshape(score_diff_var.array, occlusion_window_shape) occlusion_scores[occlude_index] += score_diff outputs = (occlusion_scores,) return outputs
def __call__(self, array): xp = cuda.get_array_module(array) array[...] = xp.asarray(self.fill_value)
def backward(self, retain_grad=False): """Runs error backpropagation (a.k.a. backprop) from this variable. On backprop, :meth:`Function.backward` is called on each :class:`Function` object appearing in the backward graph starting from this variable. The backward graph is represented by backward references from variables to their creators, and from functions to their inputs. The backprop stops at all root variables. Some functions set ``None`` as gradients of some inputs, where further backprop does not take place at such input variables. This method uses :data:`grad` as the initial error array. User can manually set a gradient array before calling this method. If :data:`data` contains only one element (i.e., it is scalar) and :data:`grad` is ``None``, then this method automatically complements 1.0 as the initial error. This is useful on starting backprop from some scalar loss value. Args: retain_grad (bool): If ``True``, the gradient arrays of all intermediate variables are kept. Otherwise, :data:`grad` of the intermediate variables are set to ``None`` on appropriate timing, which may reduce the maximum memory consumption. In most cases of training some models, the purpose of backprop is to compute gradients of parameters, not of variables, so it is recommended to set this flag ``False``. """ if self.creator is None: return initial_device = None if cuda.available: try: initial_device = cuda.Device() except cuda.cupy.cuda.runtime.CUDARuntimeError as e: if e.status != 38: # cudaErrorNoDevice raise is_debug = chainer.is_debug() cand_funcs = [] seen_set = set() seen_vars = set() need_copy = set() # Initialize error by 1, if this is a loss variable if self.data.size == 1 and self.grad is None: with cuda.get_device(self.data) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.data) else: self.grad = cuda.cupy.ones_like(self.data) def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator) while cand_funcs: _, _, func = heapq.heappop(cand_funcs) outputs = [y() for y in func.outputs] # access via weak ref in_data = tuple([x.data for x in func.inputs]) out_grad = () # if enable grad accumulate if mkld.enable_acc_gradF((in_data,)) and in_data[0].ndim == 4 and all(isinstance(xi, numpy.ndarray) for xi in in_data): out_grad_tmp = tuple([None if y is None else y.grad for y in outputs]) acc_grad_tuple = tuple([None if y is None else y.acc_grad for y in outputs]) for grad_tmp, acc_grad in zip(out_grad_tmp, acc_grad_tuple): if len(acc_grad) == 0: # no need accumulate, just return grad out_grad += (grad_tmp,) else: """ acc_grad's length is not 0, means need to do grad accumulate call native MKLDNN sum primitive """ y = numpy.empty((grad_tmp.shape), dtype=grad_tmp.dtype) acc_grad += (grad_tmp,) mkldnn_sum = mkldnn.Sum_F32() mkldnn_sum.sum4d_gx(acc_grad, y) out_grad += (y,) else: out_grad = tuple([None if y is None else y.grad for y in outputs]) hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) cuda.get_device(*(in_data + out_grad)).use() for hook in six.itervalues(hooks): hook.backward_preprocess(func, in_data, out_grad) if isinstance(func, chainer.functions.connection.convolution_2d.Convolution2DFunction): _x = func.inputs[0] if _x.creator is None and func.in_chain is True: func.mkldnn_opt = True cosim_output = func.backward_cpu_cosim(in_data, out_grad) gxs = func.backward(in_data, out_grad) assert len(gxs) == len(in_data) func.cpu_cosim_verify_result(gxs, cosim_output) for hook in six.itervalues(hooks): hook.backward_postprocess(func, in_data, out_grad) if is_debug: for gx in gxs: if gx is None: continue cuda.get_device(gx).use() if cuda.get_array_module(gx).isnan(gx).any(): msg = 'NaN is detected on backward computation' raise RuntimeError(msg) if not retain_grad: for y in outputs: if y is not None and y is not self: y.grad = None for x, gx in zip(func.inputs, gxs): if gx is None: continue _check_grad_type(func, x, gx) # Accumulate the gradient to x. It is a bit tricky to handle # branches and parameter gradient accumulation correctly. id_x = id(x) if x.creator is None: # leaf if x._grad is None: # 1st visit x.grad = gx need_copy.add(id_x) else: cuda.get_device(gx).use() if id_x in need_copy: # 2nd visit if mkld.enable_acc_gradF((in_data,)) and in_data[0].ndim == 4 and all(isinstance(xi, numpy.ndarray) for xi in in_data): # if enable_acc_grad,will deply to do grad accumulate,only record grad x.acc_grad += (gx,) else: x.grad = utils.force_array(x.grad + gx) # copy need_copy.remove(id_x) # remove from list in 2nd visit else: if mkld.enable_acc_gradF((in_data,)) and in_data[0].ndim == 4 and all(isinstance(xi, numpy.ndarray) for xi in in_data): # if enable_acc_grad, will deply to do grad accumulate, only record grad if len(x.acc_grad) > 0: # means 3rd or later visit for variable x x.acc_grad += (gx,) else: # means this variable is W or b x._grad += gx else: x._grad += gx # 3rd or later visit else: # not a leaf add_cand(x.creator) if id_x not in seen_vars: # 1st visit x.grad = gx seen_vars.add(id_x) need_copy.add(id_x) else: cuda.get_device(gx).use() if id_x in need_copy: # 2nd visit if mkld.enable_acc_gradF((in_data,)) and in_data[0].ndim == 4 and all(isinstance(xi, numpy.ndarray) for xi in in_data): # if enable_acc_grad, will deply to do grad accumulate, only record grad x.acc_grad += (gx,) else: x._grad = utils.force_array(gx + x._grad) # copied need_copy.remove(id_x) else: # 3rd or later visit if mkld.enable_acc_gradF((in_data,)) and in_data[0].ndim == 4 and all(isinstance(xi, numpy.ndarray) for xi in in_data): # if enable_acc_grad, will deply to do grad accumulate, only record grad x.acc_grad += (gx,) else: x._grad += gx del gxs # to reduce memory usage if initial_device is not None: initial_device.use()
def total_variation2(self, x, tau=None): xp = cuda.get_array_module(x.data) dx = x[:, :, 1:, :] - x[:, :, :-1, :] dy = x[:, :, :, 1:] - x[:, :, :, :-1] return F.average(F.absolute(dx)) + F.average(F.absolute(dy))
def forward(self, xs): self.retain_inputs(()) self._in_shapes = [x.shape for x in xs] xp = cuda.get_array_module(*xs) return xp.dstack(xs),
def _backward_main(self, retain_grad): self._node._check_old_style_gradient() if self.creator_node is None: return initial_device = None if cuda.available and isinstance(self.data, cuda.cupy.ndarray): try: initial_device = cuda.Device() except cuda.cupy.cuda.runtime.CUDARuntimeError as e: if e.status != 38: # cudaErrorNoDevice raise is_debug = chainer.is_debug() cand_funcs = [] seen_set = set() grads = {} # Initialize error by 1, if this is a loss variable if self.data.size == 1 and self._grad_var is None: with cuda.get_device_from_array(self.data) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.data) else: self.grad = cuda.cupy.ones_like(self.data) grads[self._node] = self._grad_var def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator_node) def get_grad(node): if node is None: return None if node in grads: return grads[node] return node.grad_var while cand_funcs: _, _, func = heapq.heappop(cand_funcs) inputs = func.inputs target_input_indexes = [ i for i, x in enumerate(inputs) if x.requires_grad ] if not target_input_indexes: continue outputs = [y() for y in func.outputs] # access via weak ref in_data = tuple([x.data for x in inputs]) out_grad = tuple([get_grad(y) for y in outputs]) out_grad_data = tuple( [None if g is None else g.data for g in out_grad]) hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance cuda.get_device_from_array(*in_data).use() for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_data) # Collect the current input gradients. # # Note (Tokui): When the same variable is passed to multiple input # slots (e.g. an expression like ``f(x, x)``), it makes the # gradient accumulation complicated since the back-propagated # gradients w.r.t. the first and second argument should be # accumulated to the current gradient w.r.t. the same variable. # In this case, the current implementation passes the current # gradient only to the first occurrence of the variable in the # input tuple and passes ``None`` to the rest of the occurrences. # For example, when the input variables are ``(x, x)``, the # input gradient passed to the ``backward_accumulate`` method is # ``(gx, None)`` where ``gx`` is the current gradient of ``x``. # See also the docstring of ``FunctionNode.backward_accumulate``. target_inputs = [inputs[i] for i in target_input_indexes] in_grad = [] for i, index_i in enumerate(target_input_indexes): x = inputs[index_i] if x in target_inputs[:i]: # Pass ``None`` for duplicated input variables except for # the first occurrence (see the comment above). gx = None elif x in grads: gx = grads[x] elif x.creator_node is None: x._check_old_style_gradient() # accumulate the gradient only if the node is a leaf gx = x.grad_var else: gx = None in_grad.append(gx) gxs = func.backward_accumulate( target_input_indexes, out_grad, in_grad) assert len(gxs) == len(in_grad) for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_data) if is_debug: for gx in gxs: if gx is None: continue gx_data = gx.data if gx_data.dtype.kind == 'f': cuda.get_device_from_array(gx_data).use() if cuda.get_array_module(gx_data).isnan(gx_data).any(): raise RuntimeError( 'NaN is detected on backward computation of ' '{}'.format(func.label)) if not retain_grad: for y in outputs: if y is not None and y is not self.node: grads[y] = None y_var = y.get_variable_or_none() if y_var is not None: y_var._grad_var = None for i, gx in enumerate(gxs): if gx is None: continue x = target_inputs[i] if not x.requires_grad: continue _check_grad_type(func, x, gx.data) if x in target_inputs[:i]: # Accumulate the duplicated gradients here. See the comment # above the code that builds ``in_grad``. cur_gx = grads[x] grads[x] = gx if cur_gx is None else gx + cur_gx else: grads[x] = gx x_var = x.get_variable_or_none() if x_var is not None: x_var._grad_var = grads[x] if x.creator_node is not None: add_cand(x.creator_node) del gxs # to reduce memory usage if initial_device is not None: initial_device.use()
def __init__(self, q_values, q_values_formatter=lambda x: x): assert isinstance(q_values, chainer.Variable) self.xp = cuda.get_array_module(q_values.data) self.q_values = q_values self.n_actions = q_values.data.shape[1] self.q_values_formatter = q_values_formatter
def forward(self, xs): xp = cuda.get_array_module(*xs) return xp.dstack(xs),
def backward(self, x, gy): x, = x xp = cuda.get_array_module(x) gy, = gy gy_former, gy_latter = xp.split(gy, 2, axis=self.axis) return gy_former * (x > 0) - gy_latter * (-x > 0),
def __call__(self, rule, param): grad = param.grad xp = cuda.get_array_module(grad) with cuda.get_device_from_array(grad): xp.clip(grad, self.lower_bound, self.upper_bound, out=grad)
def __call__(self, *inputs): """Applies forward propagation with chaining backward references. Basic behavior is expressed in documentation of :class:`Function` class. .. note:: If the :data:`~Variable.data` attribute of input variables exist on GPU device, then, before it calls :meth:`forward` method, the appropriate device is selected, so in most cases implementers do not need to take care of device selection. Args: inputs: Tuple of input :class:`Variable`, :class:`numpy.ndarray` or :class:`cupy.ndarray` objects. The volatile flags of all input variables must agree. If the input is an :class:`numpy.ndarray` or a :class:`cupy.ndarray`, it is automatically wrapped with :class:`Variable`. Returns: One :class:`Variable` object or a tuple of multiple :class:`Variable` objects. """ inputs = [x if isinstance(x, chainer.Variable) else chainer.Variable(x, volatile=flag.AUTO) for x in inputs] in_data = tuple([x.data for x in inputs]) if chainer.is_debug(): self._stack = traceback.extract_stack() if self.type_check_enable: self._check_data_type_forward(in_data) hooks = chainer.get_function_hooks() if self._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(self.local_function_hooks) for hook in six.itervalues(hooks): hook.forward_preprocess(self, in_data) # Forward prop with cuda.get_device(*in_data): outputs = self.forward(in_data) assert type(outputs) == tuple for hook in six.itervalues(hooks): hook.forward_postprocess(self, in_data) if chainer.is_debug(): if any(out.dtype.kind == 'f' and cuda.get_array_module(out).isnan(out).any() for out in outputs): msg = 'NaN is detected on forward computation' raise RuntimeError(msg) out_v = flag.aggregate_flags([x.volatile for x in inputs]) ret = tuple([variable.Variable(y, volatile=out_v) for y in outputs]) if out_v == 'on': build_graph = False elif out_v == 'off': build_graph = True else: build_graph = getattr(_thread_local, 'default_backprop', True) if build_graph: # Topological ordering self.rank = max([x.rank for x in inputs]) if inputs else 0 # Backward edges for y in ret: y.set_creator(self) self.inputs = inputs # Forward edges (must be weak references) self.outputs = tuple([weakref.ref(y) for y in ret]) if len(ret) == 1: return ret[0] else: return ret