def backward_gpu(self, x, gy): e1 = array.as_mat(x[0]) e2 = array.as_mat(x[1]) gy, = gy kern_add = cuda.reduce('T in0, T in1, T in2', 'T out', 'in0 * in1 * in2', 'a + b', 'out += a', 0, 'bilinear_product_add') kern = cuda.reduce('T in0, T in1, T in2', 'T out', 'in0 * in1 * in2', 'a + b', 'out = a', 0, 'bilinear_product') e1_b = e1[:, :, numpy.newaxis, numpy.newaxis] # ij e2_b = e2[:, numpy.newaxis, :, numpy.newaxis] # ik gy_b = gy[:, numpy.newaxis, numpy.newaxis, :] # il W_b = self.W[numpy.newaxis, :, :, :] # jkl # 'ij,ik,il->jkl' kern_add(e1_b, e2_b, gy_b, self.gW, axis=0) if not self.nobias: self.gV1 += e1.T.dot(gy) self.gV2 += e2.T.dot(gy) self.gb += gy.sum(axis=0) # 'ik,jkl,il->ij' ge1 = kern(e2_b, W_b, gy_b, axis=(2, 3)) # 'ij,jkl,il->ik' ge2 = kern(e1_b, W_b, gy_b, axis=(1, 3)) if not self.nobias: ge1 += gy.dot(self.V1.T) ge2 += gy.dot(self.V2.T) return (ge1.reshape(x[0].shape), ge2.reshape(x[1].shape))
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs[:2] log_y = super(AdaptiveSoftmaxCrossEntropy, self).forward(inputs)[0] self.y = cupy.exp(log_y) if self.normalize: coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) if self.reduce == 'mean': ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw T coeff, ' 'S ignore_label', 'T out', 't == ignore_label ? T(0) : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self._coeff, self.ignore_label) else: ret = cuda.elementwise( 'S t, raw T log_y, int32 n_channel, T ignore', 'T out', ''' if (t == ignore) { out = 0; } else { out = -log_y[i * n_channel + t]; } ''', 'softmax_crossent_no_reduce_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label) ret = ret.reshape(t.shape) return ret,
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs if chainer.is_debug(): self._check_input_values(x, t) log_y = softmax_log(x, self.use_cudnn) if self.cache_score: self.y = cupy.exp(log_y) if getattr(self, "normalize", True): coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) ret = cuda.reduce( "S t, raw T log_y, int32 n_channel, raw T coeff", "T out", "t == -1 ? T(0) : log_y[_j * n_channel + t]", "a + b", "out = a * -coeff[0]", "0", "crossent_fwd", )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff) return (ret,)
def backward(self, inputs, grad_outputs): e1 = array.as_mat(inputs[0]) e2 = array.as_mat(inputs[1]) W = inputs[2] gy = grad_outputs[0] xp = cuda.get_array_module(*inputs) if xp is numpy: gW = numpy.einsum('ij,ik,il->jkl', e1, e2, gy) ge1 = numpy.einsum('ik,jkl,il->ij', e2, W, gy) ge2 = numpy.einsum('ij,jkl,il->ik', e1, W, gy) else: kern = cuda.reduce('T in0, T in1, T in2', 'T out', 'in0 * in1 * in2', 'a + b', 'out = a', 0, 'bilinear_product') e1_b = e1[:, :, None, None] # ij e2_b = e2[:, None, :, None] # ik gy_b = gy[:, None, None, :] # il W_b = W[None, :, :, :] # jkl gW = kern(e1_b, e2_b, gy_b, axis=0) # 'ij,ik,il->jkl' ge1 = kern(e2_b, W_b, gy_b, axis=(2, 3)) # 'ik,jkl,il->ij' ge2 = kern(e1_b, W_b, gy_b, axis=(1, 3)) # 'ij,jkl,il->ik' ret = ge1.reshape(inputs[0].shape), ge2.reshape(inputs[1].shape), gW if len(inputs) == 6: V1, V2, b = inputs[3:] gV1 = e1.T.dot(gy) gV2 = e2.T.dot(gy) gb = gy.sum(0) ge1 += gy.dot(V1.T) ge2 += gy.dot(V2.T) ret += gV1, gV2, gb return ret
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs if chainer.is_debug(): self._check_input_values(x, t) log_y = log_softmax._log_softmax(x, self.use_cudnn) if self.cache_score: self.y = cupy.exp(log_y) if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= cupy.broadcast_to( self.class_weight.reshape(shape), x.shape) if self.normalize: coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw T coeff', 'T out', 't == -1 ? T(0) : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd' )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff) return ret,
def numerical_grad(f, inputs, grad_outputs, eps=1e-3): """Computes numerical gradient by finite differences. This function is used to implement gradient check. For usage example, see unit tests of :mod:`chainer.functions`. Args: f (function): Python function with no arguments that runs forward computation and returns the result. inputs (tuple of arrays): Tuple of arrays that should be treated as inputs. Each element of them is slightly modified to realize numerical gradient by finite differences. grad_outputs (tuple of arrays): Tuple of arrays that are treated as output gradients. eps (float): Epsilon value of finite differences. Returns: tuple: Numerical gradient arrays corresponding to ``inputs``. """ assert eps > 0 inputs = tuple(inputs) grad_outputs = tuple(grad_outputs) gpu = any(isinstance(x, cuda.ndarray) for x in inputs + grad_outputs) cpu = any(isinstance(x, numpy.ndarray) for x in inputs + grad_outputs) if gpu and cpu: raise RuntimeError('Do not mix GPU and CPU arrays in `numerical_grad`') if gpu: xp = cuda.cupy numerical_grad_kernel = cuda.reduce( 'T y1, T y2, U gy, T eps', 'V gxi', '(y1 - y2) * gy', 'a + b', 'gxi += a / (eps * 2)', '0', 'numerical_grad_kernel' ) else: xp = numpy grads = [xp.zeros_like(x) for x in inputs] with configuration.using_config('type_check', False): for x, gx in six.moves.zip(inputs, grads): for i in numpy.ndindex(x.shape): orig = x[i].copy() # hold original value x[i] = orig + eps ys1 = _copy_arrays(f()) x[i] = orig - eps ys2 = _copy_arrays(f()) x[i] = orig for y1, y2, gy in six.moves.zip(ys1, ys2, grad_outputs): if gy is not None: if (gpu and isinstance(y1, cuda.ndarray) and isinstance(y2, cuda.ndarray) and isinstance(gy, cuda.ndarray)): numerical_grad_kernel(y1, y2, gy, eps, gx[i]) else: dot = ((y1 - y2) * gy).sum() gx[i] += dot / (2 * eps) return grads
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs if chainer.is_debug(): self._check_input_values(x, t) log_y = softmax_log(x, self.use_cudnn) if self.cache_score: self.y = cupy.exp(log_y) if getattr(self, 'normalize', True): coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) tw = cupy.asnumpy(t.copy()) for i_class, weight in enumerate(self.class_weights): tw[tw == i_class] = self.class_weights[i_class] log_y = cupy.rollaxis(log_y, 1, log_y.ndim) ret = cuda.reduce( 'S t, S tw, raw T log_y, int32 n_channel, raw T coeff', 'T out', 't == -1 ? 0 : log_y[_j * n_channel + t]*tw', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd')(t, cupy.array(tw), log_y.reduced_view(), log_y.shape[-1], self._coeff) return ret,
def backward(self, inputs, grad_outputs): e1 = array.as_mat(inputs[0]) e2 = array.as_mat(inputs[1]) W = inputs[2] gy = grad_outputs[0] xp = cuda.get_array_module(*inputs) if xp is numpy: gW = numpy.einsum("ij,ik,il->jkl", e1, e2, gy) ge1 = numpy.einsum("ik,jkl,il->ij", e2, W, gy) ge2 = numpy.einsum("ij,jkl,il->ik", e1, W, gy) else: kern = cuda.reduce( "T in0, T in1, T in2", "T out", "in0 * in1 * in2", "a + b", "out = a", 0, "bilinear_product" ) e1_b = e1[:, :, None, None] # ij e2_b = e2[:, None, :, None] # ik gy_b = gy[:, None, None, :] # il W_b = W[None, :, :, :] # jkl gW = kern(e1_b, e2_b, gy_b, axis=0) # 'ij,ik,il->jkl' ge1 = kern(e2_b, W_b, gy_b, axis=(2, 3)) # 'ik,jkl,il->ij' ge2 = kern(e1_b, W_b, gy_b, axis=(1, 3)) # 'ij,jkl,il->ik' ret = ge1.reshape(inputs[0].shape), ge2.reshape(inputs[1].shape), gW if len(inputs) == 6: V1, V2, b = inputs[3:] gV1 = e1.T.dot(gy) gV2 = e2.T.dot(gy) gb = gy.sum(0) ge1 += gy.dot(V1.T) ge2 += gy.dot(V2.T) ret += gV1, gV2, gb return ret
def forward_gpu(self, inputs): x0, x1 = inputs ret = cuda.reduce('const float* x0, const float* x1', '(x0[i] - x1[i]) * (x0[i] - x1[i])', 'a+b', '0', 'mse_fwd', numpy.float32)(x0, x1) ret /= x0.size return ret,
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs if chainer.is_debug(): self._check_input_values(x, t) log_y = log_softmax._log_softmax(x, self.use_cudnn) if self.cache_score: self.y = cupy.exp(log_y) if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= cupy.broadcast_to(self.class_weight.reshape(shape), x.shape) if self.normalize: coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) ret = cuda.reduce('S t, raw T log_y, int32 n_channel, raw T coeff', 'T out', 't == -1 ? T(0) : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self._coeff) return ret,
def forward_gpu(self, inputs): x, t = inputs self.y, = sigmoid.Sigmoid(self.use_cudnn).forward_gpu((x,)) loss = -cuda.reduce( 'int* t, float* x', 'x[i] * (t[i] - (x[i] >= 0)) - log1pf(expf(-fabsf(x[i])))', 'a+b', '0', 'sigmoid_crossent_fwd', numpy.float32)(t, x) return loss / t.shape[0],
def _popcount(): return cuda.reduce( "T x", "T y", "2*__popc(x)-32", "a+b", "y = a", "0", "popcount")
def forward_gpu(self, inputs): x0, x1 = inputs ret = cuda.reduce( 'const float* x0, const float* x1', '(x0[i] - x1[i]) * (x0[i] - x1[i])', 'a+b', '0', 'mse_fwd', numpy.float32)(x0, x1) ret /= x0.size return ret,
def forward_gpu(self, inputs): x, t = inputs self.y, = Softmax(self.use_cudnn).forward_gpu((x,)) ret = cuda.reduce( 'int* t, float* y, int n_channel', '-log(y[i * n_channel + t[i]])', 'a+b', '0', 'crossent_fwd', numpy.float32)(t, self.y, self.y.shape[1]) ret /= t.size return ret,
def forward_gpu(self, inputs): x, t = inputs self.y, = sigmoid.Sigmoid(self.use_cudnn).forward_gpu((x, )) loss = cuda.reduce('T x, S t, T inv_cnt', 'T out', 'x * (t - (x >= 0)) - log1p(exp(-fabs(x)))', 'a + b', 'out = a * inv_cnt', 0, 'sigmoid_crossent_fwd')(x, t, -1.0 / t.shape[0]) return loss,
def forward_gpu(self, inputs): x, t = inputs self.y, = sigmoid.Sigmoid(self.use_cudnn).forward_gpu((x, )) loss = -cuda.reduce( 'int* t, float* x', 'x[i] * (t[i] - (x[i] >= 0)) - log1pf(expf(-fabsf(x[i])))', 'a+b', '0', 'sigmoid_crossent_fwd', numpy.float32)(t, x) return loss / t.shape[0],
def backward(self, inputs, grad_outputs): # # preprocess # x, gamma = inputs[:2] gy, gl = grad_outputs head_ndim = gamma.ndim + 1 expander = (None, Ellipsis) + (None, ) * (x.ndim - head_ndim) m = gamma.dtype.type(x.size // gamma.size) axis = (0, ) + tuple(range(head_ndim, x.ndim)) xp = cuda.get_array_module(x) if len(inputs) == 6: assert not chainer.config.train # we do not have to consider Lipschitz constant var = inputs[5] + self.eps gs = gamma * self.std_inv gbeta = gy.sum(axis=axis) ggamma = (gy * self.x_hat).sum(axis=axis) gmean = -gs * gbeta gvar = -0.5 * gamma / var * ggamma gx = gs[expander] * gy return gx, ggamma, gbeta, None, gmean, gvar assert chainer.config.train gbeta = gy.sum(axis=axis) ggamma = cuda.reduce('T gy, T x_hat', 'T out', 'gy * x_hat', 'a + b', 'out = a', '0', 'conv_bn_ggamma')(gy, self.x_hat, axis=axis, keepdims=False) if gl is not None: assert getattr(chainer.config, 'lmt', False) cuda.elementwise( 'T gl, T u, T u_mid, T std_inv', 'T ggamma', ''' ggamma += gl * u * u_mid * std_inv; ''', 'conv_bn_ggamma2')(gl, self.u.reshape(self.std_inv.shape), self.u_mid.reshape(self.std_inv.shape), self.std_inv, ggamma) inv_m = numpy.float32(1) / m if xp is numpy: gx = (gamma * self.std_inv)[expander] * ( gy - (self.x_hat * ggamma[expander] + gbeta[expander]) / m) else: # in LMT, ggamma is changed and this automatically corrects gx gx = cuda.elementwise( 'T gy, T x_hat, T gamma, T std_inv, T ggamma, T gbeta, \ T inv_m', 'T gx', 'gx = (gamma * std_inv) * (gy - (x_hat * ggamma + gbeta) * \ inv_m)', 'conv_bn_bwd')(gy, self.x_hat, gamma[expander], self.std_inv[expander], ggamma[expander], gbeta[expander], inv_m) if gl is not None: return gx, ggamma, gbeta, (gl * self.u_mid.T * self.v).reshape( inputs[3].shape) else: return gx, ggamma, gbeta, None
def forward_gpu(self, inputs): x, t = inputs self.y, = sigmoid.Sigmoid(self.use_cudnn).forward_gpu((x,)) loss = cuda.reduce( 'T x, S t, T inv_cnt', 'T out', 'x * (t - (x >= 0)) - log1p(exp(-fabs(x)))', 'a + b', 'out = a * inv_cnt', 0, 'sigmoid_crossent_fwd')(x, t, -1.0 / t.shape[0]) return loss,
def forward_gpu(self, inputs): x, t = inputs self.y, = Softmax(self.use_cudnn).forward_gpu((x, )) ret = cuda.reduce('int* t, float* y, int n_channel', '-log(y[i * n_channel + t[i]])', 'a+b', '0', 'crossent_fwd', numpy.float32)(t, self.y, self.y.shape[1]) ret /= t.size return ret,
def forward_gpu(self, inputs): x0, x1 = inputs ret = cuda.reduce( "const float* x0, const float* x1", "(x0[i] - x1[i]) * (x0[i] - x1[i])", "a+b", "0", "mse_fwd", numpy.float32, )(x0, x1) ret /= x0.size return (ret,)
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs if chainer.is_debug(): _check_input_values(x, t, self.ignore_label) if x.size == 0: y = cupy.zeros(t.shape, dtype=x.dtype) if self.cache_score: self.y = y if self.reduce == 'mean': return y.sum(), else: return y, log_y = log_softmax._log_softmax(x) if self.cache_score: self.y = cupy.exp(log_y) if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= cupy.broadcast_to( self.class_weight.reshape(shape), x.shape) if self.normalize: coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) if self.reduce == 'mean': ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw T coeff, ' 'S ignore_label', 'T out', 't == ignore_label ? T(0) : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd' )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff, self.ignore_label) else: ret = cuda.elementwise( 'S t, raw T log_y, int32 n_channel, T ignore', 'T out', ''' if (t == ignore) { out = 0; } else { out = -log_y[i * n_channel + t]; } ''', 'softmax_crossent_no_reduce_fwd' )(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label) ret = ret.reshape(t.shape) return ret,
def normalize(arr, eps): """normalize input array and return its norm from https://github.com/pfnet-research/sngan_projection/blob/master/source/functions/max_sv.py#L5 :param arr: numpy ndarray or cupy ndarray :param eps: epsilon for numerical stability :return: norm of input array """ norm = cuda.reduce('T x', 'T out', 'x * x', 'a + b', 'out = sqrt(a)', 0, 'norm_sn')(arr) cuda.elementwise('T norm, T eps', 'T x', 'x /= (norm + eps)', 'div_sn')(norm, eps, arr) return norm
def backward_gpu(self, x, gy): e1 = array.as_mat(x[0]) e2 = array.as_mat(x[1]) gy, = gy kern_add = cuda.reduce( 'T in0, T in1, T in2', 'T out', 'in0 * in1 * in2', 'a + b', 'out += a', 0, 'bilinear_product_add') kern = cuda.reduce( 'T in0, T in1, T in2', 'T out', 'in0 * in1 * in2', 'a + b', 'out = a', 0, 'bilinear_product') e1_b = e1[:, :, numpy.newaxis, numpy.newaxis] # ij e2_b = e2[:, numpy.newaxis, :, numpy.newaxis] # ik gy_b = gy[:, numpy.newaxis, numpy.newaxis, :] # il W_b = self.W[numpy.newaxis, :, :, :] # jkl # 'ij,ik,il->jkl' kern_add(e1_b, e2_b, gy_b, self.gW, axis=0) if not self.nobias: self.gV1 += e1.T.dot(gy) self.gV2 += e2.T.dot(gy) self.gb += gy.sum(axis=0) # 'ik,jkl,il->ij' ge1 = kern(e2_b, W_b, gy_b, axis=(2, 3)) # 'ij,jkl,il->ik' ge2 = kern(e1_b, W_b, gy_b, axis=(1, 3)) if not self.nobias: ge1 += gy.dot(self.V1.T) ge2 += gy.dot(self.V2.T) return (ge1.reshape(x[0].shape), ge2.reshape(x[1].shape))
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs self.y, = softmax.Softmax(self.use_cudnn).forward((x, )) n_unit = int(numpy.prod(self.y.shape[2:])) if getattr(self, 'normalize', True): count = t.shape[0] * n_unit else: count = t.shape[0] y = cupy.rollaxis(self.y, 1, len(self.y)) ret = cuda.reduce('S t, raw T y, int32 n_channel, T inv_count', 'T out', 'log(y[_j * n_channel + t])', 'a + b', 'out = a * inv_count', '0', 'crossent_fwd')(t, y, y.shape[-1], -1.0 / count) return ret,
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs self.y, = softmax.Softmax(self.use_cudnn).forward((x, )) if getattr(self, 'normalize', True): count = x.size // x.shape[1] else: count = x.shape[0] y = cupy.rollaxis(self.y, 1, self.y.ndim) ret = cuda.reduce('S t, raw T y, int32 n_channel, T inv_count', 'T out', 'log(y[_j * n_channel + t])', 'a + b', 'out = a * inv_count', '0', 'crossent_fwd')(t, y.reduced_view(), y.shape[-1], -1.0 / count) return ret,
def forward_gpu(self, inputs): x, t = inputs max_length = cuda.reduce( 'int* t, int* begins', 'begins[t[i] + 1] - begins[t[i]]', 'max(a,b)', '0', 'binary_hierarchical_softmax_max_length', numpy.int32 )(t, self.begins) max_length = cuda.to_cpu(max_length)[()] length = max_length * x.shape[0] ls = cuda.empty((length,), dtype=numpy.float32) n_in = x.shape[1] wxy = cuda.empty((length,), dtype=numpy.float32) cuda.elementwise( '''float* ls, float* wxy, const float* x, const float* w, const int* ts, const int* paths, const float* codes, const int* begins, int c, int max_length''', ''' int ind = i / max_length; int offset = i - ind * max_length; int t = ts[ind]; int begin = begins[t]; int length = begins[t + 1] - begins[t]; if (offset < length) { int p = begin + offset; int node = paths[p]; x = &x[ind * c]; float wx = 0; for (int j = 0; j < c; ++j) { wx += w[node * c + j] * x[j]; } wxy[i] = wx * codes[p]; ls[i] = log(1 + exp(-wxy[i])); } else { ls[i] = 0; } ''', 'binary_hierarchical_softmax_forward' )(ls, wxy, x, self.W, t, self.paths, self.codes, self.begins, n_in, max_length) self.max_length = max_length self.wxy = wxy return cuda.gpuarray.sum(ls),
def forward_gpu(self, inputs): cupy = cuda.cupy x, t, W = inputs max_length = cuda.reduce( 'T t, raw T begins', 'T out', 'begins[t + 1] - begins[t]', 'max(a, b)', 'out = a', '0', 'binary_hierarchical_softmax_max_length')(t, self.begins) max_length = cuda.to_cpu(max_length)[()] length = max_length * x.shape[0] ls = cupy.empty((length,), dtype=numpy.float32) n_in = x.shape[1] wxy = cupy.empty_like(ls) cuda.elementwise( '''raw T x, raw T w, raw int32 ts, raw int32 paths, raw T codes, raw int32 begins, int32 c, int32 max_length''', 'T ls, T wxy', ''' int ind = i / max_length; int offset = i - ind * max_length; int t = ts[ind]; int begin = begins[t]; int length = begins[t + 1] - begins[t]; if (offset < length) { int p = begin + offset; int node = paths[p]; T wx = 0; for (int j = 0; j < c; ++j) { int w_ind[] = {node, j}; int x_ind[] = {ind, j}; wx += w[w_ind] * x[x_ind]; } wxy = wx * codes[p]; ls = log(1 + exp(-wxy)); } else { ls = 0; } ''', 'binary_hierarchical_softmax_forward' )(x, W, t, self.paths, self.codes, self.begins, n_in, max_length, ls, wxy) self.max_length = max_length self.wxy = wxy return ls.sum(),
def forward_gpu(self, inputs): x, t, W = inputs max_length = cuda.reduce( 'T t, raw T begins', 'T out', 'begins[t + 1] - begins[t]', 'max(a, b)', 'out = a', '0', 'binary_hierarchical_softmax_max_length')(t, self.begins) max_length = cuda.to_cpu(max_length)[()] length = max_length * x.shape[0] ls = cuda.cupy.empty((length,), dtype=numpy.float32) n_in = x.shape[1] wxy = cuda.cupy.empty_like(ls) cuda.elementwise( '''raw T x, raw T w, raw int32 ts, raw int32 paths, raw T codes, raw int32 begins, int32 c, int32 max_length''', 'T ls, T wxy', ''' int ind = i / max_length; int offset = i - ind * max_length; int t = ts[ind]; int begin = begins[t]; int length = begins[t + 1] - begins[t]; if (offset < length) { int p = begin + offset; int node = paths[p]; T wx = 0; for (int j = 0; j < c; ++j) { int w_ind[] = {node, j}; int x_ind[] = {ind, j}; wx += w[w_ind] * x[x_ind]; } wxy = wx * codes[p]; ls = log(1 + exp(-wxy)); } else { ls = 0; } ''', 'binary_hierarchical_softmax_forward' )(x, W, t, self.paths, self.codes, self.begins, n_in, max_length, ls, wxy) self.max_length = max_length self.wxy = wxy return ls.sum(),
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs log_y = softmax_log(x, self.use_cudnn) self.y = cupy.exp(log_y) if getattr(self, 'normalize', True): coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw T coeff', 'T out', 't == -1 ? 0 : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd' )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff) return ret,
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs log_y = softmax_log(x, self.use_cudnn) self.y = cupy.exp(log_y) if getattr(self, 'normalize', True): coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) ret = cuda.reduce('S t, raw T log_y, int32 n_channel, raw T coeff', 'T out', 't == -1 ? 0 : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self._coeff) return ret,
def backward(self, inputs, grad_outputs): x, gamma = inputs[:2] gy, gl = grad_outputs head_ndim = gamma.ndim + 1 expander = (None, Ellipsis) + (None, ) * (x.ndim - head_ndim) m = gamma.dtype.type(x.size // gamma.size) axis = (0, ) + tuple(range(head_ndim, x.ndim)) xp = cuda.get_array_module(x) if len(inputs) == 5: assert not chainer.config.train # we do not have to consider Lipschitz constant var = inputs[4] + self.eps gs = gamma * self.std_inv gbeta = gy.sum(axis=axis) ggamma = (gy * self.x_hat).sum(axis=axis) gmean = -gs * gbeta gvar = -0.5 * gamma / var * ggamma gx = gs[expander] * gy return gx, ggamma, gbeta, gmean, gvar assert configuration.config.train gbeta = gy.sum(axis=axis) ggamma = cuda.reduce('T gy, T x_hat', 'T out', 'gy * x_hat', 'a + b', 'out = a', '0', 'bn_ggamma')(gy, self.x_hat, axis=axis, keepdims=False) if gl is not None: assert getattr(chainer.config, 'lmt', False) ggamma[self.index] += gl.reshape( tuple()) * self.std_inv[self.index] inv_m = numpy.float32(1) / m if xp is numpy: gx = (gamma * self.std_inv)[expander] * ( gy - (self.x_hat * ggamma[expander] + gbeta[expander]) / m) else: gx = cuda.elementwise( 'T gy, T x_hat, T gamma, T std_inv, T ggamma, T gbeta, \ T inv_m', 'T gx', 'gx = (gamma * std_inv) * (gy - (x_hat * ggamma + gbeta) * \ inv_m)', 'bn_bwd')(gy, self.x_hat, gamma[expander], self.std_inv[expander], ggamma[expander], gbeta[expander], inv_m) return gx, ggamma, gbeta
def __init__(self, epsilon=1e-5, stability=1e0): """ Args: epsilon: How close to take point for perturbation calc stability: Add this term to denominator to stabilize... """ self.epsilon = epsilon self.stability_term = stability self.init = True self.calc_inner_product_sum = cuda.reduce( 'T u, T v', 'T sum_uv', 'u * v', 'a + b', 'sum_uv = a', '0', 'calc_inner_product_sum' )
def forward_gpu(self, inputs): x, t = inputs self.y, = softmax.Softmax(self.use_cudnn).forward_gpu((x,)) n_unit = int(numpy.prod(self.y.shape[2:])) # the map_expr is equivalent to the pseudo code -log(y[n, c, m]), # where n = i / n_unit, c = t[i], and m = i % n_unit ret = cuda.reduce( 'int* t, float* y, int n_channel, int n_unit', '-log(y[n_unit * ((i / n_unit) * n_channel + t[i])' ' + (i % n_unit)])', 'a+b', '0', 'crossent_fwd', numpy.float32 )(t, self.y, self.y.shape[1], n_unit) if getattr(self, 'normalize', True): n_unit = int(numpy.prod(self.y.shape[2:])) count = t.shape[0] * n_unit else: count = t.shape[0] ret /= count return ret,
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs self.y, = softmax.Softmax(self.use_cudnn).forward((x,)) if getattr(self, "normalize", True): count = x.size // x.shape[1] else: count = x.shape[0] y = cupy.rollaxis(self.y, 1, self.y.ndim) ret = cuda.reduce( "S t, raw T y, int32 n_channel, T inv_count", "T out", "log(y[_j * n_channel + t])", "a + b", "out = a * inv_count", "0", "crossent_fwd", )(t, y.reduced_view(), y.shape[-1], -1.0 / count) return (ret,)
def backward(self, inputs, grad_outputs): e1 = array.as_mat(inputs[0]) e2 = array.as_mat(inputs[1]) W = inputs[2] if not type_check.same_types(*inputs): raise ValueError( 'numpy and cupy must not be used together\n' 'type(W): {0}, type(e1): {1}, type(e2): {2}'.format( type(W), type(e1), type(e2))) gy = grad_outputs[0] xp = cuda.get_array_module(*inputs) if xp is numpy: gW = numpy.einsum('ij,ik,il->jkl', e1, e2, gy) ge1 = numpy.einsum('ik,jkl,il->ij', e2, W, gy) ge2 = numpy.einsum('ij,jkl,il->ik', e1, W, gy) else: kern = cuda.reduce('T in0, T in1, T in2', 'T out', 'in0 * in1 * in2', 'a + b', 'out = a', 0, 'bilinear_product') e1_b = e1[:, :, None, None] # ij e2_b = e2[:, None, :, None] # ik gy_b = gy[:, None, None, :] # il W_b = W[None, :, :, :] # jkl gW = kern(e1_b, e2_b, gy_b, axis=0) # 'ij,ik,il->jkl' ge1 = kern(e2_b, W_b, gy_b, axis=(2, 3)) # 'ik,jkl,il->ij' ge2 = kern(e1_b, W_b, gy_b, axis=(1, 3)) # 'ij,jkl,il->ik' ret = ge1.reshape(inputs[0].shape), ge2.reshape(inputs[1].shape), gW if len(inputs) == 6: V1, V2, b = inputs[3:] gV1 = e1.T.dot(gy) gV2 = e2.T.dot(gy) gb = gy.sum(0) ge1 += gy.dot(V1.T) ge2 += gy.dot(V2.T) ret += gV1, gV2, gb return ret
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs self.y, = softmax.Softmax(self.use_cudnn).forward((x,)) if getattr(self, 'normalize', True): count = float((t != self.ignore_label).sum()) else: count = t.shape[0] self.count = count if count == 0: return cupy.zeros((), dtype=x.dtype), y = cupy.rollaxis(self.y, 1, self.y.ndim) ret = cuda.reduce( 'S t, raw T y, int32 n_channel, T inv_count', 'T out', 't == -1 ? 0 : log(y[_j * n_channel + t])', 'a + b', 'out = a * inv_count', '0', 'crossent_fwd' )(t, y.reduced_view(), y.shape[-1], -1.0 / count) return ret,
def backward(self, inputs, grad_outputs): e1 = array.as_mat(inputs[0]) e2 = array.as_mat(inputs[1]) W = inputs[2] if not type_check.same_types(*inputs): raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(e1): {1}, type(e2): {2}' .format(type(W), type(e1), type(e2))) gy = grad_outputs[0] xp = cuda.get_array_module(*inputs) if xp is numpy: gW = numpy.einsum('ij,ik,il->jkl', e1, e2, gy) ge1 = numpy.einsum('ik,jkl,il->ij', e2, W, gy) ge2 = numpy.einsum('ij,jkl,il->ik', e1, W, gy) else: kern = cuda.reduce('T in0, T in1, T in2', 'T out', 'in0 * in1 * in2', 'a + b', 'out = a', 0, 'bilinear_product') e1_b = e1[:, :, None, None] # ij e2_b = e2[:, None, :, None] # ik gy_b = gy[:, None, None, :] # il W_b = W[None, :, :, :] # jkl gW = kern(e1_b, e2_b, gy_b, axis=0) # 'ij,ik,il->jkl' ge1 = kern(e2_b, W_b, gy_b, axis=(2, 3)) # 'ik,jkl,il->ij' ge2 = kern(e1_b, W_b, gy_b, axis=(1, 3)) # 'ij,jkl,il->ik' ret = ge1.reshape(inputs[0].shape), ge2.reshape(inputs[1].shape), gW if len(inputs) == 6: V1, V2, b = inputs[3:] gV1 = e1.T.dot(gy) gV2 = e2.T.dot(gy) gb = gy.sum(0) ge1 += gy.dot(V1.T) ge2 += gy.dot(V2.T) ret += gV1, gV2, gb return ret
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs if chainer.is_debug(): self._check_input_values(x, t) log_y = cupy.log(x) if self.cache_score: self.y = x if getattr(self, 'normalize', True): coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw T coeff', 'T out', 't == -1 ? 0 : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd' )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff) return ret,
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs log_y = cupy.log(x + 1e-5) self.y = x if(self.debug): ipdb.set_trace() if getattr(self, 'normalize', True): coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw T coeff, raw T weights', 'T out', 't == -1 ? 0 : log_y[_j * n_channel + t] * weights[t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd' )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff, self.weights.reduced_view()) return ret,
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs if chainer.is_debug(): self._check_input_values(x, t) log_y = cupy.log(x) if self.cache_score: self.y = x if getattr(self, 'normalize', True): coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) ret = cuda.reduce('S t, raw T log_y, int32 n_channel, raw T coeff', 'T out', 't == -1 ? 0 : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self._coeff) return ret,
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs log_y = cupy.log(x + 1e-5) self.y = x if (self.debug): ipdb.set_trace() if getattr(self, 'normalize', True): coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw T coeff, raw T weights', 'T out', 't == -1 ? 0 : log_y[_j * n_channel + t] * weights[t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self._coeff, self.weights.reduced_view()) return ret,
def forward(self, inputs): cs, ls, alpha_hat, beta_hat, kappa_hat, kappa_prev = inputs # cs : one-hot-encoding vectors whose shape is (W, U) # U: maximal length of character sequences in a batch # W: number of characters used in a data # # ls : a vector containing lengths of character sequences in a batch # # alpha, beta, kappa: length K vectors. shape = (batchsize, K) # batchsize, W, U = cs.shape K = alpha_hat.shape[1] if isinstance(cs, numpy.ndarray): self.alpha = numpy.exp(alpha_hat).reshape((batchsize, K, 1)) self.beta = numpy.exp(beta_hat).reshape((batchsize, K, 1)) self.kappa = (kappa_prev + numpy.exp(kappa_hat)).reshape( (batchsize, K, 1)) us = numpy.arange(U).astype(numpy.float32).reshape((1, 1, U)) self.phai_mat = self.alpha * numpy.exp( -self.beta * (self.kappa - us)**2) # --> (batchsize, K, U) ws = numpy.matmul( cs, self.phai_mat.sum(axis=1).reshape(batchsize, U, 1) ) # (batchsize, W, U) x (batchsize, U, 1)--> (batchsize, W, 1) if ls.sum() > 0: #ls is not None: max_phai_idx = numpy.sum(self.phai_mat, axis=1).argmax( axis=1 ) # (batchsize, K, U) --> (batchsize, U) --> (batchsize, 1) eow = numpy.where(max_phai_idx > ls, max_phai_idx, -1) # (batchsize, 1) else: eow = numpy.zeros((batchsize, U)) #None else: self.alpha, self.beta, self.kappa = cuda.elementwise( 'T a_hat, T b_hat, T ka_hat, T ka_prev', 'T a, T b, T ka', ''' a = exp(a_hat); b = exp(b_hat); ka = ka_prev + exp(ka_hat); ''', 'softwindow_fwd1')(alpha_hat, beta_hat, kappa_hat, kappa_prev) us = cuda.cupy.arange(U).astype(cuda.cupy.float32).reshape( (1, 1, U)) self.phai_mat = cuda.elementwise( 'T a, T b, T k, T u', 'T ph', ''' ph = a * exp(- b *(k - u)*(k - u)); ''', 'softwindow_fwd2' )( self.alpha.reshape(batchsize, K, 1), self.beta.reshape(batchsize, K, 1), self.kappa.reshape(batchsize, K, 1), us #cuda.cupy.arange(U).astype(cuda.cupy.float32).reshape((1, 1, U)) ) #phais = self.phai_mat.sum(axis=1).reshape(batchsize, U, 1) phais = cuda.reduce( 'T x', 'T y', 'x', 'a+b', 'y=a', '0', 'softwindow_fwd3', )(self.phai_mat, axis=1) if ls.sum() > 0: # ls is not None: max_phai_idx = cuda.cupy.argmax(phais, axis=1, keepdims=True) phais = phais.reshape(batchsize, U, 1) ws = cuda.cupy.empty((batchsize, W, 1)).astype(cuda.cupy.float32) _batch_matmul_gpu(cs, phais, out=ws) if ls.sum() > 0: # ls is not None: eow = cuda.cupy.where(max_phai_idx > ls, max_phai_idx, -1) else: eow = cuda.cupy.zeros((batchsize, U)) #None return ws.reshape(batchsize, W), self.kappa.reshape( (batchsize, K)), eow
def forward(self, inputs): xp = cuda.get_array_module(*inputs) xnext, eow, e_hat, pi_hat, mux_hat, muy_hat, sgmx_hat, sgmy_hat, rho_hat = inputs batchsize, M = pi_hat.shape x1 = xnext[:, 0].reshape((batchsize, 1)) x2 = xnext[:, 1].reshape((batchsize, 1)) x3 = xnext[:, 2].reshape((batchsize, 1)) if isinstance(mux_hat, numpy.ndarray): self.x = xnext self.eos = 1. / (1. + numpy.exp(e_hat)) #_sigmoid(e_hat) self.pi_ = numpy.exp(pi_hat) / numpy.exp(pi_hat).sum( axis=1).reshape((batchsize, 1)) self.mux = mux_hat self.muy = muy_hat self.sgmx = numpy.exp(sgmx_hat) self.sgmy = numpy.exp(sgmy_hat) self.rho_ = numpy.tanh(rho_hat) if x3.sum() >= 0.0: #xnext is not None: # training & validation #x1 = xnext[:,0].reshape((batchsize, 1)) #x2 = xnext[:,1].reshape((batchsize, 1)) #x3 = xnext[:,2].reshape((batchsize, 1)) dx1 = (x1 - self.mux) / self.sgmx dx2 = (x2 - self.muy) / self.sgmy self.Zs = dx1 * dx1 + dx2 * dx2 - 2. * self.rho_ * dx1 * dx2 Ns = numpy.exp(-0.5 * self.Zs / (1. - self.rho_**2)) / ( 2. * 3.1415927 * self.sgmx * self.sgmy * numpy.sqrt(1. - self.rho_**2) + 1e-10) gamma_hats = self.pi_ * Ns sum_gamma_hats = gamma_hats.sum(axis=1).reshape( (batchsize, 1)) + 1e-10 self.gammas = gamma_hats / sum_gamma_hats loss_t = -numpy.log(sum_gamma_hats) - x3 * numpy.log( self.eos) - (1. - x3) * numpy.log(1. - self.eos) idx = numpy.where(x3 == 2)[0] self.update_or_not = numpy.ones_like(x3) self.update_or_not[idx, 0] = 0.0 loss_t = loss_t * self.update_or_not self.xnext = xnext # Prediction in training xnext_h = numpy.copy(xnext) with chainer.no_backprop_mode(): myux_min_h = mux_hat.min(axis=1).reshape((batchsize, 1)) myux_max_h = mux_hat.max(axis=1).reshape((batchsize, 1)) myuy_min_h = muy_hat.min(axis=1).reshape((batchsize, 1)) myuy_max_h = muy_hat.max(axis=1).reshape((batchsize, 1)) protect_mask = numpy.ones((batchsize, 1)) while protect_mask.sum() > 0: z1_h = numpy.random.uniform(size=batchsize).reshape( (batchsize, 1)) z2_ = numpy.random.uniform(size=batchsize).reshape( (batchsize, 1)) x1_h = myux_min_h + (myux_max_h - myux_min_h) * z1_h x2_h = myuy_min_h + (myuy_max_h - myuy_min_h) * z2_ dx1_h = (x1_h - self.mux) / self.sgmx dx2_h = (x2_h - self.muy) / self.sgmy self.Zs_h = dx1_h * dx1_h + dx2_h * dx2_h - 2. * self.rho_ * dx1_h * dx2_h Ns = numpy.exp( -0.5 * self.Zs_h / (1. - self.rho_**2)) / ( 2. * 3.1415927 * self.sgmx * self.sgmy * numpy.sqrt(1. - self.rho_**2) + 1e-10) gamma_hats_h = self.pi_ * Ns sum_gamma_hats = gamma_hats_h.sum(axis=1) # Pr(x|ys) us_h = numpy.random.uniform(size=batchsize) idx = numpy.where(sum_gamma_hats > us_h)[0] xnext_h[idx, 0] += (x1_h * protect_mask)[idx, 0] xnext_h[idx, 1] += (x2_h * protect_mask)[idx, 0] protect_mask[idx, 0] = 0.0 #xnext[:, 2] = self.eos[:, 0] #xnext[:, 2] = numpy.where(eow < 0, xnext[:, 2], 2.) #xnext_h[:, 2] = self.eos[:, 0] #mask = eow < 0 #if not mask.all(): # xnext_h[:, 2] = 2.0 #xnext[:, 2:] = xp.where(eow < 0, self.eos[:, 0:1], 2.) xnext_h[:, 2] = xp.where(self.eos[:, 0] > 0.10, 1.0, 0.0) self.xnext = xnext_h else: # prediction xnext = numpy.zeros((batchsize, 3)) myux_min = mux_hat.min(axis=1).reshape((batchsize, 1)) myux_max = mux_hat.max(axis=1).reshape((batchsize, 1)) myuy_min = muy_hat.min(axis=1).reshape((batchsize, 1)) myuy_max = muy_hat.max(axis=1).reshape((batchsize, 1)) protect_mask = numpy.ones((batchsize, 1)) while protect_mask.sum() > 0: z1 = numpy.random.uniform(size=batchsize).reshape( (batchsize, 1)) z2 = numpy.random.uniform(size=batchsize).reshape( (batchsize, 1)) x1 = myux_min + (myux_max - myux_min) * z1 x2 = myuy_min + (myuy_max - myuy_min) * z2 dx1 = (x1 - self.mux) / self.sgmx dx2 = (x2 - self.muy) / self.sgmy self.Zs = dx1 * dx1 + dx2 * dx2 - 2. * self.rho_ * dx1 * dx2 Ns = numpy.exp(-0.5 * self.Zs / (1. - self.rho_**2)) / ( 2. * 3.1415927 * self.sgmx * self.sgmy * numpy.sqrt(1. - self.rho_**2) + 1e-10) gamma_hats = self.pi_ * Ns sum_gamma_hats = gamma_hats.sum(axis=1) # Pr(x|ys) us = numpy.random.uniform(size=batchsize) idx = numpy.where(sum_gamma_hats > us)[0] xnext[idx, 0] += (x1 * protect_mask)[idx, 0] xnext[idx, 1] += (x2 * protect_mask)[idx, 0] protect_mask[idx, 0] = 0.0 #xnext[:, 2] = self.eos[:, 0] #xnext[:, 2] = numpy.where(eow < 0, xnext[:, 2], 2.) xnext[:, 2] = self.eos[:, 0] mask = eow < 0 if not mask.all(): xnext[:, 2] = 2.0 #xnext[:, 2:] = xp.where(eow < 0, self.eos[:, 0:1], 2.) self.xnext = xnext #loss_t = None loss_t = xp.zeros((batchsize, 1)).astype(xp.float32) self.Zs = None else: self.mux = mux_hat self.muy = muy_hat self.pi_hat = pi_hat - pi_hat.max(axis=1).reshape(batchsize, 1) sum_exp_pi = cuda.reduce( 'T x', # input params 'T y', # output params 'exp(x)', # map 'a+b', # reduce 'y=a', # post-reduction map '1e-10', # identity value 'mdout_sumexp' # kernel name )(self.pi_hat, axis=1) self.eos = 1. / (1. + cuda.cupy.exp(e_hat)) if x3.sum() >= 0.0: #xnext is not None: # training & validation gamma_hats, self.Zs, self.pi_, self.sgmx, self.sgmy, self.rho_ = cuda.elementwise( 'T x1, T x2, T pi_hat, T mux_, T muy_, T sgmx_hat, T sgmy_hat, T rho_hat, T sum_exp_pi', # input 'T gammas, T Zs, T pi_, T sgmx_, T sgmy_, T rho_', # output ''' pi_ = exp(pi_hat)/sum_exp_pi; sgmx_ = exp(sgmx_hat) + 1e-10; sgmy_ = exp(sgmy_hat) + 1e-10; rho_ = tanh(rho_hat); T rho2 = 1. - rho_*rho_ + 1e-10; T dx1 = (x1 - mux_)/sgmx_; T dx2 = (x2 - muy_)/sgmy_; Zs = dx1*dx1 + dx2*dx2- 2.*rho_*dx1*dx2; T Ns = exp( -0.5*Zs /rho2)/(2. * 3.1415927 * sgmx_ * sgmy_ * sqrt(rho2)); gammas = pi_ * Ns; ''', 'mdout_fwd1', )(x1, x2, self.pi_hat, mux_hat, muy_hat, sgmx_hat, sgmy_hat, rho_hat, sum_exp_pi.reshape((batchsize, 1))) sum_gamma_hats = gamma_hats.sum(axis=1).reshape( (batchsize, 1)) + 1e-10 self.gammas = gamma_hats / sum_gamma_hats loss_t = cuda.elementwise( 'T sum_, T x3, T eos', 'T loss', ''' loss = -log(sum_) - x3 * log(eos) - (1. - x3) * log(1.-eos); ''', 'mdout_fwd2', )(sum_gamma_hats, x3, self.eos) self.update_or_not = xp.where(x3 == 2., 0.0, 1.0).astype(xp.float32) loss_t = loss_t * self.update_or_not self.xnext = xnext # Prediction in training with chainer.no_backprop_mode(): self.sgmx_h = xp.where(self.sgmx < 0.0015, 0.0015, self.sgmx) self.sgmy_h = xp.where(self.sgmy < 0.0015, 0.0015, self.sgmy) muxs = xp.empty((batchsize, M, M)).astype(xp.float32) muys = xp.empty((batchsize, M, M)).astype(xp.float32) _batch_matmul_gpu(mux_hat.reshape((batchsize, M, 1)), xp.ones((batchsize, 1, M)).astype(xp.float32), out=muxs) _batch_matmul_gpu(muy_hat.reshape((batchsize, M, 1)), xp.ones((batchsize, 1, M)).astype(xp.float32), out=muys) gamma_hats_at_components = cuda.elementwise( 'T x1, T x2, T pi_, T mux_, T muy_, T sgmx_, T sgmy_, T rho_', # input 'T gammas', # output ''' T rho2 = 1. - rho_*rho_ + 1e-10; T dx1 = (x1 - mux_)/sgmx_; T dx2 = (x2 - muy_)/sgmy_; T Zs = dx1*dx1 + dx2*dx2- 2.*rho_*dx1*dx2; T Ns = exp( -0.5*Zs /rho2)/(2. * 3.1415927 * sgmx_ * sgmy_ * sqrt(rho2)); gammas = pi_ * Ns; ''', 'mdout_fwd5', )(muxs, muys, self.pi_.reshape((batchsize, 1, M)), mux_hat.reshape((batchsize, 1, M)), muy_hat.reshape((batchsize, 1, M)), self.sgmx_h.reshape((batchsize, 1, M)), self.sgmy_h.reshape((batchsize, 1, M)), self.rho_.reshape((batchsize, 1, M))) sum_gamma_hats_at_components = gamma_hats_at_components.sum( axis=2) # (batchsize, M) p_maxs = sum_gamma_hats_at_components.max(axis=1).reshape( (batchsize, 1)) # (batchsize, 1) myux_min_h = mux_hat.min(axis=1).reshape( (batchsize, 1, 1)) - 0.01 myux_max_h = mux_hat.max(axis=1).reshape( (batchsize, 1, 1)) + 0.01 myuy_min_h = muy_hat.min(axis=1).reshape( (batchsize, 1, 1)) - 0.01 myuy_max_h = muy_hat.max(axis=1).reshape( (batchsize, 1, 1)) + 0.01 xnext_h = xp.zeros((batchsize, 3)).astype(xp.float32) protect_mask = xp.ones((batchsize, 1)).astype(xp.float32) n_samples = 32768 * 2 #16384 #8192 #4096 #2048 #1024 #512 x1_h = xp.copy(x1) x2_h = xp.copy(x2) while protect_mask.sum() > 0: # sampling n (=n_samples) samples in parallel at a step z1_h = xp.random.uniform(size=batchsize * n_samples).reshape( (batchsize, n_samples, 1)) z2_h = xp.random.uniform(size=batchsize * n_samples).reshape( (batchsize, n_samples, 1)) x1__h = (myux_min_h + (myux_max_h - myux_min_h) * z1_h).astype( xp.float32) # (batchsize, n_samples, 1) x2__h = (myuy_min_h + (myuy_max_h - myuy_min_h) * z2_h).astype( xp.float32) # (batchsize, n_samples, 1) gamma_hats_h = cuda.elementwise( 'T x1, T x2, T pi_, T mux_, T muy_, T sgmx_, T sgmy_, T rho_', # input 'T gammas', # output ''' T rho2 = 1. - rho_*rho_ + 1e-10; T dx1 = (x1 - mux_)/sgmx_; T dx2 = (x2 - muy_)/sgmy_; T Zs = dx1*dx1 + dx2*dx2- 2.*rho_*dx1*dx2; T Ns = exp( -0.5*Zs /rho2)/(2. * 3.1415927 * sgmx_ * sgmy_ * sqrt(rho2)); gammas = pi_ * Ns; ''', 'mdout_fwd4', )(x1__h, x2__h, self.pi_.reshape((batchsize, 1, M)), mux_hat.reshape((batchsize, 1, M)), muy_hat.reshape((batchsize, 1, M)), self.sgmx_h.reshape((batchsize, 1, M)), self.sgmy_h.reshape((batchsize, 1, M)), self.rho_.reshape((batchsize, 1, M))) sum_gamma_hats_h = gamma_hats_h.sum(axis=2) us_h = xp.random.uniform( size=batchsize * n_samples).reshape( (batchsize, n_samples)) * p_maxs update_mask__h = xp.where( sum_gamma_hats_h > us_h, 1.0, 0.0).astype(xp.float32).reshape( (batchsize, n_samples)) update_mask_h = update_mask__h.max(axis=1).reshape( (batchsize, 1)) sample_idx_h = update_mask__h.argmax(axis=1).reshape( (batchsize, 1)) for bb in xrange(batchsize): this_midx = sample_idx_h[bb, 0] x1_h[bb:bb + 1, 0] = x1__h[bb:bb + 1, this_midx:this_midx + 1, 0] x2_h[bb:bb + 1, 0] = x2__h[bb:bb + 1, this_midx:this_midx + 1, 0] xnext_h[:, 0] += (x1_h * protect_mask * update_mask_h)[:, 0] xnext_h[:, 1] += (x2_h * protect_mask * update_mask_h)[:, 0] protect_mask -= protect_mask * update_mask_h xnext_h[:, 2:] = xp.where(self.eos[:, 0:1] > 0.10, 1.0, 0.0) #xnext_h[:, 2:] = xp.where(eow < 0, self.eos[:, 0:1], 2.) self.xnext = xnext_h #loss_t = xp.zeros((batchsize, 1)).astype(xp.float32) #self.Zs = None else: # prediction (sampling from probability distribution) # pi, sgmx, sgmy, rho <-- pi_hat, sgmx_hat, sgmy_hat, rho_hat self.pi_, self.sgmx, self.sgmy, self.rho_ = cuda.elementwise( 'T pi_hat, T sgmx_hat, T sgmy_hat, T rho_hat, T sum_exp_pi', # input 'T pi_, T sgmx_, T sgmy_, T rho_', # output ''' pi_ = exp(pi_hat)/sum_exp_pi; sgmx_ = exp(sgmx_hat) + 1e-10; sgmy_ = exp(sgmy_hat) + 1e-10; rho_ = tanh(rho_hat); ''', 'mdout_fwd3', )(self.pi_hat, sgmx_hat, sgmy_hat, rho_hat, sum_exp_pi.reshape((batchsize, 1))) # because variances of gaussians are very small, sampling is virtually impossible, we set lower boundary for variances! self.sgmx = xp.where(self.sgmx < 0.0015, 0.0015, self.sgmx) self.sgmy = xp.where(self.sgmy < 0.0015, 0.0015, self.sgmy) #print(self.sgmx.min(), self.sgmy.min()) # get the (aproximated) maximum p value of M-mixture gaussian distributions. # Here I assume that the maximum p value is taken at a center of a gaussian component in the mixture. # First, calculate p-values at each center of gaussian components, # and the maximum of these p-values is considered as the upper boundary of the M-mixture gaussian distributions # prepare x1 and x2 matrices like # [ [mux0, mux0, ...., mux0], # [mux1, mux1, ...., mux1], # ... # [muxn, muxn, ...., muxn]] where n = batchsize muxs = xp.empty((batchsize, M, M)).astype(xp.float32) muys = xp.empty((batchsize, M, M)).astype(xp.float32) _batch_matmul_gpu(mux_hat.reshape((batchsize, M, 1)), xp.ones( (batchsize, 1, M)).astype(xp.float32), out=muxs) _batch_matmul_gpu(muy_hat.reshape((batchsize, M, 1)), xp.ones( (batchsize, 1, M)).astype(xp.float32), out=muys) # N_i((mux[j], muy[j])) for i = 0, 1, ..., M and j = 0, 1, ..., M gamma_hats_at_components = cuda.elementwise( 'T x1, T x2, T pi_, T mux_, T muy_, T sgmx_, T sgmy_, T rho_', # input 'T gammas', # output ''' T rho2 = 1. - rho_*rho_ + 1e-10; T dx1 = (x1 - mux_)/sgmx_; T dx2 = (x2 - muy_)/sgmy_; T Zs = dx1*dx1 + dx2*dx2- 2.*rho_*dx1*dx2; T Ns = exp( -0.5*Zs /rho2)/(2. * 3.1415927 * sgmx_ * sgmy_ * sqrt(rho2)); gammas = pi_ * Ns; ''', 'mdout_fwd5', )(muxs, muys, self.pi_.reshape((batchsize, 1, M)), mux_hat.reshape((batchsize, 1, M)), muy_hat.reshape((batchsize, 1, M)), self.sgmx.reshape((batchsize, 1, M)), self.sgmy.reshape((batchsize, 1, M)), self.rho_.reshape((batchsize, 1, M))) # p[j] = sum(N_i((mux[j], muy[j])) for i = 0, 1, ..., M sum_gamma_hats_at_components = gamma_hats_at_components.sum( axis=2) # (batchsize, M) # max(p[0], p[1], ..., p[M]) for each batch p_maxs = sum_gamma_hats_at_components.max(axis=1).reshape( (batchsize, 1)) # (batchsize, 1) #print(p_maxs.reshape((1, batchsize))) myux_min = mux_hat.min(axis=1).reshape( (batchsize, 1, 1)) - 0.01 myux_max = mux_hat.max(axis=1).reshape( (batchsize, 1, 1)) + 0.01 myuy_min = muy_hat.min(axis=1).reshape( (batchsize, 1, 1)) - 0.01 myuy_max = muy_hat.max(axis=1).reshape( (batchsize, 1, 1)) + 0.01 xnext = xp.zeros((batchsize, 3)).astype(xp.float32) protect_mask = xp.ones((batchsize, 1)).astype(xp.float32) n_samples = 32768 * 2 #16384 #8192 #4096 #2048 #1024 #512 while protect_mask.sum() > 0: # sampling n (=n_samples) samples in parallel at a step z1 = xp.random.uniform(size=batchsize * n_samples).reshape( (batchsize, n_samples, 1)) z2 = xp.random.uniform(size=batchsize * n_samples).reshape( (batchsize, n_samples, 1)) x1_ = (myux_min + (myux_max - myux_min) * z1).astype( xp.float32) # (batchsize, n_samples, 1) x2_ = (myuy_min + (myuy_max - myuy_min) * z2).astype( xp.float32) # (batchsize, n_samples, 1) gamma_hats = cuda.elementwise( 'T x1, T x2, T pi_, T mux_, T muy_, T sgmx_, T sgmy_, T rho_', # input 'T gammas', # output ''' T rho2 = 1. - rho_*rho_ + 1e-10; T dx1 = (x1 - mux_)/sgmx_; T dx2 = (x2 - muy_)/sgmy_; T Zs = dx1*dx1 + dx2*dx2- 2.*rho_*dx1*dx2; T Ns = exp( -0.5*Zs /rho2)/(2. * 3.1415927 * sgmx_ * sgmy_ * sqrt(rho2)); gammas = pi_ * Ns; ''', 'mdout_fwd4', )(x1_, x2_, self.pi_.reshape((batchsize, 1, M)), mux_hat.reshape((batchsize, 1, M)), muy_hat.reshape((batchsize, 1, M)), self.sgmx.reshape((batchsize, 1, M)), self.sgmy.reshape((batchsize, 1, M)), self.rho_.reshape((batchsize, 1, M))) sum_gamma_hats_ = gamma_hats.sum(axis=2) """ sum_gamma_hats = sum_gamma_hats_.max(axis=1).reshape((batchsize, 1)) sample_idx = sum_gamma_hats_.argmax(axis=1).reshape((batchsize, 1)) for bb in xrange(batchsize): this_midx = sample_idx[bb, 0] x1[bb:bb+1, 0] = x1_[bb:bb+1, this_midx:this_midx+1, 0] x2[bb:bb+1, 0] = x2_[bb:bb+1, this_midx:this_midx+1, 0] us = xp.random.uniform(size=batchsize).reshape((batchsize, 1)) * p_maxs update_mask = xp.where(sum_gamma_hats > us, 1.0, 0.0).astype(xp.float32).reshape((batchsize, 1)) xnext[:, 0] += (x1*protect_mask*update_mask)[:, 0] xnext[:, 1] += (x2*protect_mask*update_mask)[:, 0] protect_mask -= protect_mask * update_mask """ """ us_ = xp.random.uniform(size=batchsize* n_samples).reshape((batchsize, n_samples)) * p_maxs update_mask_ = xp.where(sum_gamma_hats_ > us_, 1.0, 0.0).astype(xp.float32).reshape((batchsize, n_samples)) x1 = x1_.reshape((batchsize, n_samples)) * update_mask_ x2 = x2_.reshape((batchsize, n_samples)) * update_mask_ for i in xrange(n_samples): xnext[:, 0] += (x1_[:,i, :]*protect_mask)[:, 0] xnext[:, 1] += (x2_[:,i, :]*protect_mask)[:, 0] #print(protect_mask.shape, update_mask_[:, i:(i+1)].shape) protect_mask -= protect_mask * update_mask_[:, i:(i+1)] """ us_ = xp.random.uniform( size=batchsize * n_samples).reshape( (batchsize, n_samples)) * p_maxs update_mask_ = xp.where(sum_gamma_hats_ > us_, 1.0, 0.0).astype(xp.float32).reshape( (batchsize, n_samples)) update_mask = update_mask_.max(axis=1).reshape( (batchsize, 1)) sample_idx = update_mask_.argmax(axis=1).reshape( (batchsize, 1)) for bb in xrange(batchsize): this_midx = sample_idx[bb, 0] x1[bb:bb + 1, 0] = x1_[bb:bb + 1, this_midx:this_midx + 1, 0] x2[bb:bb + 1, 0] = x2_[bb:bb + 1, this_midx:this_midx + 1, 0] xnext[:, 0] += (x1 * protect_mask * update_mask)[:, 0] xnext[:, 1] += (x2 * protect_mask * update_mask)[:, 0] protect_mask -= protect_mask * update_mask xnext[:, 2:] = self.eos[:, 0:1] xnext[:, 2:] = xp.where(eow < 0, self.eos[:, 0:1], 2.) self.xnext = xnext loss_t = xp.zeros((batchsize, 1)).astype(xp.float32) self.Zs = None return loss_t, self.xnext, self.eos, self.pi_, self.mux, self.muy, self.sgmx, self.sgmy, self.rho_,
def _l2normalize(v, eps=1e-12): norm = cuda.reduce('T x', 'T out', 'x * x', 'a + b', 'out = sqrt(a)', 0, 'norm_sn') div = cuda.elementwise('T x, T norm, T eps', 'T out', 'out = x / (norm + eps)', 'div_sn') return div(v, norm(v), eps)
def backward(self, inputs, grad_outputs): xp = cuda.get_array_module(*inputs) cs, ls, alpha_hat, beta_hat, kappa_hat, kappa_prev = inputs batchsize, W, U = cs.shape K = alpha_hat.shape[1] gw, gk = grad_outputs[0:2] # (batchsize, W) ga_hat = xp.empty_like(alpha_hat) gb_hat = xp.empty_like(beta_hat) gk_hat = xp.empty_like(kappa_hat) gk_prev = xp.empty_like(kappa_prev) #gk) gc = xp.empty_like(cs) # Consider the case that either gradient is not given if gw is None: gw = 0 if gk is None: gk = 0 if xp is numpy: gwc = numpy.matmul(gw.reshape(batchsize, 1, W), cs) # (batchsize, 1, U) emat = self.phai_mat * gwc # (batchsize, K, U) ga_hat[:] = emat.sum(axis=2) us = numpy.arange(U).astype(numpy.float32).reshape((1, 1, U)) diff = us - self.kappa b = self.beta.reshape((batchsize, K)) gb_hat[:] = -b * (emat * diff**2).sum(axis=2) gk_prev[:] = gk + 2. * b * (emat * diff).sum(axis=2) gk_hat[:] = numpy.exp(kappa_hat) * gk_prev else: gwc = cuda.cupy.empty((batchsize, 1, U)).astype(cuda.cupy.float32) #for i in xrange(batchsize): # gwc[i] = (gw.reshape(batchsize, 1, W))[i].dot(cs[i]) # (1, W).(W, U) --> (1, U) _batch_matmul_gpu(gw.reshape(batchsize, 1, W), cs, out=gwc) #emat = self.phai_mat * gwc emat = cuda.elementwise( 'T phai, T gwc', 'T emat', ''' emat = phai * gwc; ''', 'softwindow_bw1', )(self.phai_mat, gwc) #ga_hat[:] = emat.sum(axis=2) ga_hat[:] = cuda.reduce( 'T x', 'T y', 'x', 'a+b', 'y=a', '0', 'softwindow_bw2', )(emat, axis=2) us = cuda.cupy.arange(U).astype(cuda.cupy.float32).reshape( (1, 1, U)) diff = us - self.kappa.reshape(batchsize, K, 1) b = self.beta.reshape(batchsize, K) tmp2, tmp1 = cuda.elementwise( 'T emat, T diff', 'T ed2, T ed1', ''' ed1 = emat * diff; ed2 = ed1 * diff; ''', 'softwindow_bw3')(emat, diff) sum1 = cuda.reduce( 'T x', 'T y', 'x', 'a+b', 'y=a', '0', 'softwindow_bw4', )(tmp1, axis=2) sum2 = cuda.reduce( 'T x', 'T y', 'x', 'a+b', 'y=a', '0', 'softwindow_bw5', )(tmp2, axis=2) gb_hat[:] = -b * sum2 gk_prev[:] = gk + 2. * b * sum1 #gb_hat[:] = - b * (emat * diff**2).sum(axis=2) #gk_prev[:]= gk + 2. * b * (emat * diff).sum(axis=2) #gk_hat[:] = cuda.cupy.exp(kappa_hat)*gk_prev gk_hat = cuda.elementwise( 'T k_hat, T gk_prev', 'T gk_hat', ''' gk_hat = exp(k_hat)*gk_prev; ''', 'softwindow_bw6')(kappa_hat, gk_prev) return None, None, ga_hat, gb_hat, gk_hat, gk_prev,
def forward_gpu(self, inputs): from chainer.cuda import cupy mean_x, cov_x, t = inputs dim = len(mean_x[0]) self._make_samples(t) self._pos_indexes = self.samples[:,0] self._neg_indexes = self.samples[:,1] self._m_pos = self.M.take(self._pos_indexes, axis=0) self._c_pos = self.C.take(self._pos_indexes, axis=0) self._m_neg = self.M.take(self._neg_indexes, axis=0) self._c_neg = self.C.take(self._neg_indexes, axis=0) if self._covariance_type == CovarianceType.diagonal: kern_trace = cuda.reduce( 'T Ci, T Cj', 'T tr', 'Cj / Ci', 'a + b', 'tr = a', 0, 'trace') tr_p = kern_trace(cov_x, self._c_pos, axis=1) tr_n = kern_trace(cov_x, self._c_neg, axis=1) kern_det = cuda.reduce( 'T Ci, T Cj', 'T det', '__logf(Cj) - __logf(Ci)', 'a + b', 'det = a', 0, 'determinant') det_p = kern_det(cov_x, self._c_pos, axis=1) det_n = kern_det(cov_x, self._c_neg, axis=1) kern_fac = cuda.reduce( 'T Mi, T Mj, T Ci', 'T out', '__powf(abs(Mi - Mj), 2.0) / Ci', 'a + b', 'out = a', 0, 'factor') fac_p = kern_fac(mean_x, self._m_pos, cov_x, axis=1) fac_n = kern_fac(mean_x, self._m_neg, cov_x, axis=1) self._kl_pos, self._kl_neg, loss = cuda.elementwise( 'T f_p, T f_n, T tr_p, T tr_n, T det_p, T det_n, S ip, S in, \ float32 m, int32 dim', 'T kl_p, T kl_n, T L', ''' if (ip == in) { kl_p = 0.0; kl_n = 0.0; L = m; } else { kl_p = -0.5 * (tr_p + f_p - dim - det_p); kl_n = -0.5 * (tr_n + f_n - dim - det_n); L = max(0.0, m - kl_p + kl_n); } ''', 'loss_function_diagonal' )(fac_p, fac_n, tr_p, tr_n, det_p, det_n, self._pos_indexes, self._neg_indexes, self._margin, dim) elif self._covariance_type == CovarianceType.spherical: kern_sq_err_sum = cuda.reduce( 'T in0, T in1', 'T out', '__powf(abs(in0 - in1), 2.0)', 'a + b', 'out = a', 0, 'residual_sum_of_squares') sq_p = kern_sq_err_sum(mean_x, self._m_pos, axis=1)[cupy.newaxis, :].T sq_n = kern_sq_err_sum(mean_x, self._m_neg, axis=1)[cupy.newaxis, :].T self._kl_pos, self._kl_neg, loss = cuda.elementwise( 'T sq_p, T sq_n, T cx, T cp, T cn, S ip, S in, float32 m, int32 dim', 'T kl_p, T kl_n, T L', ''' if (ip == in) { kl_p = 0.0; kl_n = 0.0; L = m; } else { T tr_p = dim * cp / cx; T tr_n = dim * cn / cx; T det_p = dim * __logf(cp / cx); T det_n = dim * __logf(cn / cx); kl_p = -0.5 * (tr_p + sq_p / cx - dim - det_p); kl_n = -0.5 * (tr_n + sq_n / cx - dim - det_n); L = max(0.0, m - kl_p + kl_n); } ''', 'loss_function_spherical' )(sq_p, sq_n, cov_x, self._c_pos, self._c_neg, self._pos_indexes[cupy.newaxis, :].T, self._neg_indexes[cupy.newaxis, :].T, self._margin, dim) sum_loss = cuda.cupy.sum(loss) return sum_loss,
def forward(self, inputs): # # preprocessing # xp = cuda.get_array_module(*inputs) x, gamma, beta = inputs[:3] if configuration.config.train: if self.running_mean is None: self.running_mean = xp.zeros_like(beta, dtype=xp.float32) self.running_var = xp.zeros_like(gamma, dtype=xp.float32) else: self.running_mean = xp.array(self.running_mean) self.running_var = xp.array(self.running_var) elif len(inputs) == 6: self.fixed_mean = inputs[4] self.fixed_var = inputs[5] head_ndim = beta.ndim + 1 expander = (None, Ellipsis) + (None, ) * (x.ndim - head_ndim) # # start of forward path # if configuration.config.train: axis = (0, ) + tuple(range(head_ndim, x.ndim)) mean = x.mean(axis=axis) var = cuda.reduce('S x, T mean, T alpha', 'T out', '(x - mean) * (x - mean)', 'a + b', 'out = alpha * a', '0', 'conv_bn_var')(x, mean[expander], x.shape[1] / x.size, axis=axis, keepdims=False) else: mean = self.fixed_mean var = self.fixed_var if xp is numpy: raise NotImplementedError() else: self.std_inv = cuda.elementwise( 'T var, T eps', 'T std_inv', ''' std_inv = 1 / sqrt(var + eps); ''', 'conv_bn_std_inv')(var, self.eps) self.x_hat, y = cuda.elementwise( 'T x, T mean, T std_inv, T gamma, T beta', 'T x_hat, T y', ''' x_hat = (x - mean) * std_inv; y = gamma * x_hat + beta; ''', 'conv_bn_fwd')(x, mean[expander], self.std_inv[expander], gamma[expander], beta[expander]) # # end of forward path # # # calculation of lipschitz constant # if chainer.config.train and getattr(chainer.config, 'lmt', False): # # power iteration for a matrix Diag(\gamma_i/\sigma_i)W # # u <= Diag(\gamma_i/\sigma_i) u # v <= W u # u_mid <= W^T v # u <= Diag(\gamma_i/\sigma_i)^T v # W = inputs[3].reshape((inputs[3].shape[0], -1)) tmp_l = gamma * self.std_inv self.u *= tmp_l self.v = self.u.dot(W) # normalize for back propagation normalize(self.v, eps=1e-20) # do not normalize u_mid self.u_mid = self.v.dot(W.T) self.u[:] = self.u_mid * tmp_l # normalize for back propagation nu = normalize(self.u, eps=1e-20) # spectral norm is approximated by the norm of a vector u l = nu.reshape((1, )) else: # not used l = xp.ones((1, ), dtype=xp.float32) # # calculate running average of statistics # if configuration.config.train: self.running_mean *= self.decay self.running_mean += mean * (1 - self.decay) self.running_var *= self.decay self.running_var += var * (1 - self.decay) return y, l