def forward(self, x, gy): xp = cuda.get_array_module(x) col = im2col_array(x, self.kernel_size, self.stride, self.pad, to_matrix=False) gW = xp.tensordot(gy, col, ((0, 2, 3), (0, 4, 5))) return gW
def im2col_array(img, kernel_size, stride, pad, to_matrix=True): N, C, H, W = img.shape KH, KW = pair(kernel_size) SH, SW = pair(stride) PH, PW = pair(pad) OH = get_conv_outsize(H, KH, SH, PH) OW = get_conv_outsize(W, KW, SW, PW) xp = cuda.get_array_module(img) if xp != np: col = _im2col_gpu(img, kernel_size, stride, pad) else: img = np.pad(img, ((0, 0), (0, 0), (PH, PH + SH - 1), (PW, PW + SW - 1)), mode='constant', constant_values=(0, )) col = np.ndarray((N, C, KH, KW, OH, OW), dtype=img.dtype) for j in range(KH): j_lim = j + SH * OH for i in range(KW): i_lim = i + SW * OW col[:, :, j, i, :, :] = img[:, :, j:j_lim:SH, i:i_lim:SW] if to_matrix: col = col.transpose((0, 4, 5, 1, 2, 3)).reshape((N * OH * OW, -1)) return col
def _init_W(self, x): self.in_size = x.shape[1] xp = cuda.get_array_module(x) I, O = self.in_size, self.out_size W_data = xp.random.randn(I, O).astype(np.float32) * np.sqrt(1 / I) self.W.data = W_data
def forward(self, x, W, b): xp = cuda.get_array_module(x) Weight = W SH, SW = self.stride PH, PW = self.pad C, OC, KH, KW = Weight.shape N, C, H, W = x.shape if self.outsize is None: out_h = get_deconv_outsize(H, KH, SH, PH) out_w = get_deconv_outsize(W, KW, SW, PW) else: out_h, out_w = pair(self.outsize) img_shape = (N, OC, out_h, out_w) gcol = xp.tensordot(Weight, x, (0, 1)) gcol = xp.rollaxis(gcol, 3) y = col2im_array(gcol, img_shape, (KH, KW), self.stride, self.pad, to_matrix=False) # b, k, h, w if b is not None: self.no_bias = True y += b.reshape((1, b.size, 1, 1)) return y
def forward(self, x): if self.W.data is None: self.in_size = x.shape[1] xp = cuda.get_array_module(x) self._init_W(xp) y = F.linear(x, self.W, self.b) return y
def forward(self, x, gamma, beta): assert x.ndim == 2 or x.ndim == 4 x_ndim = x.ndim if x_ndim == 4: N, C, H, W = x.shape # (N, C, H, W) -> (N*H*W, C) x = x.transpose(0, 2, 3, 1).reshape(-1, C) xp = cuda.get_array_module(x) if dezero.Config.train: mean = x.mean(axis=0) var = x.var(axis=0) inv_std = 1 / xp.sqrt(var + self.eps) xc = (x - mean) * inv_std m = x.size // gamma.size s = m - 1. if m - 1. > 1. else 1. adjust = m / s # unbiased estimation self.avg_mean *= self.decay self.avg_mean += (1 - self.decay) * mean self.avg_var *= self.decay self.avg_var += (1 - self.decay) * adjust * var self.inv_std = inv_std else: inv_std = 1 / xp.sqrt(self.avg_var + self.eps) xc = (x - self.avg_mean) * inv_std y = gamma * xc + beta if x_ndim == 4: # (N*H*W, C) -> (N, C, H, W) y = y.reshape(N, H, W, C).transpose(0, 3, 1, 2) return y
def forward(self, x, t): xp = cuda.get_array_module(t.data) N = x.shape[0] log_z = utils.logsumexp(x, axis=1) log_p = x - log_z log_p = log_p[xp.arange(N), t.ravel()] y = -log_p.sum() / xp.float32(N) return y
def forward(self, x): if self.W.data is None: self.in_channels = x.shpae[1] xp = cuda.get_array_module(x) self._init_W(xp) y = F.conv2d_simple(x, self.W, self.b, self.stride, self.pad) return y
def dropout(x, dropout_ratio=0.5): x = as_variable(x) if dezero.Config.train: xp = cuda.get_array_module(x) mask = xp.random.rand(*x.shape) > dropout_ratio scale = xp.array(1.0 - dropout_ratio).astype(x.dtype) y = x * mask / scale return y
def __call__(self, x): if self.W.data is None: self.in_channels = x.shape[1] xp = cuda.get_array_module(x) self._init_W(xp) y = F.conv2d(x, self.W, self.b, self.stride, self.pad) return y
def forward(self, gy): xp = cuda.get_array_module(gy) gx = xp.zeros(self.in_shape, dtype=gy.dtype) # if np is np: xp.add.at(gx, self.slices, gy) # else: # xp.scatter_add(gx, self.slices, gy) return gx
def logsumexp(x, axis=1): xp = cuda.get_array_module(x) m = x.max(axis=axis, keepdims=True) y = x - m xp.exp(y, out=y) s = y.sum(axis=axis, keepdims=True) xp.log(s, out=s) m += s return m
def _init_W(self, x): self.in_channels = x.shape[1] xp = cuda.get_array_module(x) C, OC = self.in_channels, self.out_channels KH, KW = pair(self.kernel_size) W_data = xp.random.randn(OC, C, KH, KW).astype(np.float32) * np.sqrt( 1 / C * KH * KW) self.W.data = W_data
def forward(self, x): # initialize weights when data is injected if self.W.data is None: self.in_size = x.shape[1] xp = cuda.get_array_module(x) self._init_W(xp) y = F.linear(x, self.W, self.b) return y
def update_one(self, param): xp = cuda.get_array_module(param) v_key = id(param) if v_key not in self.vs: self.vs[v_key] = xp.zeros_like(param.data) v = self.vs[v_key] v *= self.momentum v -= self.lr * param.grad.data param.data += v
def forward(self, gy): xp = cuda.get_array_module(gy) gx = xp.zeros(self.in_shape) if xp is np: np.add.at(gx, self.slices, gy) else: xp.scatter_add(gx, self.slices, gy) return gx
def softmax_cross_entropy_simple(x, t): x, t = as_variable(x), as_variable(t) N = x.shape[0] p = softmax(x) p = clip(p, 1e-15, 1.0) log_p = log(p) xp = cuda.get_array_module(t.data) tlog_p = log_p[xp.arange(N), t.data] y = -1 * sum(tlog_p) / N return y
def dropout(x, dropout_ratio=0.5): x = as_variable(x) if dezero.config.train: xp = cuda.get_array_module(x) mask = xp.random.rand(*x.shape) > dropout_ratio scale = 1.0 - dropout_ratio y = x * mask / scale return y else: return x
def __call__(self, x): if self.W.data is None: self.in_size = x.shape[1] xp = cuda.get_array_module(x) I, O = self.in_size, self.out_size W_data = xp.random.randn(I, O).astype(np.float32) * np.sqrt(1 / I) self.W.data = W_data y = F.linear(x, self.W, self.b) return y
def backward(self, gy): x, t = self.inputs N, CLS_NUM = x.shape gy *= 1 / N y = softmax(x) # convert to one-hot xp = cuda.get_array_module(t.data) t_onehot = xp.eye(CLS_NUM, dtype=t.dtype)[t.data] y = (y - t_onehot) * gy return y
def _init_params(self, x): xp = cuda.get_array_module(x) D = x.shape[1] if self.avg_mean.data is None: self.avg_mean.data = xp.zeros(D, dtype=x.dtype) if self.avg_var.data is None: self.avg_var.data = xp.ones(D, dtype=x.dtype) if self.gamma.data is None: self.gamma.data = xp.ones(D, dtype=x.dtype) if self.beta.data is None: self.beta.data = xp.zeros(D, dtype=x.dtype)
def forward(self, x, W, b): xp = cuda.get_array_module(x) KH, KW = W.shape[2:] col = im2col_array(x, (KH, KW), self.stride, self.pad, to_matrix=False) y = xp.tensordot(col, W, ((1, 2, 3), (1, 2, 3))) if b is not None: y += b y = xp.rollaxis(y, 3, 1) # y = np.transpose(y, (0, 3, 1, 2)) return y
def update_one(self, param): xp = cuda.get_array_module(param.data) h_key = id(param) if h_key not in self.hs: self.hs[h_key] = xp.zeros_like(param.data) lr = self.lr eps = self.eps grad = param.grad.data h = self.hs[h_key] h += grad * grad param.data -= lr * grad / (xp.sqrt(h) + eps)
def numerical_grad(f, x, *args, **kwargs): """数値微分で勾配を求める Parameters ---------- f : DeZero function DeZeroの関数やレイヤ x : ndarray or dezero.Variable 勾配を求める変数 args : 可変長引数 f(x, y) のように、入力する変数が x 以外にある場合はここで与える kwargs : キーワード引数 f(x, key=y) のように、入力する変数が x 以外にある場合はここで与える Returns ------- grad : ndarray """ eps = 1e-4 x = x.data if isinstance(x, Variable) else x xp = cuda.get_array_module(x) if xp is not np: np_x = cuda.as_numpy(x) else: np_x = x grad = xp.zeros_like(x) it = np.nditer(np_x, flags=['multi_index'], op_flags=['readwrite']) while not it.finished: idx = it.multi_index tmp_val = x[idx].copy() x[idx] = tmp_val + eps y1 = f(x, *args, **kwargs) # f(x+h) if isinstance(y1, Variable): y1 = y1.data y1 = y1.copy() x[idx] = tmp_val - eps y2 = f(x, *args, **kwargs) # f(x-h) if isinstance(y2, Variable): y2 = y2.data y2 = y2.copy() diff = (y1 - y2).sum() grad[idx] = diff / (2 * eps) x[idx] = tmp_val it.iternext() return grad
def update_one(self, param): xp = cuda.get_array_module(param.data) key = id(param) if key not in self.ms: self.ms[key] = xp.zeros_like(param.data) self.vs[key] = xp.zeros_like(param.data) m, v = self.ms[key], self.vs[key] beta1, beta2, eps = self.beta1, self.beta2, self.eps grad = param.grad.data m += (1 - beta1) * (grad - m) v += (1 - beta2) * (grad * grad - v) param.data -= self.lr * m / (xp.sqrt(v) + eps)
def numerical_grad(f, x, *args, **kwargs): """Computes numerical gradient by finite differences. Args: f (callable): A function which gets `Variable`s and returns `Variable`s. x (`ndarray` or `dezero.Variable`): A target `Variable` for computing the gradient. *args: If `f` needs variables except `x`, you can specify with this argument. **kwargs: If `f` needs keyword variables, you can specify with this argument. Returns: `ndarray`: Gradient. """ eps = 1e-4 x = x.data if isinstance(x, Variable) else x xp = cuda.get_array_module(x) if xp is not np: np_x = cuda.as_numpy(x) else: np_x = x grad = xp.zeros_like(x) it = np.nditer(np_x, flags=['multi_index'], op_flags=['readwrite']) while not it.finished: idx = it.multi_index tmp_val = x[idx].copy() x[idx] = tmp_val + eps y1 = f(x, *args, **kwargs) # f(x+h) if isinstance(y1, Variable): y1 = y1.data y1 = y1.copy() x[idx] = tmp_val - eps y2 = f(x, *args, **kwargs) # f(x-h) if isinstance(y2, Variable): y2 = y2.data y2 = y2.copy() diff = (y1 - y2).sum() grad[idx] = diff / (2 * eps) x[idx] = tmp_val it.iternext() return grad
def update_one(self, param): xp = cuda.get_array_module(param.data) key = id(param) if key not in self.msg: self.msg[key] = xp.zeros_like(param.data) self.msdx[key] = xp.zeros_like(param.data) msg, msdx = self.msg[key], self.msdx[key] rho = self.rho eps = self.eps grad = param.grad.data msg *= rho msg += (1 - rho) * grad * grad dx = xp.sqrt((msdx + eps) / (msg + eps)) * grad msdx *= rho msdx += (1 - rho) * dx * dx param.data -= dx
def forward(self, gy): xp = cuda.get_array_module(gy) N, C, OH, OW = gy.shape N, C, H, W = self.input_shape KH, KW = pair(self.kernel_size) gcol = xp.zeros((N * C * OH * OW * KH * KW), dtype=self.dtype) indexes = (self.indexes.ravel() + xp.arange(0, self.indexes.size * KH * KW, KH * KW)) gcol[indexes] = gy.ravel() gcol = gcol.reshape(N, C, OH, OW, KH, KW) gcol = xp.swapaxes(gcol, 2, 4) gcol = xp.swapaxes(gcol, 3, 5) gx = col2im_array(gcol, (N, C, H, W), self.kernel_size, self.stride, self.pad, to_matrix=False) return gx
def col2im(col, img_shape, kernel_size, stride, pad): xp = cuda.get_array_module(col) if xp != np: img = _col2im_gpu(col, img_shape, kernel_size, stride, pad) return img n, c, h, w = img_shape kh, kw = _pair(kernel_size) sh, sw = _pair(stride) ph, pw = _pair(pad) oh = get_conv_outsize(h, kh, sh, ph) ow = get_conv_outsize(w, kw, sw, pw) img = np.zeros((n, c, h + 2 * ph + sh - 1, w + 2 * pw + sw - 1), dtype=col.dtype) for j in range(kh): j_lim = j + sh * oh for i in range(kw): i_lim = i + sw * ow img[:, :, j:j_lim:sh, i:i_lim:sw] += col[:, :, j, i, :, :] return img[:, :, ph:h + ph, pw:w + pw]
def col2im_array(col, img_shape, kernel_size, stride, pad, to_matrix=True): N, C, H, W = img_shape KH, KW = pair(kernel_size) SH, SW = pair(stride) PH, PW = pair(pad) OH = get_conv_outsize(H, KH, SH, PH) OW = get_conv_outsize(W, KW, SW, PW) if to_matrix: col = col.reshape(N, OH, OW, C, KH, KW).transpose(0, 3, 4, 5, 1, 2) xp = cuda.get_array_module(col) if xp != np: img = _col2im_gpu(col, SH, SW, PH, PW, H, W) return img else: img = np.zeros((N, C, H + 2 * PH + SH - 1, W + 2 * PW + SW - 1), dtype=col.dtype) for j in range(KH): j_lim = j + SH * OH for i in range(KW): i_lim = i + SW * OW img[:, :, j:j_lim:SH, i:i_lim:SW] += col[:, :, j, i, :, :] return img[:, :, PH : H + PH, PW : W + PW]