def forward_cpu(self, inputs): if ((self.dy == 1 and self.dx == 1) and intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): self._use_ideep = True self.retain_inputs((0, 1)) # only retain x and W if len(inputs) == 2: (x, W), b = inputs, None else: x, W, b = inputs self._calc_out_size(x, W) if self.groups > 1: # Grouped convolution implementation return self._forward_grouped_convolution(x, W, b) elif (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation self._use_ideep = True return self._forward_ideep(x, W, b) else: return self._forward_cpu_core(x, W, b)
def forward(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): return self._forward_ideep(inputs) y = inputs[0] * self.mask return y,
def forward(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x)): return self._forward_ideep(x) if self.mask is not None: y = x[0] * self.mask else: scale = x[0].dtype.type(1. / (1 - self.dropout_ratio)) xp = cuda.get_array_module(*x) if xp == numpy: flag = xp.random.rand(*x[0].shape) >= self.dropout_ratio self.mask = scale * flag y = x[0] * self.mask else: rand = xp.random.rand(*x[0].shape, dtype=numpy.float32) self.mask, y = cuda.elementwise( 'T x, R r, T scale, T ratio', 'T mask, T y', ''' mask = (r >= ratio) * scale; y = x * mask; ''', 'dropout_fwd', )(x[0], rand, scale, self.dropout_ratio) return y,
def forward(self, inputs): self._config_use_ideep = chainer.config.use_ideep if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation return self._forward_ideep(inputs) # Generic implementation if len(inputs) == 3: x, W, b = inputs else: (x, W), b = inputs, None # NumPy raises an error when the array is not contiguous. # See: https://github.com/chainer/chainer/issues/2744 # TODO(niboshi): Remove this code when NumPy is fixed. if (isinstance(x, numpy.ndarray) and not (x.flags.c_contiguous or x.flags.f_contiguous) and 1 in x.shape): x = numpy.ascontiguousarray(x) y = x.dot(W.T).astype(x.dtype, copy=False) if b is not None: y += b self.retain_inputs((0, 1)) # b is not retained return y,
def forward(self, inputs): self.retain_inputs((0, 1)) c_prev, x = inputs a, i, f, o = _extract_gates(x) batch = len(x) if isinstance(x, chainer.get_cpu_array_types()): if intel64.should_use_ideep('>=auto'): xp = intel64.ideep.get_array_module(x) else: xp = numpy a = xp.tanh(a) i = _sigmoid(i, xp) f = _sigmoid(f, xp) o = _sigmoid(o, xp) c_next = numpy.empty_like(c_prev) c_next[:batch] = a * i + f * c_prev[:batch] h = o * xp.tanh(c_next[:batch]) else: c_next = cuda.cupy.empty_like(c_prev) h = cuda.cupy.empty_like(c_next[:batch]) cuda.elementwise( 'T c_prev, T a, T i_, T f, T o', 'T c, T h', ''' COMMON_ROUTINE; c = aa * ai + af * c_prev; h = ao * tanh(c); ''', 'lstm_fwd', preamble=_preamble)( c_prev[:batch], a, i, f, o, c_next[:batch], h) c_next[batch:] = c_prev[batch:] self.retain_outputs((0,)) return c_next, h
def forward(self, inputs): xp = backend.get_array_module(*inputs) c_prev, x, c_next, gc, gh = inputs batch = len(x) gx = xp.empty_like(x) ga, gi, gf, go = _extract_gates(gx) # Consider the case that either gradient is not given if gc is None: gc_update = 0 gc_rest = 0 else: gc_update = gc[:batch] gc_rest = gc[batch:] if gh is None: gh = 0 a, i, f, o = _extract_gates(x) if xp is numpy: if intel64.should_use_ideep('>=auto'): xp = intel64.ideep.get_array_module(x) tanh_a = xp.tanh(a) sig_i = _sigmoid(i, xp) sig_f = _sigmoid(f, xp) sig_o = _sigmoid(o, xp) co = xp.tanh(c_next[:batch]) gc_prev = numpy.empty_like(c_prev) # multiply f later gc_prev[:batch] = gh * sig_o * _grad_tanh(co) + gc_update gc = gc_prev[:batch] ga[:] = gc * sig_i * _grad_tanh(tanh_a) gi[:] = gc * tanh_a * _grad_sigmoid(sig_i) gf[:] = gc * c_prev[:batch] * _grad_sigmoid(sig_f) go[:] = gh * co * _grad_sigmoid(sig_o) gc_prev[:batch] *= sig_f # multiply f here gc_prev[batch:] = gc_rest else: gc_prev = xp.empty_like(c_prev) cuda.elementwise( 'T c_prev, T c, T gc, T gh, T a, T i_, T f, T o', 'T gc_prev, T ga, T gi, T gf, T go', ''' COMMON_ROUTINE; T co = tanh(c); T temp = gh * ao * grad_tanh(co) + gc; ga = temp * ai * grad_tanh(aa); gi = temp * aa * grad_sigmoid(ai); gf = temp * c_prev * grad_sigmoid(af); go = gh * co * grad_sigmoid(ao); gc_prev = temp * af; ''', 'lstm_bwd', preamble=_preamble)( c_prev[:batch], c_next[:batch], gc_update, gh, a, i, f, o, gc_prev[:batch], ga, gi, gf, go) gc_prev[batch:] = gc_rest return gc_prev, gx
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): return self.forward_ideep(inputs) gy, = inputs gx = gy * (self.b > 0) return utils.force_array(gx, dtype=gy.dtype),
def forward(self, xs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(xs, (4,))): # iDeep implementation return self._forward_ideep(xs) # Generic implementation xp = cuda.get_array_module(*xs) return xp.concatenate(xs, self.axis),
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): return self.forward_ideep(inputs) x, = inputs y = numpy.maximum(x, 0, dtype=x.dtype) self.retain_outputs((0,)) return utils.force_array(y),
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation self._use_ideep = True return self.forward_ideep(inputs) x, = inputs self.retain_outputs((0,)) return utils.force_array(numpy.maximum(x, 0, dtype=x.dtype)),
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation self._use_ideep = True return self.forward_ideep(inputs) x, = inputs self.retain_outputs((0,)) return utils.force_array(numpy.maximum(x, 0, dtype=x.dtype)),
def forward_cpu(self, gy): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(gy)): return self._forward_ideep(gy) h, w = self._in_shape[2:] gcol = numpy.tile(gy[0][:, :, None, None], (1, 1, self.kh, self.kw, 1, 1)) gx = conv.col2im_cpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) gx /= self.kh * self.kw return gx,
def forward_cpu(self, gy): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(gy)): return self._forward_ideep(gy) h, w = self._in_shape[2:] gcol = numpy.tile(gy[0][:, :, None, None], (1, 1, self.kh, self.kw, 1, 1)) gx = conv.col2im_cpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) gx /= self.kh * self.kw return gx,
def forward_cpu(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x)): return self._forward_ideep(x) self._in_shape = x[0].shape self._in_dtype = x[0].dtype col = conv.im2col_cpu(x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw) y = col.mean(axis=(2, 3)) return y,
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): return self.forward_ideep(inputs) gy, = inputs gy = gy.copy() if self.slope >= 0: gy[self.y < 0] *= self.slope else: gy[self.x < 0] *= self.slope return gy,
def forward_cpu(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x)): return self._forward_ideep(x) self._in_shape = x[0].shape self._in_dtype = x[0].dtype col = conv.im2col_cpu(x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw) y = col.mean(axis=(2, 3)) return y,
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): return self.forward_ideep(inputs) gy, = inputs gy = gy.copy() if self.slope >= 0: gy[self.y < 0] *= self.slope else: gy[self.x < 0] *= self.slope return gy,
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): return self.forward_ideep(inputs) x, = inputs y = x.copy() y[x < 0] *= self.slope if self.slope >= 0: self.retain_outputs((0,)) else: self.retain_inputs((0,)) return y,
def forward_cpu(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x) and self.mask is None): return self._forward_ideep(x) if self.mask is not None: y = x[0] * self.mask else: scale = x[0].dtype.type(1. / (1 - self.dropout_ratio)) flag = numpy.random.rand(*x[0].shape) >= self.dropout_ratio self.mask = scale * flag y = x[0] * self.mask return y,
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): return self.forward_ideep(inputs) x, = inputs y = x.copy() y[x < 0] *= self.slope if self.slope >= 0: self.retain_outputs((0, )) else: self.retain_inputs((0, )) return y,
def forward_cpu(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x) and self.mask is None): return self._forward_ideep(x) if self.mask is not None: y = x[0] * self.mask else: scale = x[0].dtype.type(1. / (1 - self.dropout_ratio)) flag = numpy.random.rand(*x[0].shape) >= self.dropout_ratio self.mask = scale * flag y = x[0] * self.mask return y,
def forward(self, inputs): # Currently iDeep only supports 4 dims if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs, (4,)) and self._ideep_is_supported(inputs)): return self._forward_ideep(inputs) x, = inputs self._xp = backend.get_array_module(x) indices_or_sections = self.indices_or_sections ret = self._xp.split(x, indices_or_sections, self.axis) if self._xp == numpy and not _numpy_split_ok: ret = _fix_numpy_split(ret, x, indices_or_sections, self.axis) self._shapes = [r.shape for r in ret] return tuple(ret)
def forward(self, xs): self.len = len(xs) if len(xs) == 1: return xs if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(xs)): y = intel64.ideep.multi_add(xs) else: # The output should be a new array. Add the first 2 arrays # and get the result y. Then add the rest arrays to y. y = xs[0] + xs[1] for x in xs[2:]: y += x return utils.force_array(y),
def forward(self, inputs): # Currently iDeep only supports 4 dims if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs, (4, )) and self._ideep_is_supported(inputs)): return self._forward_ideep(inputs) x, = inputs self._xp = backend.get_array_module(x) indices_or_sections = self.indices_or_sections ret = self._xp.split(x, indices_or_sections, self.axis) if self._xp == numpy and not _numpy_split_ok: ret = _fix_numpy_split(ret, x, indices_or_sections, self.axis) self._shapes = [r.shape for r in ret] return tuple(ret)
def forward_cpu(self, inputs): self.retain_inputs((0, 1)) # retain only x and W if len(inputs) == 2: (x, W), b = inputs, None else: x, W, b = inputs if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): self._use_ideep = True if self.groups > 1: return self._forward_grouped_convolution(x, W, b) else: return self._forward_cpu_core(x, W, b)
def forward_cpu(self, inputs): self.retain_inputs((0, 1)) # retain only x and W if len(inputs) == 2: (x, W), b = inputs, None else: x, W, b = inputs if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): self._use_ideep = True if self.groups > 1: return self._forward_grouped_convolution(x, W, b) else: return self._forward_cpu_core(x, W, b)
def forward_cpu(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x, (4,))): self._use_ideep = True return self._forward_ideep(x) half_n = self.n // 2 x2 = numpy.square(x[0]) sum_part = x2.copy() for i in six.moves.range(1, half_n + 1): sum_part[:, i:] += x2[:, :-i] sum_part[:, :-i] += x2[:, i:] self.unit_scale = self.k + self.alpha * sum_part self.scale = self.unit_scale ** -self.beta self.y = x[0] * self.scale return self.y,
def forward(self, inputs): # Currently iDeep only supports 4 dims if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs, (4,)) and self._ideep_is_supported(inputs)): return self._forward_ideep(inputs) x, = inputs if isinstance(self.indices_or_sections, collections.Iterable): cdimx = x.shape[self.axis] ind = list(self.indices_or_sections) ind.append(cdimx) self._xp = cuda.get_array_module(x) ret = tuple(self._xp.split(x, self.indices_or_sections, self.axis)) self._shapes = [r.shape for r in ret] return ret
def forward(self, inputs): # Currently iDeep only supports 4 dims if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs, (4, )) and self._ideep_is_supported(inputs)): return self._forward_ideep(inputs) x, = inputs self._xp = cuda.get_array_module(x) if self.indices is not None: indices_or_sections = self.indices else: indices_or_sections = self.sections ret = tuple(self._xp.split(x, indices_or_sections, self.axis)) self._shapes = [r.shape for r in ret] return ret
def forward_cpu(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x, (4, ))): self._use_ideep = True return self._forward_ideep(x) half_n = self.n // 2 x2 = numpy.square(x[0]) sum_part = x2.copy() for i in six.moves.range(1, half_n + 1): sum_part[:, i:] += x2[:, :-i] sum_part[:, :-i] += x2[:, i:] self.unit_scale = self.k + self.alpha * sum_part self.scale = self.unit_scale**-self.beta self.y = x[0] * self.scale return self.y,
def forward(self, inputs): # Currently iDeep only supports 4 dims if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs, (4, )) and self._ideep_is_supported(inputs)): return self._forward_ideep(inputs) x, = inputs if isinstance(self.indices_or_sections, collections.Iterable): cdimx = x.shape[self.axis] ind = list(self.indices_or_sections) ind.append(cdimx) self._xp = cuda.get_array_module(x) ret = tuple(self._xp.split(x, self.indices_or_sections, self.axis)) self._shapes = [r.shape for r in ret] return ret
def forward_cpu(self, inputs): if (self.groups == 1 and intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation self._use_ideep = True return self._forward_ideep(inputs) self.retain_inputs((0, 1)) # retain only x and W self.retain_outputs((0, )) if len(inputs) == 2: (x, W), b = inputs, None else: x, W, b = inputs if self.groups > 1: return self._forward_grouped_convolution(x, W, b) else: return self._forward_cpu_core(x, W, b)
def forward(self, inputs): self._config_use_ideep = chainer.config.use_ideep if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation return self._forward_ideep(inputs) # Generic implementation self.retain_inputs((0, 1)) W, gy = inputs if (isinstance(gy, numpy.ndarray) and not (gy.flags.c_contiguous or gy.flags.f_contiguous) and 1 in gy.shape): gy = numpy.ascontiguousarray(gy) gx = gy.dot(W).astype(gy.dtype, copy=False) return gx,
def forward(self, inputs): self._config_use_ideep = chainer.config.use_ideep if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation return self._forward_ideep(inputs) # Generic implementation self.retain_inputs((0, 1)) W, gy = inputs if (isinstance(gy, numpy.ndarray) and not (gy.flags.c_contiguous or gy.flags.f_contiguous) and 1 in gy.shape): gy = numpy.ascontiguousarray(gy) gx = gy.dot(W).astype(gy.dtype, copy=False) return gx,
def forward_cpu(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x)): return self._forward_ideep(x) self._in_shape = x[0].shape self._in_dtype = x[0].dtype col = conv.im2col_cpu( x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, pval=-float('inf'), cover_all=self.cover_all) n, c, kh, kw, out_h, out_w = col.shape col = col.reshape(n, c, kh * kw, out_h, out_w) # We select maximum twice, since the implementation using numpy.choose # hits its bug when kh * kw >= 32. self.indexes = col.argmax(axis=2) y = col.max(axis=2) return y,
def forward(self, inputs): self._config_use_ideep = chainer.config.use_ideep if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation return self._forward_ideep(inputs) # Generic implementation if len(inputs) == 3: x, W, b = inputs else: (x, W), b = inputs, None # NumPy raises an error when the array is not contiguous. # See: https://github.com/chainer/chainer/issues/2744 # TODO(niboshi): Remove this code when NumPy is fixed. if (isinstance(x, numpy.ndarray) and not (x.flags.c_contiguous or x.flags.f_contiguous) and 1 in x.shape): x = numpy.ascontiguousarray(x) # In order to be compatible with the "static graph" feature, it is # required that all output arrays of this forward # function be allocated explicitly: xp = cuda.get_array_module(x) y = xp.empty((x.shape[0], W.shape[0])).astype(x.dtype) # This is required because all of the "static_*()" functions # use the convention that any output arrays are supplied # as input arguments to the function. That is because it is # not allowed for a "static_*()" function to return anything # other than `None`. The reason is to prevent dynamic allocation # of output arrays during execution of the static schedule # because it would break the model. self.static_linear_no_bias(xp, x.dtype == W.dtype, inputs=[x, W], outputs=[y]) if len(inputs) == 3: self.static_add_bias(inputs=[b], outputs=[y]) self.retain_inputs((0, 1)) # b is not retained return y,
def forward_cpu(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x)): return self._forward_ideep(x) self._in_shape = x[0].shape self._in_dtype = x[0].dtype col = conv.im2col_cpu( x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, pval=-float('inf'), cover_all=self.cover_all) n, c, kh, kw, out_h, out_w = col.shape col = col.reshape(n, c, kh * kw, out_h, out_w) # We select maximum twice, since the implementation using numpy.choose # hits its bug when kh * kw >= 32. self.indexes = col.argmax(axis=2) y = col.max(axis=2) return y,
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs, (4, ))): self._use_ideep = True return self.forward_ideep(inputs) x, = inputs self.retain_inputs((0, )) self.retain_outputs((0, )) half_n = self.n // 2 x2 = numpy.square(x) sum_part = x2.copy() for i in six.moves.range(1, half_n + 1): sum_part[:, i:] += x2[:, :-i] sum_part[:, :-i] += x2[:, i:] self.unit_scale = self.k + self.alpha * sum_part self.scale = self.unit_scale**-self.beta y = x * self.scale return y,
def forward(self, xs): self.len = len(xs) if len(xs) == 1: return xs y = None if intel64.should_use_ideep('>=auto'): bxs = numpy.broadcast_arrays(*xs) if intel64.inputs_all_ready(bxs): y = intel64.ideep.multi_add(bxs) if y is None: # The output should be a new array. Add the first 2 arrays # and get the result y. Then add the rest arrays to y. y = xs[0] + xs[1] for x in xs[2:]: if x.shape == y.shape: y += x else: y = x + y return utils.force_array(y),
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs, (4,))): self._use_ideep = True return self.forward_ideep(inputs) x, = inputs self.retain_inputs((0,)) self.retain_outputs((0,)) half_n = self.n // 2 x2 = numpy.square(x) sum_part = x2.copy() for i in six.moves.range(1, half_n + 1): sum_part[:, i:] += x2[:, :-i] sum_part[:, :-i] += x2[:, i:] self.unit_scale = self.k + self.alpha * sum_part self.scale = self.unit_scale ** -self.beta y = x * self.scale return y,
def forward(self, inputs): self._config_use_ideep = chainer.config.use_ideep if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation return self._forward_ideep(inputs) # Generic implementation if len(inputs) == 3: x, W, b = inputs else: (x, W), b = inputs, None # NumPy raises an error when the array is not contiguous. # See: https://github.com/chainer/chainer/issues/2744 # TODO(niboshi): Remove this code when NumPy is fixed. if (isinstance(x, numpy.ndarray) and not (x.flags.c_contiguous or x.flags.f_contiguous) and 1 in x.shape): x = numpy.ascontiguousarray(x) # In order to be compatible with the "static graph" feature, it is # required that all output arrays of this forward # function be allocated explicitly: xp = cuda.get_array_module(x) y = xp.empty((x.shape[0], W.shape[0]), dtype=x.dtype) # This is required because all of the "static_*()" functions # use the convention that any output arrays are supplied # as input arguments to the function. That is because it is # not allowed for a "static_*()" function to return anything # other than `None`. The reason is to prevent dynamic allocation # of output arrays during execution of the static schedule # because it would break the model. self.static_linear_no_bias(xp, x.dtype == W.dtype, inputs=[x, W], outputs=[y]) if len(inputs) == 3: self.static_add_bias(inputs=[b], outputs=[y]) self.retain_inputs((0, 1)) # b is not retained return y,
def forward_cpu(self, inputs): self.retain_inputs((0, 1)) # only retain x and W if len(inputs) == 2: (x, W), b = inputs, None else: x, W, b = inputs self._calc_out_size(x, W) if self.groups > 1: # Grouped convolution implementation return self._forward_grouped_convolution(x, W, b) elif (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation self._use_ideep = True return self._forward_ideep(x, W, b) else: return self._forward_cpu_core(x, W, b)
def forward_cpu(self, inputs): if self.cudnn_fast: raise RuntimeError( '\'cudnn_fast\' can\'t be used in the CPU backend') self._check_input_layouts_all_standard() self.retain_inputs((0, 1)) # retain only x and W if len(inputs) == 2: (x, W), b = inputs, None else: x, W, b = inputs if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): self._use_ideep = True if self.groups > 1: return self._forward_grouped_convolution(x, W, b) else: return self._forward_cpu_core(x, W, b)
def forward_cpu(self, gy): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(gy)): return self._forward_ideep(gy) n, c, out_h, out_w = gy[0].shape h, w = self._in_shape[2:] kh, kw = self.kh, self.kw gcol = numpy.zeros( (n * c * out_h * out_w * kh * kw), dtype=self._in_dtype) indexes = self.indexes.flatten() indexes += numpy.arange(0, indexes.size * kh * kw, kh * kw) gcol[indexes] = gy[0].ravel() gcol = gcol.reshape(n, c, out_h, out_w, kh, kw) gcol = numpy.swapaxes(gcol, 2, 4) gcol = numpy.swapaxes(gcol, 3, 5) gx = conv.col2im_cpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) return gx,
def forward_cpu(self, gy): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(gy)): return self._forward_ideep(gy) n, c, out_h, out_w = gy[0].shape h, w = self._in_shape[2:] kh, kw = self.kh, self.kw gcol = numpy.zeros((n * c * out_h * out_w * kh * kw), dtype=self._in_dtype) indexes = self.indexes.ravel() + numpy.arange( 0, self.indexes.size * kh * kw, kh * kw) gcol[indexes] = gy[0].ravel() gcol = gcol.reshape(n, c, out_h, out_w, kh, kw) gcol = numpy.swapaxes(gcol, 2, 4) gcol = numpy.swapaxes(gcol, 3, 5) gx = conv.col2im_cpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) return gx,
def forward(self, inputs): self.retain_inputs((0, 1)) c_prev, x = inputs a, i, f, o = _extract_gates(x) batch = len(x) if isinstance(x, chainer.get_cpu_array_types()): if intel64.should_use_ideep('>=auto'): xp = intel64.ideep.get_array_module(x) else: xp = numpy a = xp.tanh(a) i = _sigmoid(i, xp) f = _sigmoid(f, xp) o = _sigmoid(o, xp) c_next = numpy.empty_like(c_prev) c_next[:batch] = a * i + f * c_prev[:batch] h = o * xp.tanh(c_next[:batch]) else: c_next = cuda.cupy.empty_like(c_prev) h = cuda.cupy.empty_like(c_next[:batch]) cuda.elementwise('T c_prev, T a, T i_, T f, T o', 'T c, T h', ''' COMMON_ROUTINE; c = aa * ai + af * c_prev; h = ao * tanh(c); ''', 'lstm_fwd', preamble=_preamble)(c_prev[:batch], a, i, f, o, c_next[:batch], h) c_next[batch:] = c_prev[batch:] self.retain_outputs((0, )) return c_next, h
def can_use_ideep(self): return self.ideep_ok and intel64.should_use_ideep('>=auto')
def forward(self, inputs): xp = backend.get_array_module(*inputs) c_prev1, c_prev2, x1, x2, c_next, gc, gh = inputs gx1 = xp.empty_like(x1) gx2 = xp.empty_like(x2) ga1, gi1, gf1, go1 = _extract_gates(gx1) ga2, gi2, gf2, go2 = _extract_gates(gx2) if gc is None: gc = 0 if gh is None: gh = 0 a1, i1, f1, o1 = _extract_gates(x1) a2, i2, f2, o2 = _extract_gates(x2) if xp is numpy: if intel64.should_use_ideep('>=auto'): xp = intel64.ideep.get_array_module(x1) tanh_a1 = xp.tanh(a1) sig_i1 = _sigmoid(i1, xp) sig_f1 = _sigmoid(f1, xp) tanh_a2 = xp.tanh(a2) sig_i2 = _sigmoid(i2, xp) sig_f2 = _sigmoid(f2, xp) sig_o = _sigmoid(o1 + o2, xp) co = xp.tanh(c_next) # multiply f later gc_prev = gh * sig_o * _grad_tanh(co) + gc ga1[:] = gc_prev * sig_i1 * _grad_tanh(tanh_a1) gi1[:] = gc_prev * tanh_a1 * _grad_sigmoid(sig_i1) gf1[:] = gc_prev * c_prev1 * _grad_sigmoid(sig_f1) go1[:] = gh * co * _grad_sigmoid(sig_o) ga2[:] = gc_prev * sig_i2 * _grad_tanh(tanh_a2) gi2[:] = gc_prev * tanh_a2 * _grad_sigmoid(sig_i2) gf2[:] = gc_prev * c_prev2 * _grad_sigmoid(sig_f2) go2[:] = gh * co * _grad_sigmoid(sig_o) # multiply f here gc_prev1 = gc_prev * sig_f1 gc_prev2 = gc_prev * sig_f2 else: a1, i1, f1, o1 = _extract_gates(x1) a2, i2, f2, o2 = _extract_gates(x2) gc_prev1 = xp.empty_like(c_prev1) gc_prev2 = xp.empty_like(c_prev2) cuda.elementwise('''T c_prev1, T a1, T i1, T f1, T o1, T c_prev2, T a2, T i2, T f2, T o2, T c, T gc, T gh''', '''T gc_prev1, T ga1, T gi1, T gf1, T go1, T gc_prev2, T ga2, T gi2, T gf2, T go2''', ''' COMMON_ROUTINE; T co = tanh(c); T temp = gh * ao * grad_tanh(co) + gc; ga1 = temp * ai1 * grad_tanh(aa1); gi1 = temp * aa1 * grad_sigmoid(ai1); gf1 = temp * c_prev1 * grad_sigmoid(af1); go1 = gh * co * grad_sigmoid(ao); gc_prev1 = temp * af1; ga2 = temp * ai2 * grad_tanh(aa2); gi2 = temp * aa2 * grad_sigmoid(ai2); gf2 = temp * c_prev2 * grad_sigmoid(af2); go2 = gh * co * grad_sigmoid(ao); gc_prev2 = temp * af2; ''', 'lstm_bwd', preamble=_preamble)(c_prev1, a1, i1, f1, o1, c_prev2, a2, i2, f2, o2, c_next, gc, gh, gc_prev1, ga1, gi1, gf1, go1, gc_prev2, ga2, gi2, gf2, go2) return gc_prev1, gc_prev2, gx1, gx2
def can_use_ideep(self): return self.ideep_ok and intel64.should_use_ideep('>=auto')