def forward_cpu(self, inputs): if ((self.dy == 1 and self.dx == 1) and intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): self._use_ideep = True self.retain_inputs((0, 1)) # only retain x and W if len(inputs) == 2: (x, W), b = inputs, None else: x, W, b = inputs self._calc_out_size(x, W) if self.groups > 1: # Grouped convolution implementation return self._forward_grouped_convolution(x, W, b) elif (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation self._use_ideep = True return self._forward_ideep(x, W, b) else: return self._forward_cpu_core(x, W, b)
def forward_cpu(self, inputs): if ((self.dy == 1 and self.dx == 1) and intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): self._use_ideep = True self.retain_inputs((0, 1)) # only retain x and W if len(inputs) == 2: (x, W), b = inputs, None x_layout, w_layout = self.input_layouts else: x, W, b = inputs x_layout, w_layout, _ = self.input_layouts x_shape = memory_layouts._transpose_shape(x.shape, x_layout, None) w_shape = memory_layouts._transpose_shape(W.shape, w_layout, None) self._calc_out_size(x_shape, w_shape) if self.groups > 1: # Grouped convolution implementation return self._forward_grouped_convolution(x, W, b) elif (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation self._use_ideep = True return self._forward_ideep(x, W, b) else: return self._forward_cpu_core(x, W, b)
def forward_cpu(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x)): return self._forward_ideep(x) self._in_shape = x[0].shape self._in_dtype = x[0].dtype col = conv.im2col_cpu(x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, pval=-float('inf'), cover_all=self.cover_all) n, c, kh, kw, out_h, out_w = col.shape col = col.reshape(n, c, kh * kw, out_h, out_w) # We select maximum twice, since the implementation using numpy.choose # hits its bug when kh * kw >= 32. self.indexes = col.argmax(axis=2) y = col.max(axis=2) return y,
def forward(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x) and self.mask is None): return self._forward_ideep(x) if self.mask is not None: y = x[0] * self.mask else: scale = x[0].dtype.type(1. / (1 - self.dropout_ratio)) xp = cuda.get_array_module(*x) if xp == numpy: flag = xp.random.rand(*x[0].shape) >= self.dropout_ratio self.mask = scale * flag y = x[0] * self.mask else: rand = xp.random.rand(*x[0].shape, dtype=numpy.float32) self.mask, y = cuda.elementwise( 'T x, R r, T scale, T ratio', 'T mask, T y', ''' mask = (r >= ratio) * scale; y = x * mask; ''', 'dropout_fwd', )(x[0], rand, scale, self.dropout_ratio) return y,
def forward_cpu(self, x): if (self.ndim == 2 and intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x)): return self._forward_2d_ideep(x) ksize = self.ksize stride = self.stride pad = self.pad cover_all = self.cover_all in_shape = x[0].shape in_dtype = x[0].dtype col = conv_nd.im2col_nd_cpu(x[0], ksize, stride, pad, pval=-float('inf'), cover_all=cover_all) n, c = col.shape[:2] mid = (len(col.shape) - 2) // 2 + 2 ksize = col.shape[2:mid] outs = col.shape[mid:] # (n, c, k_1 * k_2 * ... * k_N, out_1, out_2, ..., out_N) col_shape = (n, c) + (functools.reduce(mul, ksize), ) + outs col = col.reshape(col_shape) # We select maximum twice, since the implementation using numpy.choose # hits its bug when kh * kw >= 32. y = col.max(axis=2) self._in_shape = in_shape self._in_dtype = in_dtype self.indexes = col.argmax(axis=2) return y,
def forward_cpu(self, gy): func = self.func if (func.ndim == 2 and intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(gy)): return self._forward_2d_ideep(gy) ndim = func.ndim ksize = func.ksize stride = func.stride pad = func.pad in_shape = func._in_shape in_dtype = func._in_dtype indexes = func.indexes n, c = gy[0].shape[:2] outs = gy[0].shape[2:] dims = in_shape[2:] prod_outs = functools.reduce(mul, outs) prod_ksize = functools.reduce(mul, ksize) gcol = numpy.zeros(n * c * prod_outs * prod_ksize, dtype=in_dtype) indexes = (indexes.flatten() + numpy.arange(0, indexes.size * prod_ksize, prod_ksize)) gcol[indexes] = gy[0].ravel() gcol_shape = (n, c) + outs + ksize gcol = gcol.reshape(gcol_shape) for i in six.moves.range(ndim): gcol = numpy.swapaxes(gcol, 2 + i, ndim + 2 + i) gx = conv_nd.col2im_nd_cpu(gcol, stride, pad, dims) return gx,
def forward(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): return self._forward_ideep(inputs) y = inputs[0] * self.mask return y,
def forward(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x)): return self._forward_ideep(x) if self.mask is not None: y = x[0] * self.mask else: scale = x[0].dtype.type(1. / (1 - self.dropout_ratio)) xp = cuda.get_array_module(*x) if xp == numpy: flag = xp.random.rand(*x[0].shape) >= self.dropout_ratio self.mask = scale * flag y = x[0] * self.mask else: rand = xp.random.rand(*x[0].shape, dtype=numpy.float32) self.mask, y = cuda.elementwise( 'T x, R r, T scale, T ratio', 'T mask, T y', ''' mask = (r >= ratio) * scale; y = x * mask; ''', 'dropout_fwd', )(x[0], rand, scale, self.dropout_ratio) return y,
def init_state(self, param): xp = backend.get_array_module(param.data) with cuda.get_device_from_array(param.data): self.state['m'] = xp.zeros_like(param.data) self.state['v'] = xp.zeros_like(param.data) if self.hyperparam.amsgrad: self.state['vhat'] = xp.zeros_like(param.data) # For iDeep if (isinstance(param.data, intel64.mdarray) and intel64.inputs_all_ready((self.state['m'],)) and intel64.inputs_all_ready((self.state['v'],))): self.state['m'] = intel64.ideep.array( self.state['m'], itype=intel64.ideep.wgt_array) self.state['v'] = intel64.ideep.array( self.state['v'], itype=intel64.ideep.wgt_array)
def init_state(self, param): xp = cuda.get_array_module(param.data) with cuda.get_device_from_array(param.data): self.state['m'] = xp.zeros_like(param.data) self.state['v'] = xp.zeros_like(param.data) if self.hyperparam.amsgrad: self.state['vhat'] = xp.zeros_like(param.data) # For iDeep if (isinstance(param.data, intel64.mdarray) and intel64.inputs_all_ready((self.state['m'], )) and intel64.inputs_all_ready((self.state['v'], ))): self.state['m'] = intel64.ideep.array( self.state['m'], itype=intel64.ideep.wgt_array) self.state['v'] = intel64.ideep.array( self.state['v'], itype=intel64.ideep.wgt_array)
def forward(self, inputs): self._config_use_ideep = chainer.config.use_ideep if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation return self._forward_ideep(inputs) # Generic implementation if len(inputs) == 3: x, W, b = inputs else: (x, W), b = inputs, None # NumPy raises an error when the array is not contiguous. # See: https://github.com/chainer/chainer/issues/2744 # TODO(niboshi): Remove this code when NumPy is fixed. if (isinstance(x, numpy.ndarray) and not (x.flags.c_contiguous or x.flags.f_contiguous) and 1 in x.shape): x = numpy.ascontiguousarray(x) y = x.dot(W.T).astype(x.dtype, copy=False) if b is not None: y += b self.retain_inputs((0, 1)) # b is not retained return y,
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): return self.forward_ideep(inputs) gy, = inputs gx = gy * (self.b > 0) return utils.force_array(gx, dtype=gy.dtype),
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): return self.forward_ideep(inputs) x, = inputs y = numpy.maximum(x, 0, dtype=x.dtype) self.retain_outputs((0,)) return utils.force_array(y),
def init_state(self, param): xp = cuda.get_array_module(param.data) with cuda.get_device_from_array(param.data): self.state['v'] = xp.zeros_like(param.data) # For iDeep if intel64.inputs_all_ready((self.state['v'],)): self.state['v'] = intel64.ideep.array( self.state['v'], itype=intel64.ideep.wgt_array)
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): return self.forward_ideep(inputs) gy, = inputs gy = gy.copy() gy[self.cond <= 0] *= self.slope return gy,
def forward(self, xs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(xs, (4, ))): # iDeep implementation return self._forward_ideep(xs) # Generic implementation xp = backend.get_array_module(*xs) return xp.concatenate(xs, self.axis),
def forward(self, xs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(xs, (4,))): # iDeep implementation return self._forward_ideep(xs) # Generic implementation xp = cuda.get_array_module(*xs) return xp.concatenate(xs, self.axis),
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): return self.forward_ideep(inputs) x, = inputs y = numpy.maximum(x, 0, dtype=x.dtype) self.retain_outputs((0, )) return utils.force_array(y),
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation self._use_ideep = True return self.forward_ideep(inputs) x, = inputs self.retain_outputs((0,)) return utils.force_array(numpy.maximum(x, 0, dtype=x.dtype)),
def forward_cpu(self, gy): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(gy)): return self._forward_ideep(gy) h, w = self._in_shape[2:] gcol = numpy.tile(gy[0][:, :, None, None], (1, 1, self.kh, self.kw, 1, 1)) gx = conv.col2im_cpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) gx /= self.kh * self.kw return gx,
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): return self.forward_ideep(inputs) gy, = inputs gy = gy.copy() if self.slope >= 0: gy[self.y < 0] *= self.slope else: gy[self.x < 0] *= self.slope return gy,
def forward_cpu(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x)): return self._forward_ideep(x) self._in_shape = x[0].shape self._in_dtype = x[0].dtype col = conv.im2col_cpu(x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw) y = col.mean(axis=(2, 3)) return y,
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): return self.forward_ideep(inputs) x, = inputs y = x.copy() y[x < 0] *= self.slope if self.slope >= 0: self.retain_outputs((0, )) else: self.retain_inputs((0, )) return y,
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): return self.forward_ideep(inputs) x, = inputs y = x.copy() y[x < 0] *= self.slope if self.slope >= 0: self.retain_outputs((0,)) else: self.retain_inputs((0,)) return y,
def forward_cpu(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x) and self.mask is None): return self._forward_ideep(x) if self.mask is not None: y = x[0] * self.mask else: scale = x[0].dtype.type(1. / (1 - self.dropout_ratio)) flag = numpy.random.rand(*x[0].shape) >= self.dropout_ratio self.mask = scale * flag y = x[0] * self.mask return y,
def forward(self, axis, gamma, x, xp, expander, beta, eps, decay, running_mean, running_var): if not ( x.dtype == gamma.dtype and gamma.ndim == 1 and intel64.inputs_all_ready((x,))): self._forward_fallback = True return super().forward( axis, gamma, x, xp, expander, beta, eps, decay, running_mean, running_var) expand_dim = False if x.ndim == 2: expand_dim = True x = x[:, :, None, None] y, mean, var, inv_std = ( intel64.ideep.batchNormalization.Forward( intel64.ideep.array(x.astype(gamma.dtype, copy=False)), intel64.ideep.array(gamma), intel64.ideep.array(beta), None, None, eps )) y = y.astype(x.dtype, copy=False) if expand_dim: y = numpy.squeeze(y, axis=(2, 3)) # Update running statistics if given if running_mean is not None: m = x.size // gamma.size adjust = m / max(m - 1., 1.) # Update running_mean if isinstance(running_mean, intel64.ideep.mdarray): running_mean.inplace_axpby( decay, (1 - decay), mean) else: running_mean *= decay running_mean += mean * (1 - decay) # Update running_var if isinstance(running_var, intel64.ideep.mdarray): running_var.inplace_axpby( decay, (1 - decay), var * adjust) else: running_var *= decay running_var += var * adjust * (1 - decay) return y, running_mean, running_var, mean, var, inv_std
def __init__(self, x, gamma, key_axis): is_gamma_1d = gamma.ndim == 1 # cuDNN only supports these tensor dimensions because they are # the most commonly used. If there is a need to support other # dimensions with cuDNN, we could consider reshaping the input # into a 2-dim array with channels as second dim and m=<product # of all dimensions except the 2nd dimension> as the first # dimension. self.is_for_conv2d = is_gamma_1d and x.ndim == 4 and key_axis[0] == 1 self.is_for_linear = is_gamma_1d and key_axis[0] == x.ndim - 1 self.cudnn_dim_ok = self.is_for_conv2d or self.is_for_linear # self.cudnn_dtype_ok = x.dtype != numpy.float16 self.cudnn_dtype_ok = self.is_for_conv2d or (x.dtype != numpy.float16) self.ideep_ok = is_gamma_1d and intel64.inputs_all_ready((x, ))
def __init__(self, x, gamma): is_gamma_1d = gamma.ndim == 1 # cuDNN only supports these tensor dimensions because they are # the most commonly used. If there is a need to support other # dimensions with cuDNN, we could consider reshaping the input # into a 2-dim array with channels as second dim and m=<product # of all dimensions except the 2nd dimension> as the first # dimension. self.is_for_conv2d = x.ndim == 4 and is_gamma_1d self.is_for_linear = x.ndim == 2 and is_gamma_1d self.cudnn_dim_ok = self.is_for_conv2d or self.is_for_linear # self.cudnn_dtype_ok = x.dtype != numpy.float16 self.cudnn_dtype_ok = self.is_for_conv2d or (x.dtype != numpy.float16) self.ideep_ok = is_gamma_1d and intel64.inputs_all_ready((x,))
def forward_cpu(self, inputs): self.retain_inputs((0, 1)) # retain only x and W if len(inputs) == 2: (x, W), b = inputs, None else: x, W, b = inputs if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): self._use_ideep = True if self.groups > 1: return self._forward_grouped_convolution(x, W, b) else: return self._forward_cpu_core(x, W, b)
def forward(self, xs): self.len = len(xs) if len(xs) == 1: return xs if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(xs)): y = intel64.ideep.multi_add(xs) else: # The output should be a new array. Add the first 2 arrays # and get the result y. Then add the rest arrays to y. y = xs[0] + xs[1] for x in xs[2:]: y += x return utils.force_array(y),
def forward(self, inputs): # Currently iDeep only supports 4 dims if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs, (4,)) and self._ideep_is_supported(inputs)): return self._forward_ideep(inputs) x, = inputs self._xp = backend.get_array_module(x) indices_or_sections = self.indices_or_sections ret = self._xp.split(x, indices_or_sections, self.axis) if self._xp == numpy and not _numpy_split_ok: ret = _fix_numpy_split(ret, x, indices_or_sections, self.axis) self._shapes = [r.shape for r in ret] return tuple(ret)
def forward(self, inputs): # Currently iDeep only supports 4 dims if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs, (4, )) and self._ideep_is_supported(inputs)): return self._forward_ideep(inputs) x, = inputs self._xp = backend.get_array_module(x) indices_or_sections = self.indices_or_sections ret = self._xp.split(x, indices_or_sections, self.axis) if self._xp == numpy and not _numpy_split_ok: ret = _fix_numpy_split(ret, x, indices_or_sections, self.axis) self._shapes = [r.shape for r in ret] return tuple(ret)
def forward_cpu(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x, (4, ))): self._use_ideep = True return self._forward_ideep(x) half_n = self.n // 2 x2 = numpy.square(x[0]) sum_part = x2.copy() for i in six.moves.range(1, half_n + 1): sum_part[:, i:] += x2[:, :-i] sum_part[:, :-i] += x2[:, i:] self.unit_scale = self.k + self.alpha * sum_part self.scale = self.unit_scale**-self.beta self.y = x[0] * self.scale return self.y,
def forward(self, inputs): # Currently iDeep only supports 4 dims if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs, (4, )) and self._ideep_is_supported(inputs)): return self._forward_ideep(inputs) x, = inputs self._xp = cuda.get_array_module(x) if self.indices is not None: indices_or_sections = self.indices else: indices_or_sections = self.sections ret = tuple(self._xp.split(x, indices_or_sections, self.axis)) self._shapes = [r.shape for r in ret] return ret
def forward(self, inputs): # Currently iDeep only supports 4 dims if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs, (4,)) and self._ideep_is_supported(inputs)): return self._forward_ideep(inputs) x, = inputs if isinstance(self.indices_or_sections, collections.Iterable): cdimx = x.shape[self.axis] ind = list(self.indices_or_sections) ind.append(cdimx) self._xp = cuda.get_array_module(x) ret = tuple(self._xp.split(x, self.indices_or_sections, self.axis)) self._shapes = [r.shape for r in ret] return ret
def forward(self, inputs): # Currently iDeep only supports 4 dims if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs, (4, )) and self._ideep_is_supported(inputs)): return self._forward_ideep(inputs) x, = inputs if isinstance(self.indices_or_sections, collections.Iterable): cdimx = x.shape[self.axis] ind = list(self.indices_or_sections) ind.append(cdimx) self._xp = cuda.get_array_module(x) ret = tuple(self._xp.split(x, self.indices_or_sections, self.axis)) self._shapes = [r.shape for r in ret] return ret
def forward_cpu(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x, (4,))): self._use_ideep = True return self._forward_ideep(x) half_n = self.n // 2 x2 = numpy.square(x[0]) sum_part = x2.copy() for i in six.moves.range(1, half_n + 1): sum_part[:, i:] += x2[:, :-i] sum_part[:, :-i] += x2[:, i:] self.unit_scale = self.k + self.alpha * sum_part self.scale = self.unit_scale ** -self.beta self.y = x[0] * self.scale return self.y,
def forward(self, inputs): self._config_use_ideep = chainer.config.use_ideep if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation return self._forward_ideep(inputs) # Generic implementation self.retain_inputs((0, 1)) W, gy = inputs if (isinstance(gy, numpy.ndarray) and not (gy.flags.c_contiguous or gy.flags.f_contiguous) and 1 in gy.shape): gy = numpy.ascontiguousarray(gy) gx = gy.dot(W).astype(gy.dtype, copy=False) return gx,
def forward_cpu(self, inputs): if (self.groups == 1 and intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation self._use_ideep = True return self._forward_ideep(inputs) self.retain_inputs((0, 1)) # retain only x and W self.retain_outputs((0, )) if len(inputs) == 2: (x, W), b = inputs, None else: x, W, b = inputs if self.groups > 1: return self._forward_grouped_convolution(x, W, b) else: return self._forward_cpu_core(x, W, b)
def to_intel64(self): """Copies parameter variables and persistent values to CPU.""" intel64.check_ideep_available() d = self.__dict__ for name in self._params: d[name].to_intel64() for name in self._persistent: value = d[name] if isinstance(value, cuda.ndarray): value = value.get() # to numpy.ndarray if (isinstance(value, numpy.ndarray) and intel64.inputs_all_ready( (value, ))): value = intel64.ideep.array(value, itype=intel64.ideep.wgt_array) d[name] = value self._cpu = True self._device_id = None return self
def forward_cpu(self, x): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x)): return self._forward_ideep(x) self._in_shape = x[0].shape self._in_dtype = x[0].dtype col = conv.im2col_cpu( x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, pval=-float('inf'), cover_all=self.cover_all) n, c, kh, kw, out_h, out_w = col.shape col = col.reshape(n, c, kh * kw, out_h, out_w) # We select maximum twice, since the implementation using numpy.choose # hits its bug when kh * kw >= 32. self.indexes = col.argmax(axis=2) y = col.max(axis=2) return y,
def forward(self, xs): self.len = len(xs) if len(xs) == 1: return xs y = None if intel64.should_use_ideep('>=auto'): bxs = numpy.broadcast_arrays(*xs) if intel64.inputs_all_ready(bxs): y = intel64.ideep.multi_add(bxs) if y is None: # The output should be a new array. Add the first 2 arrays # and get the result y. Then add the rest arrays to y. y = xs[0] + xs[1] for x in xs[2:]: if x.shape == y.shape: y += x else: y = x + y return utils.force_array(y),
def forward_cpu(self, inputs): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs, (4,))): self._use_ideep = True return self.forward_ideep(inputs) x, = inputs self.retain_inputs((0,)) self.retain_outputs((0,)) half_n = self.n // 2 x2 = numpy.square(x) sum_part = x2.copy() for i in six.moves.range(1, half_n + 1): sum_part[:, i:] += x2[:, :-i] sum_part[:, :-i] += x2[:, i:] self.unit_scale = self.k + self.alpha * sum_part self.scale = self.unit_scale ** -self.beta y = x * self.scale return y,
def forward(self, inputs): self._config_use_ideep = chainer.config.use_ideep if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(inputs)): # iDeep implementation return self._forward_ideep(inputs) # Generic implementation if len(inputs) == 3: x, W, b = inputs else: (x, W), b = inputs, None # NumPy raises an error when the array is not contiguous. # See: https://github.com/chainer/chainer/issues/2744 # TODO(niboshi): Remove this code when NumPy is fixed. if (isinstance(x, numpy.ndarray) and not (x.flags.c_contiguous or x.flags.f_contiguous) and 1 in x.shape): x = numpy.ascontiguousarray(x) # In order to be compatible with the "static graph" feature, it is # required that all output arrays of this forward # function be allocated explicitly: xp = cuda.get_array_module(x) y = xp.empty((x.shape[0], W.shape[0]), dtype=x.dtype) # This is required because all of the "static_*()" functions # use the convention that any output arrays are supplied # as input arguments to the function. That is because it is # not allowed for a "static_*()" function to return anything # other than `None`. The reason is to prevent dynamic allocation # of output arrays during execution of the static schedule # because it would break the model. self.static_linear_no_bias(xp, x.dtype == W.dtype, inputs=[x, W], outputs=[y]) if len(inputs) == 3: self.static_add_bias(inputs=[b], outputs=[y]) self.retain_inputs((0, 1)) # b is not retained return y,
def forward_cpu(self, gy): if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(gy)): return self._forward_ideep(gy) n, c, out_h, out_w = gy[0].shape h, w = self._in_shape[2:] kh, kw = self.kh, self.kw gcol = numpy.zeros( (n * c * out_h * out_w * kh * kw), dtype=self._in_dtype) indexes = self.indexes.flatten() indexes += numpy.arange(0, indexes.size * kh * kw, kh * kw) gcol[indexes] = gy[0].ravel() gcol = gcol.reshape(n, c, out_h, out_w, kh, kw) gcol = numpy.swapaxes(gcol, 2, 4) gcol = numpy.swapaxes(gcol, 3, 5) gx = conv.col2im_cpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) return gx,