Beispiel #1
0
    def forward_cpu(self, inputs):
        if ((self.dy == 1 and self.dx == 1)
                and intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            self._use_ideep = True

        self.retain_inputs((0, 1))  # only retain x and W
        if len(inputs) == 2:
            (x, W), b = inputs, None
        else:
            x, W, b = inputs

        self._calc_out_size(x, W)

        if self.groups > 1:
            # Grouped convolution implementation
            return self._forward_grouped_convolution(x, W, b)

        elif (intel64.should_use_ideep('>=auto')
              and intel64.inputs_all_ready(inputs)):
            # iDeep implementation
            self._use_ideep = True
            return self._forward_ideep(x, W, b)

        else:
            return self._forward_cpu_core(x, W, b)
Beispiel #2
0
    def forward(self, inputs):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            return self._forward_ideep(inputs)

        y = inputs[0] * self.mask
        return y,
Beispiel #3
0
    def forward(self, x):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(x)):
            return self._forward_ideep(x)

        if self.mask is not None:
            y = x[0] * self.mask
        else:
            scale = x[0].dtype.type(1. / (1 - self.dropout_ratio))
            xp = cuda.get_array_module(*x)
            if xp == numpy:
                flag = xp.random.rand(*x[0].shape) >= self.dropout_ratio
                self.mask = scale * flag
                y = x[0] * self.mask
            else:
                rand = xp.random.rand(*x[0].shape, dtype=numpy.float32)
                self.mask, y = cuda.elementwise(
                    'T x, R r, T scale, T ratio', 'T mask, T y',
                    '''
                    mask = (r >= ratio) * scale;
                    y = x * mask;
                    ''',
                    'dropout_fwd',
                )(x[0], rand, scale, self.dropout_ratio)
        return y,
Beispiel #4
0
    def forward(self, inputs):
        self._config_use_ideep = chainer.config.use_ideep
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            # iDeep implementation
            return self._forward_ideep(inputs)

        # Generic implementation
        if len(inputs) == 3:
            x, W, b = inputs
        else:
            (x, W), b = inputs, None

        # NumPy raises an error when the array is not contiguous.
        # See: https://github.com/chainer/chainer/issues/2744
        # TODO(niboshi): Remove this code when NumPy is fixed.
        if (isinstance(x, numpy.ndarray) and
                not (x.flags.c_contiguous or x.flags.f_contiguous) and
                1 in x.shape):
            x = numpy.ascontiguousarray(x)

        y = x.dot(W.T).astype(x.dtype, copy=False)
        if b is not None:
            y += b
        self.retain_inputs((0, 1))  # b is not retained
        return y,
Beispiel #5
0
    def forward(self, inputs):
        self.retain_inputs((0, 1))
        c_prev, x = inputs
        a, i, f, o = _extract_gates(x)
        batch = len(x)

        if isinstance(x, chainer.get_cpu_array_types()):
            if intel64.should_use_ideep('>=auto'):
                xp = intel64.ideep.get_array_module(x)
            else:
                xp = numpy
            a = xp.tanh(a)
            i = _sigmoid(i, xp)
            f = _sigmoid(f, xp)
            o = _sigmoid(o, xp)

            c_next = numpy.empty_like(c_prev)
            c_next[:batch] = a * i + f * c_prev[:batch]
            h = o * xp.tanh(c_next[:batch])
        else:
            c_next = cuda.cupy.empty_like(c_prev)
            h = cuda.cupy.empty_like(c_next[:batch])
            cuda.elementwise(
                'T c_prev, T a, T i_, T f, T o', 'T c, T h',
                '''
                    COMMON_ROUTINE;
                    c = aa * ai + af * c_prev;
                    h = ao * tanh(c);
                ''',
                'lstm_fwd', preamble=_preamble)(
                    c_prev[:batch], a, i, f, o, c_next[:batch], h)

        c_next[batch:] = c_prev[batch:]
        self.retain_outputs((0,))
        return c_next, h
Beispiel #6
0
    def forward(self, inputs):
        xp = backend.get_array_module(*inputs)
        c_prev, x, c_next, gc, gh = inputs
        batch = len(x)

        gx = xp.empty_like(x)
        ga, gi, gf, go = _extract_gates(gx)

        # Consider the case that either gradient is not given
        if gc is None:
            gc_update = 0
            gc_rest = 0
        else:
            gc_update = gc[:batch]
            gc_rest = gc[batch:]
        if gh is None:
            gh = 0

        a, i, f, o = _extract_gates(x)
        if xp is numpy:
            if intel64.should_use_ideep('>=auto'):
                xp = intel64.ideep.get_array_module(x)
            tanh_a = xp.tanh(a)
            sig_i = _sigmoid(i, xp)
            sig_f = _sigmoid(f, xp)
            sig_o = _sigmoid(o, xp)

            co = xp.tanh(c_next[:batch])
            gc_prev = numpy.empty_like(c_prev)
            # multiply f later
            gc_prev[:batch] = gh * sig_o * _grad_tanh(co) + gc_update
            gc = gc_prev[:batch]
            ga[:] = gc * sig_i * _grad_tanh(tanh_a)
            gi[:] = gc * tanh_a * _grad_sigmoid(sig_i)
            gf[:] = gc * c_prev[:batch] * _grad_sigmoid(sig_f)
            go[:] = gh * co * _grad_sigmoid(sig_o)
            gc_prev[:batch] *= sig_f  # multiply f here
            gc_prev[batch:] = gc_rest
        else:
            gc_prev = xp.empty_like(c_prev)
            cuda.elementwise(
                'T c_prev, T c, T gc, T gh, T a, T i_, T f, T o',
                'T gc_prev, T ga, T gi, T gf, T go',
                '''
                    COMMON_ROUTINE;
                    T co = tanh(c);
                    T temp = gh * ao * grad_tanh(co) + gc;
                    ga = temp * ai * grad_tanh(aa);
                    gi = temp * aa * grad_sigmoid(ai);
                    gf = temp * c_prev * grad_sigmoid(af);
                    go = gh * co * grad_sigmoid(ao);
                    gc_prev = temp * af;
                ''',
                'lstm_bwd', preamble=_preamble)(
                    c_prev[:batch], c_next[:batch], gc_update, gh, a, i, f, o,
                    gc_prev[:batch], ga, gi, gf, go)
            gc_prev[batch:] = gc_rest

        return gc_prev, gx
Beispiel #7
0
    def forward_cpu(self, inputs):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            return self.forward_ideep(inputs)

        gy, = inputs
        gx = gy * (self.b > 0)
        return utils.force_array(gx, dtype=gy.dtype),
Beispiel #8
0
    def forward(self, xs):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(xs, (4,))):
            # iDeep implementation
            return self._forward_ideep(xs)

        # Generic implementation
        xp = cuda.get_array_module(*xs)
        return xp.concatenate(xs, self.axis),
Beispiel #9
0
    def forward_cpu(self, inputs):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            return self.forward_ideep(inputs)

        x, = inputs
        y = numpy.maximum(x, 0, dtype=x.dtype)
        self.retain_outputs((0,))
        return utils.force_array(y),
Beispiel #10
0
    def forward_cpu(self, inputs):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            # iDeep implementation
            self._use_ideep = True
            return self.forward_ideep(inputs)

        x, = inputs
        self.retain_outputs((0,))
        return utils.force_array(numpy.maximum(x, 0, dtype=x.dtype)),
Beispiel #11
0
    def forward_cpu(self, inputs):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            # iDeep implementation
            self._use_ideep = True
            return self.forward_ideep(inputs)

        x, = inputs
        self.retain_outputs((0,))
        return utils.force_array(numpy.maximum(x, 0, dtype=x.dtype)),
Beispiel #12
0
    def forward_cpu(self, gy):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(gy)):
            return self._forward_ideep(gy)

        h, w = self._in_shape[2:]
        gcol = numpy.tile(gy[0][:, :, None, None],
                          (1, 1, self.kh, self.kw, 1, 1))
        gx = conv.col2im_cpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w)
        gx /= self.kh * self.kw
        return gx,
Beispiel #13
0
    def forward_cpu(self, gy):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(gy)):
            return self._forward_ideep(gy)

        h, w = self._in_shape[2:]
        gcol = numpy.tile(gy[0][:, :, None, None],
                          (1, 1, self.kh, self.kw, 1, 1))
        gx = conv.col2im_cpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w)
        gx /= self.kh * self.kw
        return gx,
Beispiel #14
0
    def forward_cpu(self, x):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(x)):
            return self._forward_ideep(x)

        self._in_shape = x[0].shape
        self._in_dtype = x[0].dtype

        col = conv.im2col_cpu(x[0], self.kh, self.kw, self.sy, self.sx,
                              self.ph, self.pw)
        y = col.mean(axis=(2, 3))
        return y,
Beispiel #15
0
    def forward_cpu(self, inputs):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            return self.forward_ideep(inputs)

        gy, = inputs
        gy = gy.copy()
        if self.slope >= 0:
            gy[self.y < 0] *= self.slope
        else:
            gy[self.x < 0] *= self.slope
        return gy,
Beispiel #16
0
    def forward_cpu(self, x):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(x)):
            return self._forward_ideep(x)

        self._in_shape = x[0].shape
        self._in_dtype = x[0].dtype

        col = conv.im2col_cpu(x[0], self.kh, self.kw, self.sy, self.sx,
                              self.ph, self.pw)
        y = col.mean(axis=(2, 3))
        return y,
Beispiel #17
0
    def forward_cpu(self, inputs):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            return self.forward_ideep(inputs)

        gy, = inputs
        gy = gy.copy()
        if self.slope >= 0:
            gy[self.y < 0] *= self.slope
        else:
            gy[self.x < 0] *= self.slope
        return gy,
Beispiel #18
0
    def forward_cpu(self, inputs):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            return self.forward_ideep(inputs)

        x, = inputs
        y = x.copy()
        y[x < 0] *= self.slope
        if self.slope >= 0:
            self.retain_outputs((0,))
        else:
            self.retain_inputs((0,))
        return y,
Beispiel #19
0
    def forward_cpu(self, x):
        if (intel64.should_use_ideep('>=auto') and intel64.inputs_all_ready(x)
                and self.mask is None):
            return self._forward_ideep(x)

        if self.mask is not None:
            y = x[0] * self.mask
        else:
            scale = x[0].dtype.type(1. / (1 - self.dropout_ratio))
            flag = numpy.random.rand(*x[0].shape) >= self.dropout_ratio
            self.mask = scale * flag
            y = x[0] * self.mask
        return y,
Beispiel #20
0
    def forward_cpu(self, inputs):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            return self.forward_ideep(inputs)

        x, = inputs
        y = x.copy()
        y[x < 0] *= self.slope
        if self.slope >= 0:
            self.retain_outputs((0, ))
        else:
            self.retain_inputs((0, ))
        return y,
Beispiel #21
0
    def forward_cpu(self, x):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(x)
                and self.mask is None):
            return self._forward_ideep(x)

        if self.mask is not None:
            y = x[0] * self.mask
        else:
            scale = x[0].dtype.type(1. / (1 - self.dropout_ratio))
            flag = numpy.random.rand(*x[0].shape) >= self.dropout_ratio
            self.mask = scale * flag
            y = x[0] * self.mask
        return y,
Beispiel #22
0
    def forward(self, inputs):
        # Currently iDeep only supports 4 dims
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs, (4,))
                and self._ideep_is_supported(inputs)):
            return self._forward_ideep(inputs)

        x, = inputs
        self._xp = backend.get_array_module(x)
        indices_or_sections = self.indices_or_sections
        ret = self._xp.split(x, indices_or_sections, self.axis)
        if self._xp == numpy and not _numpy_split_ok:
            ret = _fix_numpy_split(ret, x, indices_or_sections, self.axis)
        self._shapes = [r.shape for r in ret]
        return tuple(ret)
Beispiel #23
0
    def forward(self, xs):
        self.len = len(xs)
        if len(xs) == 1:
            return xs
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(xs)):
            y = intel64.ideep.multi_add(xs)
        else:
            # The output should be a new array. Add the first 2 arrays
            # and get the result y. Then add the rest arrays to y.
            y = xs[0] + xs[1]
            for x in xs[2:]:
                y += x

        return utils.force_array(y),
Beispiel #24
0
    def forward(self, inputs):
        # Currently iDeep only supports 4 dims
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs, (4, ))
                and self._ideep_is_supported(inputs)):
            return self._forward_ideep(inputs)

        x, = inputs
        self._xp = backend.get_array_module(x)
        indices_or_sections = self.indices_or_sections
        ret = self._xp.split(x, indices_or_sections, self.axis)
        if self._xp == numpy and not _numpy_split_ok:
            ret = _fix_numpy_split(ret, x, indices_or_sections, self.axis)
        self._shapes = [r.shape for r in ret]
        return tuple(ret)
Beispiel #25
0
    def forward_cpu(self, inputs):
        self.retain_inputs((0, 1))  # retain only x and W
        if len(inputs) == 2:
            (x, W), b = inputs, None
        else:
            x, W, b = inputs

        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            self._use_ideep = True

        if self.groups > 1:
            return self._forward_grouped_convolution(x, W, b)
        else:
            return self._forward_cpu_core(x, W, b)
Beispiel #26
0
    def forward_cpu(self, inputs):
        self.retain_inputs((0, 1))  # retain only x and W
        if len(inputs) == 2:
            (x, W), b = inputs, None
        else:
            x, W, b = inputs

        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            self._use_ideep = True

        if self.groups > 1:
            return self._forward_grouped_convolution(x, W, b)
        else:
            return self._forward_cpu_core(x, W, b)
    def forward_cpu(self, x):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(x, (4,))):
            self._use_ideep = True
            return self._forward_ideep(x)

        half_n = self.n // 2
        x2 = numpy.square(x[0])
        sum_part = x2.copy()
        for i in six.moves.range(1, half_n + 1):
            sum_part[:, i:] += x2[:, :-i]
            sum_part[:, :-i] += x2[:, i:]
        self.unit_scale = self.k + self.alpha * sum_part
        self.scale = self.unit_scale ** -self.beta
        self.y = x[0] * self.scale
        return self.y,
Beispiel #28
0
    def forward(self, inputs):
        # Currently iDeep only supports 4 dims
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs, (4,))
                and self._ideep_is_supported(inputs)):
            return self._forward_ideep(inputs)

        x, = inputs
        if isinstance(self.indices_or_sections, collections.Iterable):
            cdimx = x.shape[self.axis]
            ind = list(self.indices_or_sections)
            ind.append(cdimx)
        self._xp = cuda.get_array_module(x)
        ret = tuple(self._xp.split(x, self.indices_or_sections, self.axis))
        self._shapes = [r.shape for r in ret]
        return ret
Beispiel #29
0
    def forward(self, inputs):
        # Currently iDeep only supports 4 dims
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs, (4, ))
                and self._ideep_is_supported(inputs)):
            return self._forward_ideep(inputs)

        x, = inputs
        self._xp = cuda.get_array_module(x)
        if self.indices is not None:
            indices_or_sections = self.indices
        else:
            indices_or_sections = self.sections
        ret = tuple(self._xp.split(x, indices_or_sections, self.axis))
        self._shapes = [r.shape for r in ret]
        return ret
    def forward_cpu(self, x):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(x, (4, ))):
            self._use_ideep = True
            return self._forward_ideep(x)

        half_n = self.n // 2
        x2 = numpy.square(x[0])
        sum_part = x2.copy()
        for i in six.moves.range(1, half_n + 1):
            sum_part[:, i:] += x2[:, :-i]
            sum_part[:, :-i] += x2[:, i:]
        self.unit_scale = self.k + self.alpha * sum_part
        self.scale = self.unit_scale**-self.beta
        self.y = x[0] * self.scale
        return self.y,
Beispiel #31
0
    def forward(self, inputs):
        # Currently iDeep only supports 4 dims
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs, (4, ))
                and self._ideep_is_supported(inputs)):
            return self._forward_ideep(inputs)

        x, = inputs
        if isinstance(self.indices_or_sections, collections.Iterable):
            cdimx = x.shape[self.axis]
            ind = list(self.indices_or_sections)
            ind.append(cdimx)
        self._xp = cuda.get_array_module(x)
        ret = tuple(self._xp.split(x, self.indices_or_sections, self.axis))
        self._shapes = [r.shape for r in ret]
        return ret
    def forward_cpu(self, inputs):
        if (self.groups == 1 and intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            # iDeep implementation
            self._use_ideep = True
            return self._forward_ideep(inputs)

        self.retain_inputs((0, 1))  # retain only x and W
        self.retain_outputs((0, ))
        if len(inputs) == 2:
            (x, W), b = inputs, None
        else:
            x, W, b = inputs

        if self.groups > 1:
            return self._forward_grouped_convolution(x, W, b)
        else:
            return self._forward_cpu_core(x, W, b)
Beispiel #33
0
    def forward(self, inputs):
        self._config_use_ideep = chainer.config.use_ideep
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            # iDeep implementation
            return self._forward_ideep(inputs)

        # Generic implementation
        self.retain_inputs((0, 1))
        W, gy = inputs

        if (isinstance(gy, numpy.ndarray) and
                not (gy.flags.c_contiguous or gy.flags.f_contiguous) and
                1 in gy.shape):
            gy = numpy.ascontiguousarray(gy)

        gx = gy.dot(W).astype(gy.dtype, copy=False)
        return gx,
Beispiel #34
0
    def forward(self, inputs):
        self._config_use_ideep = chainer.config.use_ideep
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            # iDeep implementation
            return self._forward_ideep(inputs)

        # Generic implementation
        self.retain_inputs((0, 1))
        W, gy = inputs

        if (isinstance(gy, numpy.ndarray) and
                not (gy.flags.c_contiguous or gy.flags.f_contiguous) and
                1 in gy.shape):
            gy = numpy.ascontiguousarray(gy)

        gx = gy.dot(W).astype(gy.dtype, copy=False)
        return gx,
Beispiel #35
0
    def forward_cpu(self, x):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(x)):
            return self._forward_ideep(x)

        self._in_shape = x[0].shape
        self._in_dtype = x[0].dtype

        col = conv.im2col_cpu(
            x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw,
            pval=-float('inf'), cover_all=self.cover_all)
        n, c, kh, kw, out_h, out_w = col.shape
        col = col.reshape(n, c, kh * kw, out_h, out_w)

        # We select maximum twice, since the implementation using numpy.choose
        # hits its bug when kh * kw >= 32.
        self.indexes = col.argmax(axis=2)
        y = col.max(axis=2)
        return y,
Beispiel #36
0
    def forward(self, inputs):
        self._config_use_ideep = chainer.config.use_ideep
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            # iDeep implementation
            return self._forward_ideep(inputs)

        # Generic implementation
        if len(inputs) == 3:
            x, W, b = inputs
        else:
            (x, W), b = inputs, None

        # NumPy raises an error when the array is not contiguous.
        # See: https://github.com/chainer/chainer/issues/2744
        # TODO(niboshi): Remove this code when NumPy is fixed.
        if (isinstance(x, numpy.ndarray)
                and not (x.flags.c_contiguous or x.flags.f_contiguous)
                and 1 in x.shape):
            x = numpy.ascontiguousarray(x)

        # In order to be compatible with the "static graph" feature, it is
        # required that all output arrays of this forward
        # function be allocated explicitly:
        xp = cuda.get_array_module(x)
        y = xp.empty((x.shape[0], W.shape[0])).astype(x.dtype)

        # This is required because all of the "static_*()" functions
        # use the convention that any output arrays are supplied
        # as input arguments to the function. That is because it is
        # not allowed for a "static_*()" function to return anything
        # other than `None`. The reason is to prevent dynamic allocation
        # of output arrays during execution of the static schedule
        # because it would break the model.
        self.static_linear_no_bias(xp,
                                   x.dtype == W.dtype,
                                   inputs=[x, W],
                                   outputs=[y])
        if len(inputs) == 3:
            self.static_add_bias(inputs=[b], outputs=[y])

        self.retain_inputs((0, 1))  # b is not retained
        return y,
Beispiel #37
0
    def forward_cpu(self, x):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(x)):
            return self._forward_ideep(x)

        self._in_shape = x[0].shape
        self._in_dtype = x[0].dtype

        col = conv.im2col_cpu(
            x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw,
            pval=-float('inf'), cover_all=self.cover_all)
        n, c, kh, kw, out_h, out_w = col.shape
        col = col.reshape(n, c, kh * kw, out_h, out_w)

        # We select maximum twice, since the implementation using numpy.choose
        # hits its bug when kh * kw >= 32.
        self.indexes = col.argmax(axis=2)
        y = col.max(axis=2)
        return y,
Beispiel #38
0
    def forward_cpu(self, inputs):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs, (4, ))):
            self._use_ideep = True
            return self.forward_ideep(inputs)

        x, = inputs
        self.retain_inputs((0, ))
        self.retain_outputs((0, ))

        half_n = self.n // 2
        x2 = numpy.square(x)
        sum_part = x2.copy()
        for i in six.moves.range(1, half_n + 1):
            sum_part[:, i:] += x2[:, :-i]
            sum_part[:, :-i] += x2[:, i:]
        self.unit_scale = self.k + self.alpha * sum_part
        self.scale = self.unit_scale**-self.beta
        y = x * self.scale
        return y,
Beispiel #39
0
    def forward(self, xs):
        self.len = len(xs)
        if len(xs) == 1:
            return xs
        y = None
        if intel64.should_use_ideep('>=auto'):
            bxs = numpy.broadcast_arrays(*xs)
            if intel64.inputs_all_ready(bxs):
                y = intel64.ideep.multi_add(bxs)
        if y is None:
            # The output should be a new array. Add the first 2 arrays
            # and get the result y. Then add the rest arrays to y.
            y = xs[0] + xs[1]
            for x in xs[2:]:
                if x.shape == y.shape:
                    y += x
                else:
                    y = x + y

        return utils.force_array(y),
    def forward_cpu(self, inputs):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs, (4,))):
            self._use_ideep = True
            return self.forward_ideep(inputs)

        x, = inputs
        self.retain_inputs((0,))
        self.retain_outputs((0,))

        half_n = self.n // 2
        x2 = numpy.square(x)
        sum_part = x2.copy()
        for i in six.moves.range(1, half_n + 1):
            sum_part[:, i:] += x2[:, :-i]
            sum_part[:, :-i] += x2[:, i:]
        self.unit_scale = self.k + self.alpha * sum_part
        self.scale = self.unit_scale ** -self.beta
        y = x * self.scale
        return y,
Beispiel #41
0
    def forward(self, inputs):
        self._config_use_ideep = chainer.config.use_ideep
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            # iDeep implementation
            return self._forward_ideep(inputs)

        # Generic implementation
        if len(inputs) == 3:
            x, W, b = inputs
        else:
            (x, W), b = inputs, None

        # NumPy raises an error when the array is not contiguous.
        # See: https://github.com/chainer/chainer/issues/2744
        # TODO(niboshi): Remove this code when NumPy is fixed.
        if (isinstance(x, numpy.ndarray) and
                not (x.flags.c_contiguous or x.flags.f_contiguous) and
                1 in x.shape):
            x = numpy.ascontiguousarray(x)

        # In order to be compatible with the "static graph" feature, it is
        # required that all output arrays of this forward
        # function be allocated explicitly:
        xp = cuda.get_array_module(x)
        y = xp.empty((x.shape[0], W.shape[0]), dtype=x.dtype)

        # This is required because all of the "static_*()" functions
        # use the convention that any output arrays are supplied
        # as input arguments to the function. That is because it is
        # not allowed for a "static_*()" function to return anything
        # other than `None`. The reason is to prevent dynamic allocation
        # of output arrays during execution of the static schedule
        # because it would break the model.
        self.static_linear_no_bias(xp, x.dtype == W.dtype, inputs=[x, W],
                                   outputs=[y])
        if len(inputs) == 3:
            self.static_add_bias(inputs=[b], outputs=[y])

        self.retain_inputs((0, 1))  # b is not retained
        return y,
    def forward_cpu(self, inputs):
        self.retain_inputs((0, 1))  # only retain x and W
        if len(inputs) == 2:
            (x, W), b = inputs, None
        else:
            x, W, b = inputs

        self._calc_out_size(x, W)

        if self.groups > 1:
            # Grouped convolution implementation
            return self._forward_grouped_convolution(x, W, b)

        elif (intel64.should_use_ideep('>=auto')
              and intel64.inputs_all_ready(inputs)):
            # iDeep implementation
            self._use_ideep = True
            return self._forward_ideep(x, W, b)

        else:
            return self._forward_cpu_core(x, W, b)
Beispiel #43
0
    def forward_cpu(self, inputs):

        if self.cudnn_fast:
            raise RuntimeError(
                '\'cudnn_fast\' can\'t be used in the CPU backend')

        self._check_input_layouts_all_standard()
        self.retain_inputs((0, 1))  # retain only x and W
        if len(inputs) == 2:
            (x, W), b = inputs, None
        else:
            x, W, b = inputs

        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(inputs)):
            self._use_ideep = True

        if self.groups > 1:
            return self._forward_grouped_convolution(x, W, b)
        else:
            return self._forward_cpu_core(x, W, b)
Beispiel #44
0
    def forward_cpu(self, gy):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(gy)):
            return self._forward_ideep(gy)

        n, c, out_h, out_w = gy[0].shape
        h, w = self._in_shape[2:]
        kh, kw = self.kh, self.kw

        gcol = numpy.zeros(
            (n * c * out_h * out_w * kh * kw), dtype=self._in_dtype)

        indexes = self.indexes.flatten()
        indexes += numpy.arange(0, indexes.size * kh * kw, kh * kw)

        gcol[indexes] = gy[0].ravel()
        gcol = gcol.reshape(n, c, out_h, out_w, kh, kw)
        gcol = numpy.swapaxes(gcol, 2, 4)
        gcol = numpy.swapaxes(gcol, 3, 5)

        gx = conv.col2im_cpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w)
        return gx,
Beispiel #45
0
    def forward_cpu(self, gy):
        if (intel64.should_use_ideep('>=auto')
                and intel64.inputs_all_ready(gy)):
            return self._forward_ideep(gy)

        n, c, out_h, out_w = gy[0].shape
        h, w = self._in_shape[2:]
        kh, kw = self.kh, self.kw

        gcol = numpy.zeros((n * c * out_h * out_w * kh * kw),
                           dtype=self._in_dtype)

        indexes = self.indexes.ravel() + numpy.arange(
            0, self.indexes.size * kh * kw, kh * kw)

        gcol[indexes] = gy[0].ravel()
        gcol = gcol.reshape(n, c, out_h, out_w, kh, kw)
        gcol = numpy.swapaxes(gcol, 2, 4)
        gcol = numpy.swapaxes(gcol, 3, 5)

        gx = conv.col2im_cpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w)
        return gx,
Beispiel #46
0
    def forward(self, inputs):
        self.retain_inputs((0, 1))
        c_prev, x = inputs
        a, i, f, o = _extract_gates(x)
        batch = len(x)

        if isinstance(x, chainer.get_cpu_array_types()):
            if intel64.should_use_ideep('>=auto'):
                xp = intel64.ideep.get_array_module(x)
            else:
                xp = numpy
            a = xp.tanh(a)
            i = _sigmoid(i, xp)
            f = _sigmoid(f, xp)
            o = _sigmoid(o, xp)

            c_next = numpy.empty_like(c_prev)
            c_next[:batch] = a * i + f * c_prev[:batch]
            h = o * xp.tanh(c_next[:batch])
        else:
            c_next = cuda.cupy.empty_like(c_prev)
            h = cuda.cupy.empty_like(c_next[:batch])
            cuda.elementwise('T c_prev, T a, T i_, T f, T o',
                             'T c, T h',
                             '''
                    COMMON_ROUTINE;
                    c = aa * ai + af * c_prev;
                    h = ao * tanh(c);
                ''',
                             'lstm_fwd',
                             preamble=_preamble)(c_prev[:batch], a, i, f, o,
                                                 c_next[:batch], h)

        c_next[batch:] = c_prev[batch:]
        self.retain_outputs((0, ))
        return c_next, h
Beispiel #47
0
 def can_use_ideep(self):
     return self.ideep_ok and intel64.should_use_ideep('>=auto')
Beispiel #48
0
    def forward(self, inputs):
        xp = backend.get_array_module(*inputs)
        c_prev1, c_prev2, x1, x2, c_next, gc, gh = inputs

        gx1 = xp.empty_like(x1)
        gx2 = xp.empty_like(x2)
        ga1, gi1, gf1, go1 = _extract_gates(gx1)
        ga2, gi2, gf2, go2 = _extract_gates(gx2)

        if gc is None:
            gc = 0
        if gh is None:
            gh = 0

        a1, i1, f1, o1 = _extract_gates(x1)
        a2, i2, f2, o2 = _extract_gates(x2)
        if xp is numpy:
            if intel64.should_use_ideep('>=auto'):
                xp = intel64.ideep.get_array_module(x1)
            tanh_a1 = xp.tanh(a1)
            sig_i1 = _sigmoid(i1, xp)
            sig_f1 = _sigmoid(f1, xp)
            tanh_a2 = xp.tanh(a2)
            sig_i2 = _sigmoid(i2, xp)
            sig_f2 = _sigmoid(f2, xp)
            sig_o = _sigmoid(o1 + o2, xp)

            co = xp.tanh(c_next)
            # multiply f later
            gc_prev = gh * sig_o * _grad_tanh(co) + gc
            ga1[:] = gc_prev * sig_i1 * _grad_tanh(tanh_a1)
            gi1[:] = gc_prev * tanh_a1 * _grad_sigmoid(sig_i1)
            gf1[:] = gc_prev * c_prev1 * _grad_sigmoid(sig_f1)
            go1[:] = gh * co * _grad_sigmoid(sig_o)
            ga2[:] = gc_prev * sig_i2 * _grad_tanh(tanh_a2)
            gi2[:] = gc_prev * tanh_a2 * _grad_sigmoid(sig_i2)
            gf2[:] = gc_prev * c_prev2 * _grad_sigmoid(sig_f2)
            go2[:] = gh * co * _grad_sigmoid(sig_o)
            # multiply f here
            gc_prev1 = gc_prev * sig_f1
            gc_prev2 = gc_prev * sig_f2
        else:
            a1, i1, f1, o1 = _extract_gates(x1)
            a2, i2, f2, o2 = _extract_gates(x2)
            gc_prev1 = xp.empty_like(c_prev1)
            gc_prev2 = xp.empty_like(c_prev2)
            cuda.elementwise('''T c_prev1, T a1, T i1, T f1, T o1,
                T c_prev2, T a2, T i2, T f2, T o2,
                T c, T gc, T gh''',
                             '''T gc_prev1, T ga1, T gi1, T gf1, T go1,
                T gc_prev2, T ga2, T gi2, T gf2, T go2''',
                             '''
                    COMMON_ROUTINE;
                    T co = tanh(c);
                    T temp = gh * ao * grad_tanh(co) + gc;
                    ga1 = temp * ai1 * grad_tanh(aa1);
                    gi1 = temp * aa1 * grad_sigmoid(ai1);
                    gf1 = temp * c_prev1 * grad_sigmoid(af1);
                    go1 = gh * co * grad_sigmoid(ao);
                    gc_prev1 = temp * af1;
                    ga2 = temp * ai2 * grad_tanh(aa2);
                    gi2 = temp * aa2 * grad_sigmoid(ai2);
                    gf2 = temp * c_prev2 * grad_sigmoid(af2);
                    go2 = gh * co * grad_sigmoid(ao);
                    gc_prev2 = temp * af2;
                ''',
                             'lstm_bwd',
                             preamble=_preamble)(c_prev1, a1, i1, f1, o1,
                                                 c_prev2, a2, i2, f2, o2,
                                                 c_next, gc, gh, gc_prev1, ga1,
                                                 gi1, gf1, go1, gc_prev2, ga2,
                                                 gi2, gf2, go2)

        return gc_prev1, gc_prev2, gx1, gx2
Beispiel #49
0
 def can_use_ideep(self):
     return self.ideep_ok and intel64.should_use_ideep('>=auto')