Ejemplo n.º 1
0
    def forward(self, inputs):
        self.retain_inputs((0, 1))
        c_prev, x = inputs
        a, i, f, o = _extract_gates(x)
        batch = len(x)

        if isinstance(x, chainer.get_cpu_array_types()):
            if intel64.should_use_ideep('>=auto'):
                xp = intel64.ideep.get_array_module(x)
            else:
                xp = numpy
            a = xp.tanh(a)
            i = _sigmoid(i, xp)
            f = _sigmoid(f, xp)
            o = _sigmoid(o, xp)

            c_next = numpy.empty_like(c_prev)
            c_next[:batch] = a * i + f * c_prev[:batch]
            h = o * xp.tanh(c_next[:batch])
        else:
            c_next = cuda.cupy.empty_like(c_prev)
            h = cuda.cupy.empty_like(c_next[:batch])
            cuda.elementwise(
                'T c_prev, T a, T i_, T f, T o', 'T c, T h',
                '''
                    COMMON_ROUTINE;
                    c = aa * ai + af * c_prev;
                    h = ao * tanh(c);
                ''',
                'lstm_fwd', preamble=_preamble)(
                    c_prev[:batch], a, i, f, o, c_next[:batch], h)

        c_next[batch:] = c_prev[batch:]
        self.retain_outputs((0,))
        return c_next, h
Ejemplo n.º 2
0
def im2col_gpu(img, kh, kw, sy, sx, ph, pw, cover_all=False, dy=1, dx=1,
               out_h=None, out_w=None):
    n, c, h, w = img.shape
    if out_h is None:
        out_h = get_conv_outsize(h, kh, sy, ph, cover_all, dy)
    assert out_h > 0, 'Height in the output should be positive.'
    if out_w is None:
        out_w = get_conv_outsize(w, kw, sx, pw, cover_all, dx)
    assert out_w > 0, 'Width in the output should be positive.'

    col = cuda.cupy.empty((n, c, kh, kw, out_h, out_w), dtype=img.dtype)
    cuda.elementwise(
        'raw T img, int32 h, int32 w, int32 out_h, int32 out_w,'
        'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw,'
        'int32 dy, int32 dx',
        'T col',
        '''
           int c0 = i / (kh * kw * out_h * out_w);
           int ky = i / (kw * out_h * out_w) % kh;
           int kx = i / (out_h * out_w) % kw;
           int out_y = i / out_w % out_h;
           int out_x = i % out_w;
           int in_y = ky * dy + out_y * sy - ph;
           int in_x = kx * dx + out_x * sx - pw;
           if (in_y >= 0 && in_y < h && in_x >= 0 && in_x < w) {
             col = img[in_x + w * (in_y + h * c0)];
           } else {
             col = 0;
           }
        ''',
        'im2col')(img.reduced_view(),
                  h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, dy, dx, col)
    return col
Ejemplo n.º 3
0
    def forward(self, inputs):
        self.retain_inputs((0, 1))
        c_prev, x = inputs
        a, i, f, o = _extract_gates(x)
        batch = len(x)

        if isinstance(x, numpy.ndarray):
            a = numpy.tanh(a)
            i = _sigmoid(i)
            f = _sigmoid(f)
            o = _sigmoid(o)

            c_next = numpy.empty_like(c_prev)
            c_next[:batch] = a * i + f * c_prev[:batch]
            h = o * numpy.tanh(c_next[:batch])
        else:
            c_next = cuda.cupy.empty_like(c_prev)
            h = cuda.cupy.empty_like(c_next[:batch])
            cuda.elementwise(
                'T c_prev, T a, T i_, T f, T o', 'T c, T h',
                '''
                    COMMON_ROUTINE;
                    c = aa * ai + af * c_prev;
                    h = ao * tanh(c);
                ''',
                'lstm_fwd', preamble=_preamble)(
                    c_prev[:batch], a, i, f, o, c_next[:batch], h)

        c_next[batch:] = c_prev[batch:]
        self.retain_outputs((0,))
        return c_next, h
Ejemplo n.º 4
0
    def forward_gpu(self, x):
        if chainer.should_use_cudnn('>=auto') and 2 <= self.ndim <= 3:
            # With cuDNN v3 or greater, use cuDNN implementation for inputs
            # with spatial dimensions of two or more.
            return super(MaxPoolingND, self).forward_gpu(x)

        self._in_shape = x[0].shape
        self._in_dtype = x[0].dtype

        n, c = x[0].shape[:2]
        dims = x[0].shape[2:]
        ys = tuple(conv_nd.get_conv_outsize(d, k, s, p, self.cover_all)
                   for (d, k, s, p) in six.moves.zip(
                       dims, self.ksize, self.stride, self.pad))
        # (n, c, y_1, y_2, ..., y_N)
        y_shape = (n, c) + ys
        y = cuda.cupy.empty(y_shape, dtype=x[0].dtype)
        self.indexes = cuda.cupy.empty(y_shape, dtype=numpy.int32)

        in_params, out_params, operation, name = \
            max_pooling_nd_kernel.MaxPoolingNDKernelForward.generate(self.ndim)
        cuda.elementwise(in_params, out_params, operation, name)(
            x[0].reduced_view(),
            *(dims + ys + self.ksize + self.stride + self.pad +
              (y, self.indexes)))

        return y,
Ejemplo n.º 5
0
    def forward_gpu(self, gys):
        if self._used_cudnn:
            x, = self.apoolnd.get_retained_inputs()
            return self.apoolnd.backward_gpu((x.data,), gys)

        is_pad_value_none = self.pad_value is None

        gy, = gys
        n, c = self._in_shape[:2]
        idims = self._in_shape[2:]
        odims = gy.shape[2:]
        if is_pad_value_none:
            coeff = self.apoolnd.coeff
            # This conversion from chainerx to cupy exists here for
            # double backward of chainerx on cuda.
            coeff = backend.from_chx(coeff)
            gy *= coeff
        gx = cuda.cupy.empty(self._in_shape, self._in_dtype)

        in_params, out_params, operation, name = \
            average_pooling_nd_kernel.AveragePoolingNDKernelBackward.generate(
                self.ndim)
        cuda.elementwise(in_params, out_params, operation, name)(
            gy.reduced_view(),
            *(idims + odims + self.ksize + self.stride + self.pad
              + (gx,)))

        if not is_pad_value_none:
            gx /= functools.reduce(operator.mul, self.ksize)
        return gx,
Ejemplo n.º 6
0
def col2im_gpu(col, sy, sx, ph, pw, h, w, dy=1, dx=1):
    n, c, kh, kw, out_h, out_w = col.shape
    img = cuda.cupy.empty((n, c, h, w), dtype=col.dtype)
    cuda.elementwise(
        'raw T col, int32 h, int32 w, int32 out_h, int32 out_w,'
        'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw,'
        'int32 dx, int32 dy',
        'T img',
        '''
           int c0 = i / (h * w);
           int y  = i / w % h;
           int x  = i % w;
           T val = 0;
           for (int ky = 0; ky < kh; ++ky) {
             int out_y = (y + ph - ky * dy);
             if (0 > out_y || out_y >= out_h * sy) continue;
             if (out_y % sy != 0) continue;
             out_y /= sy;
             for (int kx = 0; kx < kw; ++kx) {
               int out_x = (x + pw - kx * dx);
               if (0 > out_x || out_x >= out_w * sx) continue;
               if (out_x % sx != 0) continue;
               out_x /= sx;
               int k = out_y + out_h * (kx + kw * (ky + kh * c0));
               val = val + col[out_x + out_w * k];
             }
           }
           img = val;
        ''',
        'col2im')(col.reduced_view(),
                  h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, dx, dy, img)
    return img
Ejemplo n.º 7
0
    def forward_gpu(self, gy):
        if self._used_cudnn:
            x, = self.apool2d.get_retained_inputs()
            return self.apool2d.backward_gpu((x.data,), gy)
        n, c, h, w = self._in_shape
        y_h, y_w = gy[0].shape[2:]
        gx = cuda.cupy.empty(self._in_shape, self._in_dtype)
        coeff = 1. / (self.kh * self.kw)
        cuda.elementwise(
            'raw T gy, int32 h, int32 w,'
            'int32 out_h, int32 out_w, int32 kh, int32 kw,'
            'int32 sy, int32 sx, int32 ph, int32 pw, T coeff',
            'T gx',
            '''
               int c0 = i / (h * w);
               int y  = i / w % h + ph;
               int x  = i % w + pw;
               int out_y_0 = max(0,     (y - kh + sy) / sy);
               int out_y_1 = min(out_h, (y      + sy) / sy);
               int out_x_0 = max(0,     (x - kw + sx) / sx);
               int out_x_1 = min(out_w, (x      + sx) / sx);
               int hc0  = out_h * c0;

               T val = 0;
               for (int out_y = out_y_0; out_y < out_y_1; ++out_y) {
                 for (int out_x = out_x_0; out_x < out_x_1; ++out_x) {
                   val = val + gy[out_x + out_w * (out_y + hc0)];
                 }
               }
               gx = val * coeff;
            ''', 'avg_pool_bwd')(gy[0].reduced_view(),
                                 h, w, y_h, y_w, self.kh, self.kw,
                                 self.sy, self.sx, self.ph, self.pw, coeff,
                                 gx)
        return gx,
Ejemplo n.º 8
0
    def forward_gpu(self, inputs):
        utils.nondeterministic('atomicAdd')
        self.retain_inputs((0, 1, 2))
        x, W, gy = inputs

        if self.reduce == 'no':
            gy = gy[:, None]

        samples = self.samples
        wx = self.wx.astype(x.dtype, copy=False)
        g = cuda.elementwise(
            'T wx, T gy, int32 m', 'T g',
            '''
            T y;
            if (i % m == 0) {
              y = 1;
            } else {
              y = -1;
            }

            g = -y * gy / (1.0f + __expf(wx * y));
            ''',
            'negative_sampling_calculate_g'
        )(wx, gy, self.sample_size + 1)

        cupy = cuda.cupy
        gx = cupy.zeros_like(x)
        n_in = x.shape[1]
        cuda.elementwise(
            'raw T g, raw T W, bool mask, raw S k, int32 c, int32 m', 'T gx',
            '''
            int d = i / c;
            T w = 0;
            if (mask == 1){
                for (int j = 0; j < m; ++j) {
                  w += g[d * m + j] * W[k[d * m + j] * c + i % c];
                }
            }
            gx = w;
            ''',
            'negative_sampling_calculate_gx'
        )(g, W, self.ignore_mask[:, None], samples, n_in,
          self.sample_size + 1, gx)

        gW = cupy.zeros_like(W)
        cuda.elementwise(
            'T g, raw T x, S k, bool mask, int32 c, int32 m',
            'raw T gW',
            '''
            T gi = g;
            if (mask == 1) {
                for (int j = 0; j < c; ++j) {
                  atomicAdd(&gW[k * c + j], gi * x[(i / m) * c + j]);
                }
            }
            ''',
            'negative_sampling_calculate_gw'
        )(g, x, samples, self.ignore_mask[:, None], n_in,
          self.sample_size + 1, gW)
        return gx, None, gW
Ejemplo n.º 9
0
    def forward_gpu(self, inputs):
        if self._used_cudnn:
            x, = self.mpoolnd._cudnn_inputs
            return self._forward_gpu_compute_indexes_again((x, inputs[0]))
        x, = inputs
        self._in_shape = x.shape
        self._in_dtype = x.dtype

        n, c = x.shape[:2]
        dims = x.shape[2:]

        ys = tuple(conv_nd.get_conv_outsize(d, k, s, p, self.cover_all)
                   for (d, k, s, p) in six.moves.zip(
                       dims, self.ksize, self.stride, self.pad))
        # (n, c, y_1, y_2, ..., y_N)
        y_shape = (n, c) + ys
        y = cuda.cupy.empty(y_shape, dtype=x.dtype)

        cls = max_pooling_nd_kernel.MaxPoolingNDKernelForwardWithIndexes
        in_params, out_params, operation, name = cls.generate(self.ndim)
        cuda.elementwise(in_params, out_params, operation, name)(
            x.reduced_view(),
            *(dims + ys + self.ksize + self.stride + self.pad +
              (self.indexes.reduced_view(), y)))
        return y,
Ejemplo n.º 10
0
    def forward_cudnn(self, inputs):
        if self.eps < libcudnn.CUDNN_BN_MIN_EPSILON:
            raise RuntimeError(
                'cuDNN does not allow an eps value '
                'less than {}.'.format(libcudnn.CUDNN_BN_MIN_EPSILON))

        self.retain_inputs((0, 1))
        x, gamma, beta = inputs
        xp = cuda.cupy

        orig_shape = x.shape
        batch_size, channels = orig_shape[:2]
        groups = self.groups
        cudnn_shape = (1, batch_size * groups, -1, 1)
        x = x.reshape(cudnn_shape)

        with cuda.get_device_from_array(x):
            dummy_beta = xp.zeros(batch_size * groups, dtype=x.dtype)
            self.dummy_gamma = xp.ones_like(dummy_beta)
        x_hat, self.mean, self.inv_std = \
            cudnn.batch_normalization_forward_training(
                x, self.dummy_gamma, dummy_beta, dummy_beta, dummy_beta, None,
                None, self.eps, 1.0, True, libcudnn.CUDNN_BATCHNORM_SPATIAL,
                configuration.config.debug)

        y = x_hat.reshape((batch_size, channels, -1))
        cuda.elementwise(
            'T gamma, T beta', 'T y',
            'y = y * gamma + beta',
            'groupnorm_y')(gamma[:, None], beta[:, None], y)

        y = y.reshape(orig_shape)
        return y,
Ejemplo n.º 11
0
    def forward_gpu(self, gys):
        if self._used_cudnn:
            x, = self.apoolnd.get_retained_inputs()
            return self.apoolnd.backward_gpu((x.data,), gys)

        gy, = gys
        n, c = self._in_shape[:2]
        idims = self._in_shape[2:]
        odims = gy.shape[2:]
        gx = cuda.cupy.empty(self._in_shape, self._in_dtype)
        if self.pad_value is None:
            coeff = self._get_pooling_width(cuda.cupy, odims, gy.dtype)
            coeff = cuda.cupy.reciprocal(coeff, out=coeff)
        else:
            coeff = 1. / functools.reduce(operator.mul, self.ksize)

        in_params, out_params, operation, name = \
            average_pooling_nd_kernel.AveragePoolingNDKernelBackward.generate(
                self.ndim)
        cuda.elementwise(in_params, out_params, operation, name)(
            gy.reduced_view(),
            *(idims + odims + self.ksize + self.stride + self.pad
              + (coeff, gx)))

        return gx,
Ejemplo n.º 12
0
 def update_core_gpu(self, param):
     grad = param.grad
     if grad is None:
         return
     cuda.elementwise('T grad, T lr', 'T param',
                      'param -= lr * grad',
                      'sgd')(grad, self.hyperparam.lr, param.data)
Ejemplo n.º 13
0
Archivo: ctc.py Proyecto: ktnyt/chainer
    def label_probability(self, label_size, path, path_length,
                          multiply_seq, xp):
        seq_length = len(multiply_seq)
        n_batch = len(path)
        dtype = multiply_seq.dtype

        ret = xp.zeros((seq_length, n_batch, label_size), dtype)
        if xp == numpy:
            for b in six.moves.range(len(path)):
                target_path = path[b, :path_length[b]]
                chars = {c for c in target_path}
                for c in chars:
                    ret[:, b, c] = xp.sum(
                        multiply_seq[:, b, 0:path_length[b]]
                        [:, target_path == c], axis=1)
        else:
            cuda.elementwise(
                'T prob, I path, I path_length, I max_path_length',
                'raw T cum_prob',
                '''
                I t = i % max_path_length;
                if (t < path_length) {
                  int n_batch = cum_prob.shape()[1];
                  I s = i / (max_path_length * n_batch);
                  I b = (i - s * (max_path_length * n_batch))
                      / max_path_length;
                  int ind[] = {s, b, path};
                  atomicAdd(&cum_prob[ind], prob);
                }
                ''', 'ctc_label_prob_sum'
            )(multiply_seq, path, path_length[:, None], path.shape[1], ret)
        return ret
Ejemplo n.º 14
0
    def forward_gpu(self, inputs):
        if self._used_cudnn:
            x, = self.mpool2d.get_retained_inputs()
            return self._forward_gpu_compute_indexes_again((x.data, inputs[0]))
        else:
            x, = inputs
            n, c, h, w = x.shape
            y_h = conv.get_conv_outsize(
                h, self.kh, self.sy, self.ph, self.cover_all)
            assert y_h > 0, 'Height in the output should be positive.'
            y_w = conv.get_conv_outsize(
                w, self.kw, self.sx, self.pw, self.cover_all)
            assert y_w > 0, 'Width in the output should be positive.'
            y = cuda.cupy.empty((n, c, y_h, y_w), dtype=x.dtype)

            cuda.elementwise(
                'raw T in, raw S indexes, int32 h, int32 w, int32 out_h,'
                'int32 out_w, int32 kh, int32 kw, int32 sy, int32 sx,'
                'int32 ph, int32 pw', 'T out',
                '''
                int c0    = i / (out_h * out_w);
                int out_y = i / out_w % out_h;
                int out_x = i % out_w;
                int index = indexes[i];
                int max_y = max(0, out_y * sy - ph + index / kw);
                int max_x = max(0, out_x * sx - pw + index % kw);
                out = in[max_x + w * (max_y + h * c0)];
                ''', 'max_pool_grad_fwd')(
                    x.reduced_view(), self.indexes.reduced_view(), h, w,
                    y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph,
                    self.pw, y)
            return y,
Ejemplo n.º 15
0
    def forward(self, inputs):
        xp = backend.get_array_module(*inputs)
        c_prev, x, c_next, gc, gh = inputs
        batch = len(x)

        gx = xp.empty_like(x)
        ga, gi, gf, go = _extract_gates(gx)

        # Consider the case that either gradient is not given
        if gc is None:
            gc_update = 0
            gc_rest = 0
        else:
            gc_update = gc[:batch]
            gc_rest = gc[batch:]
        if gh is None:
            gh = 0

        a, i, f, o = _extract_gates(x)
        if xp is numpy:
            if intel64.should_use_ideep('>=auto'):
                xp = intel64.ideep.get_array_module(x)
            tanh_a = xp.tanh(a)
            sig_i = _sigmoid(i, xp)
            sig_f = _sigmoid(f, xp)
            sig_o = _sigmoid(o, xp)

            co = xp.tanh(c_next[:batch])
            gc_prev = numpy.empty_like(c_prev)
            # multiply f later
            gc_prev[:batch] = gh * sig_o * _grad_tanh(co) + gc_update
            gc = gc_prev[:batch]
            ga[:] = gc * sig_i * _grad_tanh(tanh_a)
            gi[:] = gc * tanh_a * _grad_sigmoid(sig_i)
            gf[:] = gc * c_prev[:batch] * _grad_sigmoid(sig_f)
            go[:] = gh * co * _grad_sigmoid(sig_o)
            gc_prev[:batch] *= sig_f  # multiply f here
            gc_prev[batch:] = gc_rest
        else:
            gc_prev = xp.empty_like(c_prev)
            cuda.elementwise(
                'T c_prev, T c, T gc, T gh, T a, T i_, T f, T o',
                'T gc_prev, T ga, T gi, T gf, T go',
                '''
                    COMMON_ROUTINE;
                    T co = tanh(c);
                    T temp = gh * ao * grad_tanh(co) + gc;
                    ga = temp * ai * grad_tanh(aa);
                    gi = temp * aa * grad_sigmoid(ai);
                    gf = temp * c_prev * grad_sigmoid(af);
                    go = gh * co * grad_sigmoid(ao);
                    gc_prev = temp * af;
                ''',
                'lstm_bwd', preamble=_preamble)(
                    c_prev[:batch], c_next[:batch], gc_update, gh, a, i, f, o,
                    gc_prev[:batch], ga, gi, gf, go)
            gc_prev[batch:] = gc_rest

        return gc_prev, gx
Ejemplo n.º 16
0
def _transpose(xs, length):
    if length == 0:
        return ()

    xp = backend.get_array_module(*xs)
    lengths = numpy.empty(length, dtype=numpy.int32)
    end = length
    for i, x in enumerate(xs):
        len_x = len(x)
        if len_x == end:
            continue
        lengths[len_x:end] = i
        end = len_x
    lengths[0:end] = len(xs)

    if xp is numpy:
        dtype = xs[0].dtype
        unit = xs[0].shape[1:]

        outs = tuple([xp.empty((l,) + unit, dtype=dtype) for l in lengths])
        for i, x in enumerate(xs):
            for p, xi in enumerate(x):
                outs[p][i] = xi

    else:
        offsets1 = numpy.empty(len(xs) + 1, dtype=numpy.int32)
        offsets1[0] = 0
        numpy.cumsum([len(x) for x in xs], out=offsets1[1:])

        offsets2 = numpy.empty(length + 1, dtype=numpy.int32)
        offsets2[0] = 0
        numpy.cumsum(lengths, dtype=numpy.int32, out=offsets2[1:])

        x = xp.concatenate(xs, axis=0)
        o = xp.empty_like(x)
        unit = xs[0].size // len(xs[0])
        size = length * len(xs) * unit
        cuda.elementwise(
            'int32 len, int32 unit, raw int32 off1, raw int32 off2, raw T vs',
            'raw T hs',
            '''
            int ind = i / unit;
            int off = i - ind * unit;
            int y = ind / len;
            int x = ind - y * len;
            if (off2[x] + y < off2[x + 1]) {
              hs[(off2[x] + y) * unit + off] = vs[(off1[y] + x) * unit + off];
            }
            ''',
            'transpose_sequence'
        )(length, unit, cuda.to_gpu(offsets1), cuda.to_gpu(offsets2), x, o,
          size=size)
        outs = tuple(xp.split(o, offsets2[1:-1]))

    return outs
Ejemplo n.º 17
0
Archivo: ctc.py Proyecto: tkng/chainer
    def _computes_transition(
            self, prev_prob, path, path_length, cum_prob, y):
        xp = cuda.get_array_module(prev_prob)

        if xp == numpy:
            n_batch, max_path_length = path.shape
            mat = xp.full(
                (3, n_batch, max_path_length), self.zero_padding, 'f')
            mat[0, :, :] = prev_prob
            mat[1, :, 1:] = prev_prob[:, :-1]
            mat[2, :, 2:] = prev_prob[:, :-2]
            # disable transition between the same symbols
            # (including blank-to-blank)
            same_transition = (path[:, :-2] == path[:, 2:])
            mat[2, :, 2:][same_transition] = self.zero_padding
            prob = _logsumexp(mat, xp, axis=0)
            outside = xp.arange(max_path_length) >= path_length[:, None]
            prob[outside] = self.zero_padding
            cum_prob += prob
            batch_index = xp.arange(n_batch, dtype='i')
            prob += y[batch_index[:, None], path]
        else:
            prob = xp.empty_like(prev_prob)
            cuda.elementwise(
                'raw T prob, raw I path, I path_length, T zero, raw T y',
                'T z, T cum_prob',
                '''
                int length = prob.shape()[1];
                int b = i / length;
                int t = i - b * length;
                if (t >= path_length) {
                  z = zero;
                  cum_prob += zero;
                  return;
                }
                int ind1[] = {b, t};
                int ind2[] = {b, t - 1};
                int ind3[] = {b, t - 2};
                float f1 = prob[ind1];
                float f2 = (0 <= t - 1) ? prob[ind2] : zero;
                float f3 = (0 <= t - 2 && path[ind3] != path[ind1]) ?
                  prob[ind3] : zero;

                // calculates log-sum-exp
                float m = max(f1, max(f2, f3));
                z = m + log(exp(f1 - m) + exp(f2 - m) + exp(f3 - m));

                cum_prob += z;

                int y_ind[] = {b, path[ind1]};
                z += y[y_ind];
                ''', 'ctc_transition'
            )(prev_prob, path, path_length[:, None], self.zero_padding, y,
              prob, cum_prob)
        return prob
Ejemplo n.º 18
0
    def forward_gpu(self, x):
        if chainer.should_use_cudnn('>=auto'):
            self.retain_inputs((0,))
            return super(MaxPooling2D, self).forward_gpu(x)

        self._in_shape = x[0].shape
        self._in_dtype = x[0].dtype

        n, c, h, w = x[0].shape
        y_h = conv.get_conv_outsize(
            h, self.kh, self.sy, self.ph, self.cover_all)
        assert y_h > 0, 'Height in the output should be positive.'
        y_w = conv.get_conv_outsize(
            w, self.kw, self.sx, self.pw, self.cover_all)
        assert y_w > 0, 'Width in the output should be positive.'
        y = cuda.cupy.empty((n, c, y_h, y_w), dtype=x[0].dtype)
        self.indexes = cuda.cupy.empty((n, c, y_h, y_w), dtype=numpy.int32)

        cuda.elementwise(
            'raw T in, int32 h, int32 w, int32 out_h, int32 out_w,'
            'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw',
            'T out, S indexes',
            '''
               int c0    = i / (out_h * out_w);
               int out_y = i / out_w % out_h;
               int out_x = i % out_w;
               int in_y_0 = max(0, out_y * sy - ph);
               int in_y_1 = min(h, out_y * sy + kh - ph);
               int in_x_0 = max(0, out_x * sx - pw);
               int in_x_1 = min(w, out_x * sx + kw - pw);

               T maxval = in[in_x_0 + w * (in_y_0 + h * c0)];
               int argmax_y = in_y_0;
               int argmax_x = in_x_0;
               for (int y = in_y_0; y < in_y_1; ++y) {
                 int offset_y = w * (y + h * c0);
                 for (int x = in_x_0; x < in_x_1; ++x) {
                   float v = in[x + offset_y];
                   if (maxval < v) {
                     maxval   = v;
                     argmax_y = y;
                     argmax_x = x;
                   }
                 }
               }
               out = maxval;

               int argmax_ky = argmax_y + ph - out_y * sy;
               int argmax_kx = argmax_x + pw - out_x * sx;
               indexes = argmax_kx + kw * argmax_ky;
            ''', 'max_pool_fwd')(x[0].reduced_view(),
                                 h, w, y_h, y_w, self.kh, self.kw,
                                 self.sy, self.sx, self.ph, self.pw,
                                 y, self.indexes)
        return y,
Ejemplo n.º 19
0
    def forward_gpu(self, inputs_and_grad_outputs):
        class_weight = cuda.to_gpu(self.class_weight)

        cupy = cuda.cupy
        x, t, gloss = inputs_and_grad_outputs
        if x.size == 0:
            return cupy.zeros(x.shape, dtype=x.dtype), None
        if self.y is not None:
            y = self.y
        else:
            y = log_softmax._log_softmax(x)
            cupy.exp(y, out=y)
        n_unit = t.size // len(t)
        if self.coeff is not None:
            coeff = self.coeff
        else:
            gloss = gloss[:, None, ...]
            coeff = cupy.array(1, dtype=gloss.dtype)  # dtype does not matter

        if self.class_weight is None:
            gx = cuda.elementwise(
                'T y, S t, T gloss, U coeff, S n_channel, S n_unit, '
                'S ignore_label',
                'T gx',
                '''
                    const int c = (i / n_unit % n_channel);
                    if (t == ignore_label) {
                        gx = T(0);
                    } else {
                        gx = static_cast<T>(gloss * coeff * (y - (c == t)));
                    }
                ''',
                'softmax_crossent_bwd')(
                    y, cupy.expand_dims(t, 1), gloss, coeff, x.shape[1],
                    n_unit, self.ignore_label)
        else:
            gx = cuda.elementwise(
                'T y, raw T w, S t, T gloss, U coeff, '
                'S n_channel, S n_unit, S ignore_label',
                'T gx',
                '''
                    const int c = (i / n_unit % n_channel);
                    if (t == ignore_label) {
                        gx = T(0);
                    } else {
                        gx = static_cast<T>(
                            gloss * coeff * (y - (c == t)) * w[t]);
                    }
                ''',
                'softmax_crossent_weight_bwd')(
                    y, class_weight, cupy.expand_dims(t, 1), gloss, coeff,
                    x.shape[1], n_unit, self.ignore_label)

        return gx,
Ejemplo n.º 20
0
 def forward_gpu(self, inputs):
     gy, = inputs
     x1, x2 = self.x1, self.x2
     gx1 = cuda.elementwise(
         'T x1, T x2, T gy', 'T gx1',
         'gx1 = (x1 <= x2) ? gy : (T)0.0',
         'minimum_bwd1')(x1, x2, gy)
     gx2 = cuda.elementwise(
         'T x1, T x2, T gy', 'T gx1',
         'gx1 = (x1 > x2) ? gy : (T)0.0',
         'minimum_bwd2')(x1, x2, gy)
     return utils.sum_to(gx1, x1.shape), utils.sum_to(gx2, x2.shape)
Ejemplo n.º 21
0
 def update_core_gpu(self, param):
     grad = param.grad
     if grad is None:
         return
     cuda.elementwise(
         'T grad, T lr, T momentum',
         'T param, T v',
         '''v = momentum * v - lr * grad;
            param += v;''',
         'momentum_sgd')(
             grad, self.hyperparam.lr, self.hyperparam.momentum,
             param.data, self.state['v'])
 def forward_gpu(self, x):
     self.y = cuda.cupy.square(x[0])  # temporary
     self.scale = cuda.cupy.empty_like(self.y)
     _cu_conv_sum(self.scale, self.y, self.n)
     cuda.elementwise(
         'T x, T k, T alpha, T beta',
         'T y, T scale',
         '''scale = k + alpha * scale;
            y = x * pow(scale, -beta);''',
         'lrn_fwd')(x[0], self.k, self.alpha, self.beta,
                    self.y, self.scale)
     return self.y,
Ejemplo n.º 23
0
def _inverse_indices(indices):
    xp = cuda.get_array_module(indices)
    r = xp.empty_like(indices)
    if xp is numpy:
        r[indices] = numpy.arange(len(indices))
    else:
        cuda.elementwise(
            'S ind', 'raw S r',
            'r[ind] = i',
            'inverse_indices'
        )(indices, r)
    return r
Ejemplo n.º 24
0
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1, 2))
        x, t, W = inputs

        self.ignore_mask = (t != self.ignore_label)
        samples = self._make_samples(t)

        n_in = x.shape[1]
        self.wx = cuda.elementwise(
            'raw T W, raw T x, bool mask, S k, int32 c, int32 m', 'T wx',
            '''
            T f = 0;
            if (mask == 1) {
                for (int j = 0; j < c; ++j) {
                  int x_ind[] = {(i / m), j};
                  int w_ind[] = {k, j};
                  f += x[x_ind] * W[w_ind];
                }
            }
            wx = f;
            ''',
            'negative_sampling_wx'
        )(W, x, self.ignore_mask[:, None], samples, n_in,
          self.sample_size + 1)

        loss = cuda.elementwise(
            'T wx, int32 c, int32 m, bool mask', 'T y',
            '''
            if (mask) {
              T f = wx;
              if (i % m == 0) {
                f = -f;
              }
              if (f < 0) {
                y = __logf(1 + __expf(f));
              } else {
                y = f + __logf(1 + __expf(-f));
              }
            } else {
              y = 0;
            }
            ''',
            'negative_sampling_forward'
        )(self.wx, n_in, self.sample_size + 1, self.ignore_mask[:, None])

        if self.reduce == 'sum':
            loss = loss.sum()
        else:  # 'no':
            loss = loss.sum(axis=1)

        self.samples = samples
        return loss,
 def backward_gpu(self, x, gy):
     summand = cuda.elementwise(
         'T scale, T y, T gy', 'T summand',
         'summand = y * gy / scale',
         'lrn_bwd_summand')(self.scale, self.y, gy[0])
     gx = cuda.cupy.empty_like(x[0])
     _cu_conv_sum(gx, summand, self.n)
     cuda.elementwise(
         ' T x, T gy, T scale, T beta, T coeff', 'T gx',
         'gx = pow(scale, -beta) * gy - coeff * x * gx',
         'lrn_bwd')(x[0], gy[0], self.scale,
                    self.beta, 2 * self.alpha * self.beta, gx)
     return gx,
Ejemplo n.º 26
0
 def forward_gpu(self, inputs):
     x, y, gy = inputs
     summand = cuda.elementwise(
         'T scale, T y, T gy', 'T summand',
         'summand = y * gy / scale',
         'lrn_bwd_summand')(self.scale, y, gy)
     gx = cuda.cupy.empty_like(x)
     _cu_conv_sum(gx, summand, self.n)
     cuda.elementwise(
         ' T x, T gy, T scale, T beta, T coeff', 'T gx',
         'gx = pow(scale, -beta) * gy - coeff * x * gx',
         'lrn_bwd')(x, gy, self.scale,
                    self.beta, 2 * self.alpha * self.beta, gx)
     return gx,
Ejemplo n.º 27
0
 def update_core_gpu(self, param):
     grad = param.grad
     if grad is None:
         return
     cuda.elementwise(
         'T grad, T one_minus_rho, T eps',
         'T param, T msg, T msdx',
         '''msg   = msg + one_minus_rho * (grad * grad - msg);
            T dx  = sqrt((msdx + eps) / (msg + eps)) * grad;
            msdx  += one_minus_rho * (dx * dx - msdx);
            param -= dx;''',
         'adadelta')(grad, 1 - self.hyperparam.rho,
                     self.hyperparam.eps, param.data,
                     self.state['msg'], self.state['msdx'])
Ejemplo n.º 28
0
 def update_core_gpu(self, param):
     grad = param.grad
     if grad is None:
         return
     cuda.elementwise(
         'T grad, T lr, T momentum',
         'T param, T v',
         '''
            v = v * momentum - lr * grad;
            param += momentum * momentum * v - (1 + momentum) * lr * grad;
         ''',
         'nesterov_ag')(
             grad, self.hyperparam.lr, self.hyperparam.momentum,
             param.data, self.state['v'])
Ejemplo n.º 29
0
    def backward(self, inputs, grad_outputs):
        x, gamma, _ = inputs
        gy = grad_outputs[0]
        head_ndim = gamma.ndim + 1
        expander = (None, Ellipsis) + (None,) * (x.ndim - head_ndim)
        m = gamma.dtype.type(x.size // gamma.size)
        axis = (0,) + tuple(range(head_ndim, x.ndim))
        xp = backend.get_array_module(x)

        # Note: we must be in train mode.
        assert configuration.config.train
        # NOTE(tommi): cuDNN is not used since it does not support
        # batch renormalization
        gbeta = gy.sum(axis=axis)
        ggamma = (gy * self.x_hat_renorm).sum(axis=axis)
        gsigma_batch = (gy * self.x_hat).sum(axis=axis)
        if xp is numpy:
            scale = (self.r * gamma / self.std)[expander]
            gx = scale * (gy - (self.x_hat * gsigma_batch[expander] +
                                gbeta[expander]) / m)
        else:
            inv_m = numpy.float32(1) / m
            gx = cuda.elementwise(
                'T gy, T x_hat, T gamma, T std, T gsigma_batch, T gbeta, \
                T inv_m, T r',
                'T gx',
                'gx = (r * gamma / std) * (gy - (x_hat * gsigma_batch + gbeta) * \
                inv_m)',
                'bn_bwd')(gy, self.x_hat, gamma[expander],
                          self.std[expander], gsigma_batch[expander],
                          gbeta[expander], inv_m, self.r[expander])
        return gx, ggamma, gbeta
Ejemplo n.º 30
0
 def forward_gpu(self, x):
     self.retain_inputs((0,))
     return cuda.elementwise(
         'T x', 'T y',
         'y = erfc(x)',
         'elementwise_erfc',
     )(x[0]),
Ejemplo n.º 31
0
 def forward_gpu(self, inputs):
     self.retain_inputs((0, 1))
     x, gy = inputs
     # TODO(beam2d): Make it not use the input
     value = _preprocess_const(x, self.value)
     gx = cuda.elementwise('T x, T gy, T value', 'T gx',
                           'gx = -value * gy / (x * x)',
                           'div_from_const_bwd')(x, gy, value)
     return gx,
Ejemplo n.º 32
0
 def forward_gpu(self, inputs):
     self.retain_inputs((0, 1, 2))
     p, x, y = inputs
     return cuda.elementwise(
         'T p, T x, T y',
         'T z',
         'z = p * x + (1 - p) * y',
         'linear_interpolate_fwd',
     )(p, x, y),
Ejemplo n.º 33
0
    def forward(self, inputs):
        xp = backend.get_array_module(*inputs)
        x, gamma, beta = inputs

        # Note: we must be in train mode.
        assert configuration.config.train

        head_ndim = gamma.ndim + 1
        expander = (None, Ellipsis) + (None, ) * (x.ndim - head_ndim)

        # NOTE(tommi): cuDNN is not used since it does not support
        # batch renormalization
        axis = (0, ) + tuple(range(head_ndim, x.ndim))
        mean = x.mean(axis=axis, dtype=gamma.dtype)
        var = x.var(axis=axis, dtype=gamma.dtype)
        self.std = xp.sqrt(var + self.eps, dtype=var.dtype)

        running_sigma = xp.sqrt(self._running_var + self.eps,
                                dtype=self._running_mean.dtype)
        self.r = xp.clip(self.std / running_sigma, 1.0 / self.rmax, self.rmax)
        d = xp.clip((mean - self._running_mean) / running_sigma, -self.dmax,
                    self.dmax)

        gamma = gamma[expander]
        beta = beta[expander]

        if xp is numpy:
            self.x_hat = _xhat(x, mean, self.std, expander)
            self.x_hat_renorm = self.x_hat * self.r[expander] + d[expander]
            y = gamma * self.x_hat_renorm
            y += beta
            y = y.astype(dtype=x.dtype)
        else:
            self.x_hat, self.x_hat_renorm, y = cuda.elementwise(
                'T x, U mean, U std, U gamma, U beta, U r, U d',
                'U x_hat, U x_hat_renorm, T y', '''
                x_hat = (x - mean) / std;
                x_hat_renorm = x_hat * r + d;
                y = gamma * x_hat_renorm + beta;
                ''', 'brn_fwd')(x, mean[expander], self.std[expander], gamma,
                                beta, self.r[expander], d[expander])

        if self.update_statistics:
            m = x.size // gamma[expander].size
            self._running_mean *= self.decay
            adjust = m / max(m - 1., 1.)  # unbiased estimation
            temp_ar = xp.array(mean)
            temp_ar *= (1 - self.decay)
            self._running_mean += temp_ar
            del temp_ar
            self._running_var *= self.decay
            temp_ar = xp.array(var)
            temp_ar *= (1 - self.decay) * adjust
            self._running_var += temp_ar
            del temp_ar

        return y,
Ejemplo n.º 34
0
 def forward_gpu(self, x):
     self.retain_inputs((0, ))
     self.retain_outputs((0, ))
     return cuda.elementwise(
         'T x',
         'T y',
         'y = erfcx(x)',
         'elementwise_erfcx',
     )(x[0]),
Ejemplo n.º 35
0
 def forward_gpu(self, inputs):
     self.retain_inputs((0, 1, 2, 3))
     p, x, y, gz = inputs
     return cuda.elementwise(
         'T p, T x, T y, T gz', 'T gp, T gx, T gy', '''
         gp = (x - y) * gz;
         gx = gz * p;
         gy = gz * (1 - p);
         ''', 'linear_interpolate_bwd')(p, x, y, gz)
Ejemplo n.º 36
0
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1))
        y, gy = inputs

        value = _preprocess_const(y, self.value)
        gx = cuda.elementwise('T y, T gy, T value', 'T gx',
                              'gx = log(value) * y * gy',
                              'pow_const_var_bwd')(y, gy, value)
        return gx,
Ejemplo n.º 37
0
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1))
        x, gy = inputs

        self.val = _preprocess_const(x, self.value)
        gx = cuda.elementwise('T x, T gy, T value', 'T gx',
                              'gx = value * pow(x, value - 1) * gy',
                              'pow_var_const_bwd')(x, gy, self.val)
        return gx,
Ejemplo n.º 38
0
 def forward_gpu(self, inputs):
     self.retain_inputs((0, 1, 2))
     x1, x2, gy = inputs
     gx1, gx2 = cuda.elementwise('T x1, T x2, T gy', 'T gx1, T gx2',
                                 ('T sqnorm = x1 * x1 + x2 * x2;'
                                  'gx1 = x2 / sqnorm * gy;'
                                  'gx2 = -x1 / sqnorm * gy;'),
                                 'arctan2_bwd')(x1, x2, gy)
     return gx1, gx2
Ejemplo n.º 39
0
def entropy_gpu(x):
    vec = cuda.elementwise(
        'T x',
        'T y',
        '''
            y = (x == 0) ? 0 : -x*log(x);
        ''',
        'entropy')(x.data)
    return cuda.cupy.sum(vec, 1)
Ejemplo n.º 40
0
 def log_matrix(self, x, xp):
     if xp == numpy:
         res = numpy.ma.log(x).filled(fill_value=self.zero_padding)
     else:
         create_recurrence_relation = cuda.elementwise(
             'T x, T e', 'T y', 'y = x == 0 ? e : log(x)',
             'create_recurrence_relation')
         res = create_recurrence_relation(x, self.zero_padding)
     return res.astype(numpy.float32)
Ejemplo n.º 41
0
 def forward_gpu(self, inputs):
     self.retain_inputs((0, ))
     x = inputs[0]
     y = cuda.elementwise(
         'T x, T beta, T beta_inv', 'T y', '''
           T bx = beta * x;
           y = (max(bx, (T)0) + log1p(exp(-fabs(bx)))) * beta_inv;
         ''', 'softplus_fwd')(x, self.beta, self.beta_inv)
     return y,
Ejemplo n.º 42
0
 def forward_gpu(self, inputs):
     self.retain_inputs((0, 1, 2))
     x0, x1, gy = inputs
     gx0, gx1 = cuda.elementwise(
         'T x0, T x1, T gy', 'T gx0, T gx1', '''
            gx0 = gy / x1;
            gx1 = -gx0 * x0 / x1;
         ''', 'div_bwd')(x0, x1, gy)
     return utils.sum_to(gx0, x0.shape), utils.sum_to(gx1, x1.shape)
Ejemplo n.º 43
0
 def forward_gpu(self, inputs):
     self.retain_inputs((0, ))
     x, = inputs
     x_hat = cuda.elementwise('T x, T mean, T inv_std', 'T x_hat',
                              'x_hat = (x - mean) * inv_std',
                              'groupnorm_x_hat')(x, self.mean[:, None],
                                                 self.inv_std[:, None])
     self.retain_outputs((0, ))
     return x_hat,
Ejemplo n.º 44
0
    def update_core_gpu(self, param):
        grad = param.grad
        if grad is None:
            return

        hp = self.hyperparam
        eps = grad.dtype.type(hp.eps)
        if hp.eps != 0 and eps == 0:
            raise ValueError(
                'eps of Adam optimizer is too small for {} ({})'.format(
                    grad.dtype.name, hp.eps))
        if hp.amsgrad:
            if AdamRule._amsgrad_kernel is None:
                AdamRule._amsgrad_kernel = cuda.elementwise(
                    'T grad, T alpha_t, T one_minus_beta1, T one_minus_beta2, '
                    'T eps, T eta, T weight_decay_rate',
                    'T param, T m, T v, T vhat',
                    '''m += one_minus_beta1 * (grad - m);
                       v += one_minus_beta2 * (grad * grad - v);
                       vhat = max(vhat, v);
                       param -= eta * (alpha_t * m / (sqrt(vhat) + eps) +
                                       weight_decay_rate * param);''',
                    'adam')
            AdamRule._amsgrad_kernel(
                grad, self.alpha_t, 1 - hp.beta1,
                1 - hp.beta2, hp.eps,
                hp.eta, hp.weight_decay_rate,
                param.data, self.state['m'], self.state['v'],
                self.state['vhat'])
        else:
            if AdamRule._kernel is None:
                AdamRule._kernel = cuda.elementwise(
                    'T grad, T alpha_t, T one_minus_beta1, T one_minus_beta2, '
                    'T eps, T eta, T weight_decay_rate',
                    'T param, T m, T v',
                    '''m += one_minus_beta1 * (grad - m);
                       v += one_minus_beta2 * (grad * grad - v);
                       param -= eta * (alpha_t * m / (sqrt(v) + eps) +
                                       weight_decay_rate * param);''',
                    'adam')
            AdamRule._kernel(grad, self.alpha_t, 1 - hp.beta1,
                             1 - hp.beta2, hp.eps,
                             hp.eta, hp.weight_decay_rate,
                             param.data, self.state['m'], self.state['v'])
Ejemplo n.º 45
0
    def forward(self, xs):
        xp = backend.get_array_module(*xs)

        if self.length is None:
            length = max(len(x) for x in xs)
        else:
            length = self.length

        shape = (len(xs), length) + xs[0].shape[1:]
        y = xp.empty(shape, xs[0].dtype)
        if length == 0:
            return y,  # y is an empty array

        if xp is numpy or any(not x._c_contiguous for x in xs):
            for i, x in enumerate(xs):
                l = len(x)
                if l == length:
                    y[i] = x
                else:
                    y[i, 0:l] = x
                    y[i, l:] = self.padding
        else:
            # This code assumes that all arrays are c_contiguous
            ptr_shape = (Ellipsis,) + (None,) * xs[0].ndim
            ptrs = cuda.cupy.array(
                [x.data for x in xs], numpy.uintp)[ptr_shape]
            lengths = cuda.cupy.array(
                [len(x) for x in xs], numpy.int32)[ptr_shape]
            base = utils.size_of_shape(xs[0].shape[1:])
            cuda.elementwise(
                'P ptr, int32 length, T pad, int32 base, int32 max_length',
                'T y',
                '''
                int d = i / base % max_length;
                if (d < length) {
                  y = reinterpret_cast<const T*>(ptr)[i % (base * max_length)];
                } else {
                  y = pad;
                }
                ''',
                'pad_sequence_fwd'
            )(ptrs, lengths, self.padding, base, length, y)

        return y,
Ejemplo n.º 46
0
    def forward_gpu(self, inputs):
        x, t, W = inputs
        self.ignore_mask = (t != self.ignore_label)
        n_in = x.shape[1]
        self._make_samples(t)

        self.wx = cuda.elementwise(
            'raw T W, raw T x, bool mask, S k, int32 c, int32 m', 'T wx', '''
            T f = 0;
            if (mask == 1) {
                for (int j = 0; j < c; ++j) {
                  int x_ind[] = {(i / m), j};
                  int w_ind[] = {k, j};
                  f += x[x_ind] * W[w_ind];
                }
            }
            wx = f;
            ''',
            'negative_sampling_wx')(W, x, self.ignore_mask[:, None],
                                    self.samples, n_in, self.sample_size + 1)

        loss = cuda.elementwise(
            'T wx, int32 c, int32 m, bool mask', 'T y', '''
            if (mask) {
              T f = wx;
              if (i % m == 0) {
                f = -f;
              }
              if (f < 0) {
                y = __logf(1 + __expf(f));
              } else {
                y = f + __logf(1 + __expf(-f));
              }
            } else {
              y = 0;
            }
            ''',
            'negative_sampling_forward')(self.wx, n_in, self.sample_size + 1,
                                         self.ignore_mask[:, None])
        if self.reduce == 'sum':
            loss = loss.sum()
        else:  # 'no':
            loss = loss.sum(axis=1)
        return loss,
Ejemplo n.º 47
0
 def forward_gpu(self, inputs):
     gy, = inputs
     gx1, gx2 = cuda.elementwise(
         'S cond, T gy', 'T gx1, T gx2',
         '''
         gx1 = cond ? gy : (T)0.0;
         gx2 = cond ? (T)0.0 : gy;
         ''',
         'maximum_bwd1')(self.cond, gy)
     return gx1, gx2
Ejemplo n.º 48
0
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1, 2))
        x0, x1, gy = inputs

        gx0, gx1 = cuda.elementwise(
            'T x0, T x1, T gy, T y', 'T gx0, T gx1', '''
            gx0 = x1 * pow(x0, x1 - 1) * gy;
            gx1 = log(x0) * y * gy;
            ''', 'pow_var_var_bwd')(x0, x1, gy, self.y)
        return gx0, gx1
Ejemplo n.º 49
0
 def update_core_gpu(self, param):
     grad = param.grad
     if grad is None:
         return
     if AdaGradRule._kernel is None:
         AdaGradRule._kernel = cuda.elementwise(
             'T grad, T lr, T eps', 'T param, T h', '''h += grad * grad;
                param -= lr * grad / (sqrt(h) + eps);''', 'adagrad')
     AdaGradRule._kernel(grad, self.hyperparam.lr, self.hyperparam.eps,
                         param.data, self.state['h'])
Ejemplo n.º 50
0
    def forward_gpu(self, gy):
        if self._used_cudnn:
            x, = self.apoolnd.get_retained_inputs()
            return self.apoolnd.backward_gpu((x.data, ), gy)

        n, c = self._in_shape[:2]
        dims = self._in_shape[2:]
        ys = gy[0].shape[2:]
        gx = cuda.cupy.empty(self._in_shape, self._in_dtype)
        coeff = 1. / functools.reduce(operator.mul, self.ksize)

        in_params, out_params, operation, name = \
            average_pooling_nd_kernel.AveragePoolingNDKernelBackward.generate(
                self.ndim)
        cuda.elementwise(in_params, out_params, operation, name)(
            gy[0].reduced_view(),
            *(dims + ys + self.ksize + self.stride + self.pad + (coeff, gx)))

        return gx,
Ejemplo n.º 51
0
 def forward_gpu(self, inputs):
     x, = inputs
     gx = cuda.elementwise(
         'T x, T alpha', 'T gx',
         'gx = x >= 0 ? (T)1 : (T)(alpha * exp(x))',
         'elu_bwd')(
             x, self.alpha)
     self.retain_inputs((0,))
     self.retain_outputs((0,))
     return gx,
Ejemplo n.º 52
0
 def forward_gpu(self, inputs):
     self.retain_inputs((0, 1, 2))
     x0, x1, gy = inputs
     return cuda.elementwise(
         'T x0, T x1, T gy',
         'T gx0, T gx1',
         '''
            gx0 = gy / x1;
            gx1 = -gx0 * x0 / x1;
         ''', 'div_bwd')(x0, x1, gy)
Ejemplo n.º 53
0
    def _forward_gpu_compute_indexes_again(self, inputs):
        x, ggx = inputs
        n, c, h, w = ggx.shape
        y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph,
                                    self.cover_all)
        assert y_h > 0, 'Height in the output should be positive.'
        y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw,
                                    self.cover_all)
        assert y_w > 0, 'Width in the output should be positive.'
        y = cuda.cupy.empty((n, c, y_h, y_w), dtype=x.dtype)

        cuda.elementwise(
            'raw T in, raw T ggx, int32 h, int32 w, int32 out_h,'
            'int32 out_w, int32 kh, int32 kw, int32 sy, int32 sx,'
            'int32 ph, int32 pw', 'T out', '''
            int c0    = i / (out_h * out_w);
            int out_y = i / out_w % out_h;
            int out_x = i % out_w;
            int in_y_0 = max(0, out_y * sy - ph);
            int in_y_1 = min(h, out_y * sy + kh - ph);
            int in_x_0 = max(0, out_x * sx - pw);
            int in_x_1 = min(w, out_x * sx + kw - pw);

            T maxval = in[in_x_0 + w * (in_y_0 + h * c0)];
            int argmax_y = in_y_0;
            int argmax_x = in_x_0;
            for (int y = in_y_0; y < in_y_1; ++y) {
                int offset_y = w * (y + h * c0);
                for (int x = in_x_0; x < in_x_1; ++x) {
                    float v = in[x + offset_y];
                    if (maxval < v) {
                        argmax_y = y;
                        argmax_x = x;
                    }
                }
            }
            out = ggx[argmax_x + w * (argmax_y + h * c0)]
            ''',
            'max_pool_grad_fwd_calc_indexes')(x.reduced_view(),
                                              ggx.reduced_view(), h, w, y_h,
                                              y_w, self.kh, self.kw, self.sy,
                                              self.sx, self.ph, self.pw, y)
        return y,
Ejemplo n.º 54
0
 def forward_gpu(self, inputs):
     x, = inputs
     gx = cuda.elementwise(
         'T x, T r, T ir, T r2', 'T gx',
         'gx = x >= 0 ? (T) r*pow((r2*x+1),(ir-1)) : (T)(r * exp(r*x))',
         'eru_bwd')(
             x, self.r, self.ir, self.r2)
     self.retain_inputs((0,))
     self.retain_outputs((0,))
     return gx,
Ejemplo n.º 55
0
 def forward_gpu(self, inputs):
     x, = inputs
     gx = cuda.elementwise(
         'T x, T r, T ir, T r2', 'T gx',
         'gx = r * pow((r2*abs(x) + 1),(ir - 1))',
         'oru_bwd')(
             x, self.r, self.ir, self.r2)
     self.retain_inputs((0,))
     self.retain_outputs((0,))
     return gx,
Ejemplo n.º 56
0
    def forward_gpu(self, inputs):
        x = inputs[0]
        b, c, h, w = x.shape

        y = cuda.cupy.empty_like(x)
        cuda.elementwise(
            'raw T x, int32 c, int32 h, int32 w,'
            'int32 kh, int32 kw,'
            'int32 dy, int32 dx', 'T y', '''
               int b0 = i / (c * h * w);
               int rest = i % (c * h * w);
               int c0 = rest / (h * w);
               rest %= h * w;
               int out_row = rest / w;
               int out_col = rest % w;

               int n_groups = kh * kw;
               int group_size = c / n_groups;
               int group_idx = c0 / group_size;
               // Make sure that center group is last
               if (group_idx == (n_groups - 1) / 2) {
                  group_idx = n_groups - 1;
               } else if (group_idx == n_groups - 1) {
                  group_idx = (n_groups - 1) / 2;
               }

               int ky = (group_idx / kw) - kh / 2;
               int kx = (group_idx % kw) - kw / 2;
               if (group_idx >= n_groups) {
                  ky = 0;
                  kx = 0;
               }

               int in_row = -ky * dy + out_row;
               int in_col = -kx * dx + out_col;
               if (in_row >= 0 && in_row < h && in_col >= 0 && in_col < w) {
                 y = x[b0 * c * h * w + c0 * h * w + in_row * w + in_col];
               } else {
                 y = 0;
               }
            ''', 'shift_gpu')(x, c, h, w, self.kh, self.kw, self.dy, self.dx,
                              y)
        return y,
Ejemplo n.º 57
0
def col2im_nd_gpu(col, stride, pad, dims):
    n, c = col.shape[:2]  # (n, c, k_1, ..., k_N, out_1, ..., out_N)
    mid = (len(col.shape) - 2) // 2 + 2
    ksize = col.shape[2:mid]
    outs = col.shape[mid:]
    ndim = len(dims)
    assert len(outs) == len(ksize) == len(stride) == len(pad) == ndim

    img_shape = (n, c) + dims  # (n, c, d_1, d_2, ..., d_N)
    img = cuda.cupy.empty(img_shape, dtype=col.dtype)

    in_params, out_params, operation, name = \
        conv_nd_kernel.Col2imNDKernel.generate(ndim)

    cuda.elementwise(in_params, out_params, operation,
                     name)(col.reduced_view(),
                           *(dims + outs + ksize + stride + pad + (img, )))

    return img
    def forward_gpu(self, inputs):
        if chainer.should_use_cudnn('>=auto') and 2 <= self.ndim <= 3:
            # With cuDNN v3 or greater, use cuDNN implementation for inputs
            # with spatial dimensions of two or more.
            return self.forward_cudnn(inputs)

        ndim = self.ndim
        ksize = self.ksize
        stride = self.stride
        pad = self.pad
        pad_value = self.pad_value
        cover_all = self.cover_all

        x, = inputs
        in_shape = x.shape
        in_dtype = x.dtype

        n, c = in_shape[:2]
        idims = in_shape[2:]
        odims = tuple(
            conv.get_conv_outsize(d, k, s, p, cover_all=cover_all)
            for (d, k, s, p) in six.moves.zip(idims, ksize, stride, pad))
        # (n, c, y_1, y_2, ..., y_N)
        y_shape = (n, c) + odims
        y = cuda.cupy.empty(y_shape, dtype=x.dtype)
        if pad_value is None:
            coeff = self._get_pooling_width(cuda.cupy, idims, x.dtype)
            coeff = cuda.cupy.reciprocal(coeff, out=coeff)
        else:
            assert pad_value == 0
            coeff = 1. / functools.reduce(operator.mul, ksize)

        in_params, out_params, operation, name = \
            average_pooling_nd_kernel.AveragePoolingNDKernelForward.generate(
                ndim)
        cuda.elementwise(in_params, out_params, operation,
                         name)(x.reduced_view(), *(idims + odims + ksize +
                                                   stride + pad + (coeff, y)))

        self.coeff = coeff
        self._in_shape = in_shape
        self._in_dtype = in_dtype
        return y,
Ejemplo n.º 59
0
 def forward_gpu(self, inputs):
     self.retain_inputs((0, 1))
     y, gy = inputs
     if (chainer.should_use_cudnn('==always') and self.x is not None
             and gy.flags.c_contiguous):
         gx = cudnn.activation_backward(self.x, y, gy, _mode)
     else:
         gx = cuda.elementwise('T y, T gy', 'T gx', 'gx = gy * (1 - y * y)',
                               'tanh_bwd')(y, gy)
     return gx,
Ejemplo n.º 60
0
    def backward_gpu(self, inputs, grad_outputs):
        cupy = cuda.cupy
        x, t = inputs
        if x.size == 0:
            return cupy.zeros(x.shape, dtype=x.dtype), None
        if self.y is not None:
            y = self.y
        else:
            y = log_softmax._log_softmax(x)
            cupy.exp(y, out=y)
        gloss = grad_outputs[0]
        n_unit = t.size // len(t)
        if self.reduce == 'mean':
            coeff = gloss * self._coeff
        else:
            coeff = gloss[:, None, ...]

        if self.class_weight is None:
            gx = cuda.elementwise(
                'T y, S t, T coeff, S n_channel, S n_unit, S ignore_label',
                'T gx',
                '''
                    const int c = (i / n_unit % n_channel);
                    gx = t == ignore_label ? 0 : coeff * (y - (c == t));
                ''',
                'softmax_crossent_bwd')(
                    y, cupy.expand_dims(t, 1), coeff, x.shape[1],
                    n_unit, self.ignore_label)
        else:
            gx = cuda.elementwise(
                'T y, raw T w, S t, T coeff, S n_channel, S n_unit, '
                'S ignore_label',
                'T gx',
                '''
                    const int c = (i / n_unit % n_channel);
                    gx = t == ignore_label ? 0 : coeff * (y - (c == t)) * w[t];
                ''',
                'softmax_crossent_weight_bwd')(
                    y, self.class_weight, cupy.expand_dims(t, 1), coeff,
                    x.shape[1], n_unit, self.ignore_label)

        return gx, None