예제 #1
0
    def forward_gpu(self, x):
        n, c, h, w = x[0].shape
        out_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph)
        out_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw)
        out_c = self.W.shape[0]
        y = cuda.empty((n, out_c, out_h, out_w), dtype=self.dtype)
        if cuda.cudnn_enabled and self.use_cudnn:
            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x[0])
            y_desc = cudnn.create_tensor_descriptor(y)

            self.filter_desc = cudnn.create_filter_descriptor(self.W)
            self.conv_desc = cudnn.create_convolution_descriptor(
                (self.ph, self.pw), (self.sy, self.sx))
            if self.b is not None:
                self.bias_desc = cudnn.create_tensor_descriptor(
                    self.b[None, :, None, None])

            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, _fwd_pref,
                self.max_workspace_size)
            workspace_size = libcudnn.getConvolutionForwardWorkspaceSize(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, algo)
            workspace = cuda.empty(
                (max(workspace_size // 4, 1),), dtype=self.dtype)

            one = ctypes.c_float(1)
            zero = ctypes.c_float(0)
            libcudnn.convolutionForward(
                handle, one, x_desc.value, x[0].data.ptr,
                self.filter_desc.value, self.W.data.ptr, self.conv_desc.value,
                algo, workspace.data.ptr, workspace_size, zero, y_desc.value,
                y.data.ptr)

            # TODO(beam2d): Support unshared bias
            if self.b is not None:
                libcudnn.addTensor(
                    handle, libcudnn.CUDNN_ADD_SAME_C, one,
                    self.bias_desc.value, self.b.data.ptr, one, y_desc.value,
                    y.data.ptr)
        else:
            # Implementation using im2col
            self.col = conv.im2col_gpu(
                x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw)

            # TODO(beam2d): Use streams
            W_mat = self.W.reshape(out_c, c * self.kh * self.kw)
            col_mats = self.col.reshape(
                n, c * self.kh * self.kw, out_h * out_w)
            y_mats = y.reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                y_mats[i] = W_mat.dot(col_mats[i])

            # TODO(beam2d): Support unshared bias
            if self.b is not None:
                y += self.b.reshape((1, out_c, 1, 1))

        return y,
예제 #2
0
    def forward(self, inputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None
        kh, kw = W.shape[2:]

        xp = cuda.get_array_module(*x)
        if xp is numpy:
            self.col = conv.im2col_cpu(
                x, kh, kw, self.sy, self.sx, self.ph, self.pw)
        else:
            self.col = conv.im2col_gpu(
                x, kh, kw, self.sy, self.sx, self.ph, self.pw)

        B, C, KY, KX, IY, IX = self.col.shape
        D = W.shape[0]  # (D, C, KY, KX)
        c_ = self.col.transpose(1, 0, 4, 5, 2, 3) \
            .reshape((C, B * IY * IX, KY * KX))
        w_ = W.transpose(1, 2, 3, 0).reshape((C, KY * KX, D))

        # (C, B*IY*IX, KY*KX), (C, KY*KX, D)-> (C, B*IY*IX, D)
        y = _matmul(c_, w_, xp).astype(x.dtype, copy=False)

        # (C, B*IY*IX, D) -> (B, C*D, IY, IX)
        y = y.reshape((C, B, IY * IX, D)).transpose(1, 0, 3, 2) \
            .reshape((B, C * D, IY, IX))

        if b is not None:
            y += b[None, :, None, None]
        return y,
예제 #3
0
    def forward_gpu(self, x):
        n, c, h, w = x[0].shape
        out_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph)
        out_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw)
        out_c = self.W.shape[0]
        y = cuda.empty((n, out_c, out_h, out_w), dtype=self.dtype)
        if cuda.cudnn_enabled and self.use_cudnn:
            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x[0])
            y_desc = cudnn.create_tensor_descriptor(y)

            self.filter_desc = cudnn.create_filter_descriptor(self.W)
            self.conv_desc = cudnn.create_convolution_descriptor(
                (self.ph, self.pw), (self.sy, self.sx))
            if self.b is not None:
                self.bias_desc = cudnn.create_tensor_descriptor(
                    self.b[None, :, None, None])

            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, _fwd_pref,
                self.max_workspace_size)
            workspace_size = libcudnn.getConvolutionForwardWorkspaceSize(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, algo)
            workspace = cuda.empty(
                (max(workspace_size // 4, 1),), dtype=self.dtype)

            one = ctypes.c_float(1)
            zero = ctypes.c_float(0)
            libcudnn.convolutionForward(
                handle, one, x_desc.value, x[0].data.ptr,
                self.filter_desc.value, self.W.data.ptr, self.conv_desc.value,
                algo, workspace.data.ptr, workspace_size, zero, y_desc.value,
                y.data.ptr)

            # TODO(beam2d): Support unshared bias
            if self.b is not None:
                libcudnn.addTensor(
                    handle, libcudnn.CUDNN_ADD_SAME_C, one,
                    self.bias_desc.value, self.b.data.ptr, one, y_desc.value,
                    y.data.ptr)
        else:
            # Implementation using im2col
            self.col = conv.im2col_gpu(
                x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw)

            # TODO(beam2d): Use streams
            W_mat = self.W.reshape(out_c, c * self.kh * self.kw)
            col_mats = self.col.reshape(
                n, c * self.kh * self.kw, out_h * out_w)
            y_mats = y.reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                y_mats[i] = W_mat.dot(col_mats[i])

            # TODO(beam2d): Support unshared bias
            if self.b is not None:
                y += self.b.reshape((1, out_c, 1, 1))

        return y,
예제 #4
0
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1))
        x, gy = inputs

        if (not self.cover_all and chainer.should_use_cudnn('>=auto')
                and x.dtype == self.W_dtype and
            ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)):

            # cuDNN implementation
            return self._forward_cudnn(x, gy)

        else:
            # Implementation using im2col
            _, out_c, out_h, out_w = gy.shape
            n, c, h, w = x.shape

            col = conv.im2col_gpu(x,
                                  self.kh,
                                  self.kw,
                                  self.sy,
                                  self.sx,
                                  self.ph,
                                  self.pw,
                                  cover_all=self.cover_all,
                                  dy=self.dy,
                                  dx=self.dx)
            gW = cuda.cupy.tensordot(gy, col, ((0, 2, 3),
                                               (0, 4, 5))).astype(self.W_dtype,
                                                                  copy=False)
        return gW,
예제 #5
0
    def backward_gpu(self, x, gy):
        xp = cuda.cupy
        gcol = conv.im2col_gpu(gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all)

        gcol = gcol.transpose(0, 1, 4, 5, 2, 3)
        n, c, oy, ox, ky, kx = gcol.shape
        gcol = gcol.reshape((n, c, oy, ox, ky * kx))
        indexes = xp.asarray(self.indexes, dtype=numpy.int32)
        gx = xp.empty((n, c, oy, ox), dtype=x[0].dtype)
        xp.ElementwiseKernel(
            "int32 indexes, raw float32 gcol, int32 n, int32 c, int32 oy," "int32 ox, int32 ky, int32 kx",
            "raw float32 gx",
            """
            int ind_n = i / c / oy / ox;
            int ind_c = (i / oy / ox) % c;
            int ind_oy = (i / ox) % oy;
            int ind_ox = i % ox;
            int gcol_ky = indexes / kx;
            int gcol_kx = indexes % kx;
            float top_gx = gcol[ind_n * c * oy * ox * ky * kx +
                                ind_c * oy * ox * ky * kx +
                                ind_oy * ox * ky * kx +
                                ind_ox * ky * kx +
                                gcol_ky * kx +
                                gcol_kx];
            gx[ind_n * c * oy * ox +
               ind_c * oy * ox +
               ind_oy * ox +
               ind_ox] = top_gx;
            """,
            "upsampling_2d_bwd",
        )(indexes, gcol, n, c, oy, ox, ky, kx, gx)

        return (gx,)
예제 #6
0
 def _forward_gpu_core(self, x, gy):
     col = conv.im2col_gpu(
         x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw,
         cover_all=self.cover_all, dy=self.dy, dx=self.dx)
     gW = cuda.cupy.tensordot(gy, col, ((0, 2, 3), (0, 4, 5))
                              ).astype(self.W_dtype, copy=False)
     return gW,
예제 #7
0
 def forward_gpu(self, x):
     xp = cuda.cupy
     n, c, h, w = x[0].shape
     if self.outh is None:
         self.outh = conv.get_deconv_outsize(h, self.kh, self.sy, self.ph, cover_all=self.cover_all)
     if self.outw is None:
         self.outw = conv.get_deconv_outsize(w, self.kw, self.sx, self.pw, cover_all=self.cover_all)
     up_y = xp.zeros((n, c, self.outh, self.outw), dtype=numpy.float32)
     up_y = conv.im2col_gpu(up_y, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all)
     up_y = up_y.transpose(0, 1, 4, 5, 2, 3)
     n, c, oy, ox, ky, kx = up_y.shape
     indexes = xp.asarray(self.indexes, dtype=numpy.int32)
     xp.ElementwiseKernel(
         "int32 index, float32 x, int32 n, int32 c, int32 oy, int32 ox," "int32 ky, int32 kx",
         "raw float32 up_y",
         """
         int yn = i / c / oy / ox;
         int yc = (i / oy / ox) % c;
         int yoy = (i / ox) % oy;
         int yox = i % ox;
         up_y[yn * c * oy * ox * ky * kx +
           yc * oy * ox * ky * kx +
           yoy * ox * ky * kx +
           yox * ky * kx +
           index] = x;
         """,
         "upsampling_2d_fwd",
     )(indexes, x[0], n, c, oy, ox, ky, kx, up_y)
     up_y = up_y.transpose(0, 1, 4, 5, 2, 3)
     up_y = conv.col2im_gpu(up_y, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw)
     return (up_y,)
예제 #8
0
 def _forward_gpu_core(self, x, gy):
     col = conv.im2col_gpu(
         x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw,
         cover_all=self.cover_all, dy=self.dy, dx=self.dx)
     gW = cuda.cupy.tensordot(gy, col, ((0, 2, 3), (0, 4, 5))
                              ).astype(self.W_dtype, copy=False)
     return gW,
    def forward(self, inputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None
        kh, kw = W.shape[2:]

        xp = cuda.get_array_module(*x)
        if xp is numpy:
            self.col = conv.im2col_cpu(x, kh, kw, self.sy, self.sx, self.ph,
                                       self.pw)
        else:
            self.col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph,
                                       self.pw)

        B, C, KY, KX, IY, IX = self.col.shape
        D = W.shape[0]  # (D, C, KY, KX)
        c_ = self.col.transpose(1, 0, 4, 5, 2, 3) \
            .reshape((C, B * IY * IX, KY * KX))
        w_ = W.transpose(1, 2, 3, 0).reshape((C, KY * KX, D))

        # (C, B*IY*IX, KY*KX), (C, KY*KX, D)-> (C, B*IY*IX, D)
        y = _matmul(c_, w_, xp).astype(x.dtype, copy=False)

        # (C, B*IY*IX, D) -> (B, C*D, IY, IX)
        y = y.reshape((C, B, IY * IX, D)).transpose(1, 0, 3, 2) \
            .reshape((B, C * D, IY, IX))

        if b is not None:
            y += b[None, :, None, None]
        return y,
예제 #10
0
 def forward(self, inputs):
     x, = inputs
     xp = cuda.get_array_module(x)
     if xp == numpy:
         y = im2col_cpu(x,
                        self.kh,
                        self.kw,
                        self.sy,
                        self.sx,
                        self.ph,
                        self.pw,
                        cover_all=self.cover_all,
                        dy=self.dy,
                        dx=self.dx)
     else:
         y = im2col_gpu(x,
                        self.kh,
                        self.kw,
                        self.sy,
                        self.sx,
                        self.ph,
                        self.pw,
                        cover_all=self.cover_all,
                        dy=self.dy,
                        dx=self.dx)
     n, c, kh, kw, out_h, out_w = y.shape
     y = y.reshape(n, c * kh * kw, out_h, out_w)
     return y,
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1))
        x, gy = inputs
        _, out_c, out_h, out_w = gy.shape
        n, c, h, w = x.shape

        if (self.cover_all or not chainer.should_use_cudnn('>=auto')
                or x.dtype != self.W_dtype):
            col = conv.im2col_gpu(x,
                                  self.kh,
                                  self.kw,
                                  self.sy,
                                  self.sx,
                                  self.ph,
                                  self.pw,
                                  cover_all=self.cover_all)
            gW = cuda.cupy.tensordot(gy, col, ((0, 2, 3),
                                               (0, 4, 5))).astype(self.W_dtype,
                                                                  copy=False)
            return gW,

        gW = cuda.cupy.empty((out_c, c, self.kh, self.kw), dtype=self.W_dtype)
        x = cuda.cupy.ascontiguousarray(x)
        gy = cuda.cupy.ascontiguousarray(gy)

        handle = cudnn.get_handle()
        x_desc = cudnn.create_tensor_descriptor(x)
        gy_desc = cudnn.create_tensor_descriptor(gy)

        filter_desc = cudnn.create_filter_descriptor(gW)
        conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype
        conv_desc = cudnn.create_convolution_descriptor(*conv_param)

        oz_dtype = 'd' if x.dtype == 'd' else 'f'
        one = numpy.array(1, dtype=oz_dtype).ctypes
        zero = numpy.array(0, dtype=oz_dtype).ctypes

        workspace_size = cuda.get_max_workspace_size()
        workspace = cuda.cupy.empty((workspace_size, ), dtype='b')

        if configuration.config.cudnn_deterministic:
            algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
        elif configuration.config.autotune and _cudnn_version >= 5000:
            algo = _get_algorithm_bwd_filter(x, gy, gW, conv_param, handle,
                                             x_desc, gy_desc, conv_desc,
                                             filter_desc, workspace)
        else:
            algo = libcudnn.getConvolutionBackwardFilterAlgorithm(
                handle, x_desc.value, gy_desc.value, conv_desc.value,
                filter_desc.value, _bwd_filter_pref, workspace_size)

        libcudnn.convolutionBackwardFilter_v3(handle, one.data, x_desc.value,
                                              x.data.ptr, gy_desc.value,
                                              gy.data.ptr, conv_desc.value,
                                              algo, workspace.data.ptr,
                                              workspace_size, zero.data,
                                              filter_desc.value, gW.data.ptr)

        return gW,
예제 #12
0
 def backward(self, x, gy):
     if isinstance(gy[0], cuda.ndarray):
         gcol = conv.im2col_gpu(
             gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all
         )
     else:
         gcol = conv.im2col_cpu(
             gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all
         )
     gx = gcol.sum(axis=(2, 3))
     return (gx,)
예제 #13
0
 def test_im2col_consistency(self):
     col_cpu = conv.im2col_cpu(self.x, 3, 3, 2, 2, 2, 2, dy=2, dx=2)
     col_gpu = conv.im2col_gpu(cuda.to_gpu(self.x),
                               3,
                               3,
                               2,
                               2,
                               2,
                               2,
                               dy=2,
                               dx=2)
     testing.assert_allclose(col_cpu, col_gpu.get(), atol=0, rtol=0)
예제 #14
0
 def _forward_gpu_core(self, x, W, b):
     kh, kw = W.shape[2:]
     # Implementation using im2col
     col = conv.im2col_gpu(
         x, kh, kw, self.sy, self.sx, self.ph, self.pw,
         cover_all=self.cover_all, dy=self.dy, dx=self.dx)
     y = cuda.cupy.tensordot(
         col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False)
     # TODO(beam2d): Support unshared bias
     if b is not None:
         y += b
     y = cuda.cupy.rollaxis(y, 3, 1)
     return y,
예제 #15
0
 def _forward_gpu_core(self, x, W, b):
     kh, kw = W.shape[2:]
     # Implementation using im2col
     col = conv.im2col_gpu(
         x, kh, kw, self.sy, self.sx, self.ph, self.pw,
         cover_all=self.cover_all, dy=self.dy, dx=self.dx)
     y = cuda.cupy.tensordot(
         col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False)
     # TODO(beam2d): Support unshared bias
     if b is not None:
         y += b
     y = cuda.cupy.rollaxis(y, 3, 1)
     return y,
예제 #16
0
 def forward(self, inputs):
     x, = inputs
     xp = cuda.get_array_module(x)
     if xp == numpy:
         y = im2col_cpu(
             x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw,
             cover_all=self.cover_all, dy=self.dy, dx=self.dx)
     else:
         y = im2col_gpu(
             x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw,
             cover_all=self.cover_all, dy=self.dy, dx=self.dx)
     n, c, kh, kw, out_h, out_w = y.shape
     y = y.reshape(n, c * kh * kw, out_h, out_w)
     return y,
예제 #17
0
    def forward_gpu(self, x):
        self.retain_inputs(())
        self._in_dtype = x[0].dtype

        xp = cuda.cupy
        n, c, h, w = x[0].shape
        if self.outh is None:
            self.outh = conv.get_deconv_outsize(h,
                                                self.kh,
                                                self.sy,
                                                self.ph,
                                                cover_all=self.cover_all)
        if self.outw is None:
            self.outw = conv.get_deconv_outsize(w,
                                                self.kw,
                                                self.sx,
                                                self.pw,
                                                cover_all=self.cover_all)
        up_y = xp.zeros((n, c, self.outh, self.outw), dtype=numpy.float32)
        up_y = conv.im2col_gpu(up_y,
                               self.kh,
                               self.kw,
                               self.sy,
                               self.sx,
                               self.ph,
                               self.pw,
                               cover_all=self.cover_all)
        up_y = up_y.transpose(0, 1, 4, 5, 2, 3)
        n, c, oy, ox, ky, kx = up_y.shape
        indexes = xp.asarray(self.indexes, dtype=numpy.int32)
        xp.ElementwiseKernel(
            'int32 index, float32 x, int32 n, int32 c, int32 oy, int32 ox,'
            'int32 ky, int32 kx', 'raw float32 up_y', '''
            int yn = i / c / oy / ox;
            int yc = (i / oy / ox) % c;
            int yoy = (i / ox) % oy;
            int yox = i % ox;
            up_y[yn * c * oy * ox * ky * kx +
              yc * oy * ox * ky * kx +
              yoy * ox * ky * kx +
              yox * ky * kx +
              index] = x;
            ''', 'upsampling_2d_fwd')(indexes, x[0], n, c, oy, ox, ky, kx,
                                      up_y)
        up_y = up_y.transpose(0, 1, 4, 5, 2, 3)
        up_y = conv.col2im_gpu(up_y, self.sy, self.sx, self.ph, self.pw,
                               self.outh, self.outw)
        return up_y,
 def check_forward(self, y):
     y = upsampling_2d.upsampling_2d(
         self.pooled_y, self.p.indexes, ksize=(self.p.kh, self.p.kw),
         stride=(self.p.sy, self.p.sx), pad=(self.p.ph, self.p.pw),
         outsize=self.in_shape[2:], cover_all=self.p.cover_all)
     if isinstance(y.data, numpy.ndarray):
         y = conv.im2col_cpu(y.data, self.p.kh, self.p.kw,
                             self.p.sy, self.p.sx, self.p.ph, self.p.pw)
     else:
         y = conv.im2col_gpu(y.data, self.p.kh, self.p.kw,
                             self.p.sy, self.p.sx, self.p.ph, self.p.pw)
     for i in numpy.ndindex(y.shape):
         n, c, ky, kx, oy, ox = i
         up_y = y[n, c, ky, kx, oy, ox]
         if ky * y.shape[3] + kx == self.p.indexes[n, c, oy, ox]:
             in_y = self.pooled_y.data[n, c, oy, ox]
             testing.assert_allclose(in_y, up_y)
         else:
             testing.assert_allclose(up_y, 0)
예제 #19
0
 def check_forward(self, y):
     y = F.upsampling_2d(
         self.pooled_y, self.p.indexes, ksize=(self.p.kh, self.p.kw),
         stride=(self.p.sy, self.p.sx), pad=(self.p.ph, self.p.pw),
         outsize=self.in_shape[2:], cover_all=self.p.cover_all)
     if isinstance(y.data, numpy.ndarray):
         y = conv.im2col_cpu(y.data, self.p.kh, self.p.kw,
                             self.p.sy, self.p.sx, self.p.ph, self.p.pw)
     else:
         y = conv.im2col_gpu(y.data, self.p.kh, self.p.kw,
                             self.p.sy, self.p.sx, self.p.ph, self.p.pw)
     for i in numpy.ndindex(y.shape):
         n, c, ky, kx, oy, ox = i
         up_y = y[n, c, ky, kx, oy, ox]
         if ky * y.shape[3] + kx == self.p.indexes[n, c, oy, ox]:
             in_y = self.pooled_y.data[n, c, oy, ox]
             testing.assert_allclose(in_y, up_y)
         else:
             testing.assert_allclose(up_y, 0)
    def forward(self, inputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None
        kh, kw = W.shape[2:]

        xp = cuda.get_array_module(*x)
        W = xp.where(W >= 0, 1, -1).astype(numpy.float32, copy=False)

        olen, ilen, hlen, wlen = W.shape
        if self.coeffs is None:
            self.coeffs = numpy.ones(ilen)
        coeffs = numpy.copy(self.coeffs)
        coeffs = numpy.expand_dims(coeffs, 1)
        coeffs = numpy.expand_dims(coeffs, 1)
        coeffs = numpy.expand_dims(coeffs, 0)
        coeffs = numpy.broadcast_to(coeffs, W.shape)
        M = xp.asarray(coeffs, numpy.float32).reshape(W.shape)
        self.M = M
        W = self.M * W

        if xp is numpy:
            self.col = conv.im2col_cpu(x, kh, kw, self.sy, self.sx, self.ph,
                                       self.pw)
        else:
            self.col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph,
                                       self.pw)

        B, C, KY, KX, IY, IX = self.col.shape
        D = W.shape[0]  # (D, C, KY, KX)
        c_ = self.col.transpose(1, 0, 4, 5, 2, 3) \
            .reshape((C, B * IY * IX, KY * KX))
        w_ = W.transpose(1, 2, 3, 0).reshape((C, KY * KX, D))

        # (C, B*IY*IX, KY*KX), (C, KY*KX, D)-> (C, B*IY*IX, D)
        y = _matmul(c_, w_, xp).astype(x.dtype, copy=False)

        # (C, B*IY*IX, D) -> (B, C*D, IY, IX)
        y = y.reshape((C, B, IY * IX, D)).transpose(1, 0, 3, 2) \
            .reshape((B, C * D, IY, IX))

        if b is not None:
            y += b[None, :, None, None]
        return y,
예제 #21
0
 def check_forward(self, y):
     y = F.upsampling_2d(self.pooled_y,
                         self.indices,
                         ksize=self.ksize,
                         stride=self.stride,
                         outsize=self.in_shape[2:])
     if isinstance(y.array, numpy.ndarray):
         y = conv.im2col_cpu(y.array, self.ksize, self.ksize, self.stride,
                             self.stride, 0, 0)
     else:
         y = conv.im2col_gpu(y.array, self.ksize, self.ksize, self.stride,
                             self.stride, 0, 0)
     for i in numpy.ndindex(y.shape):
         n, c, ky, kx, oy, ox = i
         up_y = y[n, c, ky, kx, oy, ox]
         if ky * y.shape[3] + kx == self.indices[n, c, oy, ox]:
             in_y = self.pooled_y.array[n, c, oy, ox]
             testing.assert_allclose(in_y, up_y)
         else:
             testing.assert_allclose(up_y, 0)
예제 #22
0
 def check_forward(self, y):
     y = F.upsampling_2d(
         self.pooled_y, self.indices, ksize=self.ksize,
         stride=self.stride, outsize=self.in_shape[2:])
     if isinstance(y.array, numpy.ndarray):
         y = conv.im2col_cpu(
             y.array, self.ksize, self.ksize, self.stride, self.stride,
             0, 0)
     else:
         y = conv.im2col_gpu(
             y.array, self.ksize, self.ksize, self.stride, self.stride,
             0, 0)
     for i in numpy.ndindex(y.shape):
         n, c, ky, kx, oy, ox = i
         up_y = y[n, c, ky, kx, oy, ox]
         if ky * y.shape[3] + kx == self.indices[n, c, oy, ox]:
             in_y = self.pooled_y.array[n, c, oy, ox]
             testing.assert_allclose(in_y, up_y)
         else:
             testing.assert_allclose(up_y, 0)
예제 #23
0
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1))  # retain only x and W
        if len(inputs) == 2:
            (x, W), b = inputs, None
        else:
            x, W, b = inputs

        out_c, _, kh, kw = W.shape
        n, _, h, w = x.shape

        out_h, out_w = self._get_out_size(inputs)
        y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype)

        if (not self.cover_all and chainer.should_use_cudnn('>=auto')
                and x.dtype == W.dtype and
            ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)):

            # cuDNN implementation
            return self._forward_cudnn(x, W, b, y)

        else:
            # Implementation using im2col
            col = conv.im2col_gpu(x,
                                  kh,
                                  kw,
                                  self.sy,
                                  self.sx,
                                  self.ph,
                                  self.pw,
                                  cover_all=self.cover_all,
                                  dy=self.dy,
                                  dx=self.dx)
            y = cuda.cupy.tensordot(col, W,
                                    ((1, 2, 3), (1, 2, 3))).astype(x.dtype,
                                                                   copy=False)
            # TODO(beam2d): Support unshared bias
            if b is not None:
                y += b
            y = cuda.cupy.rollaxis(y, 3, 1)

        return y,
예제 #24
0
 def backward(self, x, gy):
     if isinstance(gy[0], cuda.ndarray):
         gcol = conv.im2col_gpu(gy[0],
                                self.kh,
                                self.kw,
                                self.sy,
                                self.sx,
                                self.ph,
                                self.pw,
                                cover_all=self.cover_all)
     else:
         gcol = conv.im2col_cpu(gy[0],
                                self.kh,
                                self.kw,
                                self.sy,
                                self.sx,
                                self.ph,
                                self.pw,
                                cover_all=self.cover_all)
     gx = gcol.sum(axis=(2, 3))
     return gx,
    def forward_gpu(self, gy):
        xp = cuda.cupy
        gcol = conv.im2col_gpu(gy[0],
                               self.kh,
                               self.kw,
                               self.sy,
                               self.sx,
                               self.ph,
                               self.pw,
                               cover_all=self.cover_all)

        gcol = gcol.transpose(0, 1, 4, 5, 2, 3)
        n, c, oy, ox, ky, kx = gcol.shape
        gcol = gcol.reshape((n, c, oy, ox, ky * kx))
        indexes = xp.asarray(self.indexes, dtype=numpy.int32)
        gx = xp.empty((n, c, oy, ox), dtype=self._in_dtype)
        cuda.elementwise(
            'int32 indexes, raw T gcol, int32 n, int32 c, int32 oy,'
            'int32 ox, int32 ky, int32 kx', 'raw T gx', '''
            int ind_n = i / c / oy / ox;
            int ind_c = (i / oy / ox) % c;
            int ind_oy = (i / ox) % oy;
            int ind_ox = i % ox;
            int gcol_ky = indexes / kx;
            int gcol_kx = indexes % kx;
            float top_gx = gcol[ind_n * c * oy * ox * ky * kx +
                                ind_c * oy * ox * ky * kx +
                                ind_oy * ox * ky * kx +
                                ind_ox * ky * kx +
                                gcol_ky * kx +
                                gcol_kx];
            gx[ind_n * c * oy * ox +
               ind_c * oy * ox +
               ind_oy * ox +
               ind_ox] = top_gx;
            ''', 'upsampling_2d_bwd')(indexes, gcol, n, c, oy, ox, ky, kx, gx)

        return gx,
예제 #26
0
    def forward_gpu(self, x):
        self.retain_inputs(())
        self._in_dtype = x[0].dtype

        xp = cuda.cupy
        n, c, h, w = x[0].shape
        if self.outh is None:
            self.outh = conv.get_deconv_outsize(
                h, self.kh, self.sy, self.ph, cover_all=self.cover_all)
        if self.outw is None:
            self.outw = conv.get_deconv_outsize(
                w, self.kw, self.sx, self.pw, cover_all=self.cover_all)
        up_y = xp.zeros((n, c, self.outh, self.outw), dtype=self._in_dtype)
        up_y = conv.im2col_gpu(
            up_y, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw,
            cover_all=self.cover_all)
        up_y = up_y.transpose(0, 1, 4, 5, 2, 3)
        n, c, oy, ox, ky, kx = up_y.shape
        indexes = xp.asarray(self.indexes, dtype=numpy.int32)
        xp.ElementwiseKernel(
            'int32 index, T x, int32 n, int32 c, int32 oy, int32 ox,'
            'int32 ky, int32 kx', 'raw T up_y',
            '''
            int yn = i / c / oy / ox;
            int yc = (i / oy / ox) % c;
            int yoy = (i / ox) % oy;
            int yox = i % ox;
            up_y[yn * c * oy * ox * ky * kx +
              yc * oy * ox * ky * kx +
              yoy * ox * ky * kx +
              yox * ky * kx +
              index] = x;
            ''',
            'upsampling_2d_fwd')(indexes, x[0], n, c, oy, ox, ky, kx, up_y)
        up_y = up_y.transpose(0, 1, 4, 5, 2, 3)
        up_y = conv.col2im_gpu(up_y, self.sy, self.sx, self.ph, self.pw,
                               self.outh, self.outw)
        return up_y,
예제 #27
0
    def forward_grad(self, inputs, rho):
        x, W = inputs[:2]
        xp = cuda.get_array_module(*inputs)
        rho = xp.array(rho, dtype=W.dtype)

        # regenerate input matrix
        _, _, kh, kw = W.shape
        if isinstance(x, cuda.ndarray):
            col = conv.im2col_gpu(x,
                                  kh,
                                  kw,
                                  self.sy,
                                  self.sx,
                                  self.ph,
                                  self.pw,
                                  cover_all=self.cover_all)
        else:
            col = conv.im2col_cpu(x,
                                  kh,
                                  kw,
                                  self.sy,
                                  self.sx,
                                  self.ph,
                                  self.pw,
                                  cover_all=self.cover_all)

        if rho.ndim > 0:  # if rho is vector, prep for broadcasting
            rho = rho[:, xp.newaxis, xp.newaxis, xp.newaxis]

        gW = xp.tensordot(self.mask * rho, -1.0 * col,
                          ((0, 2, 3), (0, 4, 5))).astype(W.dtype, copy=False)

        # calc conscience factors [DeSieno, 1988] as bias of conv layer
        n = self.mask.shape[1]
        v = self.mask.sum(axis=(0, 2, 3), dtype=W.dtype)
        gb = -1.0 * self.conscience_factor * (1.0 / n - v / v.sum())

        return gW, gb
예제 #28
0
    def forward_gpu(self, gy):
        xp = cuda.cupy
        gcol = conv.im2col_gpu(
            gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw,
            cover_all=self.cover_all)

        gcol = gcol.transpose(0, 1, 4, 5, 2, 3)
        n, c, oy, ox, ky, kx = gcol.shape
        gcol = gcol.reshape((n, c, oy, ox, ky * kx))
        indexes = xp.asarray(self.indexes, dtype=numpy.int32)
        gx = xp.empty((n, c, oy, ox), dtype=self._in_dtype)
        cuda.elementwise(
            'int32 indexes, raw T gcol, int32 n, int32 c, int32 oy,'
            'int32 ox, int32 ky, int32 kx',
            'raw T gx',
            '''
            int ind_n = i / c / oy / ox;
            int ind_c = (i / oy / ox) % c;
            int ind_oy = (i / ox) % oy;
            int ind_ox = i % ox;
            int gcol_ky = indexes / kx;
            int gcol_kx = indexes % kx;
            float top_gx = gcol[ind_n * c * oy * ox * ky * kx +
                                ind_c * oy * ox * ky * kx +
                                ind_oy * ox * ky * kx +
                                ind_ox * ky * kx +
                                gcol_ky * kx +
                                gcol_kx];
            gx[ind_n * c * oy * ox +
               ind_c * oy * ox +
               ind_oy * ox +
               ind_ox] = top_gx;
            ''',
            'upsampling_2d_bwd')(indexes, gcol, n, c, oy, ox, ky, kx, gx)

        return gx,
예제 #29
0
 def test_im2col_consistency(self):
     col_cpu = conv.im2col_cpu(self.x, 3, 3, 2, 2, 1, 1)
     col_gpu = conv.im2col_gpu(to_gpu(self.x), 3, 3, 2, 2, 1, 1)
     assert_allclose(col_cpu, col_gpu.get(), atol=0, rtol=0)
예제 #30
0
    def backward_gpu(self, inputs, grad_outputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        if not type_check.same_types(*inputs):
            if b is not None:
                raise ValueError(
                    'numpy and cupy must not be used together\n'
                    'type(W): {0}, type(x): {1}, type(b): {2}'.format(
                        type(W), type(x), type(b)))
            else:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}'.format(
                                     type(W), type(x)))

        gy = grad_outputs[0]
        n, in_c, in_h, in_w = x.shape
        _, out_channels, kh, kw = W.shape
        c, h, w = gy.shape[1:]
        gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype)

        if (chainer.should_use_cudnn('>=auto')
                and _check_cudnn_acceptable_type(x.dtype, W.dtype)):
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            gy = cuda.cupy.ascontiguousarray(gy)
            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)

            handle = cudnn.get_handle()
            gy_desc = cudnn.create_tensor_descriptor(gy)
            gx_desc = cudnn.create_tensor_descriptor(gx)

            # chance to choose implicit-precomp-gemm algorithm
            workspace_size = cuda.get_max_workspace_size()
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, _fwd_pref, workspace_size)
            workspace = cuda.cupy.empty((workspace_size, ), dtype='b')

            oz_dtype = 'd' if x.dtype == 'd' else 'f'
            one = numpy.array(1, dtype=oz_dtype).ctypes
            zero = numpy.array(0, dtype=oz_dtype).ctypes

            libcudnn.convolutionForward(handle, one.data, gy_desc.value,
                                        gy.data.ptr, self.filter_desc.value,
                                        W.data.ptr, self.conv_desc.value, algo,
                                        workspace.data.ptr, workspace_size,
                                        zero.data, gx_desc.value, gx.data.ptr)
            # bias backward
            if b is not None:
                gb = cuda.cupy.empty_like(b)
                libcudnn.convolutionBackwardBias(handle, one.data,
                                                 gy_desc.value, gy.data.ptr,
                                                 zero.data,
                                                 self.bias_desc.value,
                                                 gb.data.ptr)
            gW = cuda.cupy.empty_like(W)
            # filter backward
            if _cudnn_version >= 3000:
                if configuration.config.cudnn_deterministic:
                    algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1  # NOQA
                else:
                    algo = libcudnn.getConvolutionBackwardFilterAlgorithm(
                        handle, gy_desc.value, gx_desc.value,
                        self.conv_desc.value, self.filter_desc.value,
                        _bwd_filter_pref, workspace_size)

                libcudnn.convolutionBackwardFilter_v3(
                    handle, one.data, gy_desc.value, gy.data.ptr,
                    gx_desc.value, x.data.ptr, self.conv_desc.value, algo,
                    workspace.data.ptr, workspace_size, zero.data,
                    self.filter_desc.value, gW.data.ptr)
            else:
                if configuration.config.cudnn_deterministic:
                    raise ValueError(
                        "`cudnn_deterministic` option must be False "
                        "if the backpropagation of "
                        "chainer.functions.Deconvolution2D "
                        "uses cuDNN and cuDNN versions < v3. "
                        "Turn off cudnn_deterministic option with "
                        "`chainer.using_config('cudnn_deterministic', False)` "
                        "context.")
                libcudnn.convolutionBackwardFilter_v2(
                    handle, one.data, gy_desc.value, gy.data.ptr,
                    gx_desc.value, x.data.ptr, self.conv_desc.value, zero.data,
                    self.filter_desc.value, gW.data.ptr)
        else:
            # Implementation using im2col
            col = conv.im2col_gpu(gy, kh, kw, self.sy, self.sx, self.ph,
                                  self.pw)

            gW = cuda.cupy.tensordot(x, col,
                                     ([0, 2, 3], [0, 4, 5])).astype(W.dtype,
                                                                    copy=False)
            gx = cuda.cupy.tensordot(col, W,
                                     ([1, 2, 3], [1, 2, 3])).astype(x.dtype,
                                                                    copy=False)
            gx = cuda.cupy.rollaxis(gx, 3, 1)

            # bias backward
            if b is not None:
                gb = gy.sum(axis=(0, 2, 3))

        if b is None:
            return gx, gW
        else:
            return gx, gW, gb
예제 #31
0
    def backward_gpu(self, inputs, grad_outputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None
        gy = grad_outputs[0]
        n, in_c, in_h, in_w = x.shape
        _, out_channels, kh, kw = W.shape
        c, h, w = gy.shape[1:]
        gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype)

        if (cuda.cudnn_enabled and self.use_cudnn and
                _check_cudnn_acceptable_type(x.dtype, W.dtype)):
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            gy = cuda.cupy.ascontiguousarray(gy)

            handle = cudnn.get_handle()
            gy_desc = cudnn.create_tensor_descriptor(gy)
            gx_desc = cudnn.create_tensor_descriptor(gx)

            # chance to choose implicit-precomp-gemm algorithm
            workspace_size = cuda.get_max_workspace_size()
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, _fwd_pref,
                workspace_size)
            workspace = cuda.cupy.empty((workspace_size,), dtype='b')

            oz_dtype = 'd' if x.dtype == 'd' else 'f'
            one = numpy.array(1, dtype=oz_dtype).ctypes
            zero = numpy.array(0, dtype=oz_dtype).ctypes

            libcudnn.convolutionForward(
                handle, one.data, gy_desc.value, gy.data.ptr,
                self.filter_desc.value, W.data.ptr,
                self.conv_desc.value, algo, workspace.data.ptr, workspace_size,
                zero.data, gx_desc.value, gx.data.ptr)
            # bias backward
            if b is not None:
                gb = cuda.cupy.empty_like(b)
                libcudnn.convolutionBackwardBias(
                    handle, one.data, gy_desc.value, gy.data.ptr,
                    zero.data, self.bias_desc.value, gb.data.ptr)
            gW = cuda.cupy.empty_like(W)
            # filter backward
            if _cudnn_version >= 4000:
                if not self.deterministic:
                    algo = libcudnn.getConvolutionBackwardFilterAlgorithm(
                        handle, gy_desc.value, gx_desc.value,
                        self.conv_desc.value, self.filter_desc.value,
                        _bwd_filter_pref, workspace_size)
                else:
                    algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1  # NOQA

                libcudnn.convolutionBackwardFilter_v3(
                    handle, one.data, gy_desc.value, gy.data.ptr,
                    gx_desc.value, x.data.ptr, self.conv_desc.value,
                    algo, workspace.data.ptr, workspace_size,
                    zero.data, self.filter_desc.value, gW.data.ptr)
            else:
                if self.deterministic:
                    raise ValueError("'deterministic' option not available "
                                     "for cuDNN versions < v4")
                libcudnn.convolutionBackwardFilter_v2(
                    handle, one.data, gy_desc.value, gy.data.ptr,
                    gx_desc.value, x.data.ptr, self.conv_desc.value,
                    zero.data, self.filter_desc.value, gW.data.ptr)
        else:
            # Implementation using im2col
            col = conv.im2col_gpu(
                gy, kh, kw, self.sy, self.sx, self.ph, self.pw)

            W_mat = W.reshape(in_c, c * kh * kw)
            col_mats = col.reshape(
                n, c * kh * kw, in_h * in_w)
            gx_mats = gx.reshape(n, in_c, in_h * in_w)
            for i in moves.range(n):
                gx_mats[i] = W_mat.dot(col_mats[i])

            # bias backward
            if b is not None:
                gb = gy.sum(axis=(0, 2, 3))

            # filter backward
            gW = cuda.cupy.zeros_like(W)
            gW_mat = gW.reshape(in_c, c * kh * kw)
            x_mats = x.reshape(n, in_c, in_h * in_w)
            for i in moves.range(n):
                gW_mat += x_mats[i].dot(col_mats[i].T)

        if b is None:
            return gx, gW
        else:
            return gx, gW, gb
예제 #32
0
    def forward_gpu(self, inputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        out_c, _, kh, kw = W.shape
        n, c, h, w = x.shape

        out_h = conv.get_conv_outsize(h,
                                      kh,
                                      self.sy,
                                      self.ph,
                                      cover_all=self.cover_all)
        out_w = conv.get_conv_outsize(w,
                                      kw,
                                      self.sx,
                                      self.pw,
                                      cover_all=self.cover_all)

        y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype)
        if (self.cover_all and cuda.cudnn_enabled and self.use_cudnn
                and _check_cudnn_acceptable_type(x.dtype, W.dtype)):
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)

            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x)
            y_desc = cudnn.create_tensor_descriptor(y)

            self.filter_desc = cudnn.create_filter_descriptor(W)
            self.conv_desc = cudnn.create_convolution_descriptor(
                (self.ph, self.pw), (self.sy, self.sx))
            if b is not None:
                self.bias_desc = cudnn.create_tensor_descriptor(b[None, :,
                                                                  None, None])

            workspace_size = cuda.get_max_workspace_size()
            workspace = cuda.cupy.empty((workspace_size, ), dtype='b')
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size)

            oz_dtype = 'd' if x.dtype == 'd' else 'f'
            one = numpy.array(1, dtype=oz_dtype).ctypes
            zero = numpy.array(0, dtype=oz_dtype).ctypes
            libcudnn.convolutionForward(handle, one.data, x_desc.value,
                                        x.data.ptr, self.filter_desc.value,
                                        W.data.ptr, self.conv_desc.value, algo,
                                        workspace.data.ptr, workspace_size,
                                        zero.data, y_desc.value, y.data.ptr)

            # TODO(beam2d): Support unshared bias
            if b is not None:
                cudnn.add_tensor(handle, one.data, self.bias_desc.value,
                                 b.data.ptr, one.data, y_desc.value,
                                 y.data.ptr)
        else:
            # Implementation using im2col
            Xb = _kern()(x)

            self.col = conv.im2col_gpu(Xb,
                                       kh,
                                       kw,
                                       self.sy,
                                       self.sx,
                                       self.ph,
                                       self.pw,
                                       cover_all=self.cover_all)

            W_mat = W.reshape(out_c, -1)
            col_mats = self.col.reshape(n, -1, out_h * out_w)
            Wb_mat = _kern()(W_mat)

            y_mats = y.reshape(n, out_c, -1)
            # TODO(beam2d): Use streams or batch gemm
            for i in moves.range(n):
                y_mats[i] = Wb_mat.dot(col_mats[i])
            # TODO(beam2d): Support unshared bias
            if b is not None:
                y += b[:, None, None]

        return y,
예제 #33
0
 def test_im2col_consistency(self):
     col_cpu = conv.im2col_cpu(self.x, 3, 3, 2, 2, 1, 1)
     col_gpu = conv.im2col_gpu(cuda.to_gpu(self.x), 3, 3, 2, 2, 1, 1)
     gradient_check.assert_allclose(col_cpu, col_gpu.get(), atol=0, rtol=0)
예제 #34
0
    def backward_gpu(self, inputs, grad_outputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None
        gy = grad_outputs[0]
        n, in_c, in_h, in_w = x.shape
        _, out_channels, kh, kw = W.shape
        c, h, w = gy.shape[1:]
        gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=numpy.float32)

        if cuda.cudnn_enabled and self.use_cudnn:
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            gy = cuda.cupy.ascontiguousarray(gy)

            handle = cudnn.get_handle()
            gy_desc = cudnn.create_tensor_descriptor(gy)
            gx_desc = cudnn.create_tensor_descriptor(gx)

            # chance to choose implicit-precomp-gemm algorithm
            self.max_workspace_size = out_channels * kh * kw * 4
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, _fwd_pref,
                self.max_workspace_size)
            workspace_size = libcudnn.getConvolutionForwardWorkspaceSize(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, algo)
            workspace = cuda.cupy.empty(
                (max(workspace_size // 4, 1),), dtype=numpy.float32)

            one = numpy.array(1, dtype=x.dtype).ctypes
            zero = numpy.array(0, dtype=x.dtype).ctypes

            libcudnn.convolutionForward(
                handle, one.data, gy_desc.value, gy.data.ptr,
                self.filter_desc.value, W.data.ptr,
                self.conv_desc.value, algo, workspace.data.ptr, workspace_size,
                zero.data, gx_desc.value, gx.data.ptr)
            # bias backward
            if b is not None:
                gb = cuda.cupy.empty_like(b)
                libcudnn.convolutionBackwardBias(
                    handle, one.data, gy_desc.value, gy.data.ptr,
                    zero.data, self.bias_desc.value, gb.data.ptr)
            gW = cuda.cupy.empty_like(W)
            # filter backward
            libcudnn.convolutionBackwardFilter_v2(
                handle, one.data, gy_desc.value, gy.data.ptr,
                gx_desc.value, x.data.ptr, self.conv_desc.value,
                zero.data, self.filter_desc.value, gW.data.ptr)
        else:
            # Implementation using im2col
            col = conv.im2col_gpu(
                gy, kh, kw, self.sy, self.sx, self.ph, self.pw)

            W_mat = W.reshape(in_c, c * kh * kw)
            col_mats = col.reshape(
                n, c * kh * kw, in_h * in_w)
            gx_mats = gx.reshape(n, in_c, in_h * in_w)
            for i in moves.range(n):
                gx_mats[i] = W_mat.dot(col_mats[i])

            # bias backward
            if b is not None:
                gb = gy.sum(axis=(0, 2, 3))

            # filter backward
            gW = cuda.cupy.zeros_like(W)
            gW_mat = gW.reshape(in_c, c * kh * kw)
            x_mats = x.reshape(n, in_c, in_h * in_w)
            for i in moves.range(n):
                gW_mat += x_mats[i].dot(col_mats[i].T)

        if b is None:
            return gx, gW
        else:
            return gx, gW, gb
예제 #35
0
 def test_im2col_consistency(self):
     col_cpu = conv.im2col_cpu(self.x, 3, 3, 2, 2, 1, 1)
     col_gpu = conv.im2col_gpu(to_gpu(self.x), 3, 3, 2, 2, 1, 1)
     assert_allclose(col_cpu, col_gpu.get(), atol=0, rtol=0)
예제 #36
0
    def forward_gpu(self, inputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        out_c, _, kh, kw = W.shape
        n, c, h, w = x.shape

        out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph)
        out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw)

        y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype)
        if cuda.cudnn_enabled and self.use_cudnn:
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)

            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x)
            y_desc = cudnn.create_tensor_descriptor(y)

            self.filter_desc = cudnn.create_filter_descriptor(W)
            self.conv_desc = cudnn.create_convolution_descriptor(
                (self.ph, self.pw), (self.sy, self.sx))
            if b is not None:
                self.bias_desc = cudnn.create_tensor_descriptor(b[None, :,
                                                                  None, None])

            self.max_workspace_size = c * kh * kw * 4
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, _fwd_pref,
                self.max_workspace_size)
            workspace_size = libcudnn.getConvolutionForwardWorkspaceSize(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, algo)
            workspace = cuda.cupy.empty((max(workspace_size // 4, 1), ),
                                        dtype=x.dtype)

            dtype = x.dtype
            one = numpy.array(1, dtype=dtype).ctypes
            zero = numpy.array(0, dtype=dtype).ctypes
            libcudnn.convolutionForward(handle, one.data, x_desc.value,
                                        x.data.ptr, self.filter_desc.value,
                                        W.data.ptr, self.conv_desc.value, algo,
                                        workspace.data.ptr, workspace_size,
                                        zero.data, y_desc.value, y.data.ptr)

            # TODO(beam2d): Support unshared bias
            if b is not None:
                libcudnn.addTensor_v2(handle, libcudnn.CUDNN_ADD_SAME_C,
                                      one.data, self.bias_desc.value,
                                      b.data.ptr, one.data, y_desc.value,
                                      y.data.ptr)
        else:
            # Implementation using im2col
            self.col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph,
                                       self.pw)
            W_mat = W.reshape(out_c, -1)
            col_mats = self.col.reshape(n, -1, out_h * out_w)
            y_mats = y.reshape(n, out_c, -1)
            # TODO(beam2d): Use streams or batch gemm
            for i in moves.range(n):
                y_mats[i] = W_mat.dot(col_mats[i])
            # TODO(beam2d): Support unshared bias
            if b is not None:
                y += b[:, None, None]

        return y,
예제 #37
0
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1))  # retain only x and W
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        if not all([isinstance(i, cuda.ndarray) for i in inputs]):
            if b is not None:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}, type(b): {2}'
                                 .format(type(W), type(x), type(b)))
            else:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}'
                                 .format(type(W), type(x)))

        out_c, _, kh, kw = W.shape
        n, c, h, w = x.shape

        out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph,
                                      cover_all=self.cover_all, d=self.dy)
        assert out_h > 0, 'Height in the output should be positive.'
        out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw,
                                      cover_all=self.cover_all, d=self.dx)
        assert out_w > 0, 'Width in the output should be positive.'

        y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype)
        if (not self.cover_all and chainer.should_use_cudnn('>=auto') and
                x.dtype == W.dtype and
                ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)):
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)

            use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x)
            y_desc = cudnn.create_tensor_descriptor(y)

            filter_desc = cudnn.create_filter_descriptor(W)
            conv_param = ((self.ph, self.pw), (self.sy, self.sx), x.dtype)
            conv_desc = cudnn.create_convolution_descriptor(
                *conv_param, dilation=(self.dy, self.dx),
                use_tensor_core=use_tensor_core)
            if b is not None:
                bias_desc = cudnn.create_tensor_descriptor(
                    b[None, :, None, None])
            workspace_size = cuda.get_max_workspace_size()
            workspace = cuda.cupy.empty((workspace_size,), dtype='b')
            if configuration.config.autotune and _cudnn_version >= 5000:
                algo = get_algorithm_fwd(
                    x, W, y, conv_param, handle, x_desc, filter_desc,
                    conv_desc, y_desc, workspace)
            else:
                algo = libcudnn.getConvolutionForwardAlgorithm(
                    handle, x_desc.value, filter_desc.value,
                    conv_desc.value, y_desc.value, _fwd_pref, workspace_size)

            if use_tensor_core:
                # Only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
                # supports Tensor-Core in cuDNN7.
                algo = libcudnn.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM  # NOQA

            oz_dtype = 'd' if x.dtype == 'd' else 'f'
            one = numpy.array(1, dtype=oz_dtype).ctypes
            zero = numpy.array(0, dtype=oz_dtype).ctypes
            libcudnn.convolutionForward(
                handle, one.data, x_desc.value, x.data.ptr,
                filter_desc.value, W.data.ptr, conv_desc.value,
                algo, workspace.data.ptr, workspace_size, zero.data,
                y_desc.value, y.data.ptr)

            # TODO(beam2d): Support unshared bias
            if b is not None:
                cudnn.add_tensor(
                    handle, one.data, bias_desc.value, b.data.ptr,
                    one.data, y_desc.value, y.data.ptr)
        else:
            # Implementation using im2col
            col = conv.im2col_gpu(
                x, kh, kw, self.sy, self.sx, self.ph, self.pw,
                cover_all=self.cover_all, dy=self.dy, dx=self.dx)
            y = cuda.cupy.tensordot(
                col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False)
            # TODO(beam2d): Support unshared bias
            if b is not None:
                y += b
            y = cuda.cupy.rollaxis(y, 3, 1)

        return y,
예제 #38
0
    def forward_gpu(self, inputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        if not type_check.same_types(*inputs):
            if b is not None:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}, type(b): {2}'
                                 .format(type(W), type(x), type(b)))
            else:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}'
                                 .format(type(W), type(x)))

        out_c, _, kh, kw = W.shape
        n, c, h, w = x.shape
        dkh, dkw = kh + (kh - 1) * (self.dy - 1), kw + (kw - 1) * (self.dx - 1)

        out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph,
                                      cover_all=self.cover_all, d=self.dy)
        out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw,
                                      cover_all=self.cover_all, d=self.dx)

        y = cuda.cupy.zeros((n, out_c, out_h, out_w), dtype=x.dtype)
        if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and
                _check_cudnn_acceptable_type(x.dtype, W.dtype)):

            pad_x = cuda.cupy.zeros((n, c, h + 2 * self.ph, w + 2 * self.pw),
                                    dtype=x.dtype)
            pad_x[:, :, self.ph:self.ph + h, self.pw:self.pw + w] = x

            out_h_s1 = h + 2 * self.ph - dkh + 1
            out_w_s1 = w + 2 * self.pw - dkw + 1

            for j in moves.range(kh):
                for i in moves.range(kw):
                    xji = cuda.cupy.ascontiguousarray(
                        pad_x[:, :,
                              j * self.dy:j * self.dy + out_h_s1,
                              i * self.dx:i * self.dx + out_w_s1])
                    Wji = cuda.cupy.ascontiguousarray(
                        W[:, :, j:j + 1, i:i + 1])

                    if i == 0 and j == 0:
                        handle = cudnn.get_handle()
                        xji_desc = cudnn.create_tensor_descriptor(xji)
                        y_desc = cudnn.create_tensor_descriptor(y)
                        self.filter_desc = cudnn.create_filter_descriptor(Wji)
                        self.conv_desc = cudnn.create_convolution_descriptor(
                            (0, 0), (self.sy, self.sx), xji.dtype)

                        workspace_size = cuda.get_max_workspace_size()
                        workspace = cuda.cupy.empty(
                            (workspace_size,), dtype='b')
                        algo = libcudnn.getConvolutionForwardAlgorithm(
                            handle, xji_desc.value, self.filter_desc.value,
                            self.conv_desc.value, y_desc.value, _fwd_pref,
                            workspace_size)

                        oz_dtype = 'd' if x.dtype == 'd' else 'f'
                        one = numpy.array(1, dtype=oz_dtype).ctypes

                    libcudnn.convolutionForward(
                        handle, one.data, xji_desc.value, xji.data.ptr,
                        self.filter_desc.value, Wji.data.ptr,
                        self.conv_desc.value, algo, workspace.data.ptr,
                        workspace_size, one.data, y_desc.value, y.data.ptr)

            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)
                self.bias_desc = cudnn.create_tensor_descriptor(
                    b[None, :, None, None])
                cudnn.add_tensor(
                    handle, one.data, self.bias_desc.value, b.data.ptr,
                    one.data, y_desc.value, y.data.ptr)
        else:
            # Implementation using im2col
            self.col = conv.im2col_gpu(
                x, kh, kw, self.sy, self.sx, self.ph, self.pw,
                cover_all=self.cover_all, dy=self.dy, dx=self.dx)
            y = cuda.cupy.tensordot(
                self.col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype,
                                                            copy=False)
            # TODO(beam2d): Support unshared bias
            if b is not None:
                y += b
            y = cuda.cupy.rollaxis(y, 3, 1)

        return y,
예제 #39
0
    def backward_gpu(self, inputs, grad_outputs):
        x, W = inputs[:2]
        gy = grad_outputs[0]
        n, in_c, in_h, in_w = x.shape
        _, out_channels, kh, kw = W.shape
        c, h, w = gy.shape[1:]
        gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=numpy.float32)

        if cuda.cudnn_enabled and self.use_cudnn:
            handle = cudnn.get_handle()
            gy_desc = cudnn.create_tensor_descriptor(gy)
            gx_desc = cudnn.create_tensor_descriptor(gx)

            # chance to choose implicit-precomp-gemm algorithm
            self.max_workspace_size = out_channels * kh * kw * 4
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, _fwd_pref,
                self.max_workspace_size)
            workspace_size = libcudnn.getConvolutionForwardWorkspaceSize(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, algo)
            workspace = cuda.cupy.empty((max(workspace_size // 4, 1), ),
                                        dtype=numpy.float32)

            one = numpy.array(1, dtype=x.dtype).ctypes
            zero = numpy.array(0, dtype=x.dtype).ctypes

            libcudnn.convolutionForward(handle, one.data, gy_desc.value,
                                        gy.data.ptr, self.filter_desc.value,
                                        W.data.ptr, self.conv_desc.value, algo,
                                        workspace.data.ptr, workspace_size,
                                        zero.data, gx_desc.value, gx.data.ptr)
            # bias backward
            if len(inputs) == 3:
                b = inputs[2]
                gb = cuda.cupy.empty_like(b)
                libcudnn.convolutionBackwardBias(handle, one.data,
                                                 gy_desc.value, gy.data.ptr,
                                                 zero.data,
                                                 self.bias_desc.value,
                                                 gb.data.ptr)
            gW = cuda.cupy.empty_like(W)
            # filter backward
            libcudnn.convolutionBackwardFilter(handle, one.data, gy_desc.value,
                                               gy.data.ptr, gx_desc.value,
                                               x.data.ptr,
                                               self.conv_desc.value, zero.data,
                                               self.filter_desc.value,
                                               gW.data.ptr)
        else:
            # Implementation using im2col
            col = conv.im2col_gpu(gy, kh, kw, self.sy, self.sx, self.ph,
                                  self.pw)

            W_mat = W.reshape(in_c, c * kh * kw)
            col_mats = col.reshape(n, c * kh * kw, in_h * in_w)
            gx_mats = gx.reshape(n, in_c, in_h * in_w)
            for i in moves.range(n):
                gx_mats[i] = W_mat.dot(col_mats[i])

            # bias backward
            if len(inputs) == 3:
                gb = gy.sum(axis=(0, 2, 3))

            # filter backward
            gW = cuda.cupy.zeros_like(W)
            gW_mat = gW.reshape(in_c, c * kh * kw)
            x_mats = x.reshape(n, in_c, in_h * in_w)
            for i in moves.range(n):
                gW_mat += x_mats[i].dot(col_mats[i].T)
        if len(inputs) == 3:
            return gx, gW, gb
        else:
            return gx, gW
예제 #40
0
    def backward_gpu(self, inputs, grad_outputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        gy = grad_outputs[0]
        n, in_c, in_h, in_w = x.shape
        _, out_channels, kh, kw = W.shape
        c, h, w = gy.shape[1:]
        gx = None

        if chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype:
            gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype)
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            gy = cuda.cupy.ascontiguousarray(gy)
            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)

            handle = cudnn.get_handle()
            gy_desc = cudnn.create_tensor_descriptor(gy)
            gx_desc = cudnn.create_tensor_descriptor(gx)

            # chance to choose implicit-precomp-gemm algorithm
            workspace_size = cuda.get_max_workspace_size()
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, _fwd_pref,
                workspace_size)
            workspace = cuda.cupy.empty((workspace_size,), dtype='b')

            oz_dtype = 'd' if x.dtype == 'd' else 'f'
            one = numpy.array(1, dtype=oz_dtype).ctypes
            zero = numpy.array(0, dtype=oz_dtype).ctypes

            libcudnn.convolutionForward(
                handle, one.data, gy_desc.value, gy.data.ptr,
                self.filter_desc.value, W.data.ptr,
                self.conv_desc.value, algo, workspace.data.ptr, workspace_size,
                zero.data, gx_desc.value, gx.data.ptr)
            # bias backward
            if b is not None:
                gb = cuda.cupy.empty_like(b)
                libcudnn.convolutionBackwardBias(
                    handle, one.data, gy_desc.value, gy.data.ptr,
                    zero.data, self.bias_desc.value, gb.data.ptr)
            gW = cuda.cupy.empty_like(W)
            # filter backward
            if configuration.config.cudnn_deterministic:
                algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
            else:
                algo = libcudnn.getConvolutionBackwardFilterAlgorithm(
                    handle, gy_desc.value, gx_desc.value,
                    self.conv_desc.value, self.filter_desc.value,
                    _bwd_filter_pref, workspace_size)

            libcudnn.convolutionBackwardFilter_v3(
                handle, one.data, gy_desc.value, gy.data.ptr,
                gx_desc.value, x.data.ptr, self.conv_desc.value,
                algo, workspace.data.ptr, workspace_size,
                zero.data, self.filter_desc.value, gW.data.ptr)
        else:
            # Implementation using im2col
            col = conv.im2col_gpu(
                gy, kh, kw, self.sy, self.sx, self.ph, self.pw)

            gW = cuda.cupy.tensordot(
                x, col, ([0, 2, 3], [0, 4, 5])).astype(W.dtype, copy=False)
            if self.requires_x_grad:
                gx = cuda.cupy.tensordot(
                    col, W, ([1, 2, 3], [1, 2, 3])).astype(x.dtype, copy=False)
                gx = cuda.cupy.rollaxis(gx, 3, 1)

            # bias backward
            if b is not None:
                gb = gy.sum(axis=(0, 2, 3))

        if b is None:
            return gx, gW
        else:
            return gx, gW, gb
예제 #41
0
    def backward_gpu(self, inputs, grad_outputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        if not type_check.same_types(*inputs):
            if b is not None:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}, type(b): {2}'
                                 .format(type(W), type(x), type(b)))
            else:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}'
                                 .format(type(W), type(x)))

        gy = grad_outputs[0]
        n, in_c, in_h, in_w = x.shape
        _, out_channels, kh, kw = W.shape
        c, h, w = gy.shape[1:]
        gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype)

        if (cuda.cudnn_enabled and self.use_cudnn and
                _check_cudnn_acceptable_type(x.dtype, W.dtype)):
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            gy = cuda.cupy.ascontiguousarray(gy)
            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)

            handle = cudnn.get_handle()
            gy_desc = cudnn.create_tensor_descriptor(gy)
            gx_desc = cudnn.create_tensor_descriptor(gx)

            # chance to choose implicit-precomp-gemm algorithm
            workspace_size = cuda.get_max_workspace_size()
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, _fwd_pref,
                workspace_size)
            workspace = cuda.cupy.empty((workspace_size,), dtype='b')

            oz_dtype = 'd' if x.dtype == 'd' else 'f'
            one = numpy.array(1, dtype=oz_dtype).ctypes
            zero = numpy.array(0, dtype=oz_dtype).ctypes

            libcudnn.convolutionForward(
                handle, one.data, gy_desc.value, gy.data.ptr,
                self.filter_desc.value, W.data.ptr,
                self.conv_desc.value, algo, workspace.data.ptr, workspace_size,
                zero.data, gx_desc.value, gx.data.ptr)
            # bias backward
            if b is not None:
                gb = cuda.cupy.empty_like(b)
                libcudnn.convolutionBackwardBias(
                    handle, one.data, gy_desc.value, gy.data.ptr,
                    zero.data, self.bias_desc.value, gb.data.ptr)
            gW = cuda.cupy.empty_like(W)
            # filter backward
            if _cudnn_version >= 3000:
                if not self.deterministic:
                    algo = libcudnn.getConvolutionBackwardFilterAlgorithm(
                        handle, gy_desc.value, gx_desc.value,
                        self.conv_desc.value, self.filter_desc.value,
                        _bwd_filter_pref, workspace_size)
                else:
                    algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1  # NOQA

                libcudnn.convolutionBackwardFilter_v3(
                    handle, one.data, gy_desc.value, gy.data.ptr,
                    gx_desc.value, x.data.ptr, self.conv_desc.value,
                    algo, workspace.data.ptr, workspace_size,
                    zero.data, self.filter_desc.value, gW.data.ptr)
            else:
                if self.deterministic:
                    raise ValueError("'deterministic' option not available "
                                     "for cuDNN versions < v3")
                libcudnn.convolutionBackwardFilter_v2(
                    handle, one.data, gy_desc.value, gy.data.ptr,
                    gx_desc.value, x.data.ptr, self.conv_desc.value,
                    zero.data, self.filter_desc.value, gW.data.ptr)
        else:
            # Implementation using im2col
            col = conv.im2col_gpu(
                gy, kh, kw, self.sy, self.sx, self.ph, self.pw)

            gW = cuda.cupy.tensordot(
                x, col, ([0, 2, 3], [0, 4, 5])).astype(W.dtype, copy=False)
            gx = cuda.cupy.tensordot(
                col, W, ([1, 2, 3], [1, 2, 3])).astype(x.dtype, copy=False)
            gx = cuda.cupy.rollaxis(gx, 3, 1)

            # bias backward
            if b is not None:
                gb = gy.sum(axis=(0, 2, 3))

        if b is None:
            return gx, gW
        else:
            return gx, gW, gb
예제 #42
0
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1))
        x, gy = inputs
        _, out_c, out_h, out_w = gy.shape
        n, c, h, w = x.shape

        if (self.cover_all or not chainer.should_use_cudnn('>=auto') or
                x.dtype != self.W_dtype or
                ((self.dy > 1 or self.dx > 1) and _cudnn_version < 6000)):
            col = conv.im2col_gpu(
                x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw,
                cover_all=self.cover_all, dy=self.dy, dx=self.dx)
            gW = cuda.cupy.tensordot(
                gy, col, ((0, 2, 3), (0, 4, 5))).astype(self.W_dtype,
                                                        copy=False)
            return gW,

        gW = cuda.cupy.empty((out_c, c, self.kh, self.kw), dtype=self.W_dtype)
        x = cuda.cupy.ascontiguousarray(x)
        gy = cuda.cupy.ascontiguousarray(gy)

        use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

        handle = cudnn.get_handle()
        x_desc = cudnn.create_tensor_descriptor(x)
        gy_desc = cudnn.create_tensor_descriptor(gy)

        filter_desc = cudnn.create_filter_descriptor(gW)
        conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype
        conv_desc = cudnn.create_convolution_descriptor(
            *conv_param, dilation=(self.dy, self.dx),
            use_tensor_core=use_tensor_core)

        oz_dtype = 'd' if x.dtype == 'd' else 'f'
        one = numpy.array(1, dtype=oz_dtype).ctypes
        zero = numpy.array(0, dtype=oz_dtype).ctypes

        workspace_size = cuda.get_max_workspace_size()
        workspace = cuda.cupy.empty((workspace_size,), dtype='b')

        if configuration.config.cudnn_deterministic:
            algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
        elif configuration.config.autotune and _cudnn_version >= 5000:
            algo = get_algorithm_bwd_filter(
                x, gy, gW, conv_param, handle, x_desc, gy_desc, conv_desc,
                filter_desc, workspace)
        else:
            algo = libcudnn.getConvolutionBackwardFilterAlgorithm(
                handle, x_desc.value, gy_desc.value, conv_desc.value,
                filter_desc.value, _bwd_filter_pref, workspace_size)

        if use_tensor_core:
            # Only CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 supports
            # Tensor-Core in cuDNN7.
            algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1

        libcudnn.convolutionBackwardFilter_v3(
            handle, one.data, x_desc.value, x.data.ptr, gy_desc.value,
            gy.data.ptr, conv_desc.value, algo, workspace.data.ptr,
            workspace_size, zero.data, filter_desc.value, gW.data.ptr)

        return gW,
예제 #43
0
    def backward_gpu(self, inputs, grad_outputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None
        gy = grad_outputs[0]
        n, in_c, in_h, in_w = x.shape
        _, out_channels, kh, kw = W.shape
        c, h, w = gy.shape[1:]
        gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype)

        if (cuda.cudnn_enabled and self.use_cudnn and
                _check_cudnn_acceptable_type(x.dtype, W.dtype)):
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            gy = cuda.cupy.ascontiguousarray(gy)

            handle = cudnn.get_handle()
            gy_desc = cudnn.create_tensor_descriptor(gy)
            gx_desc = cudnn.create_tensor_descriptor(gx)

            # chance to choose implicit-precomp-gemm algorithm
            workspace_size = cuda.get_max_workspace_size()
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, _fwd_pref,
                workspace_size)
            workspace = cuda.cupy.empty((workspace_size,), dtype='b')

            oz_dtype = 'd' if x.dtype == 'd' else 'f'
            one = numpy.array(1, dtype=oz_dtype).ctypes
            zero = numpy.array(0, dtype=oz_dtype).ctypes

            libcudnn.convolutionForward(
                handle, one.data, gy_desc.value, gy.data.ptr,
                self.filter_desc.value, W.data.ptr,
                self.conv_desc.value, algo, workspace.data.ptr, workspace_size,
                zero.data, gx_desc.value, gx.data.ptr)
            # bias backward
            if b is not None:
                gb = cuda.cupy.empty_like(b)
                libcudnn.convolutionBackwardBias(
                    handle, one.data, gy_desc.value, gy.data.ptr,
                    zero.data, self.bias_desc.value, gb.data.ptr)
            gW = cuda.cupy.empty_like(W)
            # filter backward
            if _cudnn_version >= 4000:
                algo = libcudnn.getConvolutionBackwardFilterAlgorithm(
                    handle, gy_desc.value, gx_desc.value,
                    self.conv_desc.value, self.filter_desc.value,
                    _bwd_filter_pref, workspace_size)

                libcudnn.convolutionBackwardFilter_v3(
                    handle, one.data, gy_desc.value, gy.data.ptr,
                    gx_desc.value, x.data.ptr, self.conv_desc.value,
                    algo, workspace.data.ptr, workspace_size,
                    zero.data, self.filter_desc.value, gW.data.ptr)
            else:
                libcudnn.convolutionBackwardFilter_v2(
                    handle, one.data, gy_desc.value, gy.data.ptr,
                    gx_desc.value, x.data.ptr, self.conv_desc.value,
                    zero.data, self.filter_desc.value, gW.data.ptr)
        else:
            # Implementation using im2col
            col = conv.im2col_gpu(
                gy, kh, kw, self.sy, self.sx, self.ph, self.pw)

            W_mat = W.reshape(in_c, c * kh * kw)
            col_mats = col.reshape(
                n, c * kh * kw, in_h * in_w)
            gx_mats = gx.reshape(n, in_c, in_h * in_w)
            for i in moves.range(n):
                gx_mats[i] = W_mat.dot(col_mats[i])

            # bias backward
            if b is not None:
                gb = gy.sum(axis=(0, 2, 3))

            # filter backward
            gW = cuda.cupy.zeros_like(W)
            gW_mat = gW.reshape(in_c, c * kh * kw)
            x_mats = x.reshape(n, in_c, in_h * in_w)
            for i in moves.range(n):
                gW_mat += x_mats[i].dot(col_mats[i].T)

        if b is None:
            return gx, gW
        else:
            return gx, gW, gb
예제 #44
0
    def forward_gpu(self, inputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        out_c, _, kh, kw = W.shape
        n, c, h, w = x.shape
        dkh, dkw = kh + (kh - 1) * (self.dy - 1), kw + (kw - 1) * (self.dx - 1)

        out_h = conv.get_conv_outsize(h,
                                      kh,
                                      self.sy,
                                      self.ph,
                                      cover_all=self.cover_all,
                                      d=self.dy)
        out_w = conv.get_conv_outsize(w,
                                      kw,
                                      self.sx,
                                      self.pw,
                                      cover_all=self.cover_all,
                                      d=self.dx)

        y = cuda.cupy.zeros((n, out_c, out_h, out_w), dtype=x.dtype)
        if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn
                and _check_cudnn_acceptable_type(x.dtype, W.dtype)):

            pad_x = cuda.cupy.zeros((n, c, h + 2 * self.ph, w + 2 * self.pw),
                                    dtype=x.dtype)
            pad_x[:, :, self.ph:self.ph + h, self.pw:self.pw + w] = x

            out_h_s1 = h + 2 * self.ph - dkh + 1
            out_w_s1 = w + 2 * self.pw - dkw + 1

            for j in moves.range(kh):
                for i in moves.range(kw):
                    xji = cuda.cupy.ascontiguousarray(
                        pad_x[:, :, j * self.dy:j * self.dy + out_h_s1,
                              i * self.dx:i * self.dx + out_w_s1])
                    Wji = cuda.cupy.ascontiguousarray(W[:, :, j:j + 1,
                                                        i:i + 1])

                    if i == 0 and j == 0:
                        handle = cudnn.get_handle()
                        x_desc = cudnn.create_tensor_descriptor(xji)
                        y_desc = cudnn.create_tensor_descriptor(y)
                        self.filter_desc = cudnn.create_filter_descriptor(Wji)
                        self.conv_desc = cudnn.create_convolution_descriptor(
                            (0, 0), (self.sy, self.sx), xji.dtype)

                        workspace_size = cuda.get_max_workspace_size()
                        workspace = cuda.cupy.empty((workspace_size, ),
                                                    dtype='b')
                        algo = libcudnn.getConvolutionForwardAlgorithm(
                            handle, x_desc.value, self.filter_desc.value,
                            self.conv_desc.value, y_desc.value, _fwd_pref,
                            workspace_size)

                        oz_dtype = 'd' if x.dtype == 'd' else 'f'
                        one = numpy.array(1, dtype=oz_dtype).ctypes

                    libcudnn.convolutionForward(
                        handle, one.data, x_desc.value, xji.data.ptr,
                        self.filter_desc.value, Wji.data.ptr,
                        self.conv_desc.value, algo, workspace.data.ptr,
                        workspace_size, one.data, y_desc.value, y.data.ptr)

            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)
                self.bias_desc = cudnn.create_tensor_descriptor(b[None, :,
                                                                  None, None])
                cudnn.add_tensor(handle, one.data, self.bias_desc.value,
                                 b.data.ptr, one.data, y_desc.value,
                                 y.data.ptr)
        else:
            # Implementation using im2col
            self.col = conv.im2col_gpu(x,
                                       kh,
                                       kw,
                                       self.sy,
                                       self.sx,
                                       self.ph,
                                       self.pw,
                                       cover_all=self.cover_all,
                                       dy=self.dy,
                                       dx=self.dx)
            W_mat = W.reshape(out_c, -1)
            col_mats = self.col.reshape(n, -1, out_h * out_w)
            y_mats = y.reshape(n, out_c, -1)
            # TODO(beam2d): Use streams or batch gemm
            for i in moves.range(n):
                y_mats[i] = W_mat.dot(col_mats[i])
            # TODO(beam2d): Support unshared bias
            if b is not None:
                y += b[:, None, None]

        return y,
예제 #45
0
    def forward_gpu(self, x):
        n, c, h, w = x[0].shape
        out_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph)
        out_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw)
        out_c = self.W.shape[0]
        y = cuda.empty((n, out_c, out_h, out_w), dtype=numpy.float32)

        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            x_desc = cudnn.get_tensor_desc(x[0], h, w)
            y_desc = cudnn.get_tensor_desc(y, out_h, out_w)

            self.filter_desc = cudnn.get_filter4d_desc(self.W)
            self.conv_desc = cudnn.get_conv2d_desc(
                (self.ph, self.pw), (self.sy, self.sx))
            if self.b is not None:
                self.bias_desc = cudnn.get_conv_bias_desc(self.b)

            algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, _fwd_pref,
                self.max_workspace_size)
            workspace_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, algo).value
            workspace = cuda.empty(
                (max(workspace_size // 4, 1),), dtype=numpy.float32)

            libcudnn.cudnnConvolutionForward(
                handle, 1, x_desc.value, cudnn.get_ptr(x[0]),
                self.filter_desc.value, cudnn.get_ptr(self.W),
                self.conv_desc.value, algo, cudnn.get_ptr(
                    workspace), workspace_size,
                0, y_desc.value, cudnn.get_ptr(y))

            # TODO(beam2d): Support unshared bias
            if self.b is not None:
                libcudnn.cudnnAddTensor(
                    handle, libcudnn.cudnnAddMode['CUDNN_ADD_SAME_C'],
                    1, self.bias_desc.value, cudnn.get_ptr(self.b),
                    1, y_desc.value, cudnn.get_ptr(y))
        else:
            # Implementation using im2col
            self.col = conv.im2col_gpu(
                x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw)

            # TODO(beam2d): Use streams
            handle = cuda.get_cublas_handle()
            W_mat = self.W.reshape(out_c, c * self.kh * self.kw)
            col_mats = self.col.reshape(
                n, c * self.kh * self.kw, out_h * out_w)
            y_mats = y.reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.dot(W_mat, col_mats[i], handle=handle,
                                  out=y_mats[i])

            # TODO(beam2d): Support unshared bias
            if self.b is not None:
                cuda.elementwise(
                    'float* y, const float* b, int c, int hw',
                    'y[i] += b[i / hw % c]',
                    'conv_bias_fwd')(y, self.b, out_c, out_h * out_w)

        return y,
예제 #46
0
def _im2col(x, *args, **kwargs):
    if isinstance(x, numpy.ndarray):
        return im2col_cpu(x, *args, **kwargs)
    return im2col_gpu(x, *args, **kwargs)
예제 #47
0
    def forward_gpu(self, inputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        out_c, _, kh, kw = W.shape
        n, c, h, w = x.shape

        out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph,
                                      cover_all=self.cover_all)
        out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw,
                                      cover_all=self.cover_all)

        y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype)
        if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and
                _check_cudnn_acceptable_type(x.dtype, W.dtype)):
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)

            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x)
            y_desc = cudnn.create_tensor_descriptor(y)

            self.filter_desc = cudnn.create_filter_descriptor(W)
            self.conv_desc = cudnn.create_convolution_descriptor(
                (self.ph, self.pw), (self.sy, self.sx))
            if b is not None:
                self.bias_desc = cudnn.create_tensor_descriptor(
                    b[None, :, None, None])

            workspace_size = cuda.get_max_workspace_size()
            workspace = cuda.cupy.empty((workspace_size,), dtype='b')
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, _fwd_pref,
                workspace_size)

            oz_dtype = 'd' if x.dtype == 'd' else 'f'
            one = numpy.array(1, dtype=oz_dtype).ctypes
            zero = numpy.array(0, dtype=oz_dtype).ctypes
            libcudnn.convolutionForward(
                handle, one.data, x_desc.value, x.data.ptr,
                self.filter_desc.value, W.data.ptr, self.conv_desc.value,
                algo, workspace.data.ptr, workspace_size, zero.data,
                y_desc.value, y.data.ptr)

            # TODO(beam2d): Support unshared bias
            if b is not None:
                cudnn.add_tensor(
                    handle, one.data, self.bias_desc.value, b.data.ptr,
                    one.data, y_desc.value, y.data.ptr)
        else:
            # Implementation using im2col
            self.col = conv.im2col_gpu(
                x, kh, kw, self.sy, self.sx, self.ph, self.pw,
                cover_all=self.cover_all)
            W_mat = W.reshape(out_c, -1)
            col_mats = self.col.reshape(n, -1, out_h * out_w)
            y_mats = y.reshape(n, out_c, -1)
            # TODO(beam2d): Use streams or batch gemm
            for i in moves.range(n):
                y_mats[i] = W_mat.dot(col_mats[i])
            # TODO(beam2d): Support unshared bias
            if b is not None:
                y += b[:, None, None]

        return y,
예제 #48
0
    def forward_gpu(self, inputs):
        x, W = inputs[:2]
        out_c, _, kh, kw = W.shape

        n, c, h, w = x.shape
        b = inputs[2] if len(inputs) == 3 else None

        out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph)
        out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw)

        y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype)
        if cuda.cudnn_enabled and self.use_cudnn:
            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x)
            y_desc = cudnn.create_tensor_descriptor(y)

            self.filter_desc = cudnn.create_filter_descriptor(W)
            self.conv_desc = cudnn.create_convolution_descriptor(
                (self.ph, self.pw), (self.sy, self.sx))
            if b is not None:
                self.bias_desc = cudnn.create_tensor_descriptor(
                    b[None, :, None, None])

            self.max_workspace_size = c * kh * kw * 4
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, _fwd_pref,
                self.max_workspace_size)
            workspace_size = libcudnn.getConvolutionForwardWorkspaceSize(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, algo)
            workspace = cuda.cupy.empty(
                (max(workspace_size // 4, 1),), dtype=x.dtype)

            dtype = x.dtype
            one = numpy.array(1, dtype=dtype).ctypes
            zero = numpy.array(0, dtype=dtype).ctypes
            libcudnn.convolutionForward(
                handle, one.data, x_desc.value, x.data.ptr,
                self.filter_desc.value, W.data.ptr, self.conv_desc.value,
                algo, workspace.data.ptr, workspace_size, zero.data,
                y_desc.value, y.data.ptr)

            # TODO(beam2d): Support unshared bias
            if b is not None:
                libcudnn.addTensor(
                    handle, libcudnn.CUDNN_ADD_SAME_C, one.data,
                    self.bias_desc.value, b.data.ptr, one.data,
                    y_desc.value, y.data.ptr)
        else:
            # Implementation using im2col
            self.col = conv.im2col_gpu(
                x, kh, kw, self.sy, self.sx, self.ph, self.pw)
            W_mat = W.reshape(out_c, -1)
            col_mats = self.col.reshape(n, -1, out_h * out_w)
            y_mats = y.reshape(n, out_c, -1)
            # TODO(beam2d): Use streams or batch gemm
            for i in moves.range(n):
                y_mats[i] = W_mat.dot(col_mats[i])
            # TODO(beam2d): Support unshared bias
            if b is not None:
                y += b[:, None, None]

        return y,
예제 #49
0
    def forward_gpu(self, inputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        out_c, _, kh, kw = W.shape
        n, c, h, w = x.shape
        dkh, dkw = kh + (kh - 1) * (self.dy - 1), kw + (kw - 1) * (self.dx - 1)

        out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all, d=self.dy)
        out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all, d=self.dx)

        y = cuda.cupy.zeros((n, out_c, out_h, out_w), dtype=x.dtype)
        if (
            not self.cover_all
            and cuda.cudnn_enabled
            and self.use_cudnn
            and _check_cudnn_acceptable_type(x.dtype, W.dtype)
        ):

            pad_x = cuda.cupy.zeros((n, c, h + 2 * self.ph, w + 2 * self.pw), dtype=x.dtype)
            pad_x[:, :, self.ph : self.ph + h, self.pw : self.pw + w] = x

            out_h_s1 = h + 2 * self.ph - dkh + 1
            out_w_s1 = w + 2 * self.pw - dkw + 1

            for j in moves.range(kh):
                for i in moves.range(kw):
                    xji = cuda.cupy.ascontiguousarray(
                        pad_x[:, :, j * self.dy : j * self.dy + out_h_s1, i * self.dx : i * self.dx + out_w_s1]
                    )
                    Wji = cuda.cupy.ascontiguousarray(W[:, :, j : j + 1, i : i + 1])

                    if i == 0 and j == 0:
                        handle = cudnn.get_handle()
                        x_desc = cudnn.create_tensor_descriptor(xji)
                        y_desc = cudnn.create_tensor_descriptor(y)
                        self.filter_desc = cudnn.create_filter_descriptor(Wji)
                        self.conv_desc = cudnn.create_convolution_descriptor((0, 0), (self.sy, self.sx), xji.dtype)

                        workspace_size = cuda.get_max_workspace_size()
                        workspace = cuda.cupy.empty((workspace_size,), dtype="b")
                        algo = libcudnn.getConvolutionForwardAlgorithm(
                            handle,
                            x_desc.value,
                            self.filter_desc.value,
                            self.conv_desc.value,
                            y_desc.value,
                            _fwd_pref,
                            workspace_size,
                        )

                        oz_dtype = "d" if x.dtype == "d" else "f"
                        one = numpy.array(1, dtype=oz_dtype).ctypes

                    libcudnn.convolutionForward(
                        handle,
                        one.data,
                        x_desc.value,
                        xji.data.ptr,
                        self.filter_desc.value,
                        Wji.data.ptr,
                        self.conv_desc.value,
                        algo,
                        workspace.data.ptr,
                        workspace_size,
                        one.data,
                        y_desc.value,
                        y.data.ptr,
                    )

            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)
                self.bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None])
                cudnn.add_tensor(handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr)
        else:
            # Implementation using im2col
            self.col = conv.im2col_gpu(
                x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx
            )
            W_mat = W.reshape(out_c, -1)
            col_mats = self.col.reshape(n, -1, out_h * out_w)
            y_mats = y.reshape(n, out_c, -1)
            # TODO(beam2d): Use streams or batch gemm
            for i in moves.range(n):
                y_mats[i] = W_mat.dot(col_mats[i])
            # TODO(beam2d): Support unshared bias
            if b is not None:
                y += b[:, None, None]

        return (y,)
예제 #50
0
파일: im2col.py 프로젝트: fukatani/chainer
def _im2col(x, *args, **kwargs):
    if isinstance(x, numpy.ndarray):
        return im2col_cpu(x, *args, **kwargs)
    return im2col_gpu(x, *args, **kwargs)
예제 #51
0
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1))  # retain only x and W
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        if not all([isinstance(i, cuda.ndarray) for i in inputs]):
            if b is not None:
                raise ValueError(
                    'numpy and cupy must not be used together\n'
                    'type(W): {0}, type(x): {1}, type(b): {2}'.format(
                        type(W), type(x), type(b)))
            else:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}'.format(
                                     type(W), type(x)))

        out_c, _, kh, kw = W.shape
        n, c, h, w = x.shape

        out_h = conv.get_conv_outsize(h,
                                      kh,
                                      self.sy,
                                      self.ph,
                                      cover_all=self.cover_all,
                                      d=self.dy)
        assert out_h > 0, 'Height in the output should be positive.'
        out_w = conv.get_conv_outsize(w,
                                      kw,
                                      self.sx,
                                      self.pw,
                                      cover_all=self.cover_all,
                                      d=self.dx)
        assert out_w > 0, 'Width in the output should be positive.'

        y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype)
        if (not self.cover_all and chainer.should_use_cudnn('>=auto')
                and x.dtype == W.dtype and
            ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)):
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)

            use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x)
            y_desc = cudnn.create_tensor_descriptor(y)

            filter_desc = cudnn.create_filter_descriptor(W)
            conv_desc = cudnn.create_convolution_descriptor(
                (self.ph, self.pw), (self.sy, self.sx),
                x.dtype,
                dilation=(self.dy, self.dx),
                use_tensor_core=use_tensor_core)
            if b is not None:
                bias_desc = cudnn.create_tensor_descriptor(b[None, :, None,
                                                             None])

            workspace_size = cuda.get_max_workspace_size()
            workspace = cuda.cupy.empty((workspace_size, ), dtype='b')
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, x_desc.value, filter_desc.value, conv_desc.value,
                y_desc.value, _fwd_pref, workspace_size)

            if use_tensor_core:
                # Only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
                # supports Tensor-Core in cuDNN7.
                algo = libcudnn.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM  # NOQA

            oz_dtype = 'd' if x.dtype == 'd' else 'f'
            one = numpy.array(1, dtype=oz_dtype).ctypes
            zero = numpy.array(0, dtype=oz_dtype).ctypes
            libcudnn.convolutionForward(handle, one.data, x_desc.value,
                                        x.data.ptr, filter_desc.value,
                                        W.data.ptr, conv_desc.value, algo,
                                        workspace.data.ptr, workspace_size,
                                        zero.data, y_desc.value, y.data.ptr)

            # TODO(beam2d): Support unshared bias
            if b is not None:
                cudnn.add_tensor(handle, one.data, bias_desc.value, b.data.ptr,
                                 one.data, y_desc.value, y.data.ptr)
        else:
            # Implementation using im2col
            col = conv.im2col_gpu(x,
                                  kh,
                                  kw,
                                  self.sy,
                                  self.sx,
                                  self.ph,
                                  self.pw,
                                  cover_all=self.cover_all,
                                  dy=self.dy,
                                  dx=self.dx)
            y = cuda.cupy.tensordot(col, W,
                                    ((1, 2, 3), (1, 2, 3))).astype(x.dtype,
                                                                   copy=False)
            # TODO(beam2d): Support unshared bias
            if b is not None:
                y += b
            y = cuda.cupy.rollaxis(y, 3, 1)

        return y,
예제 #52
0
 def test_im2col_consistency(self):
     col_cpu = conv.im2col_cpu(self.x, 3, 3, 2, 2, 1, 1)
     col_gpu = conv.im2col_gpu(cuda.to_gpu(self.x), 3, 3, 2, 2, 1, 1)
     gradient_check.assert_allclose(col_cpu, col_gpu.get(), atol=0, rtol=0)
    def backward_gpu(self, x, gy):
        n, out_c, out_h, out_w = x[0].shape
        c, h, w = gy[0].shape[1:]
        gx = cuda.empty((n, out_c, out_h, out_w), dtype=numpy.float32)
        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            gy_desc = cudnn.get_tensor_desc(gy[0], h, w)
            gx_desc = cudnn.get_tensor_desc(gx, out_h, out_w)

            algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, _fwd_pref,
                self.max_workspace_size)
            workspace_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, algo).value
            workspace = cuda.empty(
                (max(workspace_size // 4, 1),), dtype=numpy.float32)

            libcudnn.cudnnConvolutionForward(
                handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]),
                self.filter_desc.value, cudnn.get_ptr(self.W),
                self.conv_desc.value, algo, cudnn.get_ptr(
                    workspace), workspace_size,
                0, gx_desc.value, cudnn.get_ptr(gx))
            # bias backward
            if self.b is not None:
                libcudnn.cudnnConvolutionBackwardBias(
                    handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]),
                    1, self.bias_desc.value, cudnn.get_ptr(self.gb))
            # filter backward
            libcudnn.cudnnConvolutionBackwardFilter(
                handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]),
                gx_desc.value, cudnn.get_ptr(x[0]), self.conv_desc.value,
                1, self.filter_desc.value, cudnn.get_ptr(self.gW))
        else:
            # Implementation using im2col
            col = conv.im2col_gpu(
                gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw)

            # TODO(beam2d): Use streams
            handle = cuda.get_cublas_handle()
            W_mat = self.W.reshape(out_c, c * self.kh * self.kw)
            col_mats = col.reshape(
                n, c * self.kh * self.kw, out_h * out_w)
            gx_mats = gx.reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.dot(W_mat, col_mats[i], handle=handle,
                                  out=gx_mats[i])
            # bias backward
            if self.gb is not None:
                # TODO(beam2d): Unify kernels
                with cuda.using_cumisc(handle):
                    tmp = cuda.cumisc.sum(
                        gy[0].reshape(n * c, h * w), axis=1)
                    tmp = cuda.cumisc.sum(tmp.reshape(n, c), axis=0)
                    self.gb += tmp
            # filter backward
            # TODO(beam2d): Use streams
            gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw)
            x_mats = x[0].reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.add_dot(
                    x_mats[i], col_mats[i], gW_mat, transb='T', handle=handle)
        return gx,
예제 #54
0
    def forward_gpu(self, inputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        if not type_check.same_types(*inputs):
            if b is not None:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}, type(b): {2}'
                                 .format(type(W), type(x), type(b)))
            else:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}'
                                 .format(type(W), type(x)))

        out_c, _, kh, kw = W.shape
        n, c, h, w = x.shape

        out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph,
                                      cover_all=self.cover_all)
        assert out_h > 0, 'Height in the output should be positive.'
        out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw,
                                      cover_all=self.cover_all)
        assert out_w > 0, 'Width in the output should be positive.'

        y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype)
        if (not self.cover_all and chainer.should_use_cudnn('>=auto') and
                _check_cudnn_acceptable_type(x.dtype, W.dtype)):
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)

            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x)
            y_desc = cudnn.create_tensor_descriptor(y)

            self.filter_desc = cudnn.create_filter_descriptor(W)
            self.conv_desc = cudnn.create_convolution_descriptor(
                (self.ph, self.pw), (self.sy, self.sx), x.dtype)
            if b is not None:
                self.bias_desc = cudnn.create_tensor_descriptor(
                    b[None, :, None, None])

            workspace_size = cuda.get_max_workspace_size()
            workspace = cuda.cupy.empty((workspace_size,), dtype='b')
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, _fwd_pref,
                workspace_size)

            oz_dtype = 'd' if x.dtype == 'd' else 'f'
            one = numpy.array(1, dtype=oz_dtype).ctypes
            zero = numpy.array(0, dtype=oz_dtype).ctypes
            libcudnn.convolutionForward(
                handle, one.data, x_desc.value, x.data.ptr,
                self.filter_desc.value, W.data.ptr, self.conv_desc.value,
                algo, workspace.data.ptr, workspace_size, zero.data,
                y_desc.value, y.data.ptr)

            # TODO(beam2d): Support unshared bias
            if b is not None:
                cudnn.add_tensor(
                    handle, one.data, self.bias_desc.value, b.data.ptr,
                    one.data, y_desc.value, y.data.ptr)
        else:
            # Implementation using im2col
            self.col = conv.im2col_gpu(
                x, kh, kw, self.sy, self.sx, self.ph, self.pw,
                cover_all=self.cover_all)
            y = cuda.cupy.tensordot(
                self.col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype,
                                                            copy=False)
            # TODO(beam2d): Support unshared bias
            if b is not None:
                y += b
            y = cuda.cupy.rollaxis(y, 3, 1)

        return y,
예제 #55
0
 def test_im2col_consistency(self):
     col_cpu = conv.im2col_cpu(self.x, 3, 3, 2, 2, 2, 2, dy=2, dx=2)
     col_gpu = conv.im2col_gpu(
         cuda.to_gpu(self.x), 3, 3, 2, 2, 2, 2, dy=2, dx=2)
     testing.assert_allclose(col_cpu, col_gpu.get(), atol=0, rtol=0)
예제 #56
0
    def backward_gpu(self, inputs, grad_outputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None
        gy = grad_outputs[0]
        n, in_c, in_h, in_w = x.shape
        _, out_channels, kh, kw = W.shape
        c, h, w = gy.shape[1:]
        gx = cuda.cupy.empty((n, in_c, in_h, in_w), dtype=x.dtype)

        if (cuda.cudnn_enabled and self.use_cudnn and
                _check_cudnn_acceptable_type(x.dtype, W.dtype)):
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            gy = cuda.cupy.ascontiguousarray(gy)

            handle = cudnn.get_handle()
            gy_desc = cudnn.create_tensor_descriptor(gy)
            gx_desc = cudnn.create_tensor_descriptor(gx)

            # chance to choose implicit-precomp-gemm algorithm
            workspace_size = cuda.get_max_workspace_size()
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, _fwd_pref,
                workspace_size)
            workspace = cuda.cupy.empty((workspace_size,), dtype='b')

            oz_dtype = 'd' if x.dtype == 'd' else 'f'
            one = numpy.array(1, dtype=oz_dtype).ctypes
            zero = numpy.array(0, dtype=oz_dtype).ctypes

            libcudnn.convolutionForward(
                handle, one.data, gy_desc.value, gy.data.ptr,
                self.filter_desc.value, W.data.ptr,
                self.conv_desc.value, algo, workspace.data.ptr, workspace_size,
                zero.data, gx_desc.value, gx.data.ptr)
            # bias backward
            if b is not None:
                gb = cuda.cupy.empty_like(b)
                libcudnn.convolutionBackwardBias(
                    handle, one.data, gy_desc.value, gy.data.ptr,
                    zero.data, self.bias_desc.value, gb.data.ptr)
            gW = cuda.cupy.empty_like(W)
            # filter backward
            if _cudnn_version >= 4000:
                if not self.deterministic:
                    algo = libcudnn.getConvolutionBackwardFilterAlgorithm(
                        handle, gy_desc.value, gx_desc.value,
                        self.conv_desc.value, self.filter_desc.value,
                        _bwd_filter_pref, workspace_size)
                else:
                    algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1  # NOQA

                libcudnn.convolutionBackwardFilter_v3(
                    handle, one.data, gy_desc.value, gy.data.ptr,
                    gx_desc.value, x.data.ptr, self.conv_desc.value,
                    algo, workspace.data.ptr, workspace_size,
                    zero.data, self.filter_desc.value, gW.data.ptr)
            else:
                if self.deterministic:
                    raise ValueError("'deterministic' option not available "
                                     "for cuDNN versions < v4")
                libcudnn.convolutionBackwardFilter_v2(
                    handle, one.data, gy_desc.value, gy.data.ptr,
                    gx_desc.value, x.data.ptr, self.conv_desc.value,
                    zero.data, self.filter_desc.value, gW.data.ptr)
        else:
            # Implementation using im2col
            col = conv.im2col_gpu(
                gy, kh, kw, self.sy, self.sx, self.ph, self.pw)

            gW = cuda.cupy.tensordot(
                x, col, ([0, 2, 3], [0, 4, 5])).astype(W.dtype, copy=False)
            gx = cuda.cupy.tensordot(
                col, W, ([1, 2, 3], [1, 2, 3])).astype(x.dtype, copy=False)
            gx = cuda.cupy.rollaxis(gx, 3, 1)

            # bias backward
            if b is not None:
                gb = gy.sum(axis=(0, 2, 3))

        if b is None:
            return gx, gW
        else:
            return gx, gW, gb
예제 #57
0
    def forward_gpu(self, inputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        if not type_check.same_types(*inputs):
            if b is not None:
                raise ValueError(
                    'numpy and cupy must not be used together\n'
                    'type(W): {0}, type(x): {1}, type(b): {2}'.format(
                        type(W), type(x), type(b)))
            else:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}'.format(
                                     type(W), type(x)))

        out_c, _, kh, kw = W.shape
        n, c, h, w = x.shape
        dkh, dkw = kh + (kh - 1) * (self.dy - 1), kw + (kw - 1) * (self.dx - 1)

        out_h = conv.get_conv_outsize(h,
                                      kh,
                                      self.sy,
                                      self.ph,
                                      cover_all=self.cover_all,
                                      d=self.dy)
        out_w = conv.get_conv_outsize(w,
                                      kw,
                                      self.sx,
                                      self.pw,
                                      cover_all=self.cover_all,
                                      d=self.dx)

        y = cuda.cupy.zeros((n, out_c, out_h, out_w), dtype=x.dtype)
        if (not self.cover_all and chainer.should_use_cudnn('>=auto')
                and _check_cudnn_acceptable_type(x.dtype, W.dtype)):

            pad_x = cuda.cupy.zeros((n, c, h + 2 * self.ph, w + 2 * self.pw),
                                    dtype=x.dtype)
            pad_x[:, :, self.ph:self.ph + h, self.pw:self.pw + w] = x

            out_h_s1 = h + 2 * self.ph - dkh + 1
            out_w_s1 = w + 2 * self.pw - dkw + 1

            for j in moves.range(kh):
                for i in moves.range(kw):
                    xji = cuda.cupy.ascontiguousarray(
                        pad_x[:, :, j * self.dy:j * self.dy + out_h_s1,
                              i * self.dx:i * self.dx + out_w_s1])
                    Wji = cuda.cupy.ascontiguousarray(W[:, :, j:j + 1,
                                                        i:i + 1])

                    if i == 0 and j == 0:
                        handle = cudnn.get_handle()
                        xji_desc = cudnn.create_tensor_descriptor(xji)
                        y_desc = cudnn.create_tensor_descriptor(y)
                        self.filter_desc = cudnn.create_filter_descriptor(Wji)
                        self.conv_desc = cudnn.create_convolution_descriptor(
                            (0, 0), (self.sy, self.sx), xji.dtype)

                        workspace_size = cuda.get_max_workspace_size()
                        workspace = cuda.cupy.empty((workspace_size, ),
                                                    dtype='b')
                        algo = libcudnn.getConvolutionForwardAlgorithm(
                            handle, xji_desc.value, self.filter_desc.value,
                            self.conv_desc.value, y_desc.value, _fwd_pref,
                            workspace_size)

                        oz_dtype = 'd' if x.dtype == 'd' else 'f'
                        one = numpy.array(1, dtype=oz_dtype).ctypes

                    libcudnn.convolutionForward(
                        handle, one.data, xji_desc.value, xji.data.ptr,
                        self.filter_desc.value, Wji.data.ptr,
                        self.conv_desc.value, algo, workspace.data.ptr,
                        workspace_size, one.data, y_desc.value, y.data.ptr)

            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)
                self.bias_desc = cudnn.create_tensor_descriptor(b[None, :,
                                                                  None, None])
                cudnn.add_tensor(handle, one.data, self.bias_desc.value,
                                 b.data.ptr, one.data, y_desc.value,
                                 y.data.ptr)
        else:
            # Implementation using im2col
            self.col = conv.im2col_gpu(x,
                                       kh,
                                       kw,
                                       self.sy,
                                       self.sx,
                                       self.ph,
                                       self.pw,
                                       cover_all=self.cover_all,
                                       dy=self.dy,
                                       dx=self.dx)
            y = cuda.cupy.tensordot(self.col, W,
                                    ((1, 2, 3), (1, 2, 3))).astype(x.dtype,
                                                                   copy=False)
            # TODO(beam2d): Support unshared bias
            if b is not None:
                y += b
            y = cuda.cupy.rollaxis(y, 3, 1)

        return y,
예제 #58
0
    def forward_gpu(self, inputs):
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        if not type_check.same_types(*inputs):
            if b is not None:
                raise ValueError(
                    'numpy and cupy must not be used together\n'
                    'type(W): {0}, type(x): {1}, type(b): {2}'.format(
                        type(W), type(x), type(b)))
            else:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}'.format(
                                     type(W), type(x)))

        out_c, _, kh, kw = W.shape
        n, c, h, w = x.shape

        out_h = conv.get_conv_outsize(h,
                                      kh,
                                      self.sy,
                                      self.ph,
                                      cover_all=self.cover_all)
        assert out_h > 0, 'Height in the output should be positive.'
        out_w = conv.get_conv_outsize(w,
                                      kw,
                                      self.sx,
                                      self.pw,
                                      cover_all=self.cover_all)
        assert out_w > 0, 'Width in the output should be positive.'

        y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype)
        if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn
                and _check_cudnn_acceptable_type(x.dtype, W.dtype)):
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)

            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x)
            y_desc = cudnn.create_tensor_descriptor(y)

            self.filter_desc = cudnn.create_filter_descriptor(W)
            self.conv_desc = cudnn.create_convolution_descriptor(
                (self.ph, self.pw), (self.sy, self.sx), x.dtype)
            if b is not None:
                self.bias_desc = cudnn.create_tensor_descriptor(b[None, :,
                                                                  None, None])

            workspace_size = cuda.get_max_workspace_size()
            workspace = cuda.cupy.empty((workspace_size, ), dtype='b')
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, _fwd_pref, workspace_size)

            oz_dtype = 'd' if x.dtype == 'd' else 'f'
            one = numpy.array(1, dtype=oz_dtype).ctypes
            zero = numpy.array(0, dtype=oz_dtype).ctypes
            libcudnn.convolutionForward(handle, one.data, x_desc.value,
                                        x.data.ptr, self.filter_desc.value,
                                        W.data.ptr, self.conv_desc.value, algo,
                                        workspace.data.ptr, workspace_size,
                                        zero.data, y_desc.value, y.data.ptr)

            # TODO(beam2d): Support unshared bias
            if b is not None:
                cudnn.add_tensor(handle, one.data, self.bias_desc.value,
                                 b.data.ptr, one.data, y_desc.value,
                                 y.data.ptr)
        else:
            # Implementation using im2col
            self.col = conv.im2col_gpu(x,
                                       kh,
                                       kw,
                                       self.sy,
                                       self.sx,
                                       self.ph,
                                       self.pw,
                                       cover_all=self.cover_all)
            y = cuda.cupy.tensordot(self.col, W,
                                    ((1, 2, 3), (1, 2, 3))).astype(x.dtype,
                                                                   copy=False)
            # TODO(beam2d): Support unshared bias
            if b is not None:
                y += b
            y = cuda.cupy.rollaxis(y, 3, 1)

        return y,