Esempio n. 1
0
    def _forward_cudnn(self, x, gy):
        _, out_c, out_h, out_w = gy.shape
        n, c, h, w = x.shape

        iC = c
        iCg = int(iC / self.group)
        gW = cuda.cupy.empty((out_c, iCg, self.kh, self.kw),
                             dtype=self.W_dtype)
        x = cuda.cupy.ascontiguousarray(x)
        gy = cuda.cupy.ascontiguousarray(gy)

        use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

        handle = cudnn.get_handle()
        x_desc = cudnn.create_tensor_descriptor(x)
        gy_desc = cudnn.create_tensor_descriptor(gy)

        filter_desc = cudnn.create_filter_descriptor(gW)
        conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype
        dilation = (self.dy, self.dx)
        conv_desc = cudnn.create_convolution_descriptor(
            *conv_param,
            dilation=dilation,
            use_tensor_core=use_tensor_core,
            group=self.group)

        oz_dtype = 'd' if x.dtype == 'd' else 'f'
        one = numpy.array(1, dtype=oz_dtype).ctypes
        zero = numpy.array(0, dtype=oz_dtype).ctypes

        workspace_size = cuda.get_max_workspace_size()
        workspace = cuda.cupy.empty((workspace_size, ), dtype='b')

        if configuration.config.cudnn_deterministic:
            algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
        elif configuration.config.autotune and _cudnn_version >= 5000:
            algo = _get_algorithm_bwd_filter(x, gy, gW,
                                             conv_param + (dilation, ), handle,
                                             x_desc, gy_desc, conv_desc,
                                             filter_desc, workspace)
        else:
            algo = libcudnn.getConvolutionBackwardFilterAlgorithm(
                handle, x_desc.value, gy_desc.value, conv_desc.value,
                filter_desc.value, _bwd_filter_pref, workspace_size)

        if use_tensor_core:
            # Only CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 supports
            # Tensor-Core in cuDNN7.
            algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1

        libcudnn.convolutionBackwardFilter_v3(handle, one.data, x_desc.value,
                                              x.data.ptr, gy_desc.value,
                                              gy.data.ptr, conv_desc.value,
                                              algo, workspace.data.ptr,
                                              workspace_size, zero.data,
                                              filter_desc.value, gW.data.ptr)

        return gW,
Esempio n. 2
0
    def _forward_cudnn(self, x, gy):
        _, out_c, out_h, out_w = gy.shape
        n, c, h, w = x.shape

        iC = c
        iCg = int(iC / self.group)
        gW = cuda.cupy.empty((out_c, iCg, self.kh, self.kw),
                             dtype=self.W_dtype)
        x = cuda.cupy.ascontiguousarray(x)
        gy = cuda.cupy.ascontiguousarray(gy)

        use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

        # cuDNN 7 supports dilation only in *_BWD_FILTER_ALGO_0, but
        # it supports Tensor Cores only in *_BWD_FILTER_ALGO_1.
        if use_tensor_core and (self.dx > 1 or self.dy > 1):
            use_tensor_core = False

        handle = cudnn.get_handle()
        x_desc = cudnn.create_tensor_descriptor(x)
        gy_desc = cudnn.create_tensor_descriptor(gy)

        filter_desc = cudnn.create_filter_descriptor(gW)
        conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype
        dilation = (self.dy, self.dx)
        conv_desc = cudnn.create_convolution_descriptor(
            *conv_param, dilation=dilation,
            use_tensor_core=use_tensor_core,
            group=self.group)

        oz_dtype = 'd' if x.dtype == 'd' else 'f'
        one = numpy.array(1, dtype=oz_dtype).ctypes
        zero = numpy.array(0, dtype=oz_dtype).ctypes

        workspace_size = cuda.get_max_workspace_size()
        workspace = cuda.cupy.empty((workspace_size,), dtype='b')

        if configuration.config.cudnn_deterministic:
            algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
        elif configuration.config.autotune and _cudnn_version >= 5000:
            algo = _get_algorithm_bwd_filter(
                x, gy, gW, conv_param + (dilation,), handle, x_desc, gy_desc,
                conv_desc, filter_desc, workspace)
        else:
            algo = libcudnn.getConvolutionBackwardFilterAlgorithm(
                handle, x_desc.value, gy_desc.value, conv_desc.value,
                filter_desc.value, _bwd_filter_pref, workspace_size)

        if use_tensor_core:
            algo = self._tensor_core_adjust_algo()

        libcudnn.convolutionBackwardFilter_v3(
            handle, one.data, x_desc.value, x.data.ptr, gy_desc.value,
            gy.data.ptr, conv_desc.value, algo, workspace.data.ptr,
            workspace_size, zero.data, filter_desc.value, gW.data.ptr)

        return gW,
Esempio n. 3
0
    def _forward_cudnn(self, x, W, b):
        x = cuda.cupy.ascontiguousarray(x)
        W = cuda.cupy.ascontiguousarray(W)
        if b is not None:
            b = cuda.cupy.ascontiguousarray(b)

        n = x.shape[0]
        out_c = W.shape[1]

        use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

        handle = cudnn.get_handle()
        x_desc = cudnn.create_tensor_descriptor(x)
        y = cuda.cupy.empty((n, out_c, self.outh, self.outw), dtype=x.dtype)
        y_desc = cudnn.create_tensor_descriptor(y)

        filter_desc = cudnn.create_filter_descriptor(W)
        conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype
        dilation = (self.dy, self.dx)
        conv_desc = cudnn.create_convolution_descriptor(
            *conv_param, dilation=dilation, use_tensor_core=use_tensor_core)
        if b is not None:
            bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None])

        oz_dtype = 'd' if x.dtype == 'd' else 'f'
        one = numpy.array(1, dtype=oz_dtype).ctypes
        zero = numpy.array(0, dtype=oz_dtype).ctypes

        workspace_size = cuda.get_max_workspace_size()
        workspace = cuda.cupy.empty((workspace_size, ), dtype='b')

        if configuration.config.cudnn_deterministic:
            algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
        elif configuration.config.autotune and _cudnn_version_ >= 5000:
            algo = get_algorithm(W, x, y, conv_param + (dilation, ), handle,
                                 filter_desc, x_desc, conv_desc, y_desc,
                                 workspace)
        else:
            algo = libcudnn.getConvolutionBackwardDataAlgorithm(
                handle, filter_desc.value, x_desc.value, conv_desc.value,
                y_desc.value, _bwd_data_pref, workspace_size)

        if use_tensor_core:
            # Only CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 supports
            # Tensor-Core in cuDNN7
            algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1

        libcudnn.convolutionBackwardData_v3(
            handle, one.data, filter_desc.value, W.data.ptr, x_desc.value,
            x.data.ptr, conv_desc.value, algo, workspace.data.ptr,
            workspace_size, zero.data, y_desc.value, y.data.ptr)

        if b is not None:
            cudnn.add_tensor(handle, one.data, bias_desc.value, b.data.ptr,
                             one.data, y_desc.value, y.data.ptr)

        return y,
Esempio n. 4
0
    def _forward_cudnn(self, x, W, b, y):
        x = cuda.cupy.ascontiguousarray(x)
        W = cuda.cupy.ascontiguousarray(W)
        if b is not None:
            b = cuda.cupy.ascontiguousarray(b)

        use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

        # cuDNN 7 supports dilation only in *_FWD_ALGO_IMPLICIT_GEMM, but
        # it supports Tensor Cores only in *_FWD_ALGO_IMPLICIT_PRECOMP_GEMM.
        if use_tensor_core and (self.dx > 1 or self.dy > 1):
            use_tensor_core = False

        handle = cudnn.get_handle()
        x_desc = cudnn.create_tensor_descriptor(x)
        y_desc = cudnn.create_tensor_descriptor(y)

        filter_desc = cudnn.create_filter_descriptor(W)
        conv_param = ((self.ph, self.pw), (self.sy, self.sx), x.dtype)
        dilation = (self.dy, self.dx)
        conv_desc = cudnn.create_convolution_descriptor(
            *conv_param, dilation=dilation,
            use_tensor_core=use_tensor_core,
            group=self.group)
        if b is not None:
            bias_desc = cudnn.create_tensor_descriptor(
                b[None, :, None, None])
        workspace_size = cuda.get_max_workspace_size()
        workspace = cuda.cupy.empty((workspace_size,), dtype='b')
        if configuration.config.autotune and _cudnn_version >= 5000:
            algo = _get_algorithm_fwd(
                x, W, y, conv_param + (dilation,), handle, x_desc,
                filter_desc, conv_desc, y_desc, workspace)
        else:
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, x_desc.value, filter_desc.value,
                conv_desc.value, y_desc.value, _fwd_pref, workspace_size)

        if use_tensor_core:
            algo = self._tensor_core_adjust_algo()

        oz_dtype = 'd' if x.dtype == 'd' else 'f'
        one = numpy.array(1, dtype=oz_dtype).ctypes
        zero = numpy.array(0, dtype=oz_dtype).ctypes
        libcudnn.convolutionForward(
            handle, one.data, x_desc.value, x.data.ptr,
            filter_desc.value, W.data.ptr, conv_desc.value,
            algo, workspace.data.ptr, workspace_size, zero.data,
            y_desc.value, y.data.ptr)

        # TODO(beam2d): Support unshared bias
        if b is not None:
            cudnn.add_tensor(
                handle, one.data, bias_desc.value, b.data.ptr,
                one.data, y_desc.value, y.data.ptr)

        return y,
Esempio n. 5
0
    def _forward_cudnn(self, x, W, b, y):
        x = cuda.cupy.ascontiguousarray(x)
        W = cuda.cupy.ascontiguousarray(W)
        if b is not None:
            b = cuda.cupy.ascontiguousarray(b)

        use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

        # cuDNN 7 supports dilation only in *_FWD_ALGO_IMPLICIT_GEMM, but
        # it supports Tensor Cores only in *_FWD_ALGO_IMPLICIT_PRECOMP_GEMM.
        if use_tensor_core and (self.dx > 1 or self.dy > 1):
            use_tensor_core = False

        handle = cudnn.get_handle()
        x_desc = cudnn.create_tensor_descriptor(x)
        y_desc = cudnn.create_tensor_descriptor(y)

        filter_desc = cudnn.create_filter_descriptor(W)
        conv_param = ((self.ph, self.pw), (self.sy, self.sx), x.dtype)
        dilation = (self.dy, self.dx)
        conv_desc = cudnn.create_convolution_descriptor(
            *conv_param,
            dilation=dilation,
            use_tensor_core=use_tensor_core,
            group=self.group)
        if b is not None:
            bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None])
        workspace_size = cuda.get_max_workspace_size()
        workspace = cuda.cupy.empty((workspace_size, ), dtype='b')
        if configuration.config.autotune and _cudnn_version >= 5000:
            algo = _get_algorithm_fwd(x, W, y, conv_param + (dilation, ),
                                      handle, x_desc, filter_desc, conv_desc,
                                      y_desc, workspace)
        else:
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, x_desc.value, filter_desc.value, conv_desc.value,
                y_desc.value, _fwd_pref, workspace_size)

        if use_tensor_core:
            algo = self._tensor_core_adjust_algo()

        oz_dtype = 'd' if x.dtype == 'd' else 'f'
        one = numpy.array(1, dtype=oz_dtype).ctypes
        zero = numpy.array(0, dtype=oz_dtype).ctypes
        libcudnn.convolutionForward(handle, one.data, x_desc.value, x.data.ptr,
                                    filter_desc.value, W.data.ptr,
                                    conv_desc.value, algo, workspace.data.ptr,
                                    workspace_size, zero.data, y_desc.value,
                                    y.data.ptr)

        # TODO(beam2d): Support unshared bias
        if b is not None:
            cudnn.add_tensor(handle, one.data, bias_desc.value, b.data.ptr,
                             one.data, y_desc.value, y.data.ptr)

        return y,
Esempio n. 6
0
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1))
        x, gy = inputs
        _, out_c, out_h, out_w = gy.shape
        n, c, h, w = x.shape

        if (self.cover_all or not chainer.should_use_cudnn('>=auto') or
                x.dtype != self.W_dtype):
            col = conv.im2col_gpu(
                x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw,
                cover_all=self.cover_all)
            gW = cuda.cupy.tensordot(
                gy, col, ((0, 2, 3), (0, 4, 5))).astype(self.W_dtype,
                                                        copy=False)
            return gW,

        gW = cuda.cupy.empty((out_c, c, self.kh, self.kw), dtype=self.W_dtype)
        x = cuda.cupy.ascontiguousarray(x)
        gy = cuda.cupy.ascontiguousarray(gy)

        use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

        handle = cudnn.get_handle()
        x_desc = cudnn.create_tensor_descriptor(x)
        gy_desc = cudnn.create_tensor_descriptor(gy)

        filter_desc = cudnn.create_filter_descriptor(gW)
        conv_desc = cudnn.create_convolution_descriptor(
            (self.ph, self.pw), (self.sy, self.sx), x.dtype,
            use_tensor_core=use_tensor_core)

        oz_dtype = 'd' if x.dtype == 'd' else 'f'
        one = numpy.array(1, dtype=oz_dtype).ctypes
        zero = numpy.array(0, dtype=oz_dtype).ctypes

        workspace_size = cuda.get_max_workspace_size()
        workspace = cuda.cupy.empty((workspace_size,), dtype='b')

        if configuration.config.cudnn_deterministic:
            algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
        else:
            algo = libcudnn.getConvolutionBackwardFilterAlgorithm(
                handle, x_desc.value, gy_desc.value, conv_desc.value,
                filter_desc.value, _bwd_filter_pref, workspace_size)

        if use_tensor_core:
            # Only CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 supports
            # Tensor-Core in cuDNN7.
            algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1

        libcudnn.convolutionBackwardFilter_v3(
            handle, one.data, x_desc.value, x.data.ptr, gy_desc.value,
            gy.data.ptr, conv_desc.value, algo, workspace.data.ptr,
            workspace_size, zero.data, filter_desc.value, gW.data.ptr)

        return gW,
Esempio n. 7
0
    def _forward_cudnn(self, x, gy):
        # Convert to C-contiguous arrays.
        x = cuda.cupy.ascontiguousarray(x)
        gy = cuda.cupy.ascontiguousarray(gy)

        # Make empty arrays for result.
        out_c = gy.shape[1]
        in_c = x.shape[1]
        gW = cuda.cupy.empty((out_c, in_c) + self.ksize, dtype=self.W_dtype)

        # Get cuDNN handler and descriptors.
        use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

        handle = cudnn.get_handle()
        x_desc = cudnn.create_tensor_descriptor(x)
        gy_desc = cudnn.create_tensor_descriptor(gy)

        filter_desc = cudnn.create_filter_descriptor(gW)
        conv_param = (self.pad, self.stride, self.W_dtype)
        conv_desc = cudnn.create_convolution_descriptor(
            *conv_param, use_tensor_core=use_tensor_core)

        # Compute gradients.
        oz_dtype = 'd' if x.dtype == 'd' else 'f'
        one = numpy.array(1, dtype=oz_dtype).ctypes
        zero = numpy.array(0, dtype=oz_dtype).ctypes

        workspace_size = cuda.get_max_workspace_size()
        workspace = cuda.cupy.empty((workspace_size, ), dtype='b')

        # Compute filter weight gradient.
        if configuration.config.autotune and _cudnn_version_ >= 5000:
            algo = convolution_2d._get_algorithm_bwd_filter(
                x, gy, gW, conv_param, handle, x_desc, gy_desc, conv_desc,
                filter_desc, workspace)
        else:
            algo = libcudnn.getConvolutionBackwardFilterAlgorithm(
                handle, x_desc.value, gy_desc.value, conv_desc.value,
                filter_desc.value, _bwd_filter_pref, workspace_size)

        libcudnn.convolutionBackwardFilter_v3(handle, one.data, x_desc.value,
                                              x.data.ptr, gy_desc.value,
                                              gy.data.ptr, conv_desc.value,
                                              algo, workspace.data.ptr,
                                              workspace_size, zero.data,
                                              filter_desc.value, gW.data.ptr)

        return gW,
Esempio n. 8
0
    def _forward_cudnn(self, x, gy):
        # Convert to C-contiguous arrays.
        x = cuda.cupy.ascontiguousarray(x)
        gy = cuda.cupy.ascontiguousarray(gy)

        # Make empty arrays for result.
        out_c = gy.shape[1]
        in_c = x.shape[1]
        gW = cuda.cupy.empty(
            (out_c, in_c) + self.ksize, dtype=self.W_dtype)

        # Get cuDNN handler and descriptors.
        use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

        handle = cudnn.get_handle()
        x_desc = cudnn.create_tensor_descriptor(x)
        gy_desc = cudnn.create_tensor_descriptor(gy)

        filter_desc = cudnn.create_filter_descriptor(gW)
        conv_param = (self.pad, self.stride, self.W_dtype)
        conv_desc = cudnn.create_convolution_descriptor(
            *conv_param, use_tensor_core=use_tensor_core)

        # Compute gradients.
        oz_dtype = 'd' if x.dtype == 'd' else 'f'
        one = numpy.array(1, dtype=oz_dtype).ctypes
        zero = numpy.array(0, dtype=oz_dtype).ctypes

        workspace_size = cuda.get_max_workspace_size()
        workspace = cuda.cupy.empty((workspace_size,), dtype='b')

        # Compute filter weight gradient.
        if configuration.config.autotune and _cudnn_version_ >= 5000:
            algo = convolution_2d._get_algorithm_bwd_filter(
                x, gy, gW, conv_param, handle, x_desc, gy_desc,
                conv_desc, filter_desc, workspace)
        else:
            algo = libcudnn.getConvolutionBackwardFilterAlgorithm(
                handle, x_desc.value, gy_desc.value, conv_desc.value,
                filter_desc.value, _bwd_filter_pref, workspace_size)

        libcudnn.convolutionBackwardFilter_v3(
            handle, one.data, x_desc.value, x.data.ptr,
            gy_desc.value, gy.data.ptr, conv_desc.value,
            algo, workspace.data.ptr, workspace_size,
            zero.data, filter_desc.value, gW.data.ptr)

        return gW,
Esempio n. 9
0
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1))  # only retain x and W
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        if not all([isinstance(i, cuda.ndarray) for i in inputs]):
            if b is not None:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}, type(b): {2}'
                                 .format(type(W), type(x), type(b)))
            else:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}'
                                 .format(type(W), type(x)))

        kh, kw = W.shape[2:]
        n, in_c, in_h, in_w = x.shape
        c = W.shape[1]  # out_c
        if self.outh is None:
            self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph,
                                                d=self.dy)
            assert self.outh > 0, 'Height in the output should be positive.'
        if self.outw is None:
            self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw,
                                                d=self.dx)
            assert self.outw > 0, 'Width in the output should be positive.'

        self._set_cover_all(x, W)

        if (not self.cover_all and chainer.should_use_cudnn('>=auto') and
                x.dtype == W.dtype and
                ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)):
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)

            use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x)
            y = cuda.cupy.empty((n, c, self.outh, self.outw),
                                dtype=x.dtype)
            y_desc = cudnn.create_tensor_descriptor(y)

            filter_desc = cudnn.create_filter_descriptor(W)
            conv_desc = cudnn.create_convolution_descriptor(
                (self.ph, self.pw), (self.sy, self.sx), x.dtype,
                dilation=(self.dy, self.dx),
                use_tensor_core=use_tensor_core)
            if b is not None:
                bias_desc = cudnn.create_tensor_descriptor(
                    b[None, :, None, None])

            oz_dtype = 'd' if x.dtype == 'd' else 'f'
            one = numpy.array(1, dtype=oz_dtype).ctypes
            zero = numpy.array(0, dtype=oz_dtype).ctypes

            workspace_size = cuda.get_max_workspace_size()
            workspace = cuda.cupy.empty((workspace_size,), dtype='b')
            if configuration.config.cudnn_deterministic:
                algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
            else:
                algo = libcudnn.getConvolutionBackwardDataAlgorithm(
                    handle, filter_desc.value, x_desc.value,
                    conv_desc.value, y_desc.value, _bwd_data_pref,
                    workspace_size)

            if use_tensor_core:
                # Only CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 supports
                # Tensor-Core in cuDNN7
                algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1

            libcudnn.convolutionBackwardData_v3(
                handle, one.data, filter_desc.value, W.data.ptr,
                x_desc.value, x.data.ptr, conv_desc.value,
                algo, workspace.data.ptr, workspace_size,
                zero.data, y_desc.value, y.data.ptr)

            if b is not None:
                cudnn.add_tensor(
                    handle, one.data, bias_desc.value, b.data.ptr,
                    one.data, y_desc.value, y.data.ptr)
        else:
            gcol = cuda.cupy.tensordot(W, x, (0, 1)).astype(x.dtype,
                                                            copy=False)
            # - k, m, n: shape of out_channel
            # - b: number of inputs
            # - h, w: height and width of kernels
            # k, m, n, b, h, w -> b, k, m, n, h, w
            gcol = cuda.cupy.rollaxis(gcol, 3)
            y = conv.col2im_gpu(
                gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw,
                dy=self.dy, dx=self.dx)
            if b is not None:
                y += b.reshape(1, b.size, 1, 1)
        return y,
Esempio n. 10
0
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1))  # retain only x and W
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        if not all([isinstance(i, cuda.ndarray) for i in inputs]):
            if b is not None:
                raise ValueError(
                    'numpy and cupy must not be used together\n'
                    'type(W): {0}, type(x): {1}, type(b): {2}'.format(
                        type(W), type(x), type(b)))
            else:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}'.format(
                                     type(W), type(x)))

        out_c, _, kh, kw = W.shape
        n, c, h, w = x.shape

        out_h = conv.get_conv_outsize(h,
                                      kh,
                                      self.sy,
                                      self.ph,
                                      cover_all=self.cover_all,
                                      d=self.dy)
        assert out_h > 0, 'Height in the output should be positive.'
        out_w = conv.get_conv_outsize(w,
                                      kw,
                                      self.sx,
                                      self.pw,
                                      cover_all=self.cover_all,
                                      d=self.dx)
        assert out_w > 0, 'Width in the output should be positive.'

        y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype)
        if (not self.cover_all and chainer.should_use_cudnn('>=auto')
                and x.dtype == W.dtype and
            ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)):
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)

            use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x)
            y_desc = cudnn.create_tensor_descriptor(y)

            filter_desc = cudnn.create_filter_descriptor(W)
            conv_desc = cudnn.create_convolution_descriptor(
                (self.ph, self.pw), (self.sy, self.sx),
                x.dtype,
                dilation=(self.dy, self.dx),
                use_tensor_core=use_tensor_core)
            if b is not None:
                bias_desc = cudnn.create_tensor_descriptor(b[None, :, None,
                                                             None])

            workspace_size = cuda.get_max_workspace_size()
            workspace = cuda.cupy.empty((workspace_size, ), dtype='b')
            algo = libcudnn.getConvolutionForwardAlgorithm(
                handle, x_desc.value, filter_desc.value, conv_desc.value,
                y_desc.value, _fwd_pref, workspace_size)

            if use_tensor_core:
                # Only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
                # supports Tensor-Core in cuDNN7.
                algo = libcudnn.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM  # NOQA

            oz_dtype = 'd' if x.dtype == 'd' else 'f'
            one = numpy.array(1, dtype=oz_dtype).ctypes
            zero = numpy.array(0, dtype=oz_dtype).ctypes
            libcudnn.convolutionForward(handle, one.data, x_desc.value,
                                        x.data.ptr, filter_desc.value,
                                        W.data.ptr, conv_desc.value, algo,
                                        workspace.data.ptr, workspace_size,
                                        zero.data, y_desc.value, y.data.ptr)

            # TODO(beam2d): Support unshared bias
            if b is not None:
                cudnn.add_tensor(handle, one.data, bias_desc.value, b.data.ptr,
                                 one.data, y_desc.value, y.data.ptr)
        else:
            # Implementation using im2col
            col = conv.im2col_gpu(x,
                                  kh,
                                  kw,
                                  self.sy,
                                  self.sx,
                                  self.ph,
                                  self.pw,
                                  cover_all=self.cover_all,
                                  dy=self.dy,
                                  dx=self.dx)
            y = cuda.cupy.tensordot(col, W,
                                    ((1, 2, 3), (1, 2, 3))).astype(x.dtype,
                                                                   copy=False)
            # TODO(beam2d): Support unshared bias
            if b is not None:
                y += b
            y = cuda.cupy.rollaxis(y, 3, 1)

        return y,
Esempio n. 11
0
    def _forward_cudnn(self, x, W, b):
        x = cuda.cupy.ascontiguousarray(x)
        W = cuda.cupy.ascontiguousarray(W)
        if b is not None:
            b = cuda.cupy.ascontiguousarray(b)

        n = x.shape[0]
        # out_c = W.shape[1]
        yCg = W.shape[1]
        yC = yCg * self.group

        use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

        # cuDNN 7 supports dilation only in *_BWD_DATA_ALGO_0, but
        # it supports Tensor Cores only in *_BWD_DATA_ALGO_1.
        if use_tensor_core and (self.dx > 1 or self.dy > 1):
            use_tensor_core = False

        handle = cudnn.get_handle()
        x_desc = cudnn.create_tensor_descriptor(x)
        y = cuda.cupy.empty((n, yC, self.outh, self.outw),
                            dtype=x.dtype)
        y_desc = cudnn.create_tensor_descriptor(y)

        filter_desc = cudnn.create_filter_descriptor(W)
        conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype
        dilation = (self.dy, self.dx)
        conv_desc = cudnn.create_convolution_descriptor(
            *conv_param, dilation=dilation,
            use_tensor_core=use_tensor_core,
            group=self.group)
        if b is not None:
            bias_desc = cudnn.create_tensor_descriptor(
                b[None, :, None, None])

        oz_dtype = 'd' if x.dtype == 'd' else 'f'
        one = numpy.array(1, dtype=oz_dtype).ctypes
        zero = numpy.array(0, dtype=oz_dtype).ctypes

        workspace_size = cuda.get_max_workspace_size()
        workspace = cuda.cupy.empty((workspace_size,), dtype='b')

        if configuration.config.cudnn_deterministic:
            algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
        elif configuration.config.autotune and _cudnn_version_ >= 5000:
            algo = get_algorithm(
                W, x, y, conv_param + (dilation,), handle, filter_desc,
                x_desc, conv_desc, y_desc, workspace)
        else:
            algo = libcudnn.getConvolutionBackwardDataAlgorithm(
                handle, filter_desc.value, x_desc.value, conv_desc.value,
                y_desc.value, _bwd_data_pref, workspace_size)

        if use_tensor_core:
            algo = self._tensor_core_adjust_algo()

        libcudnn.convolutionBackwardData_v3(
            handle, one.data, filter_desc.value, W.data.ptr,
            x_desc.value, x.data.ptr, conv_desc.value,
            algo, workspace.data.ptr, workspace_size,
            zero.data, y_desc.value, y.data.ptr)

        if b is not None:
            cudnn.add_tensor(
                handle, one.data, bias_desc.value, b.data.ptr,
                one.data, y_desc.value, y.data.ptr)

        return y,
Esempio n. 12
0
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1))  # only retain x and W
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        if not all([isinstance(i, cuda.ndarray) for i in inputs]):
            if b is not None:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}, type(b): {2}'
                                 .format(type(W), type(x), type(b)))
            else:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}'
                                 .format(type(W), type(x)))

        kh, kw = W.shape[2:]
        n, in_c, in_h, in_w = x.shape
        c = W.shape[1]  # out_c
        if self.outh is None:
            self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph,
                                                d=self.dy)
            assert self.outh > 0, 'Height in the output should be positive.'
        if self.outw is None:
            self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw,
                                                d=self.dx)
            assert self.outw > 0, 'Width in the output should be positive.'

        self._set_cover_all(x, W)

        if (not self.cover_all and chainer.should_use_cudnn('>=auto') and
                x.dtype == W.dtype and
                ((self.dy == 1 and self.dx == 1) or _cudnn_version_ >= 6000)):
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)

            use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x)
            y = cuda.cupy.empty((n, c, self.outh, self.outw),
                                dtype=x.dtype)
            y_desc = cudnn.create_tensor_descriptor(y)

            filter_desc = cudnn.create_filter_descriptor(W)
            conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype
            conv_desc = cudnn.create_convolution_descriptor(
                *conv_param, dilation=(self.dy, self.dx),
                use_tensor_core=use_tensor_core)
            if b is not None:
                bias_desc = cudnn.create_tensor_descriptor(
                    b[None, :, None, None])

            oz_dtype = 'd' if x.dtype == 'd' else 'f'
            one = numpy.array(1, dtype=oz_dtype).ctypes
            zero = numpy.array(0, dtype=oz_dtype).ctypes

            workspace_size = cuda.get_max_workspace_size()
            workspace = cuda.cupy.empty((workspace_size,), dtype='b')

            if configuration.config.cudnn_deterministic:
                algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
            elif configuration.config.autotune and _cudnn_version_ >= 5000:
                algo = get_algorithm(W, x, y, conv_param, handle, filter_desc,
                                     x_desc, conv_desc, y_desc, workspace)
            else:
                algo = libcudnn.getConvolutionBackwardDataAlgorithm(
                    handle, filter_desc.value, x_desc.value, conv_desc.value,
                    y_desc.value, _bwd_data_pref, workspace_size)

            if use_tensor_core:
                # Only CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 supports
                # Tensor-Core in cuDNN7
                algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1

            libcudnn.convolutionBackwardData_v3(
                handle, one.data, filter_desc.value, W.data.ptr,
                x_desc.value, x.data.ptr, conv_desc.value,
                algo, workspace.data.ptr, workspace_size,
                zero.data, y_desc.value, y.data.ptr)

            if b is not None:
                cudnn.add_tensor(
                    handle, one.data, bias_desc.value, b.data.ptr,
                    one.data, y_desc.value, y.data.ptr)
        else:
            gcol = cuda.cupy.tensordot(W, x, (0, 1)).astype(x.dtype,
                                                            copy=False)
            # - k, m, n: shape of out_channel
            # - b: number of inputs
            # - h, w: height and width of kernels
            # k, m, n, b, h, w -> b, k, m, n, h, w
            gcol = cuda.cupy.rollaxis(gcol, 3)
            y = conv.col2im_gpu(
                gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw,
                dy=self.dy, dx=self.dx)
            if b is not None:
                y += b.reshape(1, b.size, 1, 1)
        return y,
Esempio n. 13
0
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1))
        x, gy = inputs
        _, out_c, out_h, out_w = gy.shape
        n, c, h, w = x.shape

        if (self.cover_all or not chainer.should_use_cudnn('>=auto') or
                x.dtype != self.W_dtype or
                ((self.dy > 1 or self.dx > 1) and _cudnn_version < 6000)):
            col = conv.im2col_gpu(
                x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw,
                cover_all=self.cover_all, dy=self.dy, dx=self.dx)
            gW = cuda.cupy.tensordot(
                gy, col, ((0, 2, 3), (0, 4, 5))).astype(self.W_dtype,
                                                        copy=False)
            return gW,

        gW = cuda.cupy.empty((out_c, c, self.kh, self.kw), dtype=self.W_dtype)
        x = cuda.cupy.ascontiguousarray(x)
        gy = cuda.cupy.ascontiguousarray(gy)

        use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

        handle = cudnn.get_handle()
        x_desc = cudnn.create_tensor_descriptor(x)
        gy_desc = cudnn.create_tensor_descriptor(gy)

        filter_desc = cudnn.create_filter_descriptor(gW)
        conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype
        conv_desc = cudnn.create_convolution_descriptor(
            *conv_param, dilation=(self.dy, self.dx),
            use_tensor_core=use_tensor_core)

        oz_dtype = 'd' if x.dtype == 'd' else 'f'
        one = numpy.array(1, dtype=oz_dtype).ctypes
        zero = numpy.array(0, dtype=oz_dtype).ctypes

        workspace_size = cuda.get_max_workspace_size()
        workspace = cuda.cupy.empty((workspace_size,), dtype='b')

        if configuration.config.cudnn_deterministic:
            algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
        elif configuration.config.autotune and _cudnn_version >= 5000:
            algo = get_algorithm_bwd_filter(
                x, gy, gW, conv_param, handle, x_desc, gy_desc, conv_desc,
                filter_desc, workspace)
        else:
            algo = libcudnn.getConvolutionBackwardFilterAlgorithm(
                handle, x_desc.value, gy_desc.value, conv_desc.value,
                filter_desc.value, _bwd_filter_pref, workspace_size)

        if use_tensor_core:
            # Only CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 supports
            # Tensor-Core in cuDNN7.
            algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1

        libcudnn.convolutionBackwardFilter_v3(
            handle, one.data, x_desc.value, x.data.ptr, gy_desc.value,
            gy.data.ptr, conv_desc.value, algo, workspace.data.ptr,
            workspace_size, zero.data, filter_desc.value, gW.data.ptr)

        return gW,
Esempio n. 14
0
    def forward_gpu(self, inputs):
        self.retain_inputs((0, 1))  # retain only x and W
        x, W = inputs[:2]
        b = inputs[2] if len(inputs) == 3 else None

        if not all([isinstance(i, cuda.ndarray) for i in inputs]):
            if b is not None:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}, type(b): {2}'
                                 .format(type(W), type(x), type(b)))
            else:
                raise ValueError('numpy and cupy must not be used together\n'
                                 'type(W): {0}, type(x): {1}'
                                 .format(type(W), type(x)))

        out_c, _, kh, kw = W.shape
        n, c, h, w = x.shape

        out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph,
                                      cover_all=self.cover_all, d=self.dy)
        assert out_h > 0, 'Height in the output should be positive.'
        out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw,
                                      cover_all=self.cover_all, d=self.dx)
        assert out_w > 0, 'Width in the output should be positive.'

        y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype)
        if (not self.cover_all and chainer.should_use_cudnn('>=auto') and
                x.dtype == W.dtype and
                ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)):
            x = cuda.cupy.ascontiguousarray(x)
            W = cuda.cupy.ascontiguousarray(W)
            if b is not None:
                b = cuda.cupy.ascontiguousarray(b)

            use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype)

            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(x)
            y_desc = cudnn.create_tensor_descriptor(y)

            filter_desc = cudnn.create_filter_descriptor(W)
            conv_param = ((self.ph, self.pw), (self.sy, self.sx), x.dtype)
            conv_desc = cudnn.create_convolution_descriptor(
                *conv_param, dilation=(self.dy, self.dx),
                use_tensor_core=use_tensor_core)
            if b is not None:
                bias_desc = cudnn.create_tensor_descriptor(
                    b[None, :, None, None])
            workspace_size = cuda.get_max_workspace_size()
            workspace = cuda.cupy.empty((workspace_size,), dtype='b')
            if configuration.config.autotune and _cudnn_version >= 5000:
                algo = get_algorithm_fwd(
                    x, W, y, conv_param, handle, x_desc, filter_desc,
                    conv_desc, y_desc, workspace)
            else:
                algo = libcudnn.getConvolutionForwardAlgorithm(
                    handle, x_desc.value, filter_desc.value,
                    conv_desc.value, y_desc.value, _fwd_pref, workspace_size)

            if use_tensor_core:
                # Only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
                # supports Tensor-Core in cuDNN7.
                algo = libcudnn.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM  # NOQA

            oz_dtype = 'd' if x.dtype == 'd' else 'f'
            one = numpy.array(1, dtype=oz_dtype).ctypes
            zero = numpy.array(0, dtype=oz_dtype).ctypes
            libcudnn.convolutionForward(
                handle, one.data, x_desc.value, x.data.ptr,
                filter_desc.value, W.data.ptr, conv_desc.value,
                algo, workspace.data.ptr, workspace_size, zero.data,
                y_desc.value, y.data.ptr)

            # TODO(beam2d): Support unshared bias
            if b is not None:
                cudnn.add_tensor(
                    handle, one.data, bias_desc.value, b.data.ptr,
                    one.data, y_desc.value, y.data.ptr)
        else:
            # Implementation using im2col
            col = conv.im2col_gpu(
                x, kh, kw, self.sy, self.sx, self.ph, self.pw,
                cover_all=self.cover_all, dy=self.dy, dx=self.dx)
            y = cuda.cupy.tensordot(
                col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False)
            # TODO(beam2d): Support unshared bias
            if b is not None:
                y += b
            y = cuda.cupy.rollaxis(y, 3, 1)

        return y,