Example #1
0
 def forward_gpu(self, inputs):
     x, targets = inputs
     N = x.shape[0]
     
     
     #Linear function
     z = cuda.empty((N,self.no_labels), dtype=np.float32)
     cuk.dot(x, self.W, out=z, transb='t')
     if not self.nobias:
         cuk.addVec2Mat(z, self.b)
     
     self.probs = z
     if cudnn.enabled and self.use_cudnn:
         handle = cudnn.get_default_handle()
         desc = cudnn.get_tensor_desc(z, 1, 1)
         libcudnn.cudnnSoftmaxForward(
             handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(z),
             0, desc.value, cudnn.get_ptr(self.probs))
     else:
         cuk.softmax(z, self.probs)
     
     if self.return_probs:
         return self.probs,
         
     if self.compute_loss:
         correct_probs = cuda.empty((N,),dtype=np.float32)
         cuk.getByIndex_LogAndClip(
                                     self.probs, targets,
                                      out=correct_probs)
         loss = -cuda.cumisc.sum(correct_probs, keepdims=True)/N
     else:
         loss = np.atleast_2d(np.array(np.nan,dtype=np.float32))
     
     return loss,
Example #2
0
    def backward_gpu(self, x, gy):
        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            gx = cuda.empty_like(x[0])
            desc = cudnn.get_tensor_desc(x[0], 1, 1)
            libcudnn.cudnnSoftmaxBackward(
                handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(
                    self.y),
                desc.value, cudnn.get_ptr(gy[0]), 0, desc.value,
                cudnn.get_ptr(gx))
        else:
            gx = self.y * gy[0]
            c = gx.shape[1]
            sum_ydy = cuda.empty((gx.shape[0],), dtype=numpy.float32)
            cuda.elementwise(
                'float* sum_ydy, const float* ydy, int c',
                '''
                   const float* row = ydy + i * c;
                   float sum = 0;
                   for (int j = 0; j < c; ++j) {
                     sum += row[j];
                   }
                   sum_ydy[i] = sum;
                ''', 'softmax_bwd_sum_ydy')(sum_ydy, gx, c)
            cuda.elementwise(
                'float* gx, const float* y, const float* sum_ydy, int c',
                'gx[i] -= y[i] * sum_ydy[i / c]',
                'softmax_bwd_diff')(gx, self.y, sum_ydy, c)

        return gx,
Example #3
0
    def backward_gpu(self, x, gy):
        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            gx = cuda.empty_like(x[0])
            desc = cudnn.get_tensor_desc(x[0], 1, 1)
            libcudnn.cudnnSoftmaxBackward(handle, _algorithm,
                                          _mode, 1, desc.value,
                                          cudnn.get_ptr(self.y), desc.value,
                                          cudnn.get_ptr(gy[0]), 0, desc.value,
                                          cudnn.get_ptr(gx))
        else:
            gx = self.y * gy[0]
            c = gx.shape[1]
            sum_ydy = cuda.empty((gx.shape[0], ), dtype=numpy.float32)
            cuda.elementwise(
                'float* sum_ydy, const float* ydy, int c', '''
                   const float* row = ydy + i * c;
                   float sum = 0;
                   for (int j = 0; j < c; ++j) {
                     sum += row[j];
                   }
                   sum_ydy[i] = sum;
                ''', 'softmax_bwd_sum_ydy')(sum_ydy, gx, c)
            cuda.elementwise(
                'float* gx, const float* y, const float* sum_ydy, int c',
                'gx[i] -= y[i] * sum_ydy[i / c]',
                'softmax_bwd_diff')(gx, self.y, sum_ydy, c)

        return gx,
Example #4
0
 def forward_gpu(self, x):
     self.y = cuda.empty_like(x[0])
     if cudnn.enabled and self.use_cudnn:
         handle = cudnn.get_default_handle()
         desc = cudnn.get_tensor_desc(x[0], 1, 1)
         libcudnn.cudnnActivationForward(
             handle, _mode, 1, desc.value, cudnn.get_ptr(x[0]), 0, desc.value, cudnn.get_ptr(self.y)
         )
     else:
         cuda.elementwise("float* y, const float* x", "y[i] = tanhf(x[i])", "tanh_fwd")(self.y, x[0])
     return (self.y,)
Example #5
0
 def forward_gpu(self, x):
     self.y = cuda.empty_like(x[0])
     if cudnn.enabled and self.use_cudnn:
         handle = cudnn.get_default_handle()
         desc = cudnn.get_tensor_desc(x[0], 1, 1)
         libcudnn.cudnnActivationForward(handle, _mode, 1, desc.value,
                                         cudnn.get_ptr(x[0]), 0, desc.value,
                                         cudnn.get_ptr(self.y))
     else:
         cuda.elementwise('float* y, const float* x', 'y[i] = tanhf(x[i])',
                          'tanh_fwd')(self.y, x[0])
     return self.y,
Example #6
0
 def forward_gpu(self, x):
     self.y = cuda.empty_like(x[0])
     if cudnn.enabled and self.use_cudnn:
         handle = cudnn.get_default_handle()
         desc = cudnn.get_tensor_desc(x[0], 1, 1)
         libcudnn.cudnnActivationForward(
             handle, _mode, 1, desc.value, cudnn.get_ptr(x[0]),
             0, desc.value, cudnn.get_ptr(self.y))
     else:
         cuda.elementwise(
             'float* y, const float* x', 'y[i] = 1 / (1 + __expf(-x[i]))',
             'sigmoid_fwd')(self.y, x[0])
     return self.y,
Example #7
0
 def forward_gpu(self, x):
     y = cuda.empty_like(x[0])
     if cudnn.enabled and self.use_cudnn:
         handle = cudnn.get_default_handle()
         desc = cudnn.get_tensor_desc(x[0], 1, 1)
         libcudnn.cudnnActivationForward(
             handle, _mode, 1, desc.value, cudnn.get_ptr(x[0]),
             0, desc.value, cudnn.get_ptr(y))
         self.y = y
     else:
         cuda.elementwise('float* y, const float* x', 'y[i] = max(0.f, x[i])',
                          'relu_fwd')(y, x[0])
     return y,
Example #8
0
    def backward_gpu(self, x, gy):
        # Implementation using cudnn
        handle = cudnn.get_default_handle()
        pool_desc = self.create_pool_desc()

        x_desc = cudnn.get_tensor_desc( x[0],  x[0].shape[2],  x[0].shape[3])
        y_desc = cudnn.get_tensor_desc(gy[0], gy[0].shape[2], gy[0].shape[3])

        gx = cuda.empty_like(x[0])
        libcudnn.cudnnPoolingBackward(
            handle, pool_desc.value, 1, y_desc.value, cudnn.get_ptr(self.y),
            y_desc.value, cudnn.get_ptr(gy[0]), x_desc.value, cudnn.get_ptr(x[0]),
            0, x_desc.value, cudnn.get_ptr(gx))
        return gx,
Example #9
0
 def backward_gpu(self, x, gy):
     gx = cuda.empty_like(x[0])
     if cudnn.enabled and self.use_cudnn:
         handle = cudnn.get_default_handle()
         desc = cudnn.get_tensor_desc(self.y, 1, 1)
         libcudnn.cudnnActivationBackward(handle, _mode, 1, desc.value,
                                          cudnn.get_ptr(self.y), desc.value,
                                          cudnn.get_ptr(gy[0]), desc.value,
                                          cudnn.get_ptr(x[0]), 0,
                                          desc.value, cudnn.get_ptr(gx))
     else:
         cuda.elementwise('float* gx, const float* y, const float* gy',
                          'gx[i] = gy[i] * y[i] * (1 - y[i])',
                          'sigmoid_bwd')(gx, self.y, gy[0])
     return gx,
Example #10
0
 def backward_gpu(self, x, gy):
     gx = cuda.empty_like(x[0])
     if cudnn.enabled and self.use_cudnn:
         handle = cudnn.get_default_handle()
         desc = cudnn.get_tensor_desc(self.y, 1, 1)
         libcudnn.cudnnActivationBackward(
             handle, _mode, 1, desc.value, cudnn.get_ptr(self.y),
             desc.value, cudnn.get_ptr(gy[0]), desc.value, cudnn.get_ptr(x[0]),
             0, desc.value, cudnn.get_ptr(gx))
     else:
         cuda.elementwise(
             'float* gx, const float* y, const float* gy',
             'gx[i] = gy[i] * y[i] * (1 - y[i])',
             'sigmoid_bwd')(gx, self.y, gy[0])
     return gx,
    def forward_gpu(self, x):
        n, out_c, out_h, out_w = x[0].shape
        c = self.W.shape[1]
        h = get_deconv_outsize(out_h, self.kh, self.sy, self.ph)
        w = get_deconv_outsize(out_w, self.kw, self.sx, self.pw)
        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            x_desc = cudnn.get_tensor_desc(x[0], out_h, out_w)
            y = cuda.empty((n, c, h, w), dtype=numpy.float32)
            y_desc = cudnn.get_tensor_desc(y, h, w)

            self.filter_desc = cudnn.get_filter4d_desc(self.W)
            self.conv_desc = cudnn.get_conv2d_desc(
                (self.ph, self.pw), (self.sy, self.sx))
            if self.b is not None:
                self.bias_desc = cudnn.get_conv_bias_desc(self.b)

            libcudnn.cudnnConvolutionBackwardData(
                handle, 1, self.filter_desc.value, cudnn.get_ptr(self.W),
                x_desc.value, cudnn.get_ptr(x[0]), self.conv_desc.value,
                0, y_desc.value, cudnn.get_ptr(y))
            if self.b is not None:
                libcudnn.cudnnAddTensor(
                    handle, libcudnn.cudnnAddMode['CUDNN_ADD_SAME_C'],
                    1, self.bias_desc.value, cudnn.get_ptr(self.b),
                    1, y_desc.value, cudnn.get_ptr(y))
        else:
            handle = cuda.get_cublas_handle()
            # TODO(beam2d): Use streams
            W_mat = self.W.reshape(out_c, c * self.kh * self.kw)
            x_mats = x[0].reshape(n, out_c, out_h * out_w)
            gcol = cuda.empty((n, c, self.kh, self.kw, out_h, out_w), dtype=numpy.float32)
            gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.dot(W_mat, x_mats[i], transa='T', handle=handle,
                                  out=gcol_mats[i])
            y = conv.col2im_gpu(
                gcol, self.sy, self.sx, self.ph, self.pw, h, w)
            # TODO(beam2d): Support unshared bias
            if self.b is not None:
                cuda.elementwise(
                    'float* y, const float* b, int c, int hw',
                    'y[i] += b[i / hw % c]',
                    'conv_bias_fwd')(y, self.b, c, h * w)
        return y,
Example #12
0
    def forward_gpu(self, x):
        y = cuda.empty_like(x[0])
        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            desc = cudnn.get_tensor_desc(x[0], 1, 1)
            libcudnn.cudnnSoftmaxForward(
                handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(x[0]),
                0, desc.value, cudnn.get_ptr(y))
            self.y = y
        else:
            maxes = cuda.empty((x[0].shape[0],), dtype=numpy.float32)
            c = x[0].shape[1]
            cuda.elementwise(
                'float* maxes, const float* x, int c',
                '''
                   const float* row = x + i * c;
                   float maxval = row[0];
                   for (int j = 1; j < c; ++j) {
                     if (maxval < row[j]) {
                       maxval = row[j];
                     }
                   }
                   maxes[i] = maxval;
                ''', 'softmax_rowmax')(maxes, x[0], c)
            cuda.elementwise(
                'float* y, const float* x, const float* maxes, int c',
                'y[i] = __expf(x[i] - maxes[i / c])',
                'softmax_exp')(y, x[0], maxes, c)
            coeff = maxes  # reuse memory
            cuda.elementwise(
                'float* coeff, const float* y, int c',
                '''
                   const float* row = y + i * c;
                   float sum = 0;
                   for (int j = 0; j < c; ++j) {
                     sum += row[j];
                   }
                   coeff[i] = 1 / sum;
                ''', 'softmax_invrowsum')(coeff, y, c)
            cuda.elementwise(
                'float* y, const float* coeff, int c', 'y[i] *= coeff[i / c]',
                'softmax_rowmul')(y, coeff, c)
            self.y = y

        return y,
Example #13
0
    def forward_gpu(self, x):
        # Implementation using cudnn
        n, c, h, w = x[0].shape
        y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph, self.cover_all)
        y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw, self.cover_all)
        y = cuda.empty((n, c, y_h, y_w), dtype=numpy.float32)

        handle = cudnn.get_default_handle()
        pool_desc = self.create_pool_desc()
        x_desc = cudnn.get_tensor_desc(x[0], x[0].shape[2], x[0].shape[3])
        y_desc = cudnn.get_tensor_desc(y, y_h, y_w)

        libcudnn.cudnnPoolingForward(
            handle, pool_desc.value, 1, x_desc.value, cudnn.get_ptr(x[0]),
            0, y_desc.value, cudnn.get_ptr(y))
        self.y = y

        return y,
    def backward_gpu(self, x, gy):
        out_c, out_h, out_w = gy[0].shape[1:]
        n, c, h, w = x[0].shape

        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            x_desc = cudnn.get_tensor_desc(x[0], h, w)
            gy_desc = cudnn.get_tensor_desc(gy[0], out_h, out_w)
            if self.b is not None:
                libcudnn.cudnnConvolutionBackwardBias(
                    handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]),
                    1, self.bias_desc.value, cudnn.get_ptr(self.gb))

            libcudnn.cudnnConvolutionBackwardFilter(
                handle, 1, x_desc.value, cudnn.get_ptr(x[0]),
                gy_desc.value, cudnn.get_ptr(gy[0]), self.conv_desc.value,
                1, self.filter_desc.value, cudnn.get_ptr(self.gW))

            gx = cuda.empty_like(x[0])
            libcudnn.cudnnConvolutionBackwardData(
                handle, 1, self.filter_desc.value, cudnn.get_ptr(self.W),
                gy_desc.value, cudnn.get_ptr(gy[0]), self.conv_desc.value,
                0, x_desc.value, cudnn.get_ptr(gx))
        else:
            handle = cuda.get_cublas_handle()
            if self.gb is not None:
                # TODO(beam2d): Unify kernels
                with cuda.using_cumisc(handle):
                    tmp = cuda.cumisc.sum(
                        gy[0].reshape(n * out_c, out_h * out_w), axis=1)
                    tmp = cuda.cumisc.sum(tmp.reshape(n, out_c), axis=0)
                    self.gb += tmp

            # TODO(beam2d): Use streams
            gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw)
            col_mats = self.col.reshape(
                n, c * self.kh * self.kw, out_h * out_w)
            gy_mats = gy[0].reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.add_dot(
                    gy_mats[i], col_mats[i], gW_mat, transb='T', handle=handle)

            W_mat = self.W.reshape(out_c, c * self.kh * self.kw)
            gcol = cuda.empty_like(self.col)
            gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.dot(W_mat, gy_mats[i], transa='T', handle=handle,
                                  out=gcol_mats[i])

            gx = conv.col2im_gpu(
                gcol, self.sy, self.sx, self.ph, self.pw, h, w)

        return gx,
Example #15
0
    def backward_gpu(self, x, gy):
        n_unit = int(numpy.prod(x[0].shape[2:]))
        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            gx = cuda.empty_like(x[0])
            desc = cudnn.get_tensor_desc(x[0], n_unit, 1)
            libcudnn.cudnnSoftmaxBackward(
                handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(
                    self.y),
                desc.value, cudnn.get_ptr(gy[0]), 0, desc.value,
                cudnn.get_ptr(gx))
        else:
            gx = self.y * gy[0]
            c = gx.shape[1]
            sum_ydy_shape = (gx.shape[0],) + gx.shape[2:]
            sum_ydy = cuda.empty(sum_ydy_shape, dtype=numpy.float32)
            cuda.elementwise(
                'float* sum_ydy, const float* ydy, int n_channel, int n_unit',
                '''
                   const int n = i / n_unit;
                   const int m = i % n_unit;
                   const float* row = ydy + n * n_channel * n_unit + m;
                   float sum = 0;
                   for (int c = 0; c < n_channel; ++c) {
                     sum += row[c * n_unit];
                   }
                   sum_ydy[i] = sum;
                ''', 'softmax_bwd_sum_ydy')(sum_ydy, gx, c, n_unit)
            cuda.elementwise(
                '''
                   float* gx, const float* y, const float* sum_ydy,
                   int n_channel, int n_unit
                ''',
                '''
                   const int n = i / (n_channel * n_unit);
                   const int m = i % n_unit;
                   gx[i] -= y[i] * sum_ydy[n * n_unit + m];
                ''',
                'softmax_bwd_diff')(gx, self.y, sum_ydy, c, n_unit)

        return gx,
Example #16
0
 def backward_gpu(self, x, gy):
     gx = cuda.empty_like(x[0])
     if cudnn.enabled and self.use_cudnn:
         handle = cudnn.get_default_handle()
         desc = cudnn.get_tensor_desc(self.y, 1, 1)
         libcudnn.cudnnActivationBackward(
             handle,
             _mode,
             1,
             desc.value,
             cudnn.get_ptr(self.y),
             desc.value,
             cudnn.get_ptr(gy[0]),
             desc.value,
             cudnn.get_ptr(x[0]),
             0,
             desc.value,
             cudnn.get_ptr(gx),
         )
     else:
         cuda.elementwise("float* gx, const float* x, const float* gy", "gx[i] = x[i] > 0 ? gy[i] : 0", "relu_bwd")(
             gx, x[0], gy[0]
         )
     return (gx,)
    def backward_gpu(self, x, gy):
        n, out_c, out_h, out_w = x[0].shape
        c, h, w = gy[0].shape[1:]
        gx = cuda.empty((n, out_c, out_h, out_w), dtype=numpy.float32)
        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            gy_desc = cudnn.get_tensor_desc(gy[0], h, w)
            gx_desc = cudnn.get_tensor_desc(gx, out_h, out_w)

            algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, _fwd_pref,
                self.max_workspace_size)
            workspace_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, algo).value
            workspace = cuda.empty(
                (max(workspace_size // 4, 1),), dtype=numpy.float32)

            libcudnn.cudnnConvolutionForward(
                handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]),
                self.filter_desc.value, cudnn.get_ptr(self.W),
                self.conv_desc.value, algo, cudnn.get_ptr(
                    workspace), workspace_size,
                0, gx_desc.value, cudnn.get_ptr(gx))
            # bias backward
            if self.b is not None:
                libcudnn.cudnnConvolutionBackwardBias(
                    handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]),
                    1, self.bias_desc.value, cudnn.get_ptr(self.gb))
            # filter backward
            libcudnn.cudnnConvolutionBackwardFilter(
                handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]),
                gx_desc.value, cudnn.get_ptr(x[0]), self.conv_desc.value,
                1, self.filter_desc.value, cudnn.get_ptr(self.gW))
        else:
            # Implementation using im2col
            col = conv.im2col_gpu(
                gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw)

            # TODO(beam2d): Use streams
            handle = cuda.get_cublas_handle()
            W_mat = self.W.reshape(out_c, c * self.kh * self.kw)
            col_mats = col.reshape(
                n, c * self.kh * self.kw, out_h * out_w)
            gx_mats = gx.reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.dot(W_mat, col_mats[i], handle=handle,
                                  out=gx_mats[i])
            # bias backward
            if self.gb is not None:
                # TODO(beam2d): Unify kernels
                with cuda.using_cumisc(handle):
                    tmp = cuda.cumisc.sum(
                        gy[0].reshape(n * c, h * w), axis=1)
                    tmp = cuda.cumisc.sum(tmp.reshape(n, c), axis=0)
                    self.gb += tmp
            # filter backward
            # TODO(beam2d): Use streams
            gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw)
            x_mats = x[0].reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.add_dot(
                    x_mats[i], col_mats[i], gW_mat, transb='T', handle=handle)
        return gx,
Example #18
0
    def forward_gpu(self, x):
        y = cuda.empty_like(x[0])
        n_unit = int(numpy.prod(x[0].shape[2:]))
        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            desc = cudnn.get_tensor_desc(x[0], n_unit, 1)
            libcudnn.cudnnSoftmaxForward(
                handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(x[0]),
                0, desc.value, cudnn.get_ptr(y))
            self.y = y
        else:
            maxes_shape = (x[0].shape[0],) + x[0].shape[2:]
            maxes = cuda.empty(maxes_shape, dtype=numpy.float32)
            c = x[0].shape[1]
            cuda.elementwise(
                'float* maxes, const float* x, int n_channel, int n_unit',
                '''
                   const int n = i / n_unit;
                   const int m = i % n_unit;
                   const float* row = x + n * n_channel * n_unit + m;
                   float maxval = row[0];
                   for (int c = 1; c < n_channel; ++c) {
                     const int v = c * n_unit;
                     if (maxval < row[v]) {
                       maxval = row[v];
                     }
                   }
                   maxes[i] = maxval;
                ''', 'softmax_rowmax')(maxes, x[0], c, n_unit)
            cuda.elementwise(
                '''
                   float* y, const float* x, const float* maxes,
                   int n_channel, int n_unit
                ''',
                '''
                   const int n = i / (n_channel * n_unit);
                   const int m = i % n_unit;
                   y[i] = __expf(x[i] - maxes[n * n_unit + m]);
                ''',
                'softmax_exp')(y, x[0], maxes, c, n_unit)
            coeff = maxes  # reuse memory
            cuda.elementwise(
                'float* coeff, const float* y, int n_channel, int n_unit',
                '''
                   const int n = i / n_unit;
                   const int m = i % n_unit;
                   const float* row = y + n * n_channel * n_unit + m;
                   float sum = 0;
                   for (int c = 0; c < n_channel; ++c) {
                     sum += row[c * n_unit];
                   }
                   coeff[i] = 1 / sum;
                ''', 'softmax_invrowsum')(coeff, y, c, n_unit)
            cuda.elementwise(
                'float* y, const float* coeff, int n_channel, int n_unit',
                '''
                   const int n = i / (n_channel * n_unit);
                   const int m = i % n_unit;
                   y[i] *= coeff[n * n_unit + m];
                ''',
                'softmax_rowmul')(y, coeff, c, n_unit)
            self.y = y

        return y,
    def forward_gpu(self, x):
        n, c, h, w = x[0].shape
        out_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph)
        out_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw)
        out_c = self.W.shape[0]
        y = cuda.empty((n, out_c, out_h, out_w), dtype=numpy.float32)

        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            x_desc = cudnn.get_tensor_desc(x[0], h, w)
            y_desc = cudnn.get_tensor_desc(y, out_h, out_w)

            self.filter_desc = cudnn.get_filter4d_desc(self.W)
            self.conv_desc = cudnn.get_conv2d_desc(
                (self.ph, self.pw), (self.sy, self.sx))
            if self.b is not None:
                self.bias_desc = cudnn.get_conv_bias_desc(self.b)

            algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, _fwd_pref,
                self.max_workspace_size)
            workspace_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(
                handle, x_desc.value, self.filter_desc.value,
                self.conv_desc.value, y_desc.value, algo).value
            workspace = cuda.empty(
                (max(workspace_size // 4, 1),), dtype=numpy.float32)

            libcudnn.cudnnConvolutionForward(
                handle, 1, x_desc.value, cudnn.get_ptr(x[0]),
                self.filter_desc.value, cudnn.get_ptr(self.W),
                self.conv_desc.value, algo, cudnn.get_ptr(
                    workspace), workspace_size,
                0, y_desc.value, cudnn.get_ptr(y))

            # TODO(beam2d): Support unshared bias
            if self.b is not None:
                libcudnn.cudnnAddTensor(
                    handle, libcudnn.cudnnAddMode['CUDNN_ADD_SAME_C'],
                    1, self.bias_desc.value, cudnn.get_ptr(self.b),
                    1, y_desc.value, cudnn.get_ptr(y))
        else:
            # Implementation using im2col
            self.col = conv.im2col_gpu(
                x[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw)

            # TODO(beam2d): Use streams
            handle = cuda.get_cublas_handle()
            W_mat = self.W.reshape(out_c, c * self.kh * self.kw)
            col_mats = self.col.reshape(
                n, c * self.kh * self.kw, out_h * out_w)
            y_mats = y.reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.dot(W_mat, col_mats[i], handle=handle,
                                  out=y_mats[i])

            # TODO(beam2d): Support unshared bias
            if self.b is not None:
                cuda.elementwise(
                    'float* y, const float* b, int c, int hw',
                    'y[i] += b[i / hw % c]',
                    'conv_bias_fwd')(y, self.b, out_c, out_h * out_w)

        return y,