Esempio n. 1
0
def verify_conv2d(data_dtype, conv_dtype, tensor_format=0):
    in_channel = 4
    out_channel = 16
    filter_h = 3
    filter_w = 3
    pad_h = 1
    pad_w = 1
    stride_h = 1
    stride_w = 1
    dilation_h = 1
    dilation_w = 1
    batch = 3
    height = 32
    weight = 32

    if not tvm.runtime.enabled("cuda"):
        print("skip because cuda is not enabled...")
        return
    if not tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape", True):
        print("skip because cudnn is not enabled...")
        return
    if tensor_format == 0:
        xshape = [batch, in_channel, height, weight]
        wshape = [out_channel, in_channel, filter_h, filter_w]
    else:
        xshape = [batch, height, weight, in_channel]
        wshape = [out_channel, filter_h, filter_w, in_channel]

    X = tvm.placeholder(xshape, name='X', dtype=data_dtype)
    W = tvm.placeholder(wshape, name='W', dtype=data_dtype)
    Y = cudnn.conv_forward(X,
                           W, [pad_h, pad_w], [stride_h, stride_w],
                           [dilation_h, dilation_w],
                           conv_mode=1,
                           tensor_format=tensor_format,
                           conv_dtype=conv_dtype,
                           algo=-1)
    yshape = [x.value for x in Y.shape]
    s = tvm.create_schedule(Y.op)

    def verify():
        ctx = tvm.gpu(0)
        f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv2d")
        x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
        w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
        y_np = np.zeros(yshape).astype(data_dtype)
        x = tvm.nd.array(x_np, ctx)
        w = tvm.nd.array(w_np, ctx)
        y = tvm.nd.array(y_np, ctx)
        if tensor_format == 0:
            c_np = topi.testing.conv2d_nchw_python(x_np, w_np, 1, 1)
        elif tensor_format == 1:
            wt = w_np.transpose((1, 2, 3, 0))  #OHWI => HWIO
            c_np = topi.testing.conv2d_nhwc_python(x_np, wt, 1, 1)

        f(x, w, y)
        tvm.testing.assert_allclose(y.asnumpy(), c_np, atol=3e-5, rtol=1e-3)

    verify()
Esempio n. 2
0
def verify_conv3d(data_dtype, conv_dtype, tensor_format=0, groups=1):
    in_channel = 4
    out_channel = 16
    filter_d = 3
    filter_h = 3
    filter_w = 3
    pad_d = 1
    pad_h = 1
    pad_w = 1
    stride_d = 1
    stride_h = 1
    stride_w = 1
    dilation_d = 1
    dilation_h = 1
    dilation_w = 1
    batch = 3
    depth = 32
    height = 32
    width = 32

    if not tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape", True):
        print("skip because cudnn is not enabled...")
        return

    # schedule
    xshape = [batch, in_channel, depth, height, width]
    wshape = [out_channel, in_channel // groups, filter_d, filter_h, filter_w]

    X = te.placeholder(xshape, name='X', dtype=data_dtype)
    W = te.placeholder(wshape, name='W', dtype=data_dtype)
    Y = cudnn.conv_forward(X,
                           W, [pad_d, pad_h, pad_w],
                           [stride_d, stride_h, stride_w],
                           [dilation_d, dilation_h, dilation_w],
                           conv_mode=1,
                           tensor_format=tensor_format,
                           algo=-1,
                           conv_dtype=conv_dtype,
                           groups=groups)
    yshape = [x.value for x in Y.shape]
    s = te.create_schedule(Y.op)

    # validation
    ctx = tvm.gpu(0)
    f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv3d")
    x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
    w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
    y_np = np.zeros(yshape).astype(data_dtype)
    x = tvm.nd.array(x_np, ctx)
    w = tvm.nd.array(w_np, ctx)
    y = tvm.nd.array(y_np, ctx)
    if tensor_format == 0:
        c_np = tvm.topi.testing.conv3d_ncdhw_python(x_np, w_np, 1, 1, groups)
    else:
        raise AssertionError(
            "For now, conv3d tensor format only support: 0(NCHW)")

    f(x, w, y)
    tvm.testing.assert_allclose(y.asnumpy(), c_np, atol=3e-5, rtol=1e-4)
Esempio n. 3
0
def verify_conv3d(data_dtype, conv_dtype, tensor_format=0, groups=1):
    in_channel = 4
    out_channel = 16
    filter_d = 3
    filter_h = 3
    filter_w = 3
    pad_d = 1
    pad_h = 1
    pad_w = 1
    stride_d = 1
    stride_h = 1
    stride_w = 1
    dilation_d = 1
    dilation_h = 1
    dilation_w = 1
    batch = 3
    depth = 32
    height = 32
    width = 32

    # schedule
    xshape = [batch, in_channel, depth, height, width]
    wshape = [out_channel, in_channel // groups, filter_d, filter_h, filter_w]

    X = te.placeholder(xshape, name="X", dtype=data_dtype)
    W = te.placeholder(wshape, name="W", dtype=data_dtype)
    Y = cudnn.conv_forward(
        X,
        W,
        [pad_d, pad_h, pad_w],
        [stride_d, stride_h, stride_w],
        [dilation_d, dilation_h, dilation_w],
        conv_mode=1,
        tensor_format=tensor_format,
        algo=-1,
        conv_dtype=conv_dtype,
        groups=groups,
    )
    yshape = [x.value for x in Y.shape]
    s = te.create_schedule(Y.op)

    # validation
    dev = tvm.cuda(0)
    f = tvm.build(s, [X, W, Y], target="cuda --host=llvm", name="conv3d")
    x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
    w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
    y_np = np.zeros(yshape).astype(data_dtype)
    x = tvm.nd.array(x_np, dev)
    w = tvm.nd.array(w_np, dev)
    y = tvm.nd.array(y_np, dev)
    if tensor_format == 0:
        c_np = tvm.topi.testing.conv3d_ncdhw_python(x_np, w_np, 1, 1, groups)
    else:
        raise AssertionError(
            "For now, conv3d tensor format only support: 0(NCHW)")

    f(x, w, y)
    tvm.testing.assert_allclose(y.numpy(), c_np, atol=3e-5, rtol=1e-4)
Esempio n. 4
0
def conv2d_cudnn(cfg,
                 data,
                 kernel,
                 strides,
                 padding,
                 dilation,
                 groups=1,
                 layout="NCHW",
                 out_dtype="float32"):
    """Compute conv2d using CuDNN library"""
    if layout == "NCHW":
        tensor_format = 0  # CUDNN_TENSOR_NCHW
        N, _, H, W = get_const_tuple(data.shape)
    elif layout == "NHWC":
        tensor_format = 1  # CUDNN_TENSOR_NHWC
        N, H, W, _ = get_const_tuple(data.shape)
    else:
        raise ValueError("Unsupported layout %s in cudnn" % layout)
    CO, CI, KH, KW = get_const_tuple(kernel.shape)

    # handle dilation
    stride_h, stride_w = (strides,
                          strides) if isinstance(strides, int) else strides
    dilation_h, dilation_w = (dilation, dilation) if isinstance(
        dilation, int) else dilation

    if (isinstance(padding, (list, tuple)) and len(padding) == 4
            and (padding[0] != padding[2] or padding[1] != padding[3])):
        raise ValueError("Cudnn doesn't support asymmetric padding.")
    pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
    OH = (H + pt + pb - KH) // stride_h + 1
    OW = (W + pl + pr - KW) // stride_w + 1
    cfg.add_flop(groups * 2 * N * OH * OW * CO * CI *
                 ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1))

    if data.dtype == "int8" or kernel.dtype == "int8":
        if layout == "NCHW":
            raise ValueError("NCHW layout do not support int8 in cudnn")
        dtype = "int32"
    else:
        dtype = data.dtype

    cfg.define_knob("algo", range(8))
    if cfg.is_fallback:  # Let CUDNN choose the best algo
        cfg["algo"] = OtherOptionEntity(-1)

    return cudnn.conv_forward(
        data,
        kernel,
        [pt, pl],  # cudnn padding pt, pl on both sides of input
        [stride_h, stride_w],
        [dilation_h, dilation_w],
        conv_mode=1,
        tensor_format=tensor_format,
        algo=cfg["algo"].val,
        conv_dtype=dtype,
        groups=groups,
    )
Esempio n. 5
0
def conv2d_cudnn(cfg,
                 data,
                 kernel,
                 strides,
                 padding,
                 dilation,
                 layout='NCHW',
                 out_dtype='float32'):
    """Compute conv2d using CuDNN library"""
    if layout == 'NCHW':
        tensor_format = 0  # CUDNN_TENSOR_NCHW
        N, _, H, W = get_const_tuple(data.shape)
    elif layout == 'NHWC':
        tensor_format = 1  # CUDNN_TENSOR_NHWC
        N, H, W, _ = get_const_tuple(data.shape)
    else:
        raise ValueError("Unsupported layout %s in cudnn" % layout)
    CO, CI, KH, KW = get_const_tuple(kernel.shape)

    # handle dilation
    stride_h, stride_w = (strides,
                          strides) if isinstance(strides, int) else strides
    dilation_h, dilation_w = (dilation, dilation) if isinstance(
        dilation, int) else dilation

    if isinstance(padding, (list, tuple)) and len(padding) == 4 and \
            (padding[0] != padding[2] or padding[1] != padding[3]):
        raise ValueError("Cudnn doesn't support asymmetric padding.")
    pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
    OH = (H + pt + pb - KH) // stride_h + 1
    OW = (W + pl + pr - KW) // stride_w + 1
    cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) * \
                 ((KW - 1) * dilation_w + 1))

    if data.dtype == "int8" or kernel.dtype == "int8":
        if layout == 'NCHW':
            raise ValueError("NCHW layout do not support int8 in cudnn")
        dtype = "int32"
    else:
        dtype = data.dtype

    return cudnn.conv_forward(
        data,
        kernel,
        [pt, pl],  # cudnn padding pt, pl on both sides of input
        [stride_h, stride_w],
        [dilation_h, dilation_w],
        conv_mode=1,
        tensor_format=tensor_format,
        algo=-1,  # let CUDNN choose the best algo
        conv_dtype=dtype)
Esempio n. 6
0
def _lower_conv2d(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
    """Lower a conv2d using cuDNN."""
    return cudnn.conv_forward(
        inputs[0],
        inputs[1],
        pad=op.attrs["padding"],
        stride=op.attrs["strides"],
        dilation=op.attrs["dilation"],
        conv_mode=1,
        tensor_format=0,
        algo=1,
        conv_dtype=op.checked_type.dtype,
        groups=op.attrs["groups"],
    )
Esempio n. 7
0
def verify_conv2d(data_dtype, conv_dtype, tensor_format=0, groups=1):
    in_channel = 4
    out_channel = 16
    filter_h = 3
    filter_w = 3
    pad_h = 1
    pad_w = 1
    stride_h = 1
    stride_w = 1
    dilation_h = 1
    dilation_w = 1
    batch = 3
    height = 32
    width = 32

    if not tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape", True):
        print("skip because cudnn is not enabled...")
        return
    if data_dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
        print("Skip because gpu does not have fp16 support")
        return

    # schedule
    if tensor_format == 0:
        xshape = [batch, in_channel, height, width]
        wshape = [out_channel, in_channel // groups, filter_h, filter_w]
    else:
        xshape = [batch, height, width, in_channel]
        wshape = [out_channel, filter_h, filter_w, in_channel // groups]

    X = te.placeholder(xshape, name='X', dtype=data_dtype)
    W = te.placeholder(wshape, name='W', dtype=data_dtype)
    Y = cudnn.conv_forward(X,
                           W, [pad_h, pad_w], [stride_h, stride_w],
                           [dilation_h, dilation_w],
                           conv_mode=1,
                           tensor_format=tensor_format,
                           conv_dtype=conv_dtype,
                           algo=-1,
                           groups=groups)
    yshape = [x.value for x in Y.shape]
    s = te.create_schedule(Y.op)

    # validation
    ctx = tvm.gpu(0)
    f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv2d")
    x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype)
    w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype)
    y_np = np.zeros(yshape).astype(data_dtype)
    x = tvm.nd.array(x_np, ctx)
    w = tvm.nd.array(w_np, ctx)
    y = tvm.nd.array(y_np, ctx)
    if tensor_format == 0:
        c_np = tvm.topi.testing.conv2d_nchw_python(x_np,
                                                   w_np,
                                                   1,
                                                   1,
                                                   groups=groups)
    elif tensor_format == 1:
        wt = w_np.transpose((1, 2, 3, 0))  #OHWI => HWIO
        c_np = tvm.topi.testing.conv2d_nhwc_python(x_np,
                                                   wt,
                                                   1,
                                                   1,
                                                   groups=groups)

    f(x, w, y)
    tvm.testing.assert_allclose(y.asnumpy(), c_np, atol=1e-2, rtol=1e-2)
Esempio n. 8
0
def conv3d_cudnn(cfg,
                 data,
                 kernel,
                 strides,
                 padding,
                 dilation,
                 layout="NCDHW",
                 out_dtype="float32"):
    """Conv3D operator for cuda backend.

    Parameters
    ----------
    cfg: ConfigEntity
        The config for this template

    data : tvm.te.Tensor
        5-D with shape [batch, in_channel, in_depth, in_height, in_width]

    kernel : tvm.te.Tensor
        5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width]

    strides : int or a list/tuple of three ints
        stride size, or [stride_depth, stride_height, stride_width]

    padding : int or a list/tuple of three ints
        padding size, or [pad_depth, pad_height, pad_width]

    dilation: int or a list/tuple of three ints
        dilation size, or [dilation_depth, dilation_height, dilation_width]

    layout : str
        layout of data

    out_dtype: str
        The output type. This is used for mixed precision.

    Returns
    -------
    output : tvm.te.Tensor
        5-D with shape [batch, out_channel, out_depth, out_height, out_width]
    """
    if layout == "NCDHW":
        tensor_format = 0  # CUDNN_TENSOR_NCHW
        N, _, D, H, W = get_const_tuple(data.shape)
    elif layout == "NDHWC":
        tensor_format = 1  # CUDNN_TENSOR_NHWC
        N, D, H, W, _ = get_const_tuple(data.shape)
    else:
        raise ValueError("Unsupported layout %s in cudnn" % layout)
    CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)

    # handle dilation
    stride_d, stride_h, stride_w = ((strides, strides, strides) if isinstance(
        strides, int) else strides)
    if isinstance(padding, int):
        pad_d, pad_h, pad_w = (padding, padding, padding)
    elif isinstance(padding, (list, tuple)) and len(padding) == 6:
        pad_d, pad_h, pad_w, _, _, _ = padding
    else:
        raise ValueError("Cudnn doesn't support asymmetric padding.")
    dilation_d, dilation_h, dilation_w = ((dilation, dilation,
                                           dilation) if isinstance(
                                               dilation, int) else dilation)
    dtype = data.dtype

    OD = (D + 2 * pad_d - KD) // stride_d + 1
    OH = (H + 2 * pad_h - KH) // stride_h + 1
    OW = (W + 2 * pad_w - KW) // stride_w + 1
    cfg.add_flop(2 * N * OD * OH * OW * CO * CI * ((KD - 1) * dilation_d + 1) *
                 ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1))

    return cudnn.conv_forward(
        data,
        kernel,
        [pad_d, pad_h, pad_w],
        [stride_d, stride_h, stride_w],
        [dilation_d, dilation_h, dilation_w],
        conv_mode=1,
        tensor_format=tensor_format,
        algo=-1,  # let CUDNN choose the best algo
        conv_dtype=dtype,
    )
Esempio n. 9
0
def conv2d_cuda(cfg,
                data,
                kernel,
                strides,
                padding,
                dilation,
                layout='NCHW',
                out_dtype='float32'):
    """Conv2D operator for cuda backend.

    Parameters
    ----------
    cfg: ConfigEntity
        The config for this template

    data : tvm.Tensor
        4-D with shape [batch, in_channel, in_height, in_width] or
        5-D with shape [batch, ic_chunk, in_height, in_width, ic_block]

    kernel : tvm.Tensor
        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
        6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
        filter_width, num_filter_block, in_channel_block]

    strides : int or a list/tuple of two ints
        stride size, or [stride_height, stride_width]

    padding : int or a list/tuple of 2 or 4 ints
        padding size, or
        [pad_height, pad_width] for 2 ints, or
        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints

    dilation: int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    layout : str
        layout of data

    out_dtype: str
        The output type. This is used for mixed precision.

    Returns
    -------
    output : tvm.Tensor
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    target = tvm.target.current_target()

    if "cudnn" in target.libs:
        if layout == 'NCHW':
            tensor_format = 0  # CUDNN_TENSOR_NCHW
            N, _, H, W = get_const_tuple(data.shape)
        elif layout == 'NHWC':
            tensor_format = 1  # CUDNN_TENSOR_NHWC
            N, H, W, _ = get_const_tuple(data.shape)
        else:
            raise ValueError("Unsupported layout %s in cudnn" % layout)
        CO, CI, KH, KW = get_const_tuple(kernel.shape)

        # handle dilation
        stride_h, stride_w = (strides,
                              strides) if isinstance(strides, int) else strides
        dilation_h, dilation_w = (dilation, dilation) if isinstance(
            dilation, int) else dilation

        if isinstance(padding, (list, tuple)) and len(padding) > 2:
            raise ValueError("Cudnn doesn't support asymmetric padding.")
        pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
        OH = (H + pt + pb - KH) // stride_h + 1
        OW = (W + pl + pr - KW) // stride_w + 1
        cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) *\
                    ((KW - 1) * dilation_w + 1))

        if data.dtype == "int8" or kernel.dtype == "int8":
            if layout == 'NCHW':
                raise ValueError("NCHW layout do not support int8 in cudnn")
            dtype = "int32"
        else:
            dtype = data.dtype

        return cudnn.conv_forward(
            data,
            kernel,
            [pt, pl],  # cudnn padding pt, pl on both sides of input
            [stride_h, stride_w],
            [dilation_h, dilation_w],
            conv_mode=1,
            tensor_format=tensor_format,
            algo=-1,  # let CUDNN choose the best algo
            conv_dtype=dtype)

    if cfg.template_key == 'winograd':
        return winograd_cuda(cfg,
                             data,
                             kernel,
                             strides,
                             padding,
                             dilation,
                             layout,
                             out_dtype,
                             pre_computed=False)
    if cfg.template_key == 'int8':
        if (data.dtype == 'int8' or data.dtype == 'uint8'):
            return conv2d_NCHWc_int8(cfg, data, kernel, strides, padding,
                                     dilation, layout, out_dtype)

    if layout == 'NCHW':
        return nn.conv2d_nchw(data, kernel, strides, padding, dilation,
                              out_dtype)
    if layout == 'HWCN':
        return nn.conv2d_hwcn(data, kernel, strides, padding, dilation,
                              out_dtype)
    if layout == 'NHWC':
        return nn.conv2d_nhwc(data, kernel, strides, padding, dilation,
                              out_dtype)
    raise ValueError("not support this layout {} yet".format(layout))
Esempio n. 10
0
File: conv3d.py Progetto: wenxcs/tvm
def conv3d_cudnn(cfg,
                 data,
                 kernel,
                 strides,
                 padding,
                 dilation,
                 groups,
                 layout="NCDHW",
                 out_dtype="float32"):
    """Conv3D operator for cuda backend.

    Parameters
    ----------
    cfg: ConfigEntity
        The config for this template

    data : tvm.te.Tensor
        5-D with shape [batch, in_channel, in_depth, in_height, in_width]

    kernel : tvm.te.Tensor
        5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width]

    strides : int or a list/tuple of three ints
        stride size, or [stride_depth, stride_height, stride_width]

    padding : int or a list/tuple of three ints
        padding size, or [pad_depth, pad_height, pad_width]

    dilation: int or a list/tuple of three ints
        dilation size, or [dilation_depth, dilation_height, dilation_width]

    layout : str
        layout of data

    out_dtype: str
        The output type. This is used for mixed precision.

    Returns
    -------
    output : tvm.te.Tensor
        5-D with shape [batch, out_channel, out_depth, out_height, out_width]
    """
    if layout == "NCDHW":
        tensor_format = 0  # CUDNN_TENSOR_NCHW
        N, _, D, H, W = get_const_tuple(data.shape)
    elif layout == "NDHWC":
        tensor_format = 1  # CUDNN_TENSOR_NHWC
        N, D, H, W, _ = get_const_tuple(data.shape)
    else:
        raise ValueError("Unsupported layout %s in cudnn" % layout)
    CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)

    assert groups == 1, "conv3d_cudnn does not support groups"

    # handle dilation
    stride_d, stride_h, stride_w = ((strides, strides, strides) if isinstance(
        strides, int) else strides)
    pad_d, pad_h, pad_w = (padding, padding,
                           padding) if isinstance(padding, int) else padding
    dilation_d, dilation_h, dilation_w = ((dilation, dilation,
                                           dilation) if isinstance(
                                               dilation, int) else dilation)

    OD = (D + 2 * pad_d - KD) // stride_d + 1
    OH = (H + 2 * pad_h - KH) // stride_h + 1
    OW = (W + 2 * pad_w - KW) // stride_w + 1

    if isinstance(N, int):
        cfg.add_flop(2 * N * OD * OH * OW * CO * CI *
                     ((KD - 1) * dilation_d + 1) *
                     ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1))

    cfg.define_knob(
        "algo",
        range(cudnn.algo_to_index("fwd", "CUDNN_CONVOLUTION_FWD_ALGO_COUNT")))
    if cfg.is_fallback:
        if cudnn.exists():
            # Let CUDNN choose the best algo, based on benchmarks run
            # on the local machine.  In the future, this should be
            # based on parameters stored in the Target.
            cfg["algo"] = OtherOptionEntity(-1)
        else:
            cfg["algo"] = OtherOptionEntity(0)

    return cudnn.conv_forward(
        data,
        kernel,
        [pad_d, pad_h, pad_w],
        [stride_d, stride_h, stride_w],
        [dilation_d, dilation_h, dilation_w],
        conv_mode=1,
        tensor_format=tensor_format,
        algo=cfg["algo"].val,
        conv_dtype=dtype,
    )
Esempio n. 11
0
def conv2d_cudnn(
    cfg, data, kernel, strides, padding, dilation, groups=1, layout="NCHW", out_dtype="float32"
):
    """Compute conv2d using CuDNN library"""
    if layout == "NCHW":
        tensor_format = 0  # CUDNN_TENSOR_NCHW
        N, _, H, W = get_const_tuple(data.shape)
    elif layout == "NHWC":
        tensor_format = 1  # CUDNN_TENSOR_NHWC
        N, H, W, _ = get_const_tuple(data.shape)
    else:
        raise ValueError("Unsupported layout %s in cudnn" % layout)
    CO, CI, KH, KW = get_const_tuple(kernel.shape)

    # handle dilation
    stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides
    dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation
    KH_dilated = (KH - 1) * dilation_h + 1
    KW_dilated = (KW - 1) * dilation_h + 1

    pt, pl, pb, pr = get_pad_tuple(padding, (KH_dilated, KW_dilated))
    if (pt != pb) or (pl != pr):
        raise ValueError("Cudnn doesn't support asymmetric padding.")

    OH = (H + pt + pb - KH) // stride_h + 1
    OW = (W + pl + pr - KW) // stride_w + 1

    if isinstance(N, int):
        cfg.add_flop(
            groups
            * 2
            * N
            * OH
            * OW
            * CO
            * CI
            * ((KH - 1) * dilation_h + 1)
            * ((KW - 1) * dilation_w + 1)
        )

    if data.dtype == "int8" or kernel.dtype == "int8":
        if layout == "NCHW":
            raise ValueError("NCHW layout do not support int8 in cudnn")
        dtype = "int32"
    else:
        dtype = data.dtype

    cfg.define_knob("algo", range(cudnn.algo_to_index("fwd", "CUDNN_CONVOLUTION_FWD_ALGO_COUNT")))
    if cfg.is_fallback:
        if cudnn.exists():
            # Let CUDNN choose the best algo, based on benchmarks run
            # on the local machine.  In the future, this should be
            # based on parameters stored in the Target.
            cfg["algo"] = OtherOptionEntity(-1)
        else:
            cfg["algo"] = OtherOptionEntity(0)

    return cudnn.conv_forward(
        data,
        kernel,
        [pt, pl],  # cudnn padding pt, pl on both sides of input
        [stride_h, stride_w],
        [dilation_h, dilation_w],
        conv_mode=1,
        tensor_format=tensor_format,
        algo=cfg["algo"].val,
        conv_dtype=dtype,
        groups=groups,
    )
Esempio n. 12
0
def conv3d_cuda(cfg,
                data,
                kernel,
                strides,
                padding,
                dilation,
                layout='NCDHW',
                out_dtype='float32'):
    """Conv3D operator for cuda backend.

    Parameters
    ----------
    cfg: ConfigEntity
        The config for this template

    data : tvm.Tensor
        5-D with shape [batch, in_channel, in_depth, in_height, in_width]

    kernel : tvm.Tensor
        5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width]

    strides : int or a list/tuple of three ints
        stride size, or [stride_depth, stride_height, stride_width]

    padding : int or a list/tuple of 3 or 6 ints
        padding size, or
        [pad_depth, pad_height, pad_width] for 3 ints, or
        [pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right] for 6 ints

    dilation: int or a list/tuple of three ints
        dilation size, or [dilation_depth, dilation_height, dilation_width]

    layout : str
        layout of data

    out_dtype: str
        The output type. This is used for mixed precision.

    Returns
    -------
    output : tvm.Tensor
        5-D with shape [batch, out_channel, out_depth, out_height, out_width]
    """
    target = tvm.target.current_target()

    if "cudnn" in target.libs:
        if layout == 'NCDHW':
            tensor_format = 0  # CUDNN_TENSOR_NCHW
            N, _, D, H, W = get_const_tuple(data.shape)
        elif layout == 'NDHWC':
            tensor_format = 1  # CUDNN_TENSOR_NHWC
            N, D, H, W, _ = get_const_tuple(data.shape)
        else:
            raise ValueError("Unsupported layout %s in cudnn" % layout)
        CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)

        # handle dilation
        stride_d, stride_h, stride_w = (strides, strides, strides) if isinstance(strides, int) \
            else strides
        if isinstance(padding, (list, tuple)) and len(padding) > 3:
            raise ValueError("Cudnn doesn't support asymmetric padding.")
        pf, pt, pl, pk, pb, pr = get_pad_tuple3d(padding, (KD, KH, KW))
        dilation_d, dilation_h, dilation_w = (dilation, dilation, dilation) if \
            isinstance(dilation, int) else dilation

        OD = (D + pf + pk - KD) // stride_d + 1
        OH = (H + pt + pb - KH) // stride_h + 1
        OW = (W + pl + pr - KW) // stride_w + 1
        cfg.add_flop(2 * N * OD * OH * OW * CO * CI * ((KD - 1) * dilation_d + 1) *\
                    ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1))

        return cudnn.conv_forward(
            data,
            kernel,
            [pf, pt, pl],  # cudnn padding pt, pl on both sides of input
            [stride_d, stride_h, stride_w],
            [dilation_d, dilation_h, dilation_w],
            conv_mode=1,
            tensor_format=tensor_format,
            algo=-1,  # let CUDNN choose the best algo
            conv_dtype=data.dtype)

    if layout == 'NCDHW':
        return nn.conv3d_ncdhw(data, kernel, strides, padding, dilation,
                               out_dtype)
    raise ValueError("not support this layout {} yet".format(layout))