def verify_conv2d(data_dtype, conv_dtype, tensor_format=0): in_channel = 4 out_channel = 16 filter_h = 3 filter_w = 3 pad_h = 1 pad_w = 1 stride_h = 1 stride_w = 1 dilation_h = 1 dilation_w = 1 batch = 3 height = 32 weight = 32 if not tvm.runtime.enabled("cuda"): print("skip because cuda is not enabled...") return if not tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape", True): print("skip because cudnn is not enabled...") return if tensor_format == 0: xshape = [batch, in_channel, height, weight] wshape = [out_channel, in_channel, filter_h, filter_w] else: xshape = [batch, height, weight, in_channel] wshape = [out_channel, filter_h, filter_w, in_channel] X = tvm.placeholder(xshape, name='X', dtype=data_dtype) W = tvm.placeholder(wshape, name='W', dtype=data_dtype) Y = cudnn.conv_forward(X, W, [pad_h, pad_w], [stride_h, stride_w], [dilation_h, dilation_w], conv_mode=1, tensor_format=tensor_format, conv_dtype=conv_dtype, algo=-1) yshape = [x.value for x in Y.shape] s = tvm.create_schedule(Y.op) def verify(): ctx = tvm.gpu(0) f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv2d") x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype) w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype) y_np = np.zeros(yshape).astype(data_dtype) x = tvm.nd.array(x_np, ctx) w = tvm.nd.array(w_np, ctx) y = tvm.nd.array(y_np, ctx) if tensor_format == 0: c_np = topi.testing.conv2d_nchw_python(x_np, w_np, 1, 1) elif tensor_format == 1: wt = w_np.transpose((1, 2, 3, 0)) #OHWI => HWIO c_np = topi.testing.conv2d_nhwc_python(x_np, wt, 1, 1) f(x, w, y) tvm.testing.assert_allclose(y.asnumpy(), c_np, atol=3e-5, rtol=1e-3) verify()
def verify_conv3d(data_dtype, conv_dtype, tensor_format=0, groups=1): in_channel = 4 out_channel = 16 filter_d = 3 filter_h = 3 filter_w = 3 pad_d = 1 pad_h = 1 pad_w = 1 stride_d = 1 stride_h = 1 stride_w = 1 dilation_d = 1 dilation_h = 1 dilation_w = 1 batch = 3 depth = 32 height = 32 width = 32 if not tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape", True): print("skip because cudnn is not enabled...") return # schedule xshape = [batch, in_channel, depth, height, width] wshape = [out_channel, in_channel // groups, filter_d, filter_h, filter_w] X = te.placeholder(xshape, name='X', dtype=data_dtype) W = te.placeholder(wshape, name='W', dtype=data_dtype) Y = cudnn.conv_forward(X, W, [pad_d, pad_h, pad_w], [stride_d, stride_h, stride_w], [dilation_d, dilation_h, dilation_w], conv_mode=1, tensor_format=tensor_format, algo=-1, conv_dtype=conv_dtype, groups=groups) yshape = [x.value for x in Y.shape] s = te.create_schedule(Y.op) # validation ctx = tvm.gpu(0) f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv3d") x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype) w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype) y_np = np.zeros(yshape).astype(data_dtype) x = tvm.nd.array(x_np, ctx) w = tvm.nd.array(w_np, ctx) y = tvm.nd.array(y_np, ctx) if tensor_format == 0: c_np = tvm.topi.testing.conv3d_ncdhw_python(x_np, w_np, 1, 1, groups) else: raise AssertionError( "For now, conv3d tensor format only support: 0(NCHW)") f(x, w, y) tvm.testing.assert_allclose(y.asnumpy(), c_np, atol=3e-5, rtol=1e-4)
def verify_conv3d(data_dtype, conv_dtype, tensor_format=0, groups=1): in_channel = 4 out_channel = 16 filter_d = 3 filter_h = 3 filter_w = 3 pad_d = 1 pad_h = 1 pad_w = 1 stride_d = 1 stride_h = 1 stride_w = 1 dilation_d = 1 dilation_h = 1 dilation_w = 1 batch = 3 depth = 32 height = 32 width = 32 # schedule xshape = [batch, in_channel, depth, height, width] wshape = [out_channel, in_channel // groups, filter_d, filter_h, filter_w] X = te.placeholder(xshape, name="X", dtype=data_dtype) W = te.placeholder(wshape, name="W", dtype=data_dtype) Y = cudnn.conv_forward( X, W, [pad_d, pad_h, pad_w], [stride_d, stride_h, stride_w], [dilation_d, dilation_h, dilation_w], conv_mode=1, tensor_format=tensor_format, algo=-1, conv_dtype=conv_dtype, groups=groups, ) yshape = [x.value for x in Y.shape] s = te.create_schedule(Y.op) # validation dev = tvm.cuda(0) f = tvm.build(s, [X, W, Y], target="cuda --host=llvm", name="conv3d") x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype) w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype) y_np = np.zeros(yshape).astype(data_dtype) x = tvm.nd.array(x_np, dev) w = tvm.nd.array(w_np, dev) y = tvm.nd.array(y_np, dev) if tensor_format == 0: c_np = tvm.topi.testing.conv3d_ncdhw_python(x_np, w_np, 1, 1, groups) else: raise AssertionError( "For now, conv3d tensor format only support: 0(NCHW)") f(x, w, y) tvm.testing.assert_allclose(y.numpy(), c_np, atol=3e-5, rtol=1e-4)
def conv2d_cudnn(cfg, data, kernel, strides, padding, dilation, groups=1, layout="NCHW", out_dtype="float32"): """Compute conv2d using CuDNN library""" if layout == "NCHW": tensor_format = 0 # CUDNN_TENSOR_NCHW N, _, H, W = get_const_tuple(data.shape) elif layout == "NHWC": tensor_format = 1 # CUDNN_TENSOR_NHWC N, H, W, _ = get_const_tuple(data.shape) else: raise ValueError("Unsupported layout %s in cudnn" % layout) CO, CI, KH, KW = get_const_tuple(kernel.shape) # handle dilation stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides dilation_h, dilation_w = (dilation, dilation) if isinstance( dilation, int) else dilation if (isinstance(padding, (list, tuple)) and len(padding) == 4 and (padding[0] != padding[2] or padding[1] != padding[3])): raise ValueError("Cudnn doesn't support asymmetric padding.") pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW)) OH = (H + pt + pb - KH) // stride_h + 1 OW = (W + pl + pr - KW) // stride_w + 1 cfg.add_flop(groups * 2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1)) if data.dtype == "int8" or kernel.dtype == "int8": if layout == "NCHW": raise ValueError("NCHW layout do not support int8 in cudnn") dtype = "int32" else: dtype = data.dtype cfg.define_knob("algo", range(8)) if cfg.is_fallback: # Let CUDNN choose the best algo cfg["algo"] = OtherOptionEntity(-1) return cudnn.conv_forward( data, kernel, [pt, pl], # cudnn padding pt, pl on both sides of input [stride_h, stride_w], [dilation_h, dilation_w], conv_mode=1, tensor_format=tensor_format, algo=cfg["algo"].val, conv_dtype=dtype, groups=groups, )
def conv2d_cudnn(cfg, data, kernel, strides, padding, dilation, layout='NCHW', out_dtype='float32'): """Compute conv2d using CuDNN library""" if layout == 'NCHW': tensor_format = 0 # CUDNN_TENSOR_NCHW N, _, H, W = get_const_tuple(data.shape) elif layout == 'NHWC': tensor_format = 1 # CUDNN_TENSOR_NHWC N, H, W, _ = get_const_tuple(data.shape) else: raise ValueError("Unsupported layout %s in cudnn" % layout) CO, CI, KH, KW = get_const_tuple(kernel.shape) # handle dilation stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides dilation_h, dilation_w = (dilation, dilation) if isinstance( dilation, int) else dilation if isinstance(padding, (list, tuple)) and len(padding) == 4 and \ (padding[0] != padding[2] or padding[1] != padding[3]): raise ValueError("Cudnn doesn't support asymmetric padding.") pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW)) OH = (H + pt + pb - KH) // stride_h + 1 OW = (W + pl + pr - KW) // stride_w + 1 cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) * \ ((KW - 1) * dilation_w + 1)) if data.dtype == "int8" or kernel.dtype == "int8": if layout == 'NCHW': raise ValueError("NCHW layout do not support int8 in cudnn") dtype = "int32" else: dtype = data.dtype return cudnn.conv_forward( data, kernel, [pt, pl], # cudnn padding pt, pl on both sides of input [stride_h, stride_w], [dilation_h, dilation_w], conv_mode=1, tensor_format=tensor_format, algo=-1, # let CUDNN choose the best algo conv_dtype=dtype)
def _lower_conv2d(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor: """Lower a conv2d using cuDNN.""" return cudnn.conv_forward( inputs[0], inputs[1], pad=op.attrs["padding"], stride=op.attrs["strides"], dilation=op.attrs["dilation"], conv_mode=1, tensor_format=0, algo=1, conv_dtype=op.checked_type.dtype, groups=op.attrs["groups"], )
def verify_conv2d(data_dtype, conv_dtype, tensor_format=0, groups=1): in_channel = 4 out_channel = 16 filter_h = 3 filter_w = 3 pad_h = 1 pad_w = 1 stride_h = 1 stride_w = 1 dilation_h = 1 dilation_w = 1 batch = 3 height = 32 width = 32 if not tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape", True): print("skip because cudnn is not enabled...") return if data_dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version): print("Skip because gpu does not have fp16 support") return # schedule if tensor_format == 0: xshape = [batch, in_channel, height, width] wshape = [out_channel, in_channel // groups, filter_h, filter_w] else: xshape = [batch, height, width, in_channel] wshape = [out_channel, filter_h, filter_w, in_channel // groups] X = te.placeholder(xshape, name='X', dtype=data_dtype) W = te.placeholder(wshape, name='W', dtype=data_dtype) Y = cudnn.conv_forward(X, W, [pad_h, pad_w], [stride_h, stride_w], [dilation_h, dilation_w], conv_mode=1, tensor_format=tensor_format, conv_dtype=conv_dtype, algo=-1, groups=groups) yshape = [x.value for x in Y.shape] s = te.create_schedule(Y.op) # validation ctx = tvm.gpu(0) f = tvm.build(s, [X, W, Y], "cuda", target_host="llvm", name="conv2d") x_np = np.random.uniform(-1, 1, xshape).astype(data_dtype) w_np = np.random.uniform(-1, 1, wshape).astype(data_dtype) y_np = np.zeros(yshape).astype(data_dtype) x = tvm.nd.array(x_np, ctx) w = tvm.nd.array(w_np, ctx) y = tvm.nd.array(y_np, ctx) if tensor_format == 0: c_np = tvm.topi.testing.conv2d_nchw_python(x_np, w_np, 1, 1, groups=groups) elif tensor_format == 1: wt = w_np.transpose((1, 2, 3, 0)) #OHWI => HWIO c_np = tvm.topi.testing.conv2d_nhwc_python(x_np, wt, 1, 1, groups=groups) f(x, w, y) tvm.testing.assert_allclose(y.asnumpy(), c_np, atol=1e-2, rtol=1e-2)
def conv3d_cudnn(cfg, data, kernel, strides, padding, dilation, layout="NCDHW", out_dtype="float32"): """Conv3D operator for cuda backend. Parameters ---------- cfg: ConfigEntity The config for this template data : tvm.te.Tensor 5-D with shape [batch, in_channel, in_depth, in_height, in_width] kernel : tvm.te.Tensor 5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width] strides : int or a list/tuple of three ints stride size, or [stride_depth, stride_height, stride_width] padding : int or a list/tuple of three ints padding size, or [pad_depth, pad_height, pad_width] dilation: int or a list/tuple of three ints dilation size, or [dilation_depth, dilation_height, dilation_width] layout : str layout of data out_dtype: str The output type. This is used for mixed precision. Returns ------- output : tvm.te.Tensor 5-D with shape [batch, out_channel, out_depth, out_height, out_width] """ if layout == "NCDHW": tensor_format = 0 # CUDNN_TENSOR_NCHW N, _, D, H, W = get_const_tuple(data.shape) elif layout == "NDHWC": tensor_format = 1 # CUDNN_TENSOR_NHWC N, D, H, W, _ = get_const_tuple(data.shape) else: raise ValueError("Unsupported layout %s in cudnn" % layout) CO, CI, KD, KH, KW = get_const_tuple(kernel.shape) # handle dilation stride_d, stride_h, stride_w = ((strides, strides, strides) if isinstance( strides, int) else strides) if isinstance(padding, int): pad_d, pad_h, pad_w = (padding, padding, padding) elif isinstance(padding, (list, tuple)) and len(padding) == 6: pad_d, pad_h, pad_w, _, _, _ = padding else: raise ValueError("Cudnn doesn't support asymmetric padding.") dilation_d, dilation_h, dilation_w = ((dilation, dilation, dilation) if isinstance( dilation, int) else dilation) dtype = data.dtype OD = (D + 2 * pad_d - KD) // stride_d + 1 OH = (H + 2 * pad_h - KH) // stride_h + 1 OW = (W + 2 * pad_w - KW) // stride_w + 1 cfg.add_flop(2 * N * OD * OH * OW * CO * CI * ((KD - 1) * dilation_d + 1) * ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1)) return cudnn.conv_forward( data, kernel, [pad_d, pad_h, pad_w], [stride_d, stride_h, stride_w], [dilation_d, dilation_h, dilation_w], conv_mode=1, tensor_format=tensor_format, algo=-1, # let CUDNN choose the best algo conv_dtype=dtype, )
def conv2d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCHW', out_dtype='float32'): """Conv2D operator for cuda backend. Parameters ---------- cfg: ConfigEntity The config for this template data : tvm.Tensor 4-D with shape [batch, in_channel, in_height, in_width] or 5-D with shape [batch, ic_chunk, in_height, in_width, ic_block] kernel : tvm.Tensor 4-D with shape [num_filter, in_channel, filter_height, filter_width] or 6-D with shape [num_filter_chunk, in_channel_chunk, filter_height, filter_width, num_filter_block, in_channel_block] strides : int or a list/tuple of two ints stride size, or [stride_height, stride_width] padding : int or a list/tuple of 2 or 4 ints padding size, or [pad_height, pad_width] for 2 ints, or [pad_top, pad_left, pad_bottom, pad_right] for 4 ints dilation: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] layout : str layout of data out_dtype: str The output type. This is used for mixed precision. Returns ------- output : tvm.Tensor 4-D with shape [batch, out_channel, out_height, out_width] """ target = tvm.target.current_target() if "cudnn" in target.libs: if layout == 'NCHW': tensor_format = 0 # CUDNN_TENSOR_NCHW N, _, H, W = get_const_tuple(data.shape) elif layout == 'NHWC': tensor_format = 1 # CUDNN_TENSOR_NHWC N, H, W, _ = get_const_tuple(data.shape) else: raise ValueError("Unsupported layout %s in cudnn" % layout) CO, CI, KH, KW = get_const_tuple(kernel.shape) # handle dilation stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides dilation_h, dilation_w = (dilation, dilation) if isinstance( dilation, int) else dilation if isinstance(padding, (list, tuple)) and len(padding) > 2: raise ValueError("Cudnn doesn't support asymmetric padding.") pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW)) OH = (H + pt + pb - KH) // stride_h + 1 OW = (W + pl + pr - KW) // stride_w + 1 cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) *\ ((KW - 1) * dilation_w + 1)) if data.dtype == "int8" or kernel.dtype == "int8": if layout == 'NCHW': raise ValueError("NCHW layout do not support int8 in cudnn") dtype = "int32" else: dtype = data.dtype return cudnn.conv_forward( data, kernel, [pt, pl], # cudnn padding pt, pl on both sides of input [stride_h, stride_w], [dilation_h, dilation_w], conv_mode=1, tensor_format=tensor_format, algo=-1, # let CUDNN choose the best algo conv_dtype=dtype) if cfg.template_key == 'winograd': return winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, pre_computed=False) if cfg.template_key == 'int8': if (data.dtype == 'int8' or data.dtype == 'uint8'): return conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out_dtype) if layout == 'NCHW': return nn.conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype) if layout == 'HWCN': return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype) if layout == 'NHWC': return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype) raise ValueError("not support this layout {} yet".format(layout))
def conv3d_cudnn(cfg, data, kernel, strides, padding, dilation, groups, layout="NCDHW", out_dtype="float32"): """Conv3D operator for cuda backend. Parameters ---------- cfg: ConfigEntity The config for this template data : tvm.te.Tensor 5-D with shape [batch, in_channel, in_depth, in_height, in_width] kernel : tvm.te.Tensor 5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width] strides : int or a list/tuple of three ints stride size, or [stride_depth, stride_height, stride_width] padding : int or a list/tuple of three ints padding size, or [pad_depth, pad_height, pad_width] dilation: int or a list/tuple of three ints dilation size, or [dilation_depth, dilation_height, dilation_width] layout : str layout of data out_dtype: str The output type. This is used for mixed precision. Returns ------- output : tvm.te.Tensor 5-D with shape [batch, out_channel, out_depth, out_height, out_width] """ if layout == "NCDHW": tensor_format = 0 # CUDNN_TENSOR_NCHW N, _, D, H, W = get_const_tuple(data.shape) elif layout == "NDHWC": tensor_format = 1 # CUDNN_TENSOR_NHWC N, D, H, W, _ = get_const_tuple(data.shape) else: raise ValueError("Unsupported layout %s in cudnn" % layout) CO, CI, KD, KH, KW = get_const_tuple(kernel.shape) assert groups == 1, "conv3d_cudnn does not support groups" # handle dilation stride_d, stride_h, stride_w = ((strides, strides, strides) if isinstance( strides, int) else strides) pad_d, pad_h, pad_w = (padding, padding, padding) if isinstance(padding, int) else padding dilation_d, dilation_h, dilation_w = ((dilation, dilation, dilation) if isinstance( dilation, int) else dilation) OD = (D + 2 * pad_d - KD) // stride_d + 1 OH = (H + 2 * pad_h - KH) // stride_h + 1 OW = (W + 2 * pad_w - KW) // stride_w + 1 if isinstance(N, int): cfg.add_flop(2 * N * OD * OH * OW * CO * CI * ((KD - 1) * dilation_d + 1) * ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1)) cfg.define_knob( "algo", range(cudnn.algo_to_index("fwd", "CUDNN_CONVOLUTION_FWD_ALGO_COUNT"))) if cfg.is_fallback: if cudnn.exists(): # Let CUDNN choose the best algo, based on benchmarks run # on the local machine. In the future, this should be # based on parameters stored in the Target. cfg["algo"] = OtherOptionEntity(-1) else: cfg["algo"] = OtherOptionEntity(0) return cudnn.conv_forward( data, kernel, [pad_d, pad_h, pad_w], [stride_d, stride_h, stride_w], [dilation_d, dilation_h, dilation_w], conv_mode=1, tensor_format=tensor_format, algo=cfg["algo"].val, conv_dtype=dtype, )
def conv2d_cudnn( cfg, data, kernel, strides, padding, dilation, groups=1, layout="NCHW", out_dtype="float32" ): """Compute conv2d using CuDNN library""" if layout == "NCHW": tensor_format = 0 # CUDNN_TENSOR_NCHW N, _, H, W = get_const_tuple(data.shape) elif layout == "NHWC": tensor_format = 1 # CUDNN_TENSOR_NHWC N, H, W, _ = get_const_tuple(data.shape) else: raise ValueError("Unsupported layout %s in cudnn" % layout) CO, CI, KH, KW = get_const_tuple(kernel.shape) # handle dilation stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation KH_dilated = (KH - 1) * dilation_h + 1 KW_dilated = (KW - 1) * dilation_h + 1 pt, pl, pb, pr = get_pad_tuple(padding, (KH_dilated, KW_dilated)) if (pt != pb) or (pl != pr): raise ValueError("Cudnn doesn't support asymmetric padding.") OH = (H + pt + pb - KH) // stride_h + 1 OW = (W + pl + pr - KW) // stride_w + 1 if isinstance(N, int): cfg.add_flop( groups * 2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1) ) if data.dtype == "int8" or kernel.dtype == "int8": if layout == "NCHW": raise ValueError("NCHW layout do not support int8 in cudnn") dtype = "int32" else: dtype = data.dtype cfg.define_knob("algo", range(cudnn.algo_to_index("fwd", "CUDNN_CONVOLUTION_FWD_ALGO_COUNT"))) if cfg.is_fallback: if cudnn.exists(): # Let CUDNN choose the best algo, based on benchmarks run # on the local machine. In the future, this should be # based on parameters stored in the Target. cfg["algo"] = OtherOptionEntity(-1) else: cfg["algo"] = OtherOptionEntity(0) return cudnn.conv_forward( data, kernel, [pt, pl], # cudnn padding pt, pl on both sides of input [stride_h, stride_w], [dilation_h, dilation_w], conv_mode=1, tensor_format=tensor_format, algo=cfg["algo"].val, conv_dtype=dtype, groups=groups, )
def conv3d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCDHW', out_dtype='float32'): """Conv3D operator for cuda backend. Parameters ---------- cfg: ConfigEntity The config for this template data : tvm.Tensor 5-D with shape [batch, in_channel, in_depth, in_height, in_width] kernel : tvm.Tensor 5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width] strides : int or a list/tuple of three ints stride size, or [stride_depth, stride_height, stride_width] padding : int or a list/tuple of 3 or 6 ints padding size, or [pad_depth, pad_height, pad_width] for 3 ints, or [pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right] for 6 ints dilation: int or a list/tuple of three ints dilation size, or [dilation_depth, dilation_height, dilation_width] layout : str layout of data out_dtype: str The output type. This is used for mixed precision. Returns ------- output : tvm.Tensor 5-D with shape [batch, out_channel, out_depth, out_height, out_width] """ target = tvm.target.current_target() if "cudnn" in target.libs: if layout == 'NCDHW': tensor_format = 0 # CUDNN_TENSOR_NCHW N, _, D, H, W = get_const_tuple(data.shape) elif layout == 'NDHWC': tensor_format = 1 # CUDNN_TENSOR_NHWC N, D, H, W, _ = get_const_tuple(data.shape) else: raise ValueError("Unsupported layout %s in cudnn" % layout) CO, CI, KD, KH, KW = get_const_tuple(kernel.shape) # handle dilation stride_d, stride_h, stride_w = (strides, strides, strides) if isinstance(strides, int) \ else strides if isinstance(padding, (list, tuple)) and len(padding) > 3: raise ValueError("Cudnn doesn't support asymmetric padding.") pf, pt, pl, pk, pb, pr = get_pad_tuple3d(padding, (KD, KH, KW)) dilation_d, dilation_h, dilation_w = (dilation, dilation, dilation) if \ isinstance(dilation, int) else dilation OD = (D + pf + pk - KD) // stride_d + 1 OH = (H + pt + pb - KH) // stride_h + 1 OW = (W + pl + pr - KW) // stride_w + 1 cfg.add_flop(2 * N * OD * OH * OW * CO * CI * ((KD - 1) * dilation_d + 1) *\ ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1)) return cudnn.conv_forward( data, kernel, [pf, pt, pl], # cudnn padding pt, pl on both sides of input [stride_d, stride_h, stride_w], [dilation_d, dilation_h, dilation_w], conv_mode=1, tensor_format=tensor_format, algo=-1, # let CUDNN choose the best algo conv_dtype=data.dtype) if layout == 'NCDHW': return nn.conv3d_ncdhw(data, kernel, strides, padding, dilation, out_dtype) raise ValueError("not support this layout {} yet".format(layout))