def batch_matmul_strategy_cpu(attrs, inputs, out_type, target): """batch_matmul x86 strategy""" strategy = _op.OpStrategy() if is_dynamic(out_type) or is_auto_scheduler_enabled(): strategy.add_implementation( wrap_compute_batch_matmul( topi.nn.batch_matmul, need_auto_scheduler_layout=True, need_out_dtype=True ), wrap_topi_schedule(topi.generic.nn.schedule_batch_matmul), name="batch_matmul.generic", plevel=10, ) else: strategy.add_implementation( wrap_compute_batch_matmul(topi.x86.batch_matmul, need_out_dtype=True), wrap_topi_schedule(topi.x86.schedule_batch_matmul), name="batch_matmul.x86", plevel=10, ) if "cblas" in target.libs: strategy.add_implementation( wrap_compute_batch_matmul(topi.x86.batch_matmul_cblas), wrap_topi_schedule(topi.x86.schedule_batch_matmul_cblas), name="batch_matmul_cblas.x86", plevel=15, ) if "mkl" in target.libs: strategy.add_implementation( wrap_compute_batch_matmul(topi.x86.batch_matmul_mkl), wrap_topi_schedule(topi.x86.schedule_batch_matmul_mkl), name="batch_matmul_mkl.x86", plevel=15, ) return strategy
def matmul_strategy_cuda(attrs, inputs, out_type, target): """Matmul cuda strategy.""" strategy = _op.OpStrategy() if is_auto_scheduler_enabled(): strategy.add_implementation( wrap_compute_matmul(topi.nn.matmul), naive_schedule, name="matmul.cuda", ) else: logger.warning( "Matmul is not optimized for cuda. Recommend to use cublas for better performance." ) # Temporary use this as a basic schedule strategy.add_implementation( wrap_compute_matmul(topi.gpu.matmul_default), wrap_topi_schedule(topi.gpu.schedule_matmul_default), name="matmul_default.gpu", ) if target.kind.name == "cuda" and "cublas" in target.libs: strategy.add_implementation( wrap_compute_matmul(topi.cuda.matmul_cublas), wrap_topi_schedule(topi.cuda.schedule_matmul_cublas), name="matmul_cublas.cuda", plevel=25, ) return strategy
def dense_strategy_cpu(attrs, inputs, out_type, target): """dense x86 strategy""" strategy = _op.OpStrategy() m, _ = inputs[0].shape same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype dtype = inputs[0].dtype u8s8s32 = dtype == "uint8" and inputs[ 1].dtype == "int8" and out_type.dtype == "int32" strategy.add_implementation( wrap_compute_dense(topi.x86.dense_nopack), wrap_topi_schedule(topi.x86.schedule_dense_nopack), name="dense_nopack.x86", plevel=10, ) if is_auto_scheduler_enabled(): strategy.add_implementation( wrap_compute_dense(topi.nn.dense, need_auto_scheduler_layout=True), naive_schedule, name="dense.generic", plevel=11, ) if "cblas" in target.libs: with SpecializedCondition(same_type and dtype in ["float32", "float64"]): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_cblas), wrap_topi_schedule(topi.x86.schedule_dense_cblas), name="dense_cblas.x86", plevel=13, ) if "mkl" in target.libs: with SpecializedCondition( same_type and dtype in ["float32", "float64"] or u8s8s32): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_mkl), wrap_topi_schedule(topi.x86.schedule_dense_mkl), name="dense_mkl.x86", plevel=14, ) if "mkldnn" in target.libs: with SpecializedCondition(same_type and dtype == "float32"): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_mkldnn), wrap_topi_schedule(topi.x86.schedule_dense_mkldnn), name="dense_mkldnn.x86", plevel=15, ) with SpecializedCondition(m >= 16): # this implementation may not be well-optimized, so use plevel=5 for now. strategy.add_implementation( wrap_compute_dense(topi.x86.dense_pack), wrap_topi_schedule(topi.x86.schedule_dense_pack), name="dense_pack.x86", plevel=5, ) return strategy
def batch_matmul_strategy_cpu(attrs, inputs, out_type, target): """batch_matmul x86 strategy""" strategy = _op.OpStrategy() mcpu = Target.current().mcpu need_auto_scheduler_layout = is_auto_scheduler_enabled() need_meta_schedule_layout = is_meta_schedule_enabled() if (not attrs.transpose_a and attrs.transpose_b and target_has_vnni(mcpu) and inputs[0].dtype == "uint8" and inputs[1].dtype == "int8" and inputs[1].shape[-2] % 16 == 0 and inputs[1].shape[-1] % 4 == 0): strategy.add_implementation( wrap_compute_batch_matmul(topi.x86.batch_matmul_vnni_compute, need_out_dtype=True), wrap_topi_schedule(topi.x86.schedule_batch_matmul_vnni), name="batch_matmul_vnni.x86", plevel=10, ) elif is_dynamic( out_type ) or need_auto_scheduler_layout or need_meta_schedule_layout: strategy.add_implementation( wrap_compute_batch_matmul( topi.nn.batch_matmul, need_out_dtype=True, need_auto_scheduler_layout=need_auto_scheduler_layout, need_meta_schedule_layout=need_meta_schedule_layout, ), wrap_topi_schedule(topi.generic.nn.schedule_batch_matmul), name="batch_matmul.generic", plevel=10, ) else: strategy.add_implementation( wrap_compute_batch_matmul(topi.x86.batch_matmul, need_out_dtype=True), wrap_topi_schedule(topi.x86.schedule_batch_matmul), name="batch_matmul.x86", plevel=10, ) if "cblas" in target.libs: strategy.add_implementation( wrap_compute_batch_matmul(topi.x86.batch_matmul_cblas), wrap_topi_schedule(topi.x86.schedule_batch_matmul_cblas), name="batch_matmul_cblas.x86", plevel=15, ) if "mkl" in target.libs: strategy.add_implementation( wrap_compute_batch_matmul(topi.x86.batch_matmul_mkl), wrap_topi_schedule(topi.x86.schedule_batch_matmul_mkl), name="batch_matmul_mkl.x86", plevel=15, ) return strategy
def dense_strategy_mali(attrs, inputs, out_type, target): """dense mali strategy""" strategy = _op.OpStrategy() if not is_auto_scheduler_enabled(): strategy.add_implementation( wrap_compute_dense(topi.mali.dense), wrap_topi_schedule(topi.mali.schedule_dense), name="dense.mali", ) else: strategy.add_implementation( wrap_compute_dense(topi.nn.dense, need_auto_scheduler_layout=True), naive_schedule, name="dense.mali", ) return strategy
def conv3d_strategy_cpu(attrs, inputs, out_type, target): """conv3d generic strategy""" strategy = _op.OpStrategy() layout = attrs.data_layout need_auto_scheduler_layout = is_auto_scheduler_enabled() need_meta_schedule_layout = is_meta_schedule_enabled() if need_auto_scheduler_layout or need_meta_schedule_layout: # Use auto-scheduler. We should provide clear compute definition without autotvm templates # or packed layouts. if layout == "NCDHW": strategy.add_implementation( wrap_compute_conv3d(topi.nn.conv3d_ncdhw), naive_schedule, name="conv3d_ncdhw.x86", ) elif layout == "NDHWC": strategy.add_implementation( wrap_compute_conv3d( topi.nn.conv3d_ndhwc, need_auto_scheduler_layout=need_auto_scheduler_layout, need_meta_schedule_layout=need_meta_schedule_layout, ), naive_schedule, name="conv3d_ndhwc.x86", ) else: raise ValueError("Not support this layout {} yet".format(layout)) else: # Use autotvm templates if layout == "NCDHW": strategy.add_implementation( wrap_compute_conv3d(topi.x86.conv3d_ncdhw), wrap_topi_schedule(topi.x86.schedule_conv3d_ncdhw), name="conv3d_ncdhw.x86", ) elif layout == "NDHWC": strategy.add_implementation( wrap_compute_conv3d(topi.x86.conv3d_ndhwc), wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc), name="conv3d_ndhwc.x86", ) else: raise ValueError("Not support this layout {} yet".format(layout)) return strategy
def conv2d_winograd_without_weight_transfrom_strategy_cpu( attrs, inputs, out_type, target): """conv2d_winograd_without_weight_transfrom cpu strategy""" dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") layout = attrs.data_layout strides = attrs.get_int_tuple("strides") assert dilation == (1, 1), "Do not support dilate now" assert strides == (1, 1), "Do not support strides now" assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() need_auto_scheduler_layout = is_auto_scheduler_enabled() need_meta_schedule_layout = is_meta_schedule_enabled() if layout == "NHWC": if need_meta_schedule_layout: strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_winograd_nhwc_without_weight_transform, need_auto_scheduler_layout=False, need_meta_schedule_layout=True, ), naive_schedule, name="ansor.winograd", ) elif need_auto_scheduler_layout: strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_winograd_nhwc_without_weight_transform, need_auto_scheduler_layout=True, need_meta_schedule_layout=False, ), naive_schedule, name="ansor.winograd", ) else: raise RuntimeError( "Both AutoScheduler and MetaSchedule are not enabled") else: raise RuntimeError( "Unsupported conv2d_winograd_without_weight_transfrom layout {}". format(layout)) return strategy
def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_type, target): """conv2d_winograd_without_weight_transfrom mali strategy""" dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") layout = attrs.data_layout strides = attrs.get_int_tuple("strides") kernel = inputs[1] assert dilation == (1, 1), "Do not support dilate now" assert strides == (1, 1), "Do not support strides now" assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() if layout == "NCHW": assert len(kernel.shape) == 5, "Kernel must be packed into 5-dim" strategy.add_implementation( wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd), wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.mali", ) elif layout == "NHWC": need_auto_scheduler_layout = is_auto_scheduler_enabled() need_meta_schedule_layout = is_meta_schedule_enabled() if need_auto_scheduler_layout or need_meta_schedule_layout: strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_winograd_nhwc_without_weight_transform, need_auto_scheduler_layout=need_auto_scheduler_layout, need_meta_schedule_layout=need_meta_schedule_layout, ), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc_winograd_without_weight_transform", plevel=15, ) else: raise RuntimeError( "Winograd conv2d NHWC is not enabled for mali without auto_scheduler." ) else: raise RuntimeError( "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout) ) return strategy
def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_type, target): """conv2d_winograd_without_weight_transfrom cuda strategy""" dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") layout = attrs.data_layout data, kernel = inputs stride_h, stride_w = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") assert dilation == (1, 1), "Do not support dilate now" assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() if layout == "NCHW": strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd_without_weight_transform), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd_without_weight_transform), name="conv2d_nchw_winograd_without_weight_transform.cuda", ) elif layout == "NHWC": N, H, W, _ = get_const_tuple(data.shape) alpha, _, CI, CO = get_const_tuple(kernel.shape) dilation_h, dilation_w = dilation judge_winograd_tensorcore, _, _ = judge_winograd( N, H, W, alpha, alpha, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, data.dtype, kernel.dtype, pre_flag=True, ) if ( target.kind.name == "cuda" and nvcc.have_tensorcore(target=target) and judge_winograd_tensorcore ): strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_tensorcore_without_weight_transform ), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform ), name="conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda", ) else: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc_winograd_direct_without_weight_transform), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_direct_without_weight_transform ), name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda", ) if is_auto_scheduler_enabled(): strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc_winograd_without_weight_transform", plevel=15, ) else: raise RuntimeError( "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout) ) return strategy
def conv2d_strategy_cuda(attrs, inputs, out_type, target): """conv2d cuda strategy""" strategy = _op.OpStrategy() data, kernel = inputs stride_h, stride_w = attrs.get_int_tuple("strides") dilation_h, dilation_w = attrs.get_int_tuple("dilation") padding = attrs.get_int_tuple("padding") groups = attrs.groups layout = attrs.data_layout kernel_layout = attrs.kernel_layout if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" if ( target.kind.name == "cuda" and data.dtype in ("int8", "uint8") and kernel.dtype in ("int8", "uint8") ): assert data.dtype == kernel.dtype strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8), name="conv2d_nchw_int8.cuda", ) else: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw), name="conv2d_nchw.cuda", ) _, _, kh, kw = get_const_tuple(kernel.shape) if ( (2 < kh < 8 and 2 < kw < 8 and kh == kw) and (stride_h == 1 and stride_w == 1) and (dilation_h == 1 and dilation_w == 1) ): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.cuda", plevel=5, ) elif layout == "HWCN": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwcn), wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn), name="conv2d_hwcn.cuda", ) elif layout == "NHWC" and kernel_layout == "HWIO": strategy.add_implementation( wrap_compute_conv2d(topi.gpu.conv2d_nhwc), wrap_topi_schedule(topi.gpu.schedule_conv2d_nhwc), name="conv2d_nhwc.gpu", ) N, H, W, _ = get_const_tuple(data.shape) KH, KW, CI, CO = get_const_tuple(kernel.shape) # Winograd shape related judgment ( judge_winograd_tensorcore, judge_winograd_autotvm, judge_winograd_auto_scheduler, ) = judge_winograd( N, H, W, KH, KW, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, data.dtype, kernel.dtype, pre_flag=False, ) if judge_winograd_autotvm: if ( target.kind.name == "cuda" and nvcc.have_tensorcore(target=target) and judge_winograd_tensorcore ): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc_winograd_tensorcore), wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore), name="conv2d_nhwc_winograd_tensorcore.cuda", plevel=5, ) else: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc_winograd_direct), wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_winograd_direct), name="conv2d_nhwc_winograd_direct.cuda", plevel=5, ) if ( target.kind.name == "cuda" and nvcc.have_tensorcore(target=target) and ( (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0) ) ): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore), wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_tensorcore), name="conv2d_nhwc_tensorcore.cuda", plevel=20, ) # register auto-scheduler implementations if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler: strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc.winograd", plevel=15, ) elif layout == "HWNC": assert kernel_layout in ["HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i"] _, _, N, in_channels = get_const_tuple(data.shape) pre_computed = len(kernel.shape) == 6 if pre_computed: _, _, oc_chunk, _, oc_block_factor, _ = get_const_tuple(kernel.shape) out_channels = oc_chunk * oc_block_factor else: _, _, out_channels, _ = get_const_tuple(kernel.shape) tensorcore_dtypes = ["int4", "uint4", "int8", "uint8"] if ( target.kind.name == "cuda" and nvcc.have_tensorcore(target=target) and kernel.dtype in tensorcore_dtypes and ( ( data.dtype in ["int4", "uint4"] and N % 8 == 0 and in_channels % 32 == 0 and out_channels % 8 == 0 ) or ( data.dtype in ["int8", "uint8"] and N % 8 == 0 and in_channels % 16 == 0 and out_channels % 32 == 0 ) ) ): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore), wrap_topi_schedule(topi.cuda.schedule_conv2d_hwnc_tensorcore), name="conv2d_hwnc_tensorcore_direct.cuda", plevel=20, ) else: raise RuntimeError( "Unsupported shape for conv2d HWNC.\ Need to satisfy tensor core schedule." ) elif target.kind.name == "cuda" and layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8), name="conv2d_NCHWc_int8.cuda", ) elif target.kind.name == "cuda" and "cudnn" not in target.libs: # No TVM native kernel applicable raise RuntimeError("Unsupported conv2d layout {} for CUDA".format(layout)) if ( target.kind.name == "cuda" and "cudnn" in target.libs and layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and padding[1] == padding[3] ): # add cudnn implementation if layout == "NHWC": assert kernel_layout == "OHWI" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=25, ) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups) and ( layout == "NCHW" or "cudnn" not in target.libs ): # cuDNN requires a different kernel layout for NHWC inputs. if layout == "NCHW": assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.cuda", ) elif layout == "NHWC": assert kernel_layout == "HWOI" strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.cuda", ) else: raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d # add cudnn implementation, if any cudnn_impl = False if target.kind.name == "cuda" and "cudnn" in target.libs: if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and padding[1] == padding[3]: strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True ), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=25, ) cudnn_impl = True if layout == "NCHW": assert kernel_layout == "OIHW" _, channels, _, _ = get_const_tuple(data.shape) out_channels, in_channels, _, _ = get_const_tuple(kernel.shape) oc_chunk = out_channels // 4 ic_chunk = in_channels // 4 if ( target.kind.name == "cuda" and data.dtype in ["int8", "uint8"] and kernel.dtype in ["int8", "uint8"] and channels % groups == 0 and out_channels % groups == 0 and channels % 4 == 0 and out_channels % 4 == 0 and groups <= oc_chunk and groups <= ic_chunk ): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_nchw_int8, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw_int8), name="group_conv2d_nchw_int8.cuda", ) else: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw), name="group_conv2d_nchw.cuda", ) elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8), name="group_conv2d_NCHWc_int8.cuda", ) elif not cudnn_impl: raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) return strategy
def conv2d_strategy_rocm(attrs, inputs, out_type, target): """conv2d rocm strategy""" strategy = _op.OpStrategy() data, kernel = inputs dilation_h, dilation_w = attrs.get_int_tuple("dilation") groups = attrs.groups layout = attrs.data_layout stride_h, stride_w = attrs.get_int_tuple("strides") kernel_layout = attrs.kernel_layout padding = attrs.get_int_tuple("padding") if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1: if layout == "NCHW": # TODO(@vinx13, @icemelon9): Use conv2d_NCHWc_int8 when dtype is int8/uint8. assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw), name="conv2d_nchw.cuda", ) _, _, kh, kw = get_const_tuple(kernel.shape) if (2 < kh < 8 and 2 < kw < 8 and kh == kw and stride_h == 1 and stride_w == 1 and dilation_h == 1 and dilation_w == 1): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), wrap_topi_schedule( topi.cuda.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.cuda", plevel=5, ) elif layout == "NHWC": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc), name="conv2d_nhwc.cuda", ) N, H, W, _ = get_const_tuple(data.shape) KH, KW, CI, CO = get_const_tuple(kernel.shape) ( _, judge_winograd_autotvm, judge_winograd_auto_scheduler, ) = judge_winograd( N, H, W, KH, KW, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, data.dtype, kernel.dtype, pre_flag=False, ) if judge_winograd_autotvm: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc_winograd_direct), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_direct), name="conv2d_nhwc_winograd_direct.cuda", plevel=5, ) if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler: strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc.winograd", plevel=15, ) elif layout == "HWCN": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwcn), wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn), name="conv2d_hwcn.cuda", ) elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8), name="conv2d_NCHWc_int8.cuda", ) else: raise RuntimeError( "Unsupported conv2d layout {} for CUDA".format(layout)) # add miopen implementation if ("miopen" in target.libs and layout == "NCHW" and padding[0] == padding[2] and padding[1] == padding[3]): strategy.add_implementation( wrap_compute_conv2d(topi.rocm.conv2d_nchw_miopen, True), wrap_topi_schedule(topi.rocm.schedule_conv2d_nchw_miopen), name="conv2d_nchw_miopen.rocm", plevel=15, ) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.cuda", ) elif layout == "NHWC": assert kernel_layout == "HWOI" strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.cuda", ) else: raise RuntimeError( "Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d if layout == "NCHW": # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8. assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw), name="group_conv2d_nchw.cuda", ) elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8), name="group_conv2d_NCHWc_int8.cuda", ) else: raise RuntimeError( "Unsupported group_conv2d layout {}".format(layout)) return strategy
def matmul_strategy_cpu(attrs, inputs, out_type, target): """matmul x86 strategy""" strategy = _op.OpStrategy() same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype dtype = inputs[0].dtype u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32" if "cblas" in target.libs: length_before = len(strategy.specializations) if strategy.specializations else 0 with SpecializedCondition(same_type and dtype in ["float32", "float64"]): strategy.add_implementation( wrap_compute_matmul(topi.x86.matmul_cblas), wrap_topi_schedule(topi.x86.schedule_matmul_cblas), name="matmul_cblas.x86", plevel=13, ) length_after = len(strategy.specializations) if strategy.specializations else 0 if length_before == length_after: logger.warning( "Currently cblas only support the data type to be float32 or float64. Skip." ) if "mkl" in target.libs: length_before = len(strategy.specializations) if strategy.specializations else 0 with SpecializedCondition(same_type and dtype in ["float32", "float64"] or u8s8s32): strategy.add_implementation( wrap_compute_matmul(topi.x86.matmul_mkl), wrap_topi_schedule(topi.x86.schedule_matmul_mkl), name="matmul_mkl.x86", plevel=14, ) length_after = len(strategy.specializations) if strategy.specializations else 0 if length_before == length_after: logger.warning( "Currently mkl only support the data type to be float32, float64 or input with " "uint8 and int8 while output wiht int32. Skip." ) if "mkldnn" in target.libs: length_before = len(strategy.specializations) if strategy.specializations else 0 with SpecializedCondition(same_type and dtype == "float32"): strategy.add_implementation( wrap_compute_matmul(topi.x86.matmul_mkldnn), wrap_topi_schedule(topi.x86.schedule_matmul_mkldnn), name="matmul_mkldnn.x86", plevel=15, ) length_after = len(strategy.specializations) if strategy.specializations else 0 if length_before == length_after: logger.warning("Currently mkldnn only support the data type to be float32. Skip.") if is_auto_scheduler_enabled(): strategy.add_implementation( wrap_compute_matmul(topi.nn.matmul, need_auto_scheduler_layout=True), naive_schedule, name="matmul.generic", plevel=11, ) else: # If no cblas/mkl/mkldnn strategy choosed if not strategy.specializations: logger.warning( "Matmul is not optimized for x86. " "Recommend to use cblas/mkl/mkldnn for better performance." ) strategy.add_implementation( wrap_compute_matmul(topi.nn.matmul), naive_schedule, name="matmul.generic", ) return strategy
def conv2d_strategy_cpu(attrs, inputs, out_type, target): """conv2d x86 strategy""" strategy = _op.OpStrategy() data, kernel = inputs stride_h, stride_w = get_const_tuple(attrs.strides) dilation_h, dilation_w = get_const_tuple(attrs.dilation) groups = attrs.groups layout = attrs.data_layout kernel_layout = attrs.kernel_layout if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype): strategy.add_implementation( wrap_compute_conv2d(topi.x86.conv2d_nchw_int8), wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_int8), name="conv2d_nchw_int8.x86", ) else: strategy.add_implementation( wrap_compute_conv2d(topi.x86.conv2d_nchw), wrap_topi_schedule(topi.x86.schedule_conv2d_nchw), name="conv2d_nchw.x86", ) elif _NCHWc_matcher.match(layout): # check if layout is NCHWxc assert _OIHWio_matcher.match(kernel_layout) # check if kernel is OIHWio return conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target) elif layout == "NHWC": assert kernel_layout == "HWIO" if not is_auto_scheduler_enabled(): logger.warning("conv2d NHWC layout is not optimized for x86 with autotvm.") strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True), wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc), name="conv2d_nhwc.x86", ) judge_winograd_auto_scheduler = False if len(kernel.shape) == 4: kernel_h, kernel_w, _, co = get_const_tuple(kernel.shape) judge_winograd_auto_scheduler = ( "float" in data.dtype and "float" in kernel.dtype and kernel_h == 3 and kernel_w == 3 and stride_h == 1 and stride_w == 1 and dilation_h == 1 and dilation_w == 1 and 64 < co < 512 # The last condition of co is based on our profiling of resnet workloads # on skylake avx512 machines. We found winograd is faster than direct # only when co is within this range ) # register auto-scheduler implementations if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler: strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_winograd_nhwc, need_auto_scheduler_layout=True ), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc.winograd", plevel=15, ) elif layout == "HWCN": assert kernel_layout == "HWIO" if not is_auto_scheduler_enabled(): logger.warning("conv2d HWCN layout is not optimized for x86 with autotvm.") strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_hwcn), wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn), name="conv2d_hwcn.generic", ) else: raise RuntimeError("Unsupported conv2d layout {} for x86".format(layout)) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" channel_multiplier = get_const_tuple(inputs[1].shape)[1] if channel_multiplier == 1 and dilation_h == 1 and dilation_w == 1: strategy.add_implementation( wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw), wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.x86", ) else: logger.warning( "For x86 target, depthwise_conv2d with channel " "multiplier greater than 1 is not optimized" ) strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw), wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.generic", ) elif _NCHWc_matcher.match(layout): # check if layout is NCHWxc assert _OIHWio_matcher.match(kernel_layout) # check if kernel is OIHWio return depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target) elif layout == "NHWC": assert kernel_layout == "HWOI" if not is_auto_scheduler_enabled(): logger.warning( "depthwise_conv2d NHWC layout is not optimized for x86 with autotvm." ) strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.generic", ) else: raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d if layout == "NCHW": assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.x86.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.x86.schedule_group_conv2d_nchw), name="group_conv2d_nchw.x86", ) elif layout == "NHWC": assert kernel_layout == "HWIO" if not is_auto_scheduler_enabled(): logger.warning("group_conv2d is not optimized for x86 with autotvm.") strategy.add_implementation( wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True), wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc), name="group_conv2d_nhwc.generic", ) else: raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) return strategy
def conv2d_strategy_mali(attrs, inputs, out_type, target): """conv2d mali strategy""" strategy = _op.OpStrategy() data, kernel = inputs dilation_h, dilation_w = attrs.get_int_tuple("dilation") stride_h, stride_w = attrs.get_int_tuple("strides") groups = attrs.groups layout = attrs.data_layout kernel_layout = attrs.kernel_layout if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1: if layout == "NCHW": if kernel_layout == "OIHW": strategy.add_implementation( wrap_compute_conv2d(topi.mali.conv2d_nchw_spatial_pack), wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack), name="conv2d_nchw_spatial_pack.mali", ) # check if winograd algorithm is applicable _, _, kh, kw = get_const_tuple(kernel.shape) if ( kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and dilation_h == 1 and dilation_w == 1 ): strategy.add_implementation( wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd), wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.mali", plevel=5, ) elif re.match(r"OIHW\d*o", kernel_layout): strategy.add_implementation( wrap_compute_conv2d(topi.mali.conv2d_nchw_spatial_pack), wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack), name="conv2d_nchw_spatial_pack.mali", ) else: raise RuntimeError( "Unsupported weight layout {} for conv2d NCHW".format(kernel_layout) ) elif layout == "NHWC": assert kernel_layout == "HWIO" need_auto_scheduler_layout = is_auto_scheduler_enabled() need_meta_schedule_layout = is_meta_schedule_enabled() if need_auto_scheduler_layout or need_meta_schedule_layout: strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_nhwc, need_auto_scheduler_layout=need_auto_scheduler_layout, need_meta_schedule_layout=need_meta_schedule_layout, ), naive_schedule, name="conv2d_nhwc.mali", ) is_winograd_applicable = False if len(kernel.shape) == 4: kernel_h, kernel_w, _, _ = get_const_tuple(kernel.shape) is_winograd_applicable = ( "float" in data.dtype and "float" in kernel.dtype and kernel_h == 3 and kernel_w == 3 and stride_h == 1 and stride_w == 1 and dilation_h == 1 and dilation_w == 1 ) if is_winograd_applicable: if need_meta_schedule_layout: strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_winograd_nhwc, need_auto_scheduler_layout=False, need_meta_schedule_layout=True, ), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc.winograd", plevel=15, ) elif need_auto_scheduler_layout: strategy.add_implementation( wrap_compute_conv2d( topi.nn.conv2d_winograd_nhwc, need_auto_scheduler_layout=True, need_meta_schedule_layout=False, ), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc.winograd", plevel=15, ) else: raise RuntimeError("Both AutoScheduler and MetaSchedule are not enabled") else: strategy.add_implementation( wrap_compute_conv2d(topi.mali.conv2d_nhwc_spatial_pack), wrap_topi_schedule(topi.mali.schedule_conv2d_nhwc_spatial_pack), name="conv2d_nhwc_spatial_pack.mali", ) else: raise RuntimeError("Unsupported conv2d layout {} for mali".format(layout)) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.mali.depthwise_conv2d_nchw), wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.mali", ) elif layout == "NHWC": assert kernel_layout == "HWOI" if is_auto_scheduler_enabled(): strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), naive_schedule, name="depthwise_conv2d_nhwc.mali", ) elif is_meta_schedule_enabled(): strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), naive_schedule, name="depthwise_conv2d_nhwc.mali", ) else: strategy.add_implementation( wrap_compute_conv2d(topi.mali.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.mali", ) else: raise RuntimeError("Unsupported depthwise_conv2d layout {} for mali".format(layout)) else: # group_conv2d raise RuntimeError("group_conv2d is not supported for mali") return strategy
def dense_strategy_cpu(attrs, inputs, out_type, target): """dense x86 strategy""" strategy = _op.OpStrategy() same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype dtype = inputs[0].dtype u8s8s32 = dtype == "uint8" and inputs[ 1].dtype == "int8" and out_type.dtype == "int32" strategy.add_implementation( wrap_compute_dense(topi.x86.dense_nopack), wrap_topi_schedule(topi.x86.schedule_dense_nopack), name="dense_nopack.x86", plevel=5, ) strategy.add_implementation( wrap_compute_dense(topi.x86.dense_pack), wrap_topi_schedule(topi.x86.schedule_dense_pack), name="dense_pack.x86", plevel=10, ) need_auto_scheduler_layout = is_auto_scheduler_enabled() need_meta_schedule_layout = is_meta_schedule_enabled() if need_auto_scheduler_layout or need_meta_schedule_layout: strategy.add_implementation( wrap_compute_dense( topi.nn.dense, need_auto_scheduler_layout=need_auto_scheduler_layout, need_meta_schedule_layout=need_meta_schedule_layout, ), naive_schedule, name="dense.generic", plevel=11, ) if "cblas" in target.libs: with SpecializedCondition(same_type and dtype in ["float32", "float64"]): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_cblas), wrap_topi_schedule(topi.x86.schedule_dense_cblas), name="dense_cblas.x86", plevel=13, ) if "mkl" in target.libs: with SpecializedCondition( same_type and dtype in ["float32", "float64"] or u8s8s32): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_mkl), wrap_topi_schedule(topi.x86.schedule_dense_mkl), name="dense_mkl.x86", plevel=14, ) if "dnnl" in target.libs: with SpecializedCondition(same_type and dtype == "float32"): strategy.add_implementation( wrap_compute_dense(topi.x86.dense_dnnl), wrap_topi_schedule(topi.x86.schedule_dense_dnnl), name="dense_dnnl.x86", plevel=15, ) return strategy
def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True): """Select the best implementation from the op strategy. If use_autotvm is True, it'll first try to find the best implementation based on AutoTVM profile results. If no AutoTVM profile result is found, it'll choose the implementation with highest plevel. If use_autotvm is False, it'll directly choose the implementation with highest plevel. Note that this function doesn't support op with symbolic input shapes. Parameters ---------- op : tvm.ir.Op Relay operator. attrs : object The op attribute. inputs : List[tvm.te.Tensor] Input tensors to the op. out_type : relay.Type The output type. target : tvm.target.Target The target to compile the op. use_autotvm : bool Whether query AutoTVM to pick the best. Returns ------- ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor]) The best op implementation and the corresponding output tensors. """ all_impls = get_valid_implementations(op, attrs, inputs, out_type, target) if len(all_impls) == 0: raise RuntimeError(f"No valid {op} implementations for {target}") best_plevel_impl = max(all_impls, key=lambda x: x.plevel) # Disable autotvm if auto_scheduler is enabled. # (i.e., always return the implementation with the highest priority for auto-scheduler). if is_auto_scheduler_enabled() or is_meta_schedule_dispatch_enabled(): use_autotvm = False # If not use autotvm, always return the implementation with the highest priority if not use_autotvm: logger.info( "Using %s for %s based on highest priority (%d)", best_plevel_impl.name, op.name, best_plevel_impl.plevel, ) outs = best_plevel_impl.compute(attrs, inputs, out_type) return best_plevel_impl, outs # Otherwise, try autotvm templates outputs = {} workloads = {} best_autotvm_impl = None best_cfg = None dispatch_ctx = autotvm.task.DispatchContext.current old_silent = autotvm.GLOBAL_SCOPE.silent autotvm.GLOBAL_SCOPE.silent = True for impl in all_impls: outs = impl.compute(attrs, inputs, out_type) outputs[impl] = outs workload = autotvm.task.get_workload(outs) workloads[impl] = workload if workload is None: # Not an AutoTVM tunable implementation continue cfg = dispatch_ctx.query(target, workload) if cfg.is_fallback: # Skip fallback config continue logger.info("Implementation %s for %s has cost %.2e", impl.name, op.name, cfg.cost) if best_cfg is None or best_cfg.cost > cfg.cost: best_autotvm_impl = impl best_cfg = cfg autotvm.GLOBAL_SCOPE.silent = old_silent if best_autotvm_impl: # The best autotvm implementation definitely doesn't use fallback config logger.info( "Using %s for %s based on lowest cost (%.2e)", best_autotvm_impl.name, op.name, best_cfg.cost, ) return best_autotvm_impl, outputs[best_autotvm_impl] # Use the implementation with highest plevel if workloads[best_plevel_impl] is not None: msg = ( "Cannot find tuning records for:\n target=%s\n key=%s\n" "TVM will apply a default schedule which may negatively impact performance." % (target, workloads[best_plevel_impl])) if (not autotvm.env.GLOBAL_SCOPE.silent and msg not in autotvm.task.DispatchContext.warning_messages): autotvm.task.DispatchContext.warning_messages.add(msg) global _first_warning if _first_warning: _first_warning = False info_msg = ( "One or more operators have not been tuned. Please tune your model " "for better performance. Use DEBUG logging level to see more details." ) autotvm_logger.warning(info_msg) autotvm_logger.debug(msg) logger.info( "Using %s for %s based on highest priority (%s)", best_plevel_impl.name, op.name, best_plevel_impl.plevel, ) return best_plevel_impl, outputs[best_plevel_impl]
def conv2d_strategy_cpu(attrs, inputs, out_type, target): """conv2d x86 strategy""" strategy = _op.OpStrategy() data, kernel = inputs dilation_h, dilation_w = get_const_tuple(attrs.dilation) groups = attrs.groups layout = attrs.data_layout kernel_layout = attrs.kernel_layout if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype): strategy.add_implementation( wrap_compute_conv2d(topi.x86.conv2d_nchw_int8), wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_int8), name="conv2d_nchw_int8.x86", ) else: strategy.add_implementation( wrap_compute_conv2d(topi.x86.conv2d_nchw), wrap_topi_schedule(topi.x86.schedule_conv2d_nchw), name="conv2d_nchw.x86", ) elif _NCHWc_matcher.match(layout): # check if layout is NCHWxc assert _OIHWio_matcher.match( kernel_layout) # check if kernel is OIHWio return conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target) elif layout == "NHWC": assert kernel_layout == "HWIO" if not is_auto_scheduler_enabled(): logger.warning( "conv2d NHWC layout is not optimized for x86 with autotvm." ) strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True), wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc), name="conv2d_nhwc.x86", ) elif layout == "HWCN": assert kernel_layout == "HWIO" if not is_auto_scheduler_enabled(): logger.warning( "conv2d HWCN layout is not optimized for x86 with autotvm." ) strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_hwcn), wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn), name="conv2d_hwcn.generic", ) else: raise RuntimeError( "Unsupported conv2d layout {} for x86".format(layout)) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" channel_multiplier = get_const_tuple(inputs[1].shape)[1] if channel_multiplier == 1 and dilation_h == 1 and dilation_w == 1: strategy.add_implementation( wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw), wrap_topi_schedule( topi.x86.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.x86", ) else: logger.warning("For x86 target, depthwise_conv2d with channel " "multiplier greater than 1 is not optimized") strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw), wrap_topi_schedule( topi.generic.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.generic", ) elif _NCHWc_matcher.match(layout): # check if layout is NCHWxc assert _OIHWio_matcher.match( kernel_layout) # check if kernel is OIHWio return depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target) elif layout == "NHWC": assert kernel_layout == "HWOI" if not is_auto_scheduler_enabled(): logger.warning( "depthwise_conv2d NHWC layout is not optimized for x86 with autotvm." ) strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule( topi.generic.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.generic", ) else: raise RuntimeError( "Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d if layout == "NCHW": assert kernel_layout == "OIHW" if not is_auto_scheduler_enabled(): logger.warning( "group_conv2d is not optimized for x86 with autotvm.") strategy.add_implementation( wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw), name="group_conv2d_nchw.generic", ) elif layout == "NHWC": assert kernel_layout == "HWIO" if not is_auto_scheduler_enabled(): logger.warning( "group_conv2d is not optimized for x86 with autotvm.") strategy.add_implementation( wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True), wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc), name="group_conv2d_nhwc.generic", ) else: raise RuntimeError( "Unsupported group_conv2d layout {}".format(layout)) return strategy