Example #1
0
File: x86.py Project: saudet/tvm
def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
    """batch_matmul x86 strategy"""
    strategy = _op.OpStrategy()
    if is_dynamic(out_type) or is_auto_scheduler_enabled():
        strategy.add_implementation(
            wrap_compute_batch_matmul(
                topi.nn.batch_matmul, need_auto_scheduler_layout=True, need_out_dtype=True
            ),
            wrap_topi_schedule(topi.generic.nn.schedule_batch_matmul),
            name="batch_matmul.generic",
            plevel=10,
        )
    else:
        strategy.add_implementation(
            wrap_compute_batch_matmul(topi.x86.batch_matmul, need_out_dtype=True),
            wrap_topi_schedule(topi.x86.schedule_batch_matmul),
            name="batch_matmul.x86",
            plevel=10,
        )
    if "cblas" in target.libs:
        strategy.add_implementation(
            wrap_compute_batch_matmul(topi.x86.batch_matmul_cblas),
            wrap_topi_schedule(topi.x86.schedule_batch_matmul_cblas),
            name="batch_matmul_cblas.x86",
            plevel=15,
        )
    if "mkl" in target.libs:
        strategy.add_implementation(
            wrap_compute_batch_matmul(topi.x86.batch_matmul_mkl),
            wrap_topi_schedule(topi.x86.schedule_batch_matmul_mkl),
            name="batch_matmul_mkl.x86",
            plevel=15,
        )
    return strategy
Example #2
0
def matmul_strategy_cuda(attrs, inputs, out_type, target):
    """Matmul cuda strategy."""
    strategy = _op.OpStrategy()

    if is_auto_scheduler_enabled():
        strategy.add_implementation(
            wrap_compute_matmul(topi.nn.matmul),
            naive_schedule,
            name="matmul.cuda",
        )
    else:
        logger.warning(
            "Matmul is not optimized for cuda. Recommend to use cublas for better performance."
        )
        # Temporary use this as a basic schedule
        strategy.add_implementation(
            wrap_compute_matmul(topi.gpu.matmul_default),
            wrap_topi_schedule(topi.gpu.schedule_matmul_default),
            name="matmul_default.gpu",
        )

    if target.kind.name == "cuda" and "cublas" in target.libs:
        strategy.add_implementation(
            wrap_compute_matmul(topi.cuda.matmul_cublas),
            wrap_topi_schedule(topi.cuda.schedule_matmul_cublas),
            name="matmul_cublas.cuda",
            plevel=25,
        )
    return strategy
Example #3
0
File: x86.py Project: jiajuns/tvm
def dense_strategy_cpu(attrs, inputs, out_type, target):
    """dense x86 strategy"""
    strategy = _op.OpStrategy()
    m, _ = inputs[0].shape
    same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype
    dtype = inputs[0].dtype
    u8s8s32 = dtype == "uint8" and inputs[
        1].dtype == "int8" and out_type.dtype == "int32"
    strategy.add_implementation(
        wrap_compute_dense(topi.x86.dense_nopack),
        wrap_topi_schedule(topi.x86.schedule_dense_nopack),
        name="dense_nopack.x86",
        plevel=10,
    )

    if is_auto_scheduler_enabled():
        strategy.add_implementation(
            wrap_compute_dense(topi.nn.dense, need_auto_scheduler_layout=True),
            naive_schedule,
            name="dense.generic",
            plevel=11,
        )

    if "cblas" in target.libs:
        with SpecializedCondition(same_type
                                  and dtype in ["float32", "float64"]):
            strategy.add_implementation(
                wrap_compute_dense(topi.x86.dense_cblas),
                wrap_topi_schedule(topi.x86.schedule_dense_cblas),
                name="dense_cblas.x86",
                plevel=13,
            )
    if "mkl" in target.libs:
        with SpecializedCondition(
                same_type and dtype in ["float32", "float64"] or u8s8s32):
            strategy.add_implementation(
                wrap_compute_dense(topi.x86.dense_mkl),
                wrap_topi_schedule(topi.x86.schedule_dense_mkl),
                name="dense_mkl.x86",
                plevel=14,
            )
    if "mkldnn" in target.libs:
        with SpecializedCondition(same_type and dtype == "float32"):
            strategy.add_implementation(
                wrap_compute_dense(topi.x86.dense_mkldnn),
                wrap_topi_schedule(topi.x86.schedule_dense_mkldnn),
                name="dense_mkldnn.x86",
                plevel=15,
            )
    with SpecializedCondition(m >= 16):
        # this implementation may not be well-optimized, so use plevel=5 for now.
        strategy.add_implementation(
            wrap_compute_dense(topi.x86.dense_pack),
            wrap_topi_schedule(topi.x86.schedule_dense_pack),
            name="dense_pack.x86",
            plevel=5,
        )
    return strategy
Example #4
0
def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
    """batch_matmul x86 strategy"""
    strategy = _op.OpStrategy()
    mcpu = Target.current().mcpu

    need_auto_scheduler_layout = is_auto_scheduler_enabled()
    need_meta_schedule_layout = is_meta_schedule_enabled()

    if (not attrs.transpose_a and attrs.transpose_b and target_has_vnni(mcpu)
            and inputs[0].dtype == "uint8" and inputs[1].dtype == "int8"
            and inputs[1].shape[-2] % 16 == 0
            and inputs[1].shape[-1] % 4 == 0):
        strategy.add_implementation(
            wrap_compute_batch_matmul(topi.x86.batch_matmul_vnni_compute,
                                      need_out_dtype=True),
            wrap_topi_schedule(topi.x86.schedule_batch_matmul_vnni),
            name="batch_matmul_vnni.x86",
            plevel=10,
        )
    elif is_dynamic(
            out_type
    ) or need_auto_scheduler_layout or need_meta_schedule_layout:
        strategy.add_implementation(
            wrap_compute_batch_matmul(
                topi.nn.batch_matmul,
                need_out_dtype=True,
                need_auto_scheduler_layout=need_auto_scheduler_layout,
                need_meta_schedule_layout=need_meta_schedule_layout,
            ),
            wrap_topi_schedule(topi.generic.nn.schedule_batch_matmul),
            name="batch_matmul.generic",
            plevel=10,
        )
    else:
        strategy.add_implementation(
            wrap_compute_batch_matmul(topi.x86.batch_matmul,
                                      need_out_dtype=True),
            wrap_topi_schedule(topi.x86.schedule_batch_matmul),
            name="batch_matmul.x86",
            plevel=10,
        )
    if "cblas" in target.libs:
        strategy.add_implementation(
            wrap_compute_batch_matmul(topi.x86.batch_matmul_cblas),
            wrap_topi_schedule(topi.x86.schedule_batch_matmul_cblas),
            name="batch_matmul_cblas.x86",
            plevel=15,
        )
    if "mkl" in target.libs:
        strategy.add_implementation(
            wrap_compute_batch_matmul(topi.x86.batch_matmul_mkl),
            wrap_topi_schedule(topi.x86.schedule_batch_matmul_mkl),
            name="batch_matmul_mkl.x86",
            plevel=15,
        )
    return strategy
Example #5
0
def dense_strategy_mali(attrs, inputs, out_type, target):
    """dense mali strategy"""
    strategy = _op.OpStrategy()
    if not is_auto_scheduler_enabled():
        strategy.add_implementation(
            wrap_compute_dense(topi.mali.dense),
            wrap_topi_schedule(topi.mali.schedule_dense),
            name="dense.mali",
        )
    else:
        strategy.add_implementation(
            wrap_compute_dense(topi.nn.dense, need_auto_scheduler_layout=True),
            naive_schedule,
            name="dense.mali",
        )
    return strategy
Example #6
0
def conv3d_strategy_cpu(attrs, inputs, out_type, target):
    """conv3d generic strategy"""
    strategy = _op.OpStrategy()
    layout = attrs.data_layout
    need_auto_scheduler_layout = is_auto_scheduler_enabled()
    need_meta_schedule_layout = is_meta_schedule_enabled()
    if need_auto_scheduler_layout or need_meta_schedule_layout:
        # Use auto-scheduler. We should provide clear compute definition without autotvm templates
        # or packed layouts.
        if layout == "NCDHW":
            strategy.add_implementation(
                wrap_compute_conv3d(topi.nn.conv3d_ncdhw),
                naive_schedule,
                name="conv3d_ncdhw.x86",
            )
        elif layout == "NDHWC":
            strategy.add_implementation(
                wrap_compute_conv3d(
                    topi.nn.conv3d_ndhwc,
                    need_auto_scheduler_layout=need_auto_scheduler_layout,
                    need_meta_schedule_layout=need_meta_schedule_layout,
                ),
                naive_schedule,
                name="conv3d_ndhwc.x86",
            )
        else:
            raise ValueError("Not support this layout {} yet".format(layout))
    else:
        # Use autotvm templates
        if layout == "NCDHW":
            strategy.add_implementation(
                wrap_compute_conv3d(topi.x86.conv3d_ncdhw),
                wrap_topi_schedule(topi.x86.schedule_conv3d_ncdhw),
                name="conv3d_ncdhw.x86",
            )
        elif layout == "NDHWC":
            strategy.add_implementation(
                wrap_compute_conv3d(topi.x86.conv3d_ndhwc),
                wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc),
                name="conv3d_ndhwc.x86",
            )
        else:
            raise ValueError("Not support this layout {} yet".format(layout))
    return strategy
Example #7
0
def conv2d_winograd_without_weight_transfrom_strategy_cpu(
        attrs, inputs, out_type, target):
    """conv2d_winograd_without_weight_transfrom cpu strategy"""
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
    layout = attrs.data_layout
    strides = attrs.get_int_tuple("strides")
    assert dilation == (1, 1), "Do not support dilate now"
    assert strides == (1, 1), "Do not support strides now"
    assert groups == 1, "Do not supoort arbitrary group number"
    strategy = _op.OpStrategy()
    need_auto_scheduler_layout = is_auto_scheduler_enabled()
    need_meta_schedule_layout = is_meta_schedule_enabled()
    if layout == "NHWC":
        if need_meta_schedule_layout:
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.nn.conv2d_winograd_nhwc_without_weight_transform,
                    need_auto_scheduler_layout=False,
                    need_meta_schedule_layout=True,
                ),
                naive_schedule,
                name="ansor.winograd",
            )
        elif need_auto_scheduler_layout:
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.nn.conv2d_winograd_nhwc_without_weight_transform,
                    need_auto_scheduler_layout=True,
                    need_meta_schedule_layout=False,
                ),
                naive_schedule,
                name="ansor.winograd",
            )
        else:
            raise RuntimeError(
                "Both AutoScheduler and MetaSchedule are not enabled")
    else:
        raise RuntimeError(
            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".
            format(layout))
    return strategy
Example #8
0
def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_type, target):
    """conv2d_winograd_without_weight_transfrom mali strategy"""
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
    layout = attrs.data_layout
    strides = attrs.get_int_tuple("strides")
    kernel = inputs[1]
    assert dilation == (1, 1), "Do not support dilate now"
    assert strides == (1, 1), "Do not support strides now"
    assert groups == 1, "Do not supoort arbitrary group number"
    strategy = _op.OpStrategy()
    if layout == "NCHW":
        assert len(kernel.shape) == 5, "Kernel must be packed into 5-dim"
        strategy.add_implementation(
            wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd),
            wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd),
            name="conv2d_nchw_winograd.mali",
        )
    elif layout == "NHWC":
        need_auto_scheduler_layout = is_auto_scheduler_enabled()
        need_meta_schedule_layout = is_meta_schedule_enabled()
        if need_auto_scheduler_layout or need_meta_schedule_layout:
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.nn.conv2d_winograd_nhwc_without_weight_transform,
                    need_auto_scheduler_layout=need_auto_scheduler_layout,
                    need_meta_schedule_layout=need_meta_schedule_layout,
                ),
                naive_schedule,  # this implementation should never be picked by autotvm
                name="conv2d_nhwc_winograd_without_weight_transform",
                plevel=15,
            )
        else:
            raise RuntimeError(
                "Winograd conv2d NHWC is not enabled for mali without auto_scheduler."
            )
    else:
        raise RuntimeError(
            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
        )
    return strategy
Example #9
0
def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_type, target):
    """conv2d_winograd_without_weight_transfrom cuda strategy"""
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
    layout = attrs.data_layout
    data, kernel = inputs
    stride_h, stride_w = attrs.get_int_tuple("strides")
    padding = attrs.get_int_tuple("padding")
    assert dilation == (1, 1), "Do not support dilate now"
    assert groups == 1, "Do not supoort arbitrary group number"
    strategy = _op.OpStrategy()
    if layout == "NCHW":
        strategy.add_implementation(
            wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd_without_weight_transform),
            wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd_without_weight_transform),
            name="conv2d_nchw_winograd_without_weight_transform.cuda",
        )
    elif layout == "NHWC":
        N, H, W, _ = get_const_tuple(data.shape)
        alpha, _, CI, CO = get_const_tuple(kernel.shape)
        dilation_h, dilation_w = dilation
        judge_winograd_tensorcore, _, _ = judge_winograd(
            N,
            H,
            W,
            alpha,
            alpha,
            CI,
            CO,
            padding,
            stride_h,
            stride_w,
            dilation_h,
            dilation_w,
            data.dtype,
            kernel.dtype,
            pre_flag=True,
        )
        if (
            target.kind.name == "cuda"
            and nvcc.have_tensorcore(target=target)
            and judge_winograd_tensorcore
        ):
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.cuda.conv2d_nhwc_winograd_tensorcore_without_weight_transform
                ),
                wrap_topi_schedule(
                    topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform
                ),
                name="conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda",
            )
        else:
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_nhwc_winograd_direct_without_weight_transform),
                wrap_topi_schedule(
                    topi.cuda.schedule_conv2d_nhwc_winograd_direct_without_weight_transform
                ),
                name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda",
            )

        if is_auto_scheduler_enabled():
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform),
                naive_schedule,  # this implementation should never be picked by autotvm
                name="conv2d_nhwc_winograd_without_weight_transform",
                plevel=15,
            )
    else:
        raise RuntimeError(
            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
        )
    return strategy
Example #10
0
def conv2d_strategy_cuda(attrs, inputs, out_type, target):
    """conv2d cuda strategy"""
    strategy = _op.OpStrategy()
    data, kernel = inputs
    stride_h, stride_w = attrs.get_int_tuple("strides")
    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
    padding = attrs.get_int_tuple("padding")
    groups = attrs.groups
    layout = attrs.data_layout
    kernel_layout = attrs.kernel_layout
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1:
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            if (
                target.kind.name == "cuda"
                and data.dtype in ("int8", "uint8")
                and kernel.dtype in ("int8", "uint8")
            ):
                assert data.dtype == kernel.dtype
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8),
                    name="conv2d_nchw_int8.cuda",
                )
            else:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
                    name="conv2d_nchw.cuda",
                )
            _, _, kh, kw = get_const_tuple(kernel.shape)
            if (
                (2 < kh < 8 and 2 < kw < 8 and kh == kw)
                and (stride_h == 1 and stride_w == 1)
                and (dilation_h == 1 and dilation_w == 1)
            ):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd),
                    name="conv2d_nchw_winograd.cuda",
                    plevel=5,
                )
        elif layout == "HWCN":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_hwcn),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn),
                name="conv2d_hwcn.cuda",
            )
        elif layout == "NHWC" and kernel_layout == "HWIO":
            strategy.add_implementation(
                wrap_compute_conv2d(topi.gpu.conv2d_nhwc),
                wrap_topi_schedule(topi.gpu.schedule_conv2d_nhwc),
                name="conv2d_nhwc.gpu",
            )

            N, H, W, _ = get_const_tuple(data.shape)
            KH, KW, CI, CO = get_const_tuple(kernel.shape)
            # Winograd shape related judgment
            (
                judge_winograd_tensorcore,
                judge_winograd_autotvm,
                judge_winograd_auto_scheduler,
            ) = judge_winograd(
                N,
                H,
                W,
                KH,
                KW,
                CI,
                CO,
                padding,
                stride_h,
                stride_w,
                dilation_h,
                dilation_w,
                data.dtype,
                kernel.dtype,
                pre_flag=False,
            )
            if judge_winograd_autotvm:
                if (
                    target.kind.name == "cuda"
                    and nvcc.have_tensorcore(target=target)
                    and judge_winograd_tensorcore
                ):
                    strategy.add_implementation(
                        wrap_compute_conv2d(topi.cuda.conv2d_nhwc_winograd_tensorcore),
                        wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore),
                        name="conv2d_nhwc_winograd_tensorcore.cuda",
                        plevel=5,
                    )
                else:
                    strategy.add_implementation(
                        wrap_compute_conv2d(topi.cuda.conv2d_nhwc_winograd_direct),
                        wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_winograd_direct),
                        name="conv2d_nhwc_winograd_direct.cuda",
                        plevel=5,
                    )
            if (
                target.kind.name == "cuda"
                and nvcc.have_tensorcore(target=target)
                and (
                    (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
                    or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
                    or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
                )
            ):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_tensorcore),
                    name="conv2d_nhwc_tensorcore.cuda",
                    plevel=20,
                )

            # register auto-scheduler implementations
            if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
                    naive_schedule,  # this implementation should never be picked by autotvm
                    name="conv2d_nhwc.winograd",
                    plevel=15,
                )

        elif layout == "HWNC":
            assert kernel_layout in ["HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i"]
            _, _, N, in_channels = get_const_tuple(data.shape)
            pre_computed = len(kernel.shape) == 6
            if pre_computed:
                _, _, oc_chunk, _, oc_block_factor, _ = get_const_tuple(kernel.shape)
                out_channels = oc_chunk * oc_block_factor
            else:
                _, _, out_channels, _ = get_const_tuple(kernel.shape)

            tensorcore_dtypes = ["int4", "uint4", "int8", "uint8"]
            if (
                target.kind.name == "cuda"
                and nvcc.have_tensorcore(target=target)
                and kernel.dtype in tensorcore_dtypes
                and (
                    (
                        data.dtype in ["int4", "uint4"]
                        and N % 8 == 0
                        and in_channels % 32 == 0
                        and out_channels % 8 == 0
                    )
                    or (
                        data.dtype in ["int8", "uint8"]
                        and N % 8 == 0
                        and in_channels % 16 == 0
                        and out_channels % 32 == 0
                    )
                )
            ):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_hwnc_tensorcore),
                    name="conv2d_hwnc_tensorcore_direct.cuda",
                    plevel=20,
                )
            else:
                raise RuntimeError(
                    "Unsupported shape for conv2d HWNC.\
                                    Need to satisfy tensor core schedule."
                )
        elif target.kind.name == "cuda" and layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
                name="conv2d_NCHWc_int8.cuda",
            )
        elif target.kind.name == "cuda" and "cudnn" not in target.libs:
            # No TVM native kernel applicable
            raise RuntimeError("Unsupported conv2d layout {} for CUDA".format(layout))

        if (
            target.kind.name == "cuda"
            and "cudnn" in target.libs
            and layout in ["NCHW", "NHWC"]
            and padding[0] == padding[2]
            and padding[1] == padding[3]
        ):
            # add cudnn implementation
            if layout == "NHWC":
                assert kernel_layout == "OHWI"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                name="conv2d_cudnn.cuda",
                plevel=25,
            )

    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups) and (
        layout == "NCHW" or "cudnn" not in target.libs
    ):  # cuDNN requires a different kernel layout for NHWC inputs.
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
                name="depthwise_conv2d_nchw.cuda",
            )
        elif layout == "NHWC":
            assert kernel_layout == "HWOI"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
                name="depthwise_conv2d_nhwc.cuda",
            )
        else:
            raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout))
    else:  # group_conv2d
        # add cudnn implementation, if any
        cudnn_impl = False
        if target.kind.name == "cuda" and "cudnn" in target.libs:
            if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and padding[1] == padding[3]:
                strategy.add_implementation(
                    wrap_compute_conv2d(
                        topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True
                    ),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                    name="conv2d_cudnn.cuda",
                    plevel=25,
                )
                cudnn_impl = True

        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            _, channels, _, _ = get_const_tuple(data.shape)
            out_channels, in_channels, _, _ = get_const_tuple(kernel.shape)
            oc_chunk = out_channels // 4
            ic_chunk = in_channels // 4

            if (
                target.kind.name == "cuda"
                and data.dtype in ["int8", "uint8"]
                and kernel.dtype in ["int8", "uint8"]
                and channels % groups == 0
                and out_channels % groups == 0
                and channels % 4 == 0
                and out_channels % 4 == 0
                and groups <= oc_chunk
                and groups <= ic_chunk
            ):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.group_conv2d_nchw_int8, has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw_int8),
                    name="group_conv2d_nchw_int8.cuda",
                )
            else:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
                    name="group_conv2d_nchw.cuda",
                )
        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, has_groups=True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
                name="group_conv2d_NCHWc_int8.cuda",
            )
        elif not cudnn_impl:
            raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
    return strategy
Example #11
0
def conv2d_strategy_rocm(attrs, inputs, out_type, target):
    """conv2d rocm strategy"""
    strategy = _op.OpStrategy()
    data, kernel = inputs
    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
    groups = attrs.groups
    layout = attrs.data_layout
    stride_h, stride_w = attrs.get_int_tuple("strides")
    kernel_layout = attrs.kernel_layout
    padding = attrs.get_int_tuple("padding")
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1:
        if layout == "NCHW":
            # TODO(@vinx13, @icemelon9): Use conv2d_NCHWc_int8 when dtype is int8/uint8.
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_nchw),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
                name="conv2d_nchw.cuda",
            )
            _, _, kh, kw = get_const_tuple(kernel.shape)
            if (2 < kh < 8 and 2 < kw < 8 and kh == kw and stride_h == 1
                    and stride_w == 1 and dilation_h == 1 and dilation_w == 1):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_nchw_winograd),
                    name="conv2d_nchw_winograd.cuda",
                    plevel=5,
                )
        elif layout == "NHWC":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
                name="conv2d_nhwc.cuda",
            )
            N, H, W, _ = get_const_tuple(data.shape)
            KH, KW, CI, CO = get_const_tuple(kernel.shape)

            (
                _,
                judge_winograd_autotvm,
                judge_winograd_auto_scheduler,
            ) = judge_winograd(
                N,
                H,
                W,
                KH,
                KW,
                CI,
                CO,
                padding,
                stride_h,
                stride_w,
                dilation_h,
                dilation_w,
                data.dtype,
                kernel.dtype,
                pre_flag=False,
            )

            if judge_winograd_autotvm:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nhwc_winograd_direct),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_nhwc_winograd_direct),
                    name="conv2d_nhwc_winograd_direct.cuda",
                    plevel=5,
                )

            if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
                    naive_schedule,  # this implementation should never be picked by autotvm
                    name="conv2d_nhwc.winograd",
                    plevel=15,
                )
        elif layout == "HWCN":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_hwcn),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn),
                name="conv2d_hwcn.cuda",
            )
        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
                name="conv2d_NCHWc_int8.cuda",
            )
        else:
            raise RuntimeError(
                "Unsupported conv2d layout {} for CUDA".format(layout))
        # add miopen implementation
        if ("miopen" in target.libs and layout == "NCHW"
                and padding[0] == padding[2] and padding[1] == padding[3]):
            strategy.add_implementation(
                wrap_compute_conv2d(topi.rocm.conv2d_nchw_miopen, True),
                wrap_topi_schedule(topi.rocm.schedule_conv2d_nchw_miopen),
                name="conv2d_nchw_miopen.rocm",
                plevel=15,
            )
    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout,
                             groups):
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
                name="depthwise_conv2d_nchw.cuda",
            )
        elif layout == "NHWC":
            assert kernel_layout == "HWOI"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
                name="depthwise_conv2d_nhwc.cuda",
            )
        else:
            raise RuntimeError(
                "Unsupported depthwise_conv2d layout {}".format(layout))
    else:  # group_conv2d
        if layout == "NCHW":
            # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8.
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_nchw,
                                    has_groups=True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
                name="group_conv2d_nchw.cuda",
            )
        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
                name="group_conv2d_NCHWc_int8.cuda",
            )
        else:
            raise RuntimeError(
                "Unsupported group_conv2d layout {}".format(layout))
    return strategy
Example #12
0
File: x86.py Project: saudet/tvm
def matmul_strategy_cpu(attrs, inputs, out_type, target):
    """matmul x86 strategy"""
    strategy = _op.OpStrategy()

    same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype
    dtype = inputs[0].dtype
    u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32"
    if "cblas" in target.libs:
        length_before = len(strategy.specializations) if strategy.specializations else 0
        with SpecializedCondition(same_type and dtype in ["float32", "float64"]):
            strategy.add_implementation(
                wrap_compute_matmul(topi.x86.matmul_cblas),
                wrap_topi_schedule(topi.x86.schedule_matmul_cblas),
                name="matmul_cblas.x86",
                plevel=13,
            )
        length_after = len(strategy.specializations) if strategy.specializations else 0
        if length_before == length_after:
            logger.warning(
                "Currently cblas only support the data type to be float32 or float64. Skip."
            )
    if "mkl" in target.libs:
        length_before = len(strategy.specializations) if strategy.specializations else 0
        with SpecializedCondition(same_type and dtype in ["float32", "float64"] or u8s8s32):
            strategy.add_implementation(
                wrap_compute_matmul(topi.x86.matmul_mkl),
                wrap_topi_schedule(topi.x86.schedule_matmul_mkl),
                name="matmul_mkl.x86",
                plevel=14,
            )
        length_after = len(strategy.specializations) if strategy.specializations else 0
        if length_before == length_after:
            logger.warning(
                "Currently mkl only support the data type to be float32, float64 or input with "
                "uint8 and int8 while output wiht int32. Skip."
            )
    if "mkldnn" in target.libs:
        length_before = len(strategy.specializations) if strategy.specializations else 0
        with SpecializedCondition(same_type and dtype == "float32"):
            strategy.add_implementation(
                wrap_compute_matmul(topi.x86.matmul_mkldnn),
                wrap_topi_schedule(topi.x86.schedule_matmul_mkldnn),
                name="matmul_mkldnn.x86",
                plevel=15,
            )
        length_after = len(strategy.specializations) if strategy.specializations else 0
        if length_before == length_after:
            logger.warning("Currently mkldnn only support the data type to be float32. Skip.")

    if is_auto_scheduler_enabled():
        strategy.add_implementation(
            wrap_compute_matmul(topi.nn.matmul, need_auto_scheduler_layout=True),
            naive_schedule,
            name="matmul.generic",
            plevel=11,
        )
    else:
        # If no cblas/mkl/mkldnn strategy choosed
        if not strategy.specializations:
            logger.warning(
                "Matmul is not optimized for x86. "
                "Recommend to use cblas/mkl/mkldnn for better performance."
            )
        strategy.add_implementation(
            wrap_compute_matmul(topi.nn.matmul),
            naive_schedule,
            name="matmul.generic",
        )
    return strategy
Example #13
0
File: x86.py Project: saudet/tvm
def conv2d_strategy_cpu(attrs, inputs, out_type, target):
    """conv2d x86 strategy"""
    strategy = _op.OpStrategy()
    data, kernel = inputs
    stride_h, stride_w = get_const_tuple(attrs.strides)
    dilation_h, dilation_w = get_const_tuple(attrs.dilation)
    groups = attrs.groups
    layout = attrs.data_layout
    kernel_layout = attrs.kernel_layout
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1:
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.x86.conv2d_nchw_int8),
                    wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_int8),
                    name="conv2d_nchw_int8.x86",
                )
            else:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.x86.conv2d_nchw),
                    wrap_topi_schedule(topi.x86.schedule_conv2d_nchw),
                    name="conv2d_nchw.x86",
                )
        elif _NCHWc_matcher.match(layout):  # check if layout is NCHWxc
            assert _OIHWio_matcher.match(kernel_layout)  # check if kernel is OIHWio
            return conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target)
        elif layout == "NHWC":
            assert kernel_layout == "HWIO"
            if not is_auto_scheduler_enabled():
                logger.warning("conv2d NHWC layout is not optimized for x86 with autotvm.")
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True),
                wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
                name="conv2d_nhwc.x86",
            )

            judge_winograd_auto_scheduler = False
            if len(kernel.shape) == 4:
                kernel_h, kernel_w, _, co = get_const_tuple(kernel.shape)
                judge_winograd_auto_scheduler = (
                    "float" in data.dtype
                    and "float" in kernel.dtype
                    and kernel_h == 3
                    and kernel_w == 3
                    and stride_h == 1
                    and stride_w == 1
                    and dilation_h == 1
                    and dilation_w == 1
                    and 64 < co < 512
                    # The last condition of co is based on our profiling of resnet workloads
                    # on skylake avx512 machines. We found winograd is faster than direct
                    # only when co is within this range
                )

            # register auto-scheduler implementations
            if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler:
                strategy.add_implementation(
                    wrap_compute_conv2d(
                        topi.nn.conv2d_winograd_nhwc, need_auto_scheduler_layout=True
                    ),
                    naive_schedule,  # this implementation should never be picked by autotvm
                    name="conv2d_nhwc.winograd",
                    plevel=15,
                )
        elif layout == "HWCN":
            assert kernel_layout == "HWIO"
            if not is_auto_scheduler_enabled():
                logger.warning("conv2d HWCN layout is not optimized for x86 with autotvm.")
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.conv2d_hwcn),
                wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn),
                name="conv2d_hwcn.generic",
            )
        else:
            raise RuntimeError("Unsupported conv2d layout {} for x86".format(layout))
    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            channel_multiplier = get_const_tuple(inputs[1].shape)[1]
            if channel_multiplier == 1 and dilation_h == 1 and dilation_w == 1:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw),
                    wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_nchw),
                    name="depthwise_conv2d_nchw.x86",
                )
            else:
                logger.warning(
                    "For x86 target, depthwise_conv2d with channel "
                    "multiplier greater than 1 is not optimized"
                )
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw),
                    wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nchw),
                    name="depthwise_conv2d_nchw.generic",
                )
        elif _NCHWc_matcher.match(layout):  # check if layout is NCHWxc
            assert _OIHWio_matcher.match(kernel_layout)  # check if kernel is OIHWio
            return depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target)
        elif layout == "NHWC":
            assert kernel_layout == "HWOI"
            if not is_auto_scheduler_enabled():
                logger.warning(
                    "depthwise_conv2d NHWC layout is not optimized for x86 with autotvm."
                )
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc),
                name="depthwise_conv2d_nhwc.generic",
            )
        else:
            raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout))
    else:  # group_conv2d
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.x86.group_conv2d_nchw, has_groups=True),
                wrap_topi_schedule(topi.x86.schedule_group_conv2d_nchw),
                name="group_conv2d_nchw.x86",
            )
        elif layout == "NHWC":
            assert kernel_layout == "HWIO"
            if not is_auto_scheduler_enabled():
                logger.warning("group_conv2d is not optimized for x86 with autotvm.")
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
                name="group_conv2d_nhwc.generic",
            )
        else:
            raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
    return strategy
Example #14
0
def conv2d_strategy_mali(attrs, inputs, out_type, target):
    """conv2d mali strategy"""
    strategy = _op.OpStrategy()
    data, kernel = inputs
    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
    stride_h, stride_w = attrs.get_int_tuple("strides")
    groups = attrs.groups
    layout = attrs.data_layout
    kernel_layout = attrs.kernel_layout
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1:
        if layout == "NCHW":
            if kernel_layout == "OIHW":
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.mali.conv2d_nchw_spatial_pack),
                    wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack),
                    name="conv2d_nchw_spatial_pack.mali",
                )
                # check if winograd algorithm is applicable
                _, _, kh, kw = get_const_tuple(kernel.shape)
                if (
                    kh == 3
                    and kw == 3
                    and stride_h == 1
                    and stride_w == 1
                    and dilation_h == 1
                    and dilation_w == 1
                ):
                    strategy.add_implementation(
                        wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd),
                        wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd),
                        name="conv2d_nchw_winograd.mali",
                        plevel=5,
                    )
            elif re.match(r"OIHW\d*o", kernel_layout):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.mali.conv2d_nchw_spatial_pack),
                    wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack),
                    name="conv2d_nchw_spatial_pack.mali",
                )
            else:
                raise RuntimeError(
                    "Unsupported weight layout {} for conv2d NCHW".format(kernel_layout)
                )
        elif layout == "NHWC":
            assert kernel_layout == "HWIO"
            need_auto_scheduler_layout = is_auto_scheduler_enabled()
            need_meta_schedule_layout = is_meta_schedule_enabled()
            if need_auto_scheduler_layout or need_meta_schedule_layout:
                strategy.add_implementation(
                    wrap_compute_conv2d(
                        topi.nn.conv2d_nhwc,
                        need_auto_scheduler_layout=need_auto_scheduler_layout,
                        need_meta_schedule_layout=need_meta_schedule_layout,
                    ),
                    naive_schedule,
                    name="conv2d_nhwc.mali",
                )
                is_winograd_applicable = False
                if len(kernel.shape) == 4:
                    kernel_h, kernel_w, _, _ = get_const_tuple(kernel.shape)
                    is_winograd_applicable = (
                        "float" in data.dtype
                        and "float" in kernel.dtype
                        and kernel_h == 3
                        and kernel_w == 3
                        and stride_h == 1
                        and stride_w == 1
                        and dilation_h == 1
                        and dilation_w == 1
                    )
                if is_winograd_applicable:
                    if need_meta_schedule_layout:
                        strategy.add_implementation(
                            wrap_compute_conv2d(
                                topi.nn.conv2d_winograd_nhwc,
                                need_auto_scheduler_layout=False,
                                need_meta_schedule_layout=True,
                            ),
                            naive_schedule,  # this implementation should never be picked by autotvm
                            name="conv2d_nhwc.winograd",
                            plevel=15,
                        )
                    elif need_auto_scheduler_layout:
                        strategy.add_implementation(
                            wrap_compute_conv2d(
                                topi.nn.conv2d_winograd_nhwc,
                                need_auto_scheduler_layout=True,
                                need_meta_schedule_layout=False,
                            ),
                            naive_schedule,  # this implementation should never be picked by autotvm
                            name="conv2d_nhwc.winograd",
                            plevel=15,
                        )
                    else:
                        raise RuntimeError("Both AutoScheduler and MetaSchedule are not enabled")
            else:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.mali.conv2d_nhwc_spatial_pack),
                    wrap_topi_schedule(topi.mali.schedule_conv2d_nhwc_spatial_pack),
                    name="conv2d_nhwc_spatial_pack.mali",
                )

        else:
            raise RuntimeError("Unsupported conv2d layout {} for mali".format(layout))
    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.mali.depthwise_conv2d_nchw),
                wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nchw),
                name="depthwise_conv2d_nchw.mali",
            )
        elif layout == "NHWC":
            assert kernel_layout == "HWOI"
            if is_auto_scheduler_enabled():
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                    naive_schedule,
                    name="depthwise_conv2d_nhwc.mali",
                )
            elif is_meta_schedule_enabled():
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                    naive_schedule,
                    name="depthwise_conv2d_nhwc.mali",
                )
            else:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.mali.depthwise_conv2d_nhwc),
                    wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nhwc),
                    name="depthwise_conv2d_nhwc.mali",
                )
        else:
            raise RuntimeError("Unsupported depthwise_conv2d layout {} for mali".format(layout))
    else:  # group_conv2d
        raise RuntimeError("group_conv2d is not supported for mali")
    return strategy
Example #15
0
def dense_strategy_cpu(attrs, inputs, out_type, target):
    """dense x86 strategy"""
    strategy = _op.OpStrategy()
    same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype
    dtype = inputs[0].dtype
    u8s8s32 = dtype == "uint8" and inputs[
        1].dtype == "int8" and out_type.dtype == "int32"
    strategy.add_implementation(
        wrap_compute_dense(topi.x86.dense_nopack),
        wrap_topi_schedule(topi.x86.schedule_dense_nopack),
        name="dense_nopack.x86",
        plevel=5,
    )

    strategy.add_implementation(
        wrap_compute_dense(topi.x86.dense_pack),
        wrap_topi_schedule(topi.x86.schedule_dense_pack),
        name="dense_pack.x86",
        plevel=10,
    )

    need_auto_scheduler_layout = is_auto_scheduler_enabled()
    need_meta_schedule_layout = is_meta_schedule_enabled()

    if need_auto_scheduler_layout or need_meta_schedule_layout:
        strategy.add_implementation(
            wrap_compute_dense(
                topi.nn.dense,
                need_auto_scheduler_layout=need_auto_scheduler_layout,
                need_meta_schedule_layout=need_meta_schedule_layout,
            ),
            naive_schedule,
            name="dense.generic",
            plevel=11,
        )

    if "cblas" in target.libs:
        with SpecializedCondition(same_type
                                  and dtype in ["float32", "float64"]):
            strategy.add_implementation(
                wrap_compute_dense(topi.x86.dense_cblas),
                wrap_topi_schedule(topi.x86.schedule_dense_cblas),
                name="dense_cblas.x86",
                plevel=13,
            )
    if "mkl" in target.libs:
        with SpecializedCondition(
                same_type and dtype in ["float32", "float64"] or u8s8s32):
            strategy.add_implementation(
                wrap_compute_dense(topi.x86.dense_mkl),
                wrap_topi_schedule(topi.x86.schedule_dense_mkl),
                name="dense_mkl.x86",
                plevel=14,
            )
    if "dnnl" in target.libs:
        with SpecializedCondition(same_type and dtype == "float32"):
            strategy.add_implementation(
                wrap_compute_dense(topi.x86.dense_dnnl),
                wrap_topi_schedule(topi.x86.schedule_dense_dnnl),
                name="dense_dnnl.x86",
                plevel=15,
            )
    return strategy
Example #16
0
def select_implementation(op,
                          attrs,
                          inputs,
                          out_type,
                          target,
                          use_autotvm=True):
    """Select the best implementation from the op strategy.

    If use_autotvm is True, it'll first try to find the best implementation
    based on AutoTVM profile results. If no AutoTVM profile result is found,
    it'll choose the implementation with highest plevel.

    If use_autotvm is False, it'll directly choose the implementation with
    highest plevel.

    Note that this function doesn't support op with symbolic input shapes.

    Parameters
    ----------
    op : tvm.ir.Op
        Relay operator.

    attrs : object
        The op attribute.

    inputs : List[tvm.te.Tensor]
        Input tensors to the op.

    out_type : relay.Type
        The output type.

    target : tvm.target.Target
        The target to compile the op.

    use_autotvm : bool
        Whether query AutoTVM to pick the best.

    Returns
    -------
    ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor])
        The best op implementation and the corresponding output tensors.
    """
    all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
    if len(all_impls) == 0:
        raise RuntimeError(f"No valid {op} implementations for {target}")
    best_plevel_impl = max(all_impls, key=lambda x: x.plevel)

    # Disable autotvm if auto_scheduler is enabled.
    # (i.e., always return the implementation with the highest priority for auto-scheduler).
    if is_auto_scheduler_enabled() or is_meta_schedule_dispatch_enabled():
        use_autotvm = False

    # If not use autotvm, always return the implementation with the highest priority
    if not use_autotvm:
        logger.info(
            "Using %s for %s based on highest priority (%d)",
            best_plevel_impl.name,
            op.name,
            best_plevel_impl.plevel,
        )
        outs = best_plevel_impl.compute(attrs, inputs, out_type)
        return best_plevel_impl, outs

    # Otherwise, try autotvm templates
    outputs = {}
    workloads = {}
    best_autotvm_impl = None
    best_cfg = None
    dispatch_ctx = autotvm.task.DispatchContext.current
    old_silent = autotvm.GLOBAL_SCOPE.silent
    autotvm.GLOBAL_SCOPE.silent = True
    for impl in all_impls:
        outs = impl.compute(attrs, inputs, out_type)
        outputs[impl] = outs
        workload = autotvm.task.get_workload(outs)
        workloads[impl] = workload
        if workload is None:
            # Not an AutoTVM tunable implementation
            continue
        cfg = dispatch_ctx.query(target, workload)
        if cfg.is_fallback:
            # Skip fallback config
            continue
        logger.info("Implementation %s for %s has cost %.2e", impl.name,
                    op.name, cfg.cost)
        if best_cfg is None or best_cfg.cost > cfg.cost:
            best_autotvm_impl = impl
            best_cfg = cfg
    autotvm.GLOBAL_SCOPE.silent = old_silent

    if best_autotvm_impl:
        # The best autotvm implementation definitely doesn't use fallback config
        logger.info(
            "Using %s for %s based on lowest cost (%.2e)",
            best_autotvm_impl.name,
            op.name,
            best_cfg.cost,
        )
        return best_autotvm_impl, outputs[best_autotvm_impl]

    # Use the implementation with highest plevel
    if workloads[best_plevel_impl] is not None:
        msg = (
            "Cannot find tuning records for:\n    target=%s\n    key=%s\n"
            "TVM will apply a default schedule which may negatively impact performance."
            % (target, workloads[best_plevel_impl]))
        if (not autotvm.env.GLOBAL_SCOPE.silent
                and msg not in autotvm.task.DispatchContext.warning_messages):
            autotvm.task.DispatchContext.warning_messages.add(msg)
            global _first_warning
            if _first_warning:
                _first_warning = False
                info_msg = (
                    "One or more operators have not been tuned. Please tune your model "
                    "for better performance. Use DEBUG logging level to see more details."
                )
                autotvm_logger.warning(info_msg)
            autotvm_logger.debug(msg)

    logger.info(
        "Using %s for %s based on highest priority (%s)",
        best_plevel_impl.name,
        op.name,
        best_plevel_impl.plevel,
    )
    return best_plevel_impl, outputs[best_plevel_impl]
Example #17
0
File: x86.py Project: ybai62868/tvm
def conv2d_strategy_cpu(attrs, inputs, out_type, target):
    """conv2d x86 strategy"""
    strategy = _op.OpStrategy()
    data, kernel = inputs
    dilation_h, dilation_w = get_const_tuple(attrs.dilation)
    groups = attrs.groups
    layout = attrs.data_layout
    kernel_layout = attrs.kernel_layout
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1:
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.x86.conv2d_nchw_int8),
                    wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_int8),
                    name="conv2d_nchw_int8.x86",
                )
            else:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.x86.conv2d_nchw),
                    wrap_topi_schedule(topi.x86.schedule_conv2d_nchw),
                    name="conv2d_nchw.x86",
                )
        elif _NCHWc_matcher.match(layout):  # check if layout is NCHWxc
            assert _OIHWio_matcher.match(
                kernel_layout)  # check if kernel is OIHWio
            return conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target)
        elif layout == "NHWC":
            assert kernel_layout == "HWIO"
            if not is_auto_scheduler_enabled():
                logger.warning(
                    "conv2d NHWC layout is not optimized for x86 with autotvm."
                )
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.conv2d_nhwc,
                                    need_auto_scheduler_layout=True),
                wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
                name="conv2d_nhwc.x86",
            )
        elif layout == "HWCN":
            assert kernel_layout == "HWIO"
            if not is_auto_scheduler_enabled():
                logger.warning(
                    "conv2d HWCN layout is not optimized for x86 with autotvm."
                )
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.conv2d_hwcn),
                wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn),
                name="conv2d_hwcn.generic",
            )
        else:
            raise RuntimeError(
                "Unsupported conv2d layout {} for x86".format(layout))
    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout,
                             groups):
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            channel_multiplier = get_const_tuple(inputs[1].shape)[1]
            if channel_multiplier == 1 and dilation_h == 1 and dilation_w == 1:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw),
                    wrap_topi_schedule(
                        topi.x86.schedule_depthwise_conv2d_nchw),
                    name="depthwise_conv2d_nchw.x86",
                )
            else:
                logger.warning("For x86 target, depthwise_conv2d with channel "
                               "multiplier greater than 1 is not optimized")
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw),
                    wrap_topi_schedule(
                        topi.generic.schedule_depthwise_conv2d_nchw),
                    name="depthwise_conv2d_nchw.generic",
                )
        elif _NCHWc_matcher.match(layout):  # check if layout is NCHWxc
            assert _OIHWio_matcher.match(
                kernel_layout)  # check if kernel is OIHWio
            return depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type,
                                                       target)
        elif layout == "NHWC":
            assert kernel_layout == "HWOI"
            if not is_auto_scheduler_enabled():
                logger.warning(
                    "depthwise_conv2d NHWC layout is not optimized for x86 with autotvm."
                )
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                wrap_topi_schedule(
                    topi.generic.schedule_depthwise_conv2d_nhwc),
                name="depthwise_conv2d_nhwc.generic",
            )
        else:
            raise RuntimeError(
                "Unsupported depthwise_conv2d layout {}".format(layout))
    else:  # group_conv2d
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            if not is_auto_scheduler_enabled():
                logger.warning(
                    "group_conv2d is not optimized for x86 with autotvm.")
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.group_conv2d_nchw,
                                    has_groups=True),
                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw),
                name="group_conv2d_nchw.generic",
            )
        elif layout == "NHWC":
            assert kernel_layout == "HWIO"
            if not is_auto_scheduler_enabled():
                logger.warning(
                    "group_conv2d is not optimized for x86 with autotvm.")
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.group_conv2d_nhwc,
                                    has_groups=True),
                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
                name="group_conv2d_nhwc.generic",
            )
        else:
            raise RuntimeError(
                "Unsupported group_conv2d layout {}".format(layout))
    return strategy