def _alter_conv2d_layout_arm(attrs, inputs, tinfos): """Alter op layout for pre-computing kernel transformation""" import nnvm.symbol as sym copy_inputs = [s for s in inputs] new_attrs = {k: attrs[k] for k in attrs.keys()} # Remove attached compilation target because conv2d_NCHWc needs to create # a conv2d_nchwc op and target is not one of conv2d's parameters. if "target" in new_attrs: del new_attrs["target"] assert attrs.get_int_tuple("dilation") == (1, 1), "Does not support dilation " \ "when alter_op_layout is enabled" strides = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") groups = attrs.get_int('groups') layout = attrs["layout"] out_dtype = attrs["out_dtype"] out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype if groups == 1: # query config of this workload workload = _conv_arg_to_workload(tinfos[0], tinfos[1], strides, padding, layout, out_dtype) cfg = autotvm.DispatchContext.current.query( tvm.target.current_target(), workload) if cfg.is_fallback: # if is fallback, clear query cache and return None autotvm.task.clear_fallback_cache(tvm.target.current_target(), workload) return None if cfg.template_key == 'direct': # packing weight tensor new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1]) return sym.conv2d(*copy_inputs, **new_attrs) else: # pre-compute weight transformation in winograd if "-device=arm_cpu" in tvm.target.current_target().options: tile_size = 4 VC = cfg['tile_k'].size[-1] else: from ..mali.conv2d import _pick_tile_size tile_size = _pick_tile_size(tinfos[0], tinfos[1]) VC = cfg['tile_bna'].val weight = sym.contrib.conv2d_winograd_weight_transform( copy_inputs[1], tile_size=tile_size) CO, CI, KH, KW = get_const_tuple(tinfos[1].shape) weight = sym.reshape(weight, shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI)) weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3]) copy_inputs[1] = weight new_attrs['tile_size'] = tile_size return sym.contrib.conv2d_winograd_without_weight_transform( *copy_inputs, **new_attrs) # do nothing for depthwise convolution return None
def nnvm_dot(c, a, b): """Implementation of dot.""" na = c.ref(a) nb = c.ref(b) return sym.dense(na, sym.transpose(nb, axes=(1, 0)), units=b.shape[1], use_bias=False)
def verify_transpose(dshape, axes): x = sym.Variable("x") if axes: y = sym.transpose(x, axes=axes) else: y = sym.transpose(x) y = y + 1 dtype = "float32" for target, ctx in ctx_list(): graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape}) m = graph_runtime.create(graph, lib, ctx) # set input data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype)) m.run(x=data) out_np = np.transpose(data.asnumpy(), axes=axes) + 1 out = m.get_output(0, tvm.nd.empty(out_np.shape)) np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
def _alter_conv2d_layout(attrs, inputs, tinfos): """Alter op layout for pre-computing kernel transformation""" if 'cudnn' in tvm.target.current_target( ).libs or 'miopen' in tvm.target.current_target().libs: return None import nnvm.symbol as sym copy_inputs = [s for s in inputs] new_attrs = {k: attrs[k] for k in attrs.keys()} assert attrs.get_int_tuple("dilation") == (1, 1), "Does not support dilation " \ "when alter_op_layout is enabled" strides = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") groups = attrs.get_int('groups') layout = attrs["layout"] out_dtype = attrs["out_dtype"] out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype if groups == 1: # query config of this workload workload = ('conv2d', ) + autotvm.task.args_to_workload( [tinfos[0], tinfos[1], strides, padding, layout, out_dtype]) cfg = autotvm.DispatchContext.current.query( tvm.target.current_target(), workload) if cfg.is_fallback: # if is fallback, clear query cache and return None autotvm.task.clear_fallback_cache(tvm.target.current_target(), workload) return None if cfg.template_key == 'direct': return None if cfg.template_key == 'int8': assert 'cuda' in tvm.target.current_target().keys new_attrs['layout'] = 'NCHW4c' new_attrs['out_layout'] = 'NCHW4c' new_attrs['kernel_layout'] = 'OIHW4o4i' return sym.conv2d(*copy_inputs, **new_attrs) # pre-compute weight transformation in winograd tile_size = _infer_tile_size(tinfos[0], tinfos[1]) weight = sym.contrib.conv2d_winograd_weight_transform( copy_inputs[1], tile_size=tile_size) weight = sym.transpose(weight, axes=[0, 1, 3, 2]) copy_inputs[1] = weight new_attrs['tile_size'] = tile_size return sym.contrib.conv2d_winograd_without_weight_transform( *copy_inputs, **new_attrs) # do nothing for depthwise convolution return None
def test_transpose(): x = sym.Variable("x", shape=(1, 32, 512, 512)) y = sym.transpose(x, name="y", axes=(0, 2, 3, 1)) g, ldict = correct_layout(y, "NCHW") assert (ldict["x"][0] == "NCHW") assert (ldict["y"][0] == "NHWC") # second pass will insert layout transform g, ldict = correct_layout(g, "NCHW16c") assert (ldict["x"][0] == "NCHW16c") assert (ldict["x_NCHW"][0] == "NCHW") assert (ldict["y"][0] == "NHWC")
def test_transpose(): x = sym.Variable("x", shape=(1, 32, 512, 512)) y = sym.transpose(x, name="y", axes=(0, 2, 3, 1)) g, ldict = correct_layout(y, "NCHW") assert(ldict["x"][0] == "NCHW") assert(ldict["y"][0] == "NHWC") # second pass will insert layout transform g, ldict = correct_layout(g, "NCHW16c") assert(ldict["x"][0] == "NCHW16c") assert(ldict["x_NCHW"][0] == "NCHW") assert(ldict["y"][0] == "NHWC")
def weight_prepack_conv2d(attrs, inputs, tinfos): import ast data = tinfos[0] kernel = tinfos[1] padding = ast.literal_eval(attrs['padding']) stride = ast.literal_eval(attrs['strides']) wkl = _get_workload(data, kernel, stride, padding, 'float32') sch = _get_schedule_conv(wkl) is_kernel_1x1 = isinstance(sch, AVX512Conv1x1Fwd) ic_bn, oc_bn = sch.ic_bn, sch.oc_bn new_attrs = {k: attrs[k] for k in attrs.keys()} new_attrs.pop('layout', None) kernel_sym = inputs[1] oc, ic, h, w = get_const_tuple(tinfos[1].shape) OC = oc // oc_bn IC = ic // ic_bn trans_kernel = sym.transpose(kernel_sym, axes=(1, 2, 3, 0)) trans_kernel = sym.reshape(trans_kernel, shape=(ic, h, w, OC, oc_bn)) trans_kernel = sym.transpose(trans_kernel, axes=(1, 2, 3, 4, 0)) trans_kernel = sym.reshape(trans_kernel, shape=(h, w, OC, oc_bn, IC, ic_bn)) if is_kernel_1x1: # (oc, ic, h, w) -> (OC, IC, ic, oc, h, w) trans_kernel = sym.transpose(trans_kernel, axes=(2, 4, 5, 3, 0, 1)) else: # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) trans_kernel = sym.transpose(trans_kernel, axes=(2, 4, 0, 1, 5, 3)) if attrs.get_bool('use_bias'): bias = inputs[2] bias = sym.reshape(bias, shape=(OC, oc_bn)) return sym.contrib.conv2d_nchw_kernel_packed(inputs[0], trans_kernel, bias, **new_attrs) else: return sym.contrib.conv2d_nchw_kernel_packed(inputs[0], trans_kernel, **new_attrs)
def _alter_conv2d_layout(attrs, inputs, tinfos): """Alter op layout for pre-computing kernel transformation""" import nnvm.symbol as sym copy_inputs = [s for s in inputs] new_attrs = {k: attrs[k] for k in attrs.keys()} assert attrs.get_int_tuple("dilation") == (1, 1), "Does not support dilation " \ "when alter_op_layout is enabled" strides = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") groups = attrs.get_int('groups') layout = attrs["layout"] out_dtype = attrs["out_dtype"] out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype if groups == 1: # query config of this workload workload = _conv_arg_to_workload(tinfos[0], tinfos[1], strides, padding, layout, out_dtype) cfg = autotvm.task.DispatchContext.current.query( tvm.target.current_target(), workload) if cfg.template_key == 'direct': # packing weight tensor new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1]) return sym.conv2d(*copy_inputs, **new_attrs) else: # pre-compute weight transformation in winograd tile_size = 4 weight = sym.contrib.conv2d_winograd_weight_transform( copy_inputs[1], tile_size=tile_size) CO, CI, KH, KW = get_const_tuple(tinfos[1].shape) VC = cfg['tile_k'].size[-1] weight = sym.reshape(weight, shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI)) weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3]) copy_inputs[1] = weight new_attrs['tile_size'] = tile_size return sym.contrib.conv2d_winograd_without_weight_transform( *copy_inputs, **new_attrs) # do nothing for depthwise convolution return None
def test_argmax(): dshape = (204800, 2) oshape = (1, 320, 640) dtype = "float32" x = sym.Variable("x", shape=dshape, dtype=dtype) x = sym.reshape(x, shape=(1, 320, 640, 2)) x = sym.transpose(x, axes=(0, 3, 1, 2)) y = sym.argmax(x, axis=1) target_str = "llvm" target = tvm.target.create(target_str) ctx = tvm.context(target_str, 0) with nnvm.compiler.build_config(opt_level=2): graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape}) m = graph_runtime.create(graph, lib, ctx) data = np.random.uniform(size=dshape).astype(dtype) m.run(x=data) np_reshape = np.reshape(data, (1, 320, 640, 2)) np_transpose = np.transpose(np_reshape, axes=(0, 3, 1, 2)) np_argmax = np.argmax(np_transpose, axis=1) out = m.get_output(0) np.testing.assert_allclose(out.asnumpy(), np_argmax, atol=1e-5, rtol=1e-5)
def _alter_conv2d_layout(attrs, inputs, tinfos): """Alter op layout for pre-computing kernel transformation""" if 'cudnn' in tvm.target.current_target().libs or 'miopen' in tvm.target.current_target().libs: return None import nnvm.symbol as sym copy_inputs = [s for s in inputs] new_attrs = {k: attrs[k] for k in attrs.keys()} strides = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int('groups') layout = attrs["layout"] out_dtype = attrs["out_dtype"] out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype data, kernel = tinfos[0:2] N, CI, H, W = get_const_tuple(data.shape) CO, _, KH, KW = get_const_tuple(kernel.shape) dispatch_ctx = autotvm.DispatchContext.current target = tvm.target.current_target() if groups == 1: # query config of this workload workload = autotvm.task.args_to_workload( [tinfos[0], tinfos[1], strides, padding, dilation, layout, out_dtype], conv2d) cfg = autotvm.DispatchContext.current.query(target, workload) if cfg.is_fallback: # if is fallback, clear query cache and return None autotvm.task.clear_fallback_cache(target, workload) return None if cfg.template_key == 'direct': return None if cfg.template_key == 'int8': assert 'cuda' in target.keys new_layout = 'NCHW4c' new_attrs['layout'] = new_layout new_attrs['out_layout'] = new_layout new_attrs['kernel_layout'] = 'OIHW4o4i' ic_block_factor = oc_block_factor = 4 # Store the same config for the altered operator (workload) new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor), dtype=data.dtype) new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor, KH, KW,\ oc_block_factor, ic_block_factor), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload( [new_data, new_kernel, strides, padding, dilation, new_layout, out_dtype], conv2d ) dispatch_ctx.update(target, new_workload, cfg) return sym.conv2d(*copy_inputs, **new_attrs) if attrs.get_int_tuple("dilation") != (1, 1): warnings.warn("Does not support weight pre-transform for dilated convolution.") return None # pre-compute weight transformation in winograd tile_size = _infer_tile_size(tinfos[0], tinfos[1]) weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1], tile_size=tile_size) weight = sym.transpose(weight, axes=[0, 1, 3, 2]) copy_inputs[1] = weight new_attrs['tile_size'] = tile_size # Store the same config for the altered operator (workload) new_data = data new_weight = tvm.placeholder((KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload( [new_data, new_weight, strides, padding, dilation, layout, out_dtype, tile_size], conv2d_winograd_without_weight_transform ) dispatch_ctx.update(target, new_workload, cfg) return sym.contrib.conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs) elif groups != CI: workload = autotvm.task.args_to_workload( [tinfos[0], tinfos[1], strides, padding, dilation, groups, out_dtype], group_conv2d_nchw) cfg = autotvm.DispatchContext.current.query(target, workload) if cfg.is_fallback: # if is fallback, clear query cache and return None autotvm.task.clear_fallback_cache(target, workload) return None if cfg.template_key == 'int8': assert 'cuda' in target.keys new_layout = 'NCHW4c' new_attrs['layout'] = new_layout new_attrs['out_layout'] = new_layout new_attrs['kernel_layout'] = 'OIHW4o4i' ic_block_factor = oc_block_factor = 4 # Store the same config for the altered operator (workload) new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor), dtype=data.dtype) new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor // groups,\ KH, KW, oc_block_factor, ic_block_factor), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload( [new_data, new_kernel, strides, padding, dilation, groups, out_dtype], group_conv2d_nchw ) dispatch_ctx.update(target, new_workload, cfg) return sym.conv2d(*copy_inputs, **new_attrs) # do nothing for depthwise convolution return None
def nnvm_transpose(c, a, ax): """Implementation of transpose.""" na = c.ref(a) assert ax.is_constant(tuple) return sym.transpose(na, axes=ax.value)
def check(in_shape, out_shape, **kwargs): x = sym.Variable("x", shape=in_shape) y = sym.transpose(x, name="y", **kwargs) sdict = infer_shape(y) assert(tuple(sdict["y"][0]) == tuple(out_shape))
def _alter_conv2d_layout_arm(attrs, inputs, tinfos): """Alter op layout for pre-computing kernel transformation""" import nnvm.symbol as sym copy_inputs = [s for s in inputs] new_attrs = {k: attrs[k] for k in attrs.keys()} dilation = attrs.get_int_tuple("dilation") strides = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") groups = attrs.get_int('groups') layout = attrs["layout"] out_dtype = attrs["out_dtype"] out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype if layout != 'NCHW' or groups != 1: return None if dilation != (1, 1): warnings.warn("Does not support weight pre-transform for dilated convolution.") return None data, kernel = tinfos[0:2] N, CI, H, W = get_const_tuple(data.shape) CO, _, KH, KW = get_const_tuple(kernel.shape) # query config of this workload workload = autotvm.task.args_to_workload( [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d) target = tvm.target.current_target() dispatch_ctx = autotvm.DispatchContext.current cfg = dispatch_ctx.query(target, workload) if cfg.is_fallback: # if is fallback, clear query cache and return None autotvm.task.clear_fallback_cache(target, workload) return None if cfg.template_key == 'direct': # pack weight tensor VC = cfg['tile_co'].size[-1] new_attrs['kernel_layout'] = 'OIHW%do' % VC # Store the same config for the altered operator (workload) new_data = data new_kernel = tvm.placeholder((CO // VC, CI, KH, KW, VC), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload( [new_data, new_kernel, strides, padding, dilation, 'NCHW', out_dtype], conv2d) dispatch_ctx.update(target, new_workload, cfg) return sym.conv2d(*copy_inputs, **new_attrs) else: # pre-compute weight transformation in winograd if "-device=arm_cpu" in target.options: tile_size = 4 VC = cfg['tile_k'].size[-1] else: from ..mali.conv2d import _pick_tile_size tile_size = _pick_tile_size(tinfos[0], tinfos[1]) VC = cfg['tile_bna'].val weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1], tile_size=tile_size) weight = sym.reshape(weight, shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI)) weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3]) copy_inputs[1] = weight new_attrs['tile_size'] = tile_size # Store the same config for the altered operator (workload) new_data = data new_weight = tvm.placeholder((KH + tile_size - 1, KH + tile_size -1, CO // VC, CI, VC), kernel.dtype) new_workload = autotvm.task.args_to_workload( [new_data, new_weight, strides, padding, dilation, new_attrs['layout'], out_dtype, tile_size], conv2d_winograd_without_weight_transform) dispatch_ctx.update(target, new_workload, cfg) return sym.contrib.conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
def _alter_conv2d_layout(attrs, inputs, tinfo): import nnvm.symbol as sym copy_inputs = [s for s in inputs] new_attrs = {k: attrs[k] for k in attrs.keys()} data, kernel = tinfo[0], tinfo[1] batch_size, in_channel, height, width = get_const_tuple(data.shape) groups = attrs.get_int("groups") out_channel = attrs.get_int("channels") padding = attrs.get_int_tuple("padding") strides = attrs.get_int_tuple("strides") dilation = attrs.get_int_tuple("dilation") layout = attrs['layout'] kh, kw = attrs.get_int_tuple("kernel_size") dtype = data.dtype out_dtype = dtype if attrs["out_dtype"] == "same" else attrs["out_dtype"] is_depthwise = groups == in_channel and groups == out_channel # only optimize for NCHW if layout != 'NCHW': return None if groups != 1 and not is_depthwise: return None dispatch_ctx = autotvm.task.DispatchContext.current target = tvm.target.current_target() # query schedule and fallback if necessary workload = autotvm.task.args_to_workload( [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw) \ if is_depthwise else \ autotvm.task.args_to_workload( [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d) cfg = dispatch_ctx.query(target, workload) if cfg.is_fallback: _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise) ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] new_attrs['layout'] = 'NCHW%dc' % ic_bn new_attrs['out_layout'] = 'NCHW%dc' % oc_bn new_data = tvm.placeholder( (batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data.dtype) if is_depthwise: # channel, channel_multiplier, kh, kw -> out_channel_chunk, kh, kw, out_channel_block # in which out_channel = merge(channel, channel_multiplier) kernel_sym = copy_inputs[1] kernel_sym = sym.reshape(kernel_sym, shape=(out_channel // oc_bn, oc_bn, kh, kw)) kernel_sym = sym.transpose(kernel_sym, axes=(0, 2, 3, 1)) copy_inputs[1] = kernel_sym # Store altered operator's config new_kernel = tvm.placeholder((out_channel // oc_bn, kh, kw, oc_bn), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload([ new_data, new_kernel, strides, padding, dilation, new_attrs['layout'], new_attrs['out_layout'], out_dtype ], depthwise_conv2d_NCHWc) else: out_channel, _, kh, kw = get_const_tuple(kernel.shape) # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn) # Store altered operator's config new_kernel = tvm.placeholder( (out_channel // oc_bn, in_channel // ic_bn, kh, kw, ic_bn, oc_bn), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload([ new_data, new_kernel, strides, padding, dilation, new_attrs['layout'], new_attrs['out_layout'], out_dtype ], conv2d_NCHWc) dispatch_ctx.update(target, new_workload, cfg) return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
def _alter_conv2d_layout(attrs, inputs, tinfos): """Alter op layout for pre-computing kernel transformation""" if 'cudnn' in tvm.target.current_target( ).libs or 'miopen' in tvm.target.current_target().libs: return None import nnvm.symbol as sym copy_inputs = [s for s in inputs] new_attrs = {k: attrs[k] for k in attrs.keys()} strides = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int('groups') layout = attrs["layout"] out_dtype = attrs["out_dtype"] out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype data, kernel = tinfos[0:2] N, CI, H, W = get_const_tuple(data.shape) CO, _, KH, KW = get_const_tuple(kernel.shape) dispatch_ctx = autotvm.DispatchContext.current if groups == 1: # query config of this workload workload = ('conv2d', ) + autotvm.task.args_to_workload([ tinfos[0], tinfos[1], strides, padding, dilation, layout, out_dtype ]) target = tvm.target.current_target() cfg = autotvm.DispatchContext.current.query(target, workload) if cfg.is_fallback: # if is fallback, clear query cache and return None autotvm.task.clear_fallback_cache(target, workload) return None if cfg.template_key == 'direct': return None if cfg.template_key == 'int8': assert 'cuda' in target.keys new_layout = 'NCHW4c' new_attrs['layout'] = new_layout new_attrs['out_layout'] = new_layout new_attrs['kernel_layout'] = 'OIHW4o4i' ic_block_factor = oc_block_factor = 4 # Store the same config for the altered operator (workload) new_data = tvm.placeholder( (N, CI // ic_block_factor, H, W, ic_block_factor), dtype=data.dtype) new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor, KH, KW,\ oc_block_factor, ic_block_factor), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload([ new_data, new_kernel, strides, padding, dilation, new_layout, out_dtype ], conv2d) dispatch_ctx.update(target, new_workload, cfg) return sym.conv2d(*copy_inputs, **new_attrs) if attrs.get_int_tuple("dilation") != (1, 1): warnings.warn( "Does not support weight pre-transform for dilated convolution." ) return None # pre-compute weight transformation in winograd tile_size = _infer_tile_size(tinfos[0], tinfos[1]) weight = sym.contrib.conv2d_winograd_weight_transform( copy_inputs[1], tile_size=tile_size) weight = sym.transpose(weight, axes=[0, 1, 3, 2]) copy_inputs[1] = weight new_attrs['tile_size'] = tile_size # Store the same config for the altered operator (workload) new_data = data new_weight = tvm.placeholder( (KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload([ new_data, new_weight, strides, padding, dilation, layout, out_dtype, tile_size ], conv2d_winograd_without_weight_transform) dispatch_ctx.update(target, new_workload, cfg) return sym.contrib.conv2d_winograd_without_weight_transform( *copy_inputs, **new_attrs) # do nothing for depthwise convolution return None
def _alter_conv2d_layout(attrs, inputs, tinfo): import nnvm.symbol as sym copy_inputs = [s for s in inputs] new_attrs = {k : attrs[k] for k in attrs.keys()} data, kernel = tinfo[0], tinfo[1] batch_size, in_channel, height, width = get_const_tuple(data.shape) groups = attrs.get_int("groups") out_channel = attrs.get_int("channels") padding = attrs.get_int_tuple("padding") strides = attrs.get_int_tuple("strides") dilation = attrs.get_int_tuple("dilation") layout = attrs['layout'] kh, kw = attrs.get_int_tuple("kernel_size") dtype = data.dtype out_dtype = dtype if attrs["out_dtype"] == "same" else attrs["out_dtype"] is_depthwise = groups == in_channel and groups == out_channel # only optimize for NCHW if layout != 'NCHW': return None if groups != 1 and not is_depthwise: return None dispatch_ctx = autotvm.task.DispatchContext.current target = tvm.target.current_target() # query schedule and fallback if necessary workload = autotvm.task.args_to_workload( [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw) \ if is_depthwise else \ autotvm.task.args_to_workload( [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d) cfg = dispatch_ctx.query(target, workload) if cfg.is_fallback: _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise) ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] new_attrs['layout'] = 'NCHW%dc' % ic_bn new_attrs['out_layout'] = 'NCHW%dc' % oc_bn new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn), dtype=data.dtype) if is_depthwise: # channel, channel_multiplier, kh, kw -> out_channel_chunk, kh, kw, out_channel_block # in which out_channel = merge(channel, channel_multiplier) kernel_sym = copy_inputs[1] kernel_sym = sym.reshape(kernel_sym, shape=(out_channel//oc_bn, oc_bn, kh, kw)) kernel_sym = sym.transpose(kernel_sym, axes=(0, 2, 3, 1)) copy_inputs[1] = kernel_sym # Store altered operator's config new_kernel = tvm.placeholder((out_channel//oc_bn, kh, kw, oc_bn), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload( [new_data, new_kernel, strides, padding, dilation, new_attrs['layout'], new_attrs['out_layout'], out_dtype], depthwise_conv2d_NCHWc) else: out_channel, _, kh, kw = get_const_tuple(kernel.shape) # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn) # Store altered operator's config new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn, kh, kw, ic_bn, oc_bn), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload( [new_data, new_kernel, strides, padding, dilation, new_attrs['layout'], new_attrs['out_layout'], out_dtype], conv2d_NCHWc) dispatch_ctx.update(target, new_workload, cfg) return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)