def test_injective_conv2d(): channels = 16 data = sym.Variable(name="data") pool = sym.global_avg_pool2d(data=data) weight = sym.reshape(pool, shape=[1, channels, 1, 1]) residual = sym.conv2d(data=data, kernel_size=(3,3), channels=channels, padding=(1, 1), layout="NCHW", kernel_layout="OIHW", use_bias=False, name="conv") net = weight * data + residual size = 56 dtype="float32" dshape = (1, channels, size, size) kshape = (channels, channels, 3, 3) oshape = dshape shape_dict = {"data": dshape} for target, ctx in ctx_list(): graph, lib, _ = nnvm.compiler.build(net, target, shape_dict) # data, global_avg_pool, conv weight, conv op, fused elemwise add assert graph.index.num_nodes == 5 data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype)) kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype)) m = graph_runtime.create(graph, lib, ctx) m.run(data=data, conv_weight=kernel) # get output out = m.get_output(0, tvm.nd.empty(oshape, dtype)) residual = topi.testing.conv2d_nchw_python( data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME') weight = np.mean(data.asnumpy(), axis=(2, 3)) c_np = weight[:, :, np.newaxis, np.newaxis] * data.asnumpy() + residual tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
def _alter_conv2d_layout_arm(attrs, inputs, tinfos): """Alter op layout for pre-computing kernel transformation""" import nnvm.symbol as sym copy_inputs = [s for s in inputs] new_attrs = {k: attrs[k] for k in attrs.keys()} # Remove attached compilation target because conv2d_NCHWc needs to create # a conv2d_nchwc op and target is not one of conv2d's parameters. if "target" in new_attrs: del new_attrs["target"] assert attrs.get_int_tuple("dilation") == (1, 1), "Does not support dilation " \ "when alter_op_layout is enabled" strides = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") groups = attrs.get_int('groups') layout = attrs["layout"] out_dtype = attrs["out_dtype"] out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype if groups == 1: # query config of this workload workload = _conv_arg_to_workload(tinfos[0], tinfos[1], strides, padding, layout, out_dtype) cfg = autotvm.DispatchContext.current.query( tvm.target.current_target(), workload) if cfg.is_fallback: # if is fallback, clear query cache and return None autotvm.task.clear_fallback_cache(tvm.target.current_target(), workload) return None if cfg.template_key == 'direct': # packing weight tensor new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1]) return sym.conv2d(*copy_inputs, **new_attrs) else: # pre-compute weight transformation in winograd if "-device=arm_cpu" in tvm.target.current_target().options: tile_size = 4 VC = cfg['tile_k'].size[-1] else: from ..mali.conv2d import _pick_tile_size tile_size = _pick_tile_size(tinfos[0], tinfos[1]) VC = cfg['tile_bna'].val weight = sym.contrib.conv2d_winograd_weight_transform( copy_inputs[1], tile_size=tile_size) CO, CI, KH, KW = get_const_tuple(tinfos[1].shape) weight = sym.reshape(weight, shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI)) weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3]) copy_inputs[1] = weight new_attrs['tile_size'] = tile_size return sym.contrib.conv2d_winograd_without_weight_transform( *copy_inputs, **new_attrs) # do nothing for depthwise convolution return None
def nnvm_reshape(c, v, shp): """Implementation of reshape.""" nv = c.ref(v) assert shp.is_constant(tuple) if shp.value == (): shp = (1, ) else: shp = shp.value return sym.reshape(nv, shape=shp)
def test_reshape(): x = sym.Variable("x", shape=(4, )) y = sym.reshape(x, shape=(2, 2), name="y") g, ldict = correct_layout(y, "C") assert (ldict["x"][0] == "C") assert (ldict["y"][0] == "__undef__") # second pass will insert layout transform g, ldict = correct_layout(g, "C16c") assert (ldict["x"][0] == "C16c") assert (ldict["x_C"][0] == "C") assert (ldict["y"][0] == "__undef__")
def test_reshape(): x = sym.Variable("x", shape=(4,)) y = sym.reshape(x, shape=(2,2), name="y") g, ldict = correct_layout(y, "C") assert(ldict["x"][0] == "C") assert(ldict["y"][0] == "__undef__") # second pass will insert layout transform g, ldict = correct_layout(g, "C16c") assert(ldict["x"][0] == "C16c") assert(ldict["x_C"][0] == "C") assert(ldict["y"][0] == "__undef__")
def weight_prepack_conv2d(attrs, inputs, tinfos): import ast data = tinfos[0] kernel = tinfos[1] padding = ast.literal_eval(attrs['padding']) stride = ast.literal_eval(attrs['strides']) wkl = _get_workload(data, kernel, stride, padding, 'float32') sch = _get_schedule_conv(wkl) is_kernel_1x1 = isinstance(sch, AVX512Conv1x1Fwd) ic_bn, oc_bn = sch.ic_bn, sch.oc_bn new_attrs = {k: attrs[k] for k in attrs.keys()} new_attrs.pop('layout', None) kernel_sym = inputs[1] oc, ic, h, w = get_const_tuple(tinfos[1].shape) OC = oc // oc_bn IC = ic // ic_bn trans_kernel = sym.transpose(kernel_sym, axes=(1, 2, 3, 0)) trans_kernel = sym.reshape(trans_kernel, shape=(ic, h, w, OC, oc_bn)) trans_kernel = sym.transpose(trans_kernel, axes=(1, 2, 3, 4, 0)) trans_kernel = sym.reshape(trans_kernel, shape=(h, w, OC, oc_bn, IC, ic_bn)) if is_kernel_1x1: # (oc, ic, h, w) -> (OC, IC, ic, oc, h, w) trans_kernel = sym.transpose(trans_kernel, axes=(2, 4, 5, 3, 0, 1)) else: # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) trans_kernel = sym.transpose(trans_kernel, axes=(2, 4, 0, 1, 5, 3)) if attrs.get_bool('use_bias'): bias = inputs[2] bias = sym.reshape(bias, shape=(OC, oc_bn)) return sym.contrib.conv2d_nchw_kernel_packed(inputs[0], trans_kernel, bias, **new_attrs) else: return sym.contrib.conv2d_nchw_kernel_packed(inputs[0], trans_kernel, **new_attrs)
def test_infer_shape(): x = sym.Variable('x', shape=(4, 2)) y = sym.add(x, x, name='add1') y = sym.reshape(y, target=(2, 4), name="reshape1") g = graph.create(y) g._set_json_attr("shape_attr_key", "shape") g = g.apply('InferShape') jgraph = json.loads(g.apply('SaveJSON').json_attr('json')) jnodes = jgraph['nodes'] jnode_row_ptr = jgraph['node_row_ptr'] nindex = {n['name']: i for i, n in enumerate(jnodes)} assert g.json_attr('shape')[jnode_row_ptr[nindex["reshape1"]]] == [2, 4] assert g.json_attr('shape')[jnode_row_ptr[nindex["add1"]]] == [4, 2]
def verify_reshape(dshape, oshape): x = sym.Variable("x") y = sym.reshape(x, shape=oshape) y = y + 1 dtype = "float32" for target, ctx in ctx_list(): graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape}) m = graph_runtime.create(graph, lib, ctx) # set input data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype)) m.run(x=data) out_np = data.asnumpy().reshape(oshape) + 1 out = m.get_output(0, tvm.nd.empty(out_np.shape)) np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
def _alter_conv2d_layout(attrs, inputs, tinfos): """Alter op layout for pre-computing kernel transformation""" import nnvm.symbol as sym copy_inputs = [s for s in inputs] new_attrs = {k: attrs[k] for k in attrs.keys()} assert attrs.get_int_tuple("dilation") == (1, 1), "Does not support dilation " \ "when alter_op_layout is enabled" strides = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") groups = attrs.get_int('groups') layout = attrs["layout"] out_dtype = attrs["out_dtype"] out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype if groups == 1: # query config of this workload workload = _conv_arg_to_workload(tinfos[0], tinfos[1], strides, padding, layout, out_dtype) cfg = autotvm.task.DispatchContext.current.query( tvm.target.current_target(), workload) if cfg.template_key == 'direct': # packing weight tensor new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1]) return sym.conv2d(*copy_inputs, **new_attrs) else: # pre-compute weight transformation in winograd tile_size = 4 weight = sym.contrib.conv2d_winograd_weight_transform( copy_inputs[1], tile_size=tile_size) CO, CI, KH, KW = get_const_tuple(tinfos[1].shape) VC = cfg['tile_k'].size[-1] weight = sym.reshape(weight, shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI)) weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3]) copy_inputs[1] = weight new_attrs['tile_size'] = tile_size return sym.contrib.conv2d_winograd_without_weight_transform( *copy_inputs, **new_attrs) # do nothing for depthwise convolution return None
def test_plan_memory(): x = sym.Variable('x', shape=(4, 2)) x2 = sym.add(x, x, name='addk') y = sym.reshape(x2, target=(2, 4), name="reshapek") y = sym.add(y, x2, name="add2") y = sym.add(y, y) g = graph.create(y) g._set_json_attr("shape_attr_key", "shape") g = g.apply(["InferShape", "InferType", "PlanMemory"]) jgraph = json.loads(g.apply('SaveJSON').json_attr('json')) jnodes = jgraph['nodes'] jnode_row_ptr = jgraph['node_row_ptr'] storage_id = g.json_attr('storage_id') nindex = {n['name']: i for i, n in enumerate(jnodes)} assert (storage_id[jnode_row_ptr[nindex["addk"]]] != storage_id[jnode_row_ptr[nindex["reshapek"]]]) assert (storage_id[jnode_row_ptr[nindex["add2"]]] == storage_id[ jnode_row_ptr[nindex["reshapek"]]])
def test_plan_memory(): x = sym.Variable('x', shape=(4, 2)) x2 = sym.add(x, x, name='addk') y = sym.reshape(x2, target=(2, 4), name="reshapek") y = sym.add(y, x2, name="add2") y = sym.add(y, y) g = graph.create(y) g._set_json_attr("shape_attr_key", "shape") g = g.apply(["InferShape", "InferType", "PlanMemory"]) jgraph = json.loads(g.apply('SaveJSON').json_attr('json')) jnodes = jgraph['nodes'] jnode_row_ptr = jgraph['node_row_ptr'] storage_id = g.json_attr('storage_id') nindex = {n['name']: i for i, n in enumerate(jnodes)} assert (storage_id[jnode_row_ptr[nindex["addk"]]] != storage_id[jnode_row_ptr[nindex["reshapek"]]]) assert (storage_id[jnode_row_ptr[nindex["add2"]]] == storage_id[jnode_row_ptr[nindex["reshapek"]]])
def test_argmax(): dshape = (204800, 2) oshape = (1, 320, 640) dtype = "float32" x = sym.Variable("x", shape=dshape, dtype=dtype) x = sym.reshape(x, shape=(1, 320, 640, 2)) x = sym.transpose(x, axes=(0, 3, 1, 2)) y = sym.argmax(x, axis=1) target_str = "llvm" target = tvm.target.create(target_str) ctx = tvm.context(target_str, 0) with nnvm.compiler.build_config(opt_level=2): graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape}) m = graph_runtime.create(graph, lib, ctx) data = np.random.uniform(size=dshape).astype(dtype) m.run(x=data) np_reshape = np.reshape(data, (1, 320, 640, 2)) np_transpose = np.transpose(np_reshape, axes=(0, 3, 1, 2)) np_argmax = np.argmax(np_transpose, axis=1) out = m.get_output(0) np.testing.assert_allclose(out.asnumpy(), np_argmax, atol=1e-5, rtol=1e-5)
def nnvm_array_reduce(c, fn, array, shape): """Implementation of array_reduce.""" assert fn.is_constant(Primitive) assert shape.is_constant(tuple) fn = fn.value tshp = shape.value ary = c.ref(array) if fn == P.scalar_add: ashp = ashape(array) if len(tshp) < len(ashp): ts = (1, ) * (len(ashp) - len(tshp)) + tshp else: ts = tshp axis = list(i for i, t in enumerate(ts) if t == 1) if len(axis) == 1: axis = axis[0] res = sym.sum(ary, axis=axis, keepdims=1) if len(tshp) < len(ashp): res = sym.reshape(res, shape=tshp) return res else: raise NotImplementedError(f"reduce with {fn}")
def nn(m: Model): v_images = sym.Variable("images", shape=(BATCH_SIZE, 1, 28, 28), dtype=0) v_true_labels = sym.Variable("true_labels", shape=(BATCH_SIZE, 10), dtype=0) x = v_images x = sym.reshape(data=x, shape=(BATCH_SIZE, 28 * 28)) x = sym.dense(data=x, units=10) logits = x x = -sym.elemwise_mul(v_true_labels, sym.log_softmax(x)) loss = sym.sum(x) / BATCH_SIZE # This is not really accuracy, because we use softmax instead of hardmax accuracy = sym.sum(v_true_labels * sym.softmax(logits)) / BATCH_SIZE # We have to somehow list all weights (the corresponding variables are generated automatically) weight_vars = [ v for v in loss.list_input_variables() if v.attr('name') not in ['images', 'true_labels'] ] optimizer = SGD(learning_rate=1e-4) update_step = optimizer.minimize(loss, var=weight_vars) tgraph = nnvm.graph.create(sym.Group( [loss, update_step])).apply("InferShape").apply("InferType") fgraph = nnvm.graph.create(sym.Group( [loss, accuracy])).apply("InferShape").apply("InferType") m.tgraph = tgraph m.fgraph = fgraph m.optimizer = optimizer m.loss = loss return m
def _alter_conv2d_layout(attrs, inputs, tinfo): import nnvm.symbol as sym copy_inputs = [s for s in inputs] new_attrs = {k: attrs[k] for k in attrs.keys()} data, kernel = tinfo[0], tinfo[1] batch_size, in_channel, height, width = get_const_tuple(data.shape) groups = attrs.get_int("groups") out_channel = attrs.get_int("channels") padding = attrs.get_int_tuple("padding") strides = attrs.get_int_tuple("strides") dilation = attrs.get_int_tuple("dilation") layout = attrs['layout'] kh, kw = attrs.get_int_tuple("kernel_size") dtype = data.dtype out_dtype = dtype if attrs["out_dtype"] == "same" else attrs["out_dtype"] is_depthwise = groups == in_channel and groups == out_channel # only optimize for NCHW if layout != 'NCHW': return None if groups != 1 and not is_depthwise: return None dispatch_ctx = autotvm.task.DispatchContext.current target = tvm.target.current_target() # query schedule and fallback if necessary workload = autotvm.task.args_to_workload( [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw) \ if is_depthwise else \ autotvm.task.args_to_workload( [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d) cfg = dispatch_ctx.query(target, workload) if cfg.is_fallback: _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise) ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] new_attrs['layout'] = 'NCHW%dc' % ic_bn new_attrs['out_layout'] = 'NCHW%dc' % oc_bn new_data = tvm.placeholder( (batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data.dtype) if is_depthwise: # channel, channel_multiplier, kh, kw -> out_channel_chunk, kh, kw, out_channel_block # in which out_channel = merge(channel, channel_multiplier) kernel_sym = copy_inputs[1] kernel_sym = sym.reshape(kernel_sym, shape=(out_channel // oc_bn, oc_bn, kh, kw)) kernel_sym = sym.transpose(kernel_sym, axes=(0, 2, 3, 1)) copy_inputs[1] = kernel_sym # Store altered operator's config new_kernel = tvm.placeholder((out_channel // oc_bn, kh, kw, oc_bn), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload([ new_data, new_kernel, strides, padding, dilation, new_attrs['layout'], new_attrs['out_layout'], out_dtype ], depthwise_conv2d_NCHWc) else: out_channel, _, kh, kw = get_const_tuple(kernel.shape) # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn) # Store altered operator's config new_kernel = tvm.placeholder( (out_channel // oc_bn, in_channel // ic_bn, kh, kw, ic_bn, oc_bn), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload([ new_data, new_kernel, strides, padding, dilation, new_attrs['layout'], new_attrs['out_layout'], out_dtype ], conv2d_NCHWc) dispatch_ctx.update(target, new_workload, cfg) return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
def check(in_shape, tshape, out_shape): x = sym.Variable("x", shape=in_shape) y = sym.reshape(x, shape=tshape, name="y") sdict = infer_shape(y) assert(tuple(sdict["y"][0]) == tuple(out_shape))
def test_reshape(): x = sym.Variable("x") y = sym.reshape(x, shape=(10, 20), name="y") assert(y.list_input_names() == ["x"])
def _alter_conv2d_layout_arm(attrs, inputs, tinfos): """Alter op layout for pre-computing kernel transformation""" import nnvm.symbol as sym copy_inputs = [s for s in inputs] new_attrs = {k: attrs[k] for k in attrs.keys()} dilation = attrs.get_int_tuple("dilation") strides = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") groups = attrs.get_int('groups') layout = attrs["layout"] out_dtype = attrs["out_dtype"] out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype if layout != 'NCHW' or groups != 1: return None if dilation != (1, 1): warnings.warn("Does not support weight pre-transform for dilated convolution.") return None data, kernel = tinfos[0:2] N, CI, H, W = get_const_tuple(data.shape) CO, _, KH, KW = get_const_tuple(kernel.shape) # query config of this workload workload = autotvm.task.args_to_workload( [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d) target = tvm.target.current_target() dispatch_ctx = autotvm.DispatchContext.current cfg = dispatch_ctx.query(target, workload) if cfg.is_fallback: # if is fallback, clear query cache and return None autotvm.task.clear_fallback_cache(target, workload) return None if cfg.template_key == 'direct': # pack weight tensor VC = cfg['tile_co'].size[-1] new_attrs['kernel_layout'] = 'OIHW%do' % VC # Store the same config for the altered operator (workload) new_data = data new_kernel = tvm.placeholder((CO // VC, CI, KH, KW, VC), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload( [new_data, new_kernel, strides, padding, dilation, 'NCHW', out_dtype], conv2d) dispatch_ctx.update(target, new_workload, cfg) return sym.conv2d(*copy_inputs, **new_attrs) else: # pre-compute weight transformation in winograd if "-device=arm_cpu" in target.options: tile_size = 4 VC = cfg['tile_k'].size[-1] else: from ..mali.conv2d import _pick_tile_size tile_size = _pick_tile_size(tinfos[0], tinfos[1]) VC = cfg['tile_bna'].val weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1], tile_size=tile_size) weight = sym.reshape(weight, shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI)) weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3]) copy_inputs[1] = weight new_attrs['tile_size'] = tile_size # Store the same config for the altered operator (workload) new_data = data new_weight = tvm.placeholder((KH + tile_size - 1, KH + tile_size -1, CO // VC, CI, VC), kernel.dtype) new_workload = autotvm.task.args_to_workload( [new_data, new_weight, strides, padding, dilation, new_attrs['layout'], out_dtype, tile_size], conv2d_winograd_without_weight_transform) dispatch_ctx.update(target, new_workload, cfg) return sym.contrib.conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
def _alter_conv2d_layout(attrs, inputs, tinfo): import nnvm.symbol as sym copy_inputs = [s for s in inputs] new_attrs = {k : attrs[k] for k in attrs.keys()} data, kernel = tinfo[0], tinfo[1] batch_size, in_channel, height, width = get_const_tuple(data.shape) groups = attrs.get_int("groups") out_channel = attrs.get_int("channels") padding = attrs.get_int_tuple("padding") strides = attrs.get_int_tuple("strides") dilation = attrs.get_int_tuple("dilation") layout = attrs['layout'] kh, kw = attrs.get_int_tuple("kernel_size") dtype = data.dtype out_dtype = dtype if attrs["out_dtype"] == "same" else attrs["out_dtype"] is_depthwise = groups == in_channel and groups == out_channel # only optimize for NCHW if layout != 'NCHW': return None if groups != 1 and not is_depthwise: return None dispatch_ctx = autotvm.task.DispatchContext.current target = tvm.target.current_target() # query schedule and fallback if necessary workload = autotvm.task.args_to_workload( [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw) \ if is_depthwise else \ autotvm.task.args_to_workload( [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d) cfg = dispatch_ctx.query(target, workload) if cfg.is_fallback: _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise) ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] new_attrs['layout'] = 'NCHW%dc' % ic_bn new_attrs['out_layout'] = 'NCHW%dc' % oc_bn new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn), dtype=data.dtype) if is_depthwise: # channel, channel_multiplier, kh, kw -> out_channel_chunk, kh, kw, out_channel_block # in which out_channel = merge(channel, channel_multiplier) kernel_sym = copy_inputs[1] kernel_sym = sym.reshape(kernel_sym, shape=(out_channel//oc_bn, oc_bn, kh, kw)) kernel_sym = sym.transpose(kernel_sym, axes=(0, 2, 3, 1)) copy_inputs[1] = kernel_sym # Store altered operator's config new_kernel = tvm.placeholder((out_channel//oc_bn, kh, kw, oc_bn), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload( [new_data, new_kernel, strides, padding, dilation, new_attrs['layout'], new_attrs['out_layout'], out_dtype], depthwise_conv2d_NCHWc) else: out_channel, _, kh, kw = get_const_tuple(kernel.shape) # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc) new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn) # Store altered operator's config new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn, kh, kw, ic_bn, oc_bn), dtype=kernel.dtype) new_workload = autotvm.task.args_to_workload( [new_data, new_kernel, strides, padding, dilation, new_attrs['layout'], new_attrs['out_layout'], out_dtype], conv2d_NCHWc) dispatch_ctx.update(target, new_workload, cfg) return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)