def globalavgpool(n, c, h, w, pool_type, attrs, kernel_name="global_pool"): """ Performs the global average pooling on the input. For each feature map we can define the formula as: \f[ res = \frac{1}{W * H} \\sum X_{i,j} \f] Note: The real input is create by akg.tvm.placeholder Args: n (int): input batchsize. c (int): input channel. h (int): input height. w (int): input weight. pool_type (str): pooling mode, default average. attrs (str): Default None. kernel_name (str): a str about kernel_name Returns: tvm.tensor.Tensor of shape n * c * 1 * 1 """ input = akg.tvm.placeholder((n, c, h, w), name='input', dtype="float16") output = akg.topi.nn.global_pool(input, pool_type=pool_type) s = akg.tvm.create_schedule(output.op) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [input, output], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod
def Gather(params_shape, indices_shape, params_dtype, indices_dtype, axis, kernel_name, cce_path="./", target=utils.CCE): """Gather data by indices""" utils.check_shape(params_shape, length=2) utils.check_shape(indices_shape, length=1) utils.ops_dtype_check(params_dtype, utils.DtypeForDavinci.ALL_TYPES) utils.ops_dtype_check(indices_dtype, utils.DtypeForDavinci.INT32) utils.check_equal("axis", "zero", axis, 0) # construct compute o_shape = (indices_shape[0], params_shape[1]) xx = akg.tvm.placeholder(params_shape, dtype=params_dtype, name="X") yy = akg.tvm.placeholder(indices_shape, dtype=indices_dtype, name="Y") res = akg.tvm.extern(o_shape, [xx, yy], lambda ins, outs: kernel_ir(outs[0], ins[0], ins[1]), name="res", dtype=params_dtype) s = akg.tvm.create_schedule(res.op) # create cce attrs = {"enable_multicore": False} with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [xx, yy, res], "cce", name=kernel_name, attrs=attrs) source_code = mod.imported_modules[0].get_source() create_code(kernel_name, cce_path, source_code) return mod
def matmul_ad(data_shape, weight_shape, dtype, attrs=None): check_list = ["float16"] if not (dtype.lower() in check_list): raise RuntimeError("matmul test only support %s while dtype is %s" % (",".join(check_list), dtype)) # check_shape(shape) assert (len(data_shape) == 2) assert (len(weight_shape) == 2) assert (data_shape[1] == weight_shape[0]) m, k = data_shape _, n = weight_shape a = akg.tvm.placeholder((m, k), name='a', dtype=dtype) b = akg.tvm.placeholder((k, n), name='b', dtype=dtype) kk = akg.tvm.reduce_axis((0, k), name='kk') c = akg.tvm.compute( (m, n), lambda i, j: akg.lang.ascend.mmad(a[i, kk] * b[kk, j], axis=kk), name="c") head = akg.tvm.placeholder(c.shape, name="Head", dtype='float16') _jacs = list(akg.differentiate(c, [a], head)) sjac = akg.tvm.create_schedule([_jacs[0].op]) op_vars = [head, b, _jacs[0]] with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(sjac, op_vars, "cce", name="test2", attrs=attrs, polyhedral=True) return mod
def topk(shape, k, dtype, kernel_name, attrs, target="cce"): check_list = ["float16", "int32"] if not (dtype.lower() in check_list): raise RuntimeError("tile_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) if k > shape[-1]: raise RuntimeError("k should not be greater than shape[-1]") shape = (16, 16) out_shape = (16, 16) temp_shape = (16, 16 * 18) inputs = akg.tvm.placeholder(shape, name="input", dtype="float16") output = akg.tvm.placeholder(out_shape, name="output", dtype="float16") temp = akg.tvm.placeholder(temp_shape, name="temp", dtype="float16") values = compute_topk(output, inputs, temp) values1 = compute_get_last(values, temp) s = akg.tvm.create_schedule([values1.op]) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [inputs, values1], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod
def case_1(data_shape, dtype, kernel_name, attrs): """elemwise chain case 1""" utils.ops_dtype_check(dtype, utils.DtypeForDavinci.FLOAT16) utils.check_shape_length_equal("data", data_shape, 2) m, k = data_shape A = akg.tvm.placeholder((m, k), name='A', dtype=dtype) B = akg.tvm.placeholder((k, ), name='B', dtype=dtype) C = akg.tvm.placeholder((m, k), name='C', dtype=dtype) E = akg.tvm.compute((m, k), lambda i, j: A[i, j] * (B[j] + C[i, j]), name="E") forward_s = akg.tvm.create_schedule(E.op) op_vars = [A, B, C, E] akg.lower(forward_s, op_vars, simple_mode=True, polyhedral=True) kernel_name = gen_name_kernel(kernel_name, dtype, data_shape) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(forward_s, op_vars, "cce", name="test", attrs=attrs, polyhedral=True) return mod
def invert_permutation_run(shape, dtype, attrs): # check shapes vc_util.check_shape(shape) if not (dtype.lower() in "int32"): raise RuntimeError( "indices_dtype only support int32 while dtype is %s" % dtype) A = akg.tvm.placeholder(shape, dtype, name="A") op = invert_permutation.invert_permutation(A) s = akg.tvm.create_schedule(op.op) kernel_name = utils.gen_name_kernel("invert_permutation", dtype, shape) with akg.build_config(add_lower_pass=utils.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [A, op], "cce", name=kernel_name, attrs=attrs, polyhedral=True) input_data = np.random.permutation(np.arange(shape[0])).astype(np.int32) expect = np.full([shape[0]], 0, np.int32) for i, e in enumerate(input_data): expect[e] = i output = np.full([shape[0]], 0, np.int32) output = utils.mod_launch(mod, (input_data, output), expect=expect) return (input_data, ), output, expect, compare_tensor(output, expect, rtol=5e-03, equal_nan=True)
def roipool(shape, roibox, pooled_shape, dtype, kernel_name="roipool_forward_output", attrs=None, target="cce"): check_list = ["float16"] if not (dtype.lower() in check_list): raise RuntimeError("tile_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) utils.check_shape(shape) assert (len(shape) == 4) assert (len(roibox) == 4) assert (len(pooled_shape) == 2) a_n, a_c, a_h, a_w = shape roi_t, roi_b, roi_l, roi_r = roibox assert (roi_t >= 0 and roi_t < roi_b and roi_b < a_h) assert (roi_l >= 0 and roi_l < roi_r and roi_r < a_w) a = akg.tvm.placeholder(shape, name="a", dtype=dtype) Crop = akg.tvm.compute([a_n, a_c, roi_b - roi_t, roi_r - roi_l], lambda n, c, h, w: a[n, c, roi_t + h, roi_l + w]) p_h, p_w = pooled_shape win_h = (roi_b - roi_t) // p_h + (1 if (roi_b - roi_t) % p_h > 0 else 0) win_w = (roi_r - roi_l) // p_w + (1 if (roi_r - roi_l) % p_w > 0 else 0) assert p_h <= (roi_b - roi_t) and p_w <= (roi_r - roi_l) Unpooled = akg.tvm.compute( [a_n, a_c, p_h, p_w, win_h, win_w], lambda n, c, h, w, wh, ww: akg.tvm.expr.Select( akg.tvm.all(h * win_h + wh < roi_b - roi_t, w * win_w + ww < roi_r - roi_l), Crop[n, c, h * win_h + wh, w * win_w + ww], akg.tvm.const(0, a.dtype))) rh = akg.tvm.reduce_axis((0, win_h)) rw = akg.tvm.reduce_axis((0, win_w)) output_shape = [a_n, a_c, p_h, p_w] res = akg.tvm.compute( output_shape, lambda n, c, h, w: akg.tvm.max(Unpooled[n, c, h, w, rh, rw], axis=[rh, rw])) s = akg.tvm.create_schedule(res.op) s[Crop].compute_inline() s[Unpooled].compute_inline() kernel_name = utils.gen_name_kernel(kernel_name, dtype, shape) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [a, res], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod, output_shape
def div_mod_issue(data_shape, weight_shape, case_number): if (case_number == 0): A = akg.tvm.placeholder(data_shape, dtype='float16', name='input0') divisor = 2 stage1 = akg.tvm.compute( data_shape, lambda n, c, h, w: A[n, c / divisor, h, w] + 1, name="stage1") op_vars = [A, stage1] s = akg.tvm.create_schedule([stage1.op]) akg.lower(s, op_vars, simple_mode=True, polyhedral=True) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(s, op_vars, "cce", name="test1", polyhedral=True) return mod else: A = akg.tvm.placeholder(data_shape, dtype='float16', name='input0') B = akg.tvm.placeholder(weight_shape, dtype='float16', name='input1') divisor = 3 stage1 = akg.tvm.compute( data_shape, lambda n, c, h, w: A[n, c / divisor, h, w] + 1, name="stage1") stage2 = akg.tvm.compute( weight_shape, lambda n, c, h, w: stage1[0, c, 0, 0] + B[n, c, h, w], name="stage2") op_vars = [A, B, stage2] s = akg.tvm.create_schedule([stage2.op]) akg.lower(s, op_vars, simple_mode=True, polyhedral=True) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod_stage2 = akg.build(s, op_vars, "cce", name="test2", polyhedral=True) return mod_stage2
def fc(fMapBatch, weight, fc_dtype, block_size, attrs, kernel_name="Fully_Connected"): """ Computes full connection. Args: fMapBatch(akg.tvm.Tensor): Should be a 4D tensor. weight(akg.tvm.Tensor): Should be a 4D tensor of same type as fMapBatch. fc_dtype(str): Specifies data type of input tensors. block_size(int): Block size. attrs(dicts): Attributes. kernel_name(str): Kernel name. Returns: akg.tvm.Tensor of same type as input tensors. """ # NCHW f_n, f_c, f_h, f_w = fMapBatch.shape w_n, w_c, w_h, w_w = weight.shape if f_c != w_c or f_h != w_h or f_w != w_w or w_n < 32: raise RuntimeError("invalid input shape") f_shape_nc1hwc0 = (f_n, f_c // block_size, f_h, f_w, block_size) w_shape_fractal = (w_c // block_size * w_h * w_w, w_n // block_size, block_size, block_size) A = akg.tvm.placeholder(f_shape_nc1hwc0, dtype=fc_dtype, name='fmap') B = akg.tvm.placeholder(w_shape_fractal, dtype=fc_dtype, name='weight') out_shape_nc1hwc0 = (f_n, w_n // block_size, 1, 1, block_size) weight_shape_nc1hwc0 = (w_n, w_c // block_size, w_h, w_w, block_size) _, k_c1, k_h, k_w, k_c0 = weight_shape_nc1hwc0 kc1 = akg.tvm.reduce_axis((0, k_c1), name='kc1') kh = akg.tvm.reduce_axis((0, k_h), name='kh') kw = akg.tvm.reduce_axis((0, k_w), name='kw') kc0 = akg.tvm.reduce_axis((0, k_c0), name='kc0') res = akg.tvm.compute(out_shape_nc1hwc0, lambda n, c1, h, w, c0: akg.lang.ascend.mmad( A[n, kc1, (h + kh), (w + kw), kc0] * B[(kc1 * k_h + kh) * k_w + kw, c1, c0, kc0], axis=[kc1, kh, kw, kc0]), name="res") s = akg.tvm.create_schedule(res.op) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [A, B, res], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod
def concat_ad_run(shapes, dtype, axis, attrs): # prepare inputs placeholder inp_dtype = dtype.lower() data = [] for i in range(len(shapes)): shape = shapes[i] data.append( akg.tvm.placeholder(shape, name="data_%d" % i, dtype=inp_dtype)) kernel_name = utils.genKernelName("concat", inp_dtype, shapes) res, head = concat_ad.concat_ad(data, axis) opvars = [head] + data + [res] s = akg.tvm.create_schedule(res.op) op_attrs = [axis] if 'tuning' in attrs.keys(): t = attrs.get("tuning", False) kernel_name = attrs.get("kernel_name", False) mod = utils.op_build_test(concat_ad.concat_ad, [shapes], [dtype.lower()], op_attrs, kernel_name=kernel_name, attrs=attrs, tuning=t) if t: args, expect, head_data, inputs = gen_data(dtype, head, shapes) return mod, expect, tuple(args) else: return mod else: # build the cce kernel with akg.build_config(add_lower_pass=utils.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, opvars, "cce", name=kernel_name, attrs=attrs, polyhedral=True) print(mod.imported_modules[0].get_source()) args, expect, head_data, inputs = gen_data(dtype, head, shapes) output = utils.mod_launch(mod, tuple(args), expect=expect) return tuple(inputs) + (head_data, ), output, expect, compare_tensor( output, expect, rtol=5e-03, equal_nan=True)
def elemwise_sum_manual_schedule(input_shape, polyhedral=False, attrs=None): """manually schedule""" b = akg.tvm.placeholder(input_shape, dtype='float16', name="b") c = akg.tvm.placeholder(input_shape, dtype='float16', name="c") a = akg.tvm.compute(input_shape, lambda *indices: b(*indices) + c(*indices)) ss = akg.tvm.create_schedule([a.op]) ss.cache_read(b, "local.UB", [a]) ss.cache_read(c, "local.UB", [a]) ss.cache_write(a, "local.UB") ss[a].set_scope("local.UB") with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(ss, [b, c, a], "cce", name="test_manual_schedule", attrs=attrs, polyhedral=polyhedral) return mod
def floormod(shape, dtype, kernel_name, attrs, target="cce"): """ Compute element-wise remainder of division. \f$res=a - floor(a/b) * b\f$ Args: shape (list): a list has any nums. dtype (str): parameters' type. kernel_name (str): a str about kernel_name. attrs (str): Default None. Returns: tvm.tensor.Tensor, shape and dtype are input params. """ utils.ops_dtype_check( dtype, [utils.DtypeForDavinci.ALL_FLOAT, utils.DtypeForDavinci.INT32]) utils.check_shape(shape) a = akg.tvm.placeholder(shape=shape, name="a", dtype=dtype) b = akg.tvm.placeholder(shape=shape, name="b", dtype=dtype) # res = a - floor(a/b) * b # Newton's Method for VREC para = akg.lang.ascend.vrec(b) for _ in range(3): tmp1 = akg.lang.ascend.vmul(b, para) tmp2 = akg.lang.ascend.vmuls(tmp1, -1) tmp3 = akg.lang.ascend.vadds(tmp2, 2) para = akg.lang.ascend.vmul(tmp3, para) c = akg.lang.ascend.vmul(a, para) d = akg.lang.ascend.floor(c) e = akg.lang.ascend.vmul(d, b) res = akg.lang.ascend.vsub(a, e) s = akg.tvm.create_schedule(res.op) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [a, b, res], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod
def test_quant(fmap_shape): # input shape(NCHW -> NC1HWC0) in_n, in_c, in_h, in_w = fmap_shape assert in_c % 32 == 0 input_shape_nc1hwc0 = (in_n, in_c // 16, in_h, in_w, 16) in_n, in_c1, in_h, in_w, in_c0 = input_shape_nc1hwc0 # placeholder (NC1HWC0) FMap = akg.tvm.placeholder(input_shape_nc1hwc0, dtype='float16', name='FMap') ScaleQ = akg.tvm.placeholder((16, ), dtype='float16', name='ScaleQ') OffsetQ = akg.tvm.placeholder((16, ), dtype='float16', name='OffsetQ') out_shape_nc1hwc0 = (in_n, in_c // 32, in_h, in_w, 32) print(out_shape_nc1hwc0) out_n, out_c1, out_h, out_w, out_c0 = out_shape_nc1hwc0 # quantize Quant = akg.tvm.compute(out_shape_nc1hwc0, lambda n, c1, h, w, c0: (FMap[n, c1 + c0 // 16, h, w, c0 % 16] * ScaleQ[0] + OffsetQ[0]).astype('int8'), name='output') info = dim.Dim() info.setdim(index=0, axis=0, tilel1=2, tilel0=0) info.setdim(index=0, axis=0, tilel1=32, tilel0=0) info.setdim(index=0, axis=0, tilel1=32, tilel0=0) info.setdim(index=0, axis=0, tilel1=16, tilel0=0) # schedule s = akg.tvm.create_schedule(Quant.op) with akg.build_config(add_lower_pass=utils.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [FMap, ScaleQ, OffsetQ, Quant], 'cce', name='cce_quant', attrs={'dim': str(info)}, polyhedral=True) source_code = mod.imported_modules[0].get_source() print(source_code)
def my_dsl(dtype, kernel_name, attrs): m = tvm.var("M") n = tvm.var("N") A = tvm.placeholder((m,), name="A", dtype=dtype) B = tvm.placeholder((m,), name="B", dtype=dtype) if insn == "add": C = topi.add(A, B) elif insn == "sub": C = topi.subtract(A, B) if insn == "mul": C = topi.multiply(A, B) elif insn == "div": C = topi.divide(A, B) elif insn == "max": C = topi.maximum(A, B) elif insn == "min": C = topi.minimum(A, B) elif insn == "abs": C = tvm.compute(A.shape, lambda *index: tvm.abs(A(*index)), name='C') elif insn == "exp": C = topi.exp(A) elif insn == "log": C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) elif insn == "adds": C = A + tvm.const(2, dtype) elif insn == "muls": C = A * tvm.const(2, dtype) # C = tvm.compute((m, ), lambda i: A[i] + B[i], name="C") s = tvm.create_schedule([C.op]) with akg.build_config(add_lower_pass=utils.debug_mode(0), dump_pass_ir=True): if insnType == "binary": mod = akg.build(s, [A, B, C], "cce", name=kernel_name, attrs = attrs, polyhedral=True) else: mod = akg.build(s, [A, C], "cce", name=kernel_name, attrs = attrs, polyhedral=True) return mod
def add_a_conv(fmap_shape, filter_shape, pad_, stride_, dilation_, tile_hh=0, tile_coco=0, tile_mm=0, tile_kk=0, tile_nn=0, bypass_l1=False, use_bias=False, block_size=16, conv_dtype='float16'): conv, a_value, b_value, bias_value, kernel_name, dim_info = add_a_conv_compute( fmap_shape, filter_shape, pad_, stride_, dilation_, tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, bypass_l1, use_bias, block_size, conv_dtype) # schedule s = akg.tvm.create_schedule(conv.op) print(conv, a_value, b_value, bias_value) attrs = {} attrs["pragma_rmselfdep"] = False attrs['dim'] = dim_info with akg.build_config(add_lower_pass=utils.debug_mode(0), dump_pass_ir=True): if use_bias: mod = akg.build(s, [a_value, b_value, bias_value, conv], "cce", name=kernel_name, attrs=attrs, polyhedral=True) else: mod = akg.build(s, [a_value, b_value, conv], "cce", name=kernel_name, attrs=attrs, polyhedral=True) source_code = mod.imported_modules[0].get_source() cce_path = '.' utils.create_code(kernel_name, cce_path, source_code) return mod
def conv_relu(fmap_shape, filter_shape, pad_, stride_, dilation_, tile_hh=0, tile_coco=0, tile_mm=0, tile_kk=0, tile_nn=0, bypass_l1=False, use_bias=False, block_size=16, conv_dtype='float16'): conv, a_value, b_value, bias_value, kernel_name, dim_info = add_a_conv_compute( fmap_shape, filter_shape, pad_, stride_, dilation_, tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, bypass_l1, use_bias, block_size, conv_dtype) # leakly relu negative_slope = 0.0 slope_tmp = akg.tvm.const(negative_slope, dtype=conv_dtype) # negative_slope*x out = akg.lang.ascend.vmuls(conv, slope_tmp) # max(x,negative_slope*x) out = akg.lang.ascend.vmax(out, conv) # schedule s = akg.tvm.create_schedule(conv.op) with akg.build_config(add_lower_pass=utils.debug_mode(0), dump_pass_ir=True): if use_bias: mod = akg.build(s, [a_value, b_value, bias_value, conv], "cce", name=kernel_name, attrs={"dim": dim_info}, polyhedral=True) else: mod = akg.build(s, [a_value, b_value, conv], "cce", name=kernel_name, attrs={"dim": dim_info}, polyhedral=True) return mod
def op_build_to_func(opnames, computes, args, custom_schedule, device, kernel_name, attrs): """op_build_to_func""" if device not in ("aicore", "aicpu"): logging.error("Device %s is not in [aicore, aicpu].", device) return None logging.debug("op_build_to_func for ", opnames) polyhedral = True dump_ir = os.getenv(get_dump_ir_flag()) == "on" try: tmp_outputs = [x.op for x in computes] s = akg.tvm.create_schedule(tmp_outputs) if custom_schedule: polyhedral = False custom_schedule(s) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=dump_ir): if attrs: binds = attrs.pop(BINDS, None) rst = akg.build_to_func(s, args, name=kernel_name, attrs=attrs, polyhedral=polyhedral, binds=binds, target=_get_target(device)) else: rst = akg.build_to_func(s, args, name=kernel_name, polyhedral=polyhedral, target=_get_target(device)) except Exception: logging.error(traceback.format_exc()) return None return rst
def focalloss_ad_run2(shape, dtype, attrs): logits_pld = akg.tvm.placeholder(shape, dtype=dtype, name='logits') labels_pld = akg.tvm.placeholder(shape, dtype='int32', name='labels') d_labels, d_logits, head = focalloss_ad.focalloss_ad(labels_pld, logits_pld) print("autodiff d_logits:\n", akg.tvm.PrintTensorRecursively(d_logits)) print("autodiff d_labels:\n", akg.tvm.PrintTensorRecursively(d_labels)) # build autodiff kernels io = [labels_pld, logits_pld, head, d_labels, d_logits] s = akg.tvm.create_schedule([e.op for e in io]) kernel_name = utils.gen_name_kernel("focalloss_ad", dtype, (shape[0], shape[1],)) with akg.build_config(add_lower_pass=utils.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, io, "cce", name=kernel_name, attrs=attrs, polyhedral=True) labels_np = RANGEFILL((batchsize,)) logits_np = RANGEFILL((batchsize,), 2) head_np = RANGEFILL((batchsize,), 2) output = np.full(expect.shape, np.nan, dtype) output = utils.mod_launch(mod, (labels_np, logits_np, head_np, output), expect=output) expect = output # hack return (input_np, head_np), output, expect, compare_tensor(output, expect, atol=0.1)
def logsoftmax_ad(shape, dtype, axis, kernel_name, attrs): """Compute the gradient of logsoftmax by autodiff.""" check_list = ["float16"] if not dtype.lower() in check_list: raise RuntimeError( "logsoftmax test only support %s while dtype is %s" % (",".join(check_list), dtype)) # check_shape(shape) if axis < 0: axis = len(shape) + axis if axis >= len(shape): raise RuntimeError("axis should be less than dimension") if axis != len(shape) - 1: raise RuntimeError("Only support the last axis currently") shape_new = [shape[-2], shape[-1]] if len(shape) > 2: for i in range(len(shape) - 2): shape_new[0] = shape_new[0] * shape[i] shape = shape_new a_up = akg.tvm.placeholder(shape, dtype=dtype, name="input") b_up = logsoftmax.logsoftmax_op(a_up, shape, axis) head = akg.tvm.placeholder(b_up.shape, name="head", dtype=dtype) _jacs = list(akg.differentiate(b_up, [a_up], head)) sjac = akg.tvm.create_schedule([_jacs[0].op]) sjac[_jacs[0].op.input_tensors[1]].compute_inline() op_vars = [head, a_up, _jacs[0]] with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(sjac, op_vars, "cce", name="test2", attrs=attrs, polyhedral=True) return mod
def range_run(start, limit, delta, dtype, attrs): t_range = tvm_range.range_value(start, limit, delta, dtype) # Create module sch = akg.tvm.create_schedule(t_range.op) kernel_name = "range" with akg.build_config(add_lower_pass=utils.debug_mode(0), dump_pass_ir=True): mod = akg.build(sch, [t_range], "cce", name=kernel_name, attrs=attrs, polyhedral=True) print(mod.imported_modules[0].get_source()) # Generate data for testing the op expect = np.asarray(list(range(start, limit, delta))) output = np.full((max(0, (limit - start) / delta), ), np.nan, dtype) output = utils.mod_launch(mod, (output, ), expect=expect) return tuple(), output, expect, compare_tensor(output, expect, rtol=5e-03, equal_nan=True)
def vector_matmul(data_m, data_n, data_k, trans_a, trans_b, dtype, kernel_name, attrs): check_list = ["float16", "float32"] if not dtype in check_list: raise TypeError("softmax test only support %s while dtype is %s" % (",".join(check_list), dtype)) m = data_m n = data_n k = data_k data_shape, weight_shape = get_shape(m, n, k, trans_a, trans_b) output_shape = (m, n) A = akg.tvm.placeholder(data_shape, name='A', dtype=dtype) B = akg.tvm.placeholder(weight_shape, name='B', dtype=dtype) ZERO = akg.tvm.const(0.0, dtype=dtype) @script def matmul_hybrid_f_f(a, b, zero): t_1 = allocate((m, k, n), a.dtype, 'local') t_2 = output_tensor((m, n), a.dtype) for i_m in range(0, m): for i_k in range(0, k): for i_n in range(0, n): t_1[i_m, i_k, i_n] = a[i_m, i_k] * b[i_k, i_n] for i1_n in range(0, n): t_2[i_m, i1_n] = zero for i1_k in range(0, k): for i1_n in range(0, n): t_2[i_m, i1_n] = t_2[i_m, i1_n] + t_1[i_m, i1_k, i1_n] return t_2 @script def matmul_hybrid_f_t(a, b, zero): t_1 = allocate((m, n, k), a.dtype, 'local') t_2 = output_tensor((m, n), a.dtype) for i_m in range(0, m): for i_n in range(0, n): t_2[i_m, i_n] = zero for i_k in range(0, k): t_1[i_m, i_n, i_k] = a[i_m, i_k] * b[i_n, i_k] t_2[i_m, i_n] = t_1[i_m, i_n, i_k] + t_2[i_m, i_n] return t_2 @script def matmul_hybrid_t_f(a, b, zero): t_1 = allocate((m, k, n), a.dtype, 'local') t_2 = output_tensor((m, n), a.dtype) for i_m in range(0, m): for i_k in range(0, k): for i_n in range(0, n): t_1[i_m, i_k, i_n] = a[i_k, i_m] * b[i_k, i_n] for i1_n in range(0, n): t_2[i_m, i1_n] = zero for i1_k in range(0, k): for i1_n in range(0, n): t_2[i_m, i1_n] = t_2[i_m, i1_n] + t_1[i_m, i1_k, i1_n] return t_2 C = () if trans_a == False and trans_b == False: C = matmul_hybrid_f_f(A, B, ZERO) elif trans_a == False and trans_b == True: C = matmul_hybrid_f_t(A, B, ZERO) elif trans_a == True and trans_b == False: C = matmul_hybrid_t_f(A, B, ZERO) else: raise ValueError('Not support both transpose yet') forward_s = akg.tvm.create_schedule(C.op) op_vars = [A, B, C] with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(forward_s, op_vars, "cce", name=kernel_name, attrs=attrs, polyhedral=True) source_code = mod.imported_modules[0].get_source() create_code(kernel_name, "./", source_code) return mod, output_shape
def group_conv_ad(_n, _h, _w, _c_i, _c_o, group, _k_h, _k_w, pad_h, pad_w, _s_h, _s_w, cut_h, cut_co, cut_m, cut_k, cut_n, block_size, use_bias=False, kernel_name='group_conv'): conv_dtype = 'float16' _a = akg.tvm.placeholder((_n, _c_i // block_size, _h, _w, block_size), name="input0", dtype=conv_dtype) _b = akg.tvm.placeholder(((_c_i // group) // block_size * _k_h * _k_w, _c_o // block_size, block_size, block_size), name="input1", dtype=conv_dtype) mod_forward = group_conv_forward(_n, _h, _w, _c_i, _c_o, group, _k_h, _k_w, _a, _b, None, pad_h, pad_w, _s_h, _s_w, cut_h, cut_co, cut_m, cut_k, cut_n, block_size) _o_h = mod_forward.shape[2].value _o_w = mod_forward.shape[3].value head = akg.tvm.placeholder(mod_forward.shape, name="head", dtype=conv_dtype) # (_n,_c_o,_o_h,_o_w)--(stride)-->(_n,_c_o,(_o_h-1)*_s_h+1, # (_o_w-1)*_s_w+1)--(5d)-->(_n,_c_o/16,(_o_h-1)*_s_h+1,(_o_w-1)*_s_w+1,16) pld_head_strided = akg.tvm.placeholder((_n, _c_o // block_size, (_o_h - 1) * _s_h + 1, (_o_w - 1) * _s_w + 1, block_size), name="head_strided_5d", dtype=conv_dtype) # (_c_o,_c_i//group,_k_h,_k_w)--(flip)--> # (_c_i,_c_o//group,_k_h,_k_w)--(Fractal)-->((_c_o//group)/16*_k_h*_k_w, _c_i/16,16,16) pld_b_flipped = akg.tvm.placeholder(((_c_o // group) // block_size * _k_h * _k_w, _c_i // block_size, block_size, block_size), name="b_flip", dtype=conv_dtype) # b in Fractal format; result in Fractal format b_group_flipped = group_flip_weight(_b, _k_h, _k_w, group, _c_o // group // block_size, _c_i // group // block_size, block_size) s_gr_fl = akg.tvm.create_schedule([b_group_flipped.op]) info = dim.Dim() info.setdim(index=0, axis=0, tilel1=1, tilel0=1) info.setdim(index=0, axis=1, tilel1=1, tilel0=1) info.setdim(index=0, axis=2, tilel1=1, tilel0=1) info.setdim(index=0, axis=3, tilel1=1, tilel0=1) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=False): mod_b_group_flip = akg.build(s_gr_fl, [_b, b_group_flipped], "cce", name="b_group_flip", attrs={"dim": str(info)}, polyhedral=True) head_strided = strided_head(head, _s_h, _s_w) s_striding = akg.tvm.create_schedule(head_strided.op) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=False): mod_head_strided = akg.build(s_striding, [head, head_strided], "cce", name="h_strided", attrs={"dim": str(info)}, polyhedral=True) a_transposed = transpose_regroup(_a, block_size, group) s_transposed_nc = akg.tvm.create_schedule(a_transposed.op) info = dim.Dim() info.setdim(index=0, axis=0, tilel1=16, tilel0=16) info.setdim(index=0, axis=1, tilel1=1, tilel0=1) info.setdim(index=0, axis=2, tilel1=1, tilel0=1) info.setdim(index=0, axis=3, tilel1=1, tilel0=1) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod_transposed_nc = akg.build(s_transposed_nc, [_a, a_transposed], "cce", name="a_transposed", attrs={"dim": str(info)}, polyhedral=True) head_transposed_convert = transpose_convert_head(head, block_size) s_transposed_convert = akg.tvm.create_schedule(head_transposed_convert.op) info = dim.Dim() info.setdim(index=0, axis=0, tilel1=1, tilel0=1) info.setdim(index=0, axis=1, tilel1=1, tilel0=1) info.setdim(index=0, axis=2, tilel1=1, tilel0=1) info.setdim(index=0, axis=3, tilel1=1, tilel0=1) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod_transposed_convert = akg.build(s_transposed_convert, [head, head_transposed_convert], "cce", name="a_transposed", attrs={"dim": str(info)}, polyhedral=True) # Begin with the ad kernels ad_attrs = {"ad_conv_enable": 1} _jacs_data = list(akg.differentiate(mod_forward, [_a], head, ad_attrs, [pld_head_strided, pld_b_flipped, None])) cut_h_e, cut_co_e, cut_m_e, cut_k_e, cut_n_e = ((_o_h - 1) * _s_h + 1 + 2 * (_k_h - 1 - pad_h), 16, _h * _w, 48, 16) cut_m_e = ((cut_m_e + block_size - 1) // block_size) * block_size info = set_dims_group(cut_h_e, cut_co_e, cut_m_e, cut_k_e, cut_n_e, expr_to_int(_a.shape), _c_o, _c_i, group, _k_h, _k_w, _s_h, block_size) s_data = akg.tvm.create_schedule([_jacs_data[0].op]) # low_data = akg.lower(s_data, [pld_head_strided, pld_b_flipped, _jacs_data[0]], simple_mode=True) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=False): mod_ad_data = akg.build(s_data, [pld_head_strided, pld_b_flipped, _jacs_data[0]], "cce", name="conv_ad_data", attrs={"dim": info}, polyhedral=True) # (_n,_c_i,_h,_w)--(trans)-->(_c_i,_n,_h,_w)--(regroup)--> # (_c_i//group,_n*group,_h,_w)--(5d)-->(_c_i//group,(_n*group)/16,_h,_w,16) pld_x_trans = akg.tvm.placeholder((_c_i // group, (_n * group) // block_size, _h, _w, block_size), name="x_trans_5d", dtype=conv_dtype) # (_n,_c_o,_o_h,_o_w)--(trans)--> # (_c_o,_n,_o_h,_o_w)--(Fractal)-->(_n/16*_o_h*_o_w, _c_o/16,16,16) pld_head_trans_converted = akg.tvm.placeholder((_n // block_size * _o_h * _o_w, _c_o // block_size, block_size, block_size), name="head_trans_convert", dtype=conv_dtype) # ad_attrs = {"ad_conv_enable": 1} _jacs_weights = list(akg.differentiate(mod_forward, [_b], head, ad_attrs, [pld_x_trans, pld_head_trans_converted, None])) cut_h_e, cut_co_e, cut_m_e, cut_k_e, cut_n_e = (_h + 2 * pad_h, 16, _k_h * _k_w, 48, 16) cut_m_e = ((cut_m_e + block_size - 1) // block_size) * block_size info = set_dims_group(cut_h_e, cut_co_e, cut_m_e, cut_k_e, cut_n_e, (_c_i // group, _c_o // block_size, _k_h, _k_w, block_size), _n * group, _c_o, group, _o_h, _o_w, 1, block_size) s_weights = akg.tvm.create_schedule([_jacs_weights[0].op]) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod_ad_weights = akg.build(s_weights, [pld_x_trans, pld_head_trans_converted, _jacs_weights[0]], "cce", name="conv_ad_weights", attrs={"dim": info}, polyhedral=True) print("Forward input data shape: ", _a.shape) print("Forward input weight shape: ", _b.shape) print("Forward output shape: ", mod_forward.shape) print("Backward wrt. DATA input data shape: ", pld_head_strided.shape) print("Backward wrt. DATA input weight shape: ", pld_b_flipped.shape) print("Backward wrt. DATA output shape: ", _jacs_data[0].shape) print("Backward wrt. WEIGHT input data shape: ", pld_x_trans.shape) print("Backward wrt. WEIGHT input weight shape: ", pld_head_trans_converted.shape) print("Backward wrt. WEIGHT output shape: ", _jacs_weights[0].shape) return mod_ad_data, mod_ad_weights, mod_b_group_flip, mod_head_strided, mod_transposed_nc, mod_transposed_convert
def conv_01(fmap_shape, filter_shape, pad_, stride_, dilation_, tile_hh=0, tile_coco=0, tile_mm=0, tile_kk=0, tile_nn=0, use_bias=False, block_size=16, conv_dtype='float16'): # input shape (NCHW -> NC1HWC0) in_n, in_c, in_h, in_w = fmap_shape in_c = (in_c + block_size - 1) // block_size * block_size # kernel shape (NCHW -> NC1HWC0 -> Fractal) k_n, k_c, k_h, k_w = filter_shape k_c = (k_c + block_size - 1) // block_size * block_size k_n = (k_n + block_size - 1) // block_size * block_size input_shape_nc1hwc0 = (in_n, in_c // block_size, in_h, in_w, block_size) kernel_shape_nc1hwc0 = (k_n, k_c // block_size, k_h, k_w, block_size) k_n, _, k_h, k_w, _ = kernel_shape_nc1hwc0 kernel_shape_fractal = (k_c // block_size * k_h * k_w, k_n // block_size, block_size, block_size) # A placeholder (NC1HWCO) A = akg.tvm.placeholder(input_shape_nc1hwc0, dtype=conv_dtype, name="input0") # B_placeholder (fractal) B = akg.tvm.placeholder(kernel_shape_fractal, dtype=conv_dtype, name="input1") data = [A, B] if use_bias: bias_shape_nc1hwc0 = (1, k_n // block_size, 1, 1, block_size) bias_name = "input2" bias_value = akg.tvm.placeholder(bias_shape_nc1hwc0, dtype=conv_dtype, name=bias_name) data.append(bias_value) else: bias_name = 'None' bias_value = None conv, _ = Conv(data, fmap_shape, filter_shape, pad_, stride_, dilation_, use_bias) kernel_name = 'conv_ad' k_n, k_c, k_h, k_w = filter_shape k_c = (k_c + block_size - 1) // block_size * block_size k_n = (k_n + block_size - 1) // block_size * block_size k_hw = k_h * k_w const_shift = k_hw - 1 # B in Fractal format; result in Fractal format def flip_weight(B, k_c, k_hw, const_shift): out_shape = (B.shape[1].value * k_hw, k_c // block_size, block_size, block_size) B_flip = akg.tvm.compute( out_shape, lambda i0, i1, i2, i3: B[i1 * k_hw + const_shift - truncmod( i0, k_hw), floordiv(i0, k_hw), i3, i2], name=B.name + "_flipped") return B_flip def strided_head(H, s_h, s_w): n, c1, h, w, c0 = H.shape out_shape = (n, c1, (h - 1) * s_h + 1, (w - 1) * s_w + 1, c0) H_strided = akg.tvm.compute( out_shape, lambda i0, i1, i2, i3, i4: akg.tvm.expr.Select( akg.tvm.any(truncmod(i2, s_h) != 0, truncmod(i3, s_w) != 0), akg.tvm.const(0.0, dtype="float16"), H[i0, i1, floordiv(i2, s_h), floordiv(i3, s_w), i4]), name=H.name + "_strided") return H_strided B_flip = flip_weight(B, k_c, k_hw, const_shift) pld_B_flip = akg.tvm.placeholder(B_flip.shape, name="inp1_flipped", dtype='float16') HEAD = akg.tvm.placeholder(conv.shape, name="Head", dtype='float16') HEAD_n, HEAD_c1, HEAD_h, HEAD_w, HEAD_c0 = HEAD.shape info = set_dims((HEAD_n.value, HEAD_c1.value * HEAD_c0.value, HEAD_h.value, HEAD_w.value), (k_c, k_n, k_h, k_w), (2, 2), (1, 1), (1, 1), tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, block_size) s_h, s_w = stride_ if (s_h == 1) and (s_w == 1): ad_attrs = {"ad_conv_enable": 1, "ad_conv_reuse_conv": 1} jacs = list( akg.differentiate(conv, [A], HEAD, ad_attrs, [HEAD, pld_B_flip, None])) sjac = akg.tvm.create_schedule([jacs[0].op]) op_vars = [HEAD, pld_B_flip, jacs[0]] info = set_dims((HEAD_n.value, HEAD_c1.value * HEAD_c0.value, HEAD_h.value, HEAD_w.value), (k_c, k_n, k_h, k_w), (k_h - 1, k_w - 1), (1, 1), (1, 1), tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, block_size) else: Head_strided = strided_head(HEAD, s_h, s_w) pld_Head_strided = akg.tvm.placeholder(Head_strided.shape, name="head_strided", dtype='float16') ad_attrs = {"ad_conv_enable": 1, "ad_conv_reuse_conv": 1} jacs = list( akg.differentiate(conv, [A], HEAD, ad_attrs, [pld_Head_strided, pld_B_flip, None])) sjac = akg.tvm.create_schedule([jacs[0].op]) op_vars = [pld_Head_strided, pld_B_flip, jacs[0]] h_n, h_c1, h_h, h_w, h_c0 = pld_Head_strided.shape info = set_dims( (h_n.value, h_c1.value * h_c0.value, h_h.value, h_w.value), (k_c, k_n, k_h, k_w), (k_h - 1, k_w - 1), (1, 1), (1, 1), tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, block_size) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod_backward = akg.build(sjac, op_vars, "cce", name=kernel_name, attrs={"dim": str(info)}, polyhedral=True) def transpose_data(A): out_shape = (A.shape[1] * block_size, truncdiv(A.shape[0], block_size), A.shape[2], A.shape[3], block_size) A_transpose = akg.tvm.compute( out_shape, lambda j0, j1, j2, j3, j4: A[j1 * block_size + j4, truncdiv(j0, block_size), j2, j3, truncmod(j0, block_size)], name=A.name + "_transposed") return A_transpose # Head is in 5D format # Output is in Fractal format def transpose_convert_head(Head): out_shape = ((floordiv(Head.shape[0].value, block_size)) * Head.shape[2].value * Head.shape[3].value, Head.shape[1].value, block_size, block_size) tmp_6D_shape = (floordiv(Head.shape[0].value, block_size), block_size, Head.shape[1].value, Head.shape[2].value, Head.shape[3].value, block_size) Head_6D = akg.topi.reshape(Head, tmp_6D_shape) # Transpose from (N//block_size_N, block_size_N, C//block_size_C, H, W, block_size_C) # to (N//block_size_N, H, W, C//block_size_C, block_size_C, block_size_N,) Head_6D_transpose = akg.topi.transpose(Head_6D, (0, 3, 4, 2, 5, 1)) Head_transpose_convert = akg.topi.reshape(Head_6D_transpose, out_shape) return Head_transpose_convert X_transposed = transpose_data(A) pld_X_transposed = akg.tvm.placeholder(X_transposed.shape, name="inp0_transposed", dtype='float16') if (s_h > 1) or (s_w > 1): Head_transposed_converted = strided_head(HEAD, s_h, s_w) else: Head_transposed_converted = HEAD strided_head_n, strided_head_c1, strided_head_h, strided_head_w, strided_head_c0 = Head_transposed_converted.shape Head_transposed_converted = transpose_convert_head( Head_transposed_converted) _ = akg.tvm.create_schedule(Head_transposed_converted.op) pld_Head_transposed_converted = akg.tvm.placeholder( Head_transposed_converted.shape, name="head_transposed", dtype='float16') ad_attrs = {"ad_conv_enable": 1, "ad_conv_reuse_conv": 1} jacs = list( akg.differentiate( conv, [B], HEAD, ad_attrs, [pld_X_transposed, pld_Head_transposed_converted, None])) sjac = akg.tvm.create_schedule([jacs[0].op]) op_vars = [HEAD, pld_X_transposed, pld_Head_transposed_converted, jacs[0]] in_n, in_c1, in_h, in_w, in_c0 = A.shape info = set_dims( (in_c1.value * in_c0.value, in_n.value, in_h.value, in_w.value), (strided_head_c1.value * strided_head_c0.value, strided_head_n.value, strided_head_h.value, strided_head_w.value), (0, 0), (1, 1), (1, 1), tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, block_size) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod_backward2 = akg.build(sjac, op_vars, "cce", name="conv_backward_weight", attrs={"dim": str(info)}, polyhedral=True) return mod_backward, mod_backward2
def psroialign_compute(fm_shape, roi_shape, class_num, group_size, sample_h, sample_w, scale): ''' :param fm_shape: (n, c_dim, h, w) where: c_dim = group_size * group_size * (class_num + 1) :param roi_shape: (roi_num, 16, 1, 1). there are 5 value on dim C: score, x1, y1, x2, y2. The other 11 num is pads :param class_num: :param group_size: :param sample_h: :param sample_w: :param scale: :return: ''' dtype = "float16" fm_data = akg.tvm.placeholder(fm_shape, name="fm_data", dtype=dtype) roi_data = akg.tvm.placeholder(roi_shape, name="roi_data", dtype=dtype) scale_const = akg.tvm.const(scale, dtype=dtype) sample_h_const = akg.tvm.const(sample_h, "int32") sample_w_const = akg.tvm.const(sample_w, "int32") two_const = akg.tvm.const(2, "float16") one_const = akg.tvm.const(1, "float16") group_size_const = akg.tvm.const(group_size, "int32") bin_num = group_size * group_size # ============================================================== # step 1: scale coordinates size in original image to size in feature map # ============================================================== COSIZE = 16 roi_num = roi_shape[0] aligned_roi_num = do_align(roi_num, COSIZE) # 4 means x1, y1, x2, y2 # roi_shape[0] must be equal to COSIZE scaled_coors = akg.tvm.compute( (4, aligned_roi_num, 1, 1), lambda n, c, h, w: roi_data[c, 1 + n, h, w] * scale_const, name='scaled_coors') # ============================================================== # step 2: compute the width and height of roi # ============================================================== # 2 stands for width and height width_height_shape = (2, aligned_roi_num, 1, 1) width_height_of_rois = akg.tvm.compute( width_height_shape, lambda n, c, h, w: scaled_coors[n + 2, c, h, w] - scaled_coors[n, c, h, w], name='width_height_of_rois') width_shape = (aligned_roi_num, ) width_of_rois = akg.tvm.compute( width_shape, lambda n: scaled_coors[2, n, 0, 0] - scaled_coors[0, n, 0, 0], name='width_of_rois') width_shape = (aligned_roi_num, ) height_of_rois = akg.tvm.compute( width_shape, lambda n: scaled_coors[1, n, 0, 0] - scaled_coors[3, n, 0, 0], name='height_of_rois') # ============================================================== # step 3: compute the bias of the coordinates of all samples # ============================================================== # samples_shape = (aligned_roi_num, bin_num, sample_h, sample_w) # unit_nums = akg.tvm.compute((2,), lambda i: two_const * group_size_const \ # * akg.tvm.expr.Select(i == 0, sample_w_const, sample_h_const), name = 'uint_nums') # width_height_shape(0, x, x, x) indicates the width of a single unit which is separated by samples # and width_height_shape(1, x, x, x) the height # unit_lengths = akg.tvm.compute(width_height_shape, lambda n, c, h, w: width_height_of_rois(n, c, h, w) / unit_nums(n), \ # name = 'uint_lengths') unit_w_lengths = akg.tvm.compute( width_shape, lambda n: width_of_rois(n) / sample_w_const * group_size_const, name='uint_w_lengths') unit_h_lengths = akg.tvm.compute( width_shape, lambda n: height_of_rois(n) / sample_h_const * group_size_const, name='uint_h_lengths') # samples_coors_x_shape = (aligned_roi_num, 1, group_size * sample_h, group_size * sample_w) # samples_x_coors_bias = akg.tvm.compute(samples_coors_x_shape, lambda n, c, h, w: unit_w_lengths[n] * \ # (one_const + w * two_const), name = 'samples_x_coors_bias') # # samples_y_coors_bias = akg.tvm.compute(samples_coors_x_shape, lambda n, c, h, w: unit_h_lengths[n] * \ # (one_const + w * two_const), name = 'samples_y_coors_bias') # # samples_x_coors = akg.tvm.compute(samples_coors_x_shape, lambda n, c, h, w: \ # samples_x_coors_bias(n, c, h, w) + scaled_coors(1, c, 1, 1), name = 'samples_x_coors') # samples_y_coors = akg.tvm.compute(samples_coors_x_shape, lambda n, c, h, w: \ # samples_y_coors_bias(n, c, h, w) + scaled_coors(2, c, 1, 1), name = 'samples_y_coors') sample_w_bias_shape = (1, group_size, sample_w, aligned_roi_num) # sample_w_bias = akg.tvm.compute(sample_w_bias_shape, lambda n, c, h, w: unit_w_lengths[w] * \ # (one_const + two_const * (c * sample_w_const + h)), name = 'samples_w_bias') # sample_w_bias = akg.tvm.compute(sample_w_bias_shape, lambda n, c, h, w: unit_w_lengths[w] * \ # (one_const + two_const * (sample_w_const)), name = 'samples_w_bias') sample_h_bias_shape = (1, group_size, sample_h, aligned_roi_num) # sample_h_bias = akg.tvm.compute(sample_h_bias_shape, lambda n, c, h, w: unit_h_lengths[w] * \ # (one_const + two_const * (c * sample_h_const + h)), name = 'samples_h_bias') # sample_h_bias = akg.tvm.compute(sample_h_bias_shape, lambda n, c, h, w: unit_h_lengths[w] * \ # (one_const + two_const * (sample_h_const)), name = 'samples_h_bias') @akg.tvm.hybrid.script(capture=locals()) def gen_bias(h_value, unit_lengths, ratio): output = output_tensor((1, group_size, h_value, aligned_roi_num), 'float16') strides = allocate((aligned_roi_num, ), 'float16', 'local') for w in range(0, aligned_roi_num): strides[w] = half(0.0) for c in range(0, group_size): for h in range(0, 1): for w in range(0, aligned_roi_num): output[0, c, h, w] = unit_lengths[w] # strides[w] += unit_lengths[w] * ratio * half(h_value) for h in range(1, h_value): for w in range(0, aligned_roi_num): output[0, c, h, w] = output[0, c, h - 1, w] + ratio * unit_lengths[w] return output sample_w_bias = gen_bias(sample_w_const, unit_w_lengths, two_const) sample_h_bias = gen_bias(sample_h_const, unit_h_lengths, two_const) samples_x_coors = akg.tvm.compute( sample_w_bias_shape, lambda n, c, h, w: sample_w_bias(n, c, h, w) + scaled_coors( 0, w, 0, 0), name='samples_x_coors') samples_y_coors = akg.tvm.compute( sample_h_bias_shape, lambda n, c, h, w: sample_h_bias(n, c, h, w) + scaled_coors( 1, w, 0, 0), name='samples_y_coors') # ============================================================== # step 4: compute the low and high coordinates of samples for bilinear # ============================================================== # samples_x_coors_low = akg.tvm.compute(sample_w_bias_shape, lambda *indices: \ # akg.lang.ascend.floor(samples_x_coors(*indices)), name = 'samples_x_coors_low') # samples_x_coors_high = akg.tvm.compute(sample_w_bias_shape, lambda *indices: \ # akg.lang.ascend.ceil(samples_x_coors(*indices)), name = 'samples_x_coors_high') # samples_y_coors_low = akg.tvm.compute(sample_h_bias_shape, lambda *indices: \ # akg.lang.ascend.floor(samples_y_coors(*indices)), name = 'samples_y_coors_low') # samples_y_coors_high = akg.tvm.compute(sample_h_bias_shape, lambda *indices: \ # akg.lang.ascend.ceil(samples_y_coors(*indices)), name = 'samples_y_coors_high') samples_x_coors_low = akg.lang.ascend.floor(samples_x_coors) samples_x_coors_high = akg.lang.ascend.ceil(samples_x_coors) samples_y_coors_low = akg.lang.ascend.floor(samples_y_coors) samples_y_coors_high = akg.lang.ascend.ceil(samples_y_coors) # samples_x_coors_low = akg.tvm.compute(sample_w_bias_shape, lambda *indices: \ # akg.topi.cast(samples_x_coors(*indices), 'int32'), name = 'samples_x_coors_low') # samples_x_coors_high = akg.tvm.compute(sample_w_bias_shape, lambda *indices: \ # samples_x_coors_low(*indices) + akg.topi.cast(one_const, 'int32'), name = 'samples_x_coors_high') # samples_y_coors_low = akg.tvm.compute(sample_h_bias_shape, lambda *indices: \ # akg.topi.cast(samples_y_coors(*indices), 'int32'), name = 'samples_y_coors_low') # samples_y_coors_high = akg.tvm.compute(sample_h_bias_shape, lambda *indices: \ # samples_y_coors_low(*indices) + akg.topi.cast(one_const, 'int32'), name = 'samples_y_coors_high') # ============================================================== # step 5: compute the weight of low and high coordinates for bilinear # ============================================================== # wlx = akg.tvm.compute(samples_coors_x_shape, lambda *indices: samples_x_coors_high(*indices) - samples_x_coors(*indices)) # whx = akg.tvm.compute(samples_coors_x_shape, lambda *indices: one_const - wlx(*indices)) # # wly = akg.tvm.compute(samples_coors_x_shape, lambda *indices: samples_y_coors_high(*indices) - samples_y_coors(*indices)) # why = akg.tvm.compute(samples_coors_x_shape, lambda *indices: one_const - wly(*indices)) # # wlxXwly = akg.tvm.compute(samples_coors_x_shape, lambda *indices: wlx(*indices) * wly(*indices)) # whxXwly = akg.tvm.compute(samples_coors_x_shape, lambda *indices: whx(*indices) * wly(*indices)) # wlxXwhy = akg.tvm.compute(samples_coors_x_shape, lambda *indices: wlx(*indices) * why(*indices)) # whxXwhy = akg.tvm.compute(samples_coors_x_shape, lambda *indices: whx(*indices) * why(*indices)) wlx = akg.tvm.compute(sample_w_bias_shape, lambda *indices: samples_x_coors_high(*indices) - samples_x_coors(*indices), name='wlx') whx = akg.tvm.compute(sample_w_bias_shape, lambda *indices: one_const - wlx(*indices), name='whx') wly = akg.tvm.compute(sample_h_bias_shape, lambda *indices: samples_y_coors_high(*indices) - samples_y_coors(*indices), name='wly') why = akg.tvm.compute(sample_h_bias_shape, lambda *indices: one_const - wly(*indices), name='why') samples_shape = (group_size, group_size, sample_h, sample_w, aligned_roi_num) wlxXwly = akg.tvm.compute( samples_shape, lambda i, j, m, n, k: wlx(0, j, n, k) * wly(0, i, m, k), name='wlxXwly') whxXwly = akg.tvm.compute( samples_shape, lambda i, j, m, n, k: whx(0, j, n, k) * wly(0, i, m, k), name='whxXwly') wlxXwhy = akg.tvm.compute( samples_shape, lambda i, j, m, n, k: wlx(0, j, n, k) * why(0, i, m, k), name='wlxXwhy') whxXwhy = akg.tvm.compute( samples_shape, lambda i, j, m, n, k: whx(0, j, n, k) * why(0, i, m, k), name='whxXwhy') boundaries_values_shape = (4, sample_h, sample_w, aligned_roi_num) bin_values_shape = (1, class_num + 1, bin_num, aligned_roi_num) gap_values_shape = (class_num + 1, aligned_roi_num) @akg.tvm.hybrid.script def fetch_data(shape, fm_in, c_idx, bin_idx, bin_num, group_size, sample_h, sample_w, roi_num, x_low, x_high, y_low, y_high, one_value): boundaries_values = output_tensor(shape, 'float16') for i in range(0, sample_h): for j in range(0, sample_w): for k in range(0, roi_num): # assume batch is 1 # w_low_idx = x_low[0, bin_idx % group_size, j, k] # w_high_idx = x_high[0, bin_idx % group_size, j, k] # # h_low_idx = y_low[0, bin_idx // group_size, i, k] # h_high_idx = y_high[0, bin_idx // group_size, i, k] #x_low, y_low boundaries_values[0, i, j, k] = one_value boundaries_values[1, i, j, k] = one_value boundaries_values[2, i, j, k] = one_value boundaries_values[3, i, j, k] = one_value # boundaries_values[0, i, j, k] = fm_in[0, c_idx * bin_num + bin_idx, h_low_idx, w_low_idx] # # #x_high, y_low # boundaries_values[1, i, j, k] = fm_in[0, c_idx * bin_num + bin_idx, h_low_idx, w_high_idx] # # #x_low, y_high # boundaries_values[2, i, j, k] = fm_in[0, c_idx * bin_num + bin_idx, h_high_idx, w_low_idx] # # #x_high, y_high # boundaries_values[3, i, j, k] = fm_in[0, c_idx * bin_num + bin_idx, h_high_idx, w_high_idx] return boundaries_values @akg.tvm.hybrid.script(capture=locals()) def compute_bilinear_maxpool_gap(fm_in, x_low, x_high, y_low, y_high, wlxXwly_, whxXwly_, wlxXwhy_, whxXwhy_, one_value): bin_values = allocate(bin_values_shape, 'float16', 'local') # global average result gap_values = output_tensor(gap_values_shape, 'float16') for c in range(0, class_num + 1): for b in range(0, bin_num): boundaries_values = fetch_data(boundaries_values_shape, fm_in, c, b, bin_num, group_size, sample_h, sample_w, roi_num, x_low, x_high, y_low, y_high, one_value) k_w = b % group_size k_h = b // group_size for n in range(0, roi_num): bin_values[0, c, b, n] = half(0.0) for h in range(0, sample_h): for w in range(0, sample_w): for n in range(0, roi_num): # bilinear tmp = boundaries_values[0, h, w, n] * wlxXwly_[k_h, k_w, h, w, n] + \ boundaries_values[1, h, w, n] * whxXwly_[k_h, k_w, h, w, n] + \ boundaries_values[2, h, w, n] * wlxXwhy_[k_h, k_w, h, w, n] + \ boundaries_values[3, h, w, n] * whxXwhy_[k_h, k_w, h, w, n] # maxpooling if tmp > bin_values[0, c, b, n]: bin_values[0, c, b, n] = tmp # global average pooling for j in range(0, roi_num): tmp1 = bin_values[0, c, 0, j] for k in range(1, bin_num): tmp1 += bin_values[0, c, k, j] gap_values[c, j] = tmp1 / bin_num return gap_values # ============================================================== # step 6: compute results of bilinear, maxpooling and global average pooling # ============================================================== out = compute_bilinear_maxpool_gap(fm_data, samples_x_coors_low, samples_x_coors_high, samples_y_coors_low, samples_y_coors_high, wlxXwly, whxXwly, wlxXwhy, whxXwhy, one_const) # out = wlxXwhy # info = dim.Dim() # info.setdim(index=0, head = 0, body = 0, tail = 0, tilel1 = 1, tilel0 = 1) # info.setdim(index=0, head = 0, body = 0, tail = 0, tilel1 = 1, tilel0 = 1) s = akg.tvm.create_schedule(out.op) with akg.build_config(add_lower_pass=utils.debug_mode(0), dump_pass_ir=True): # mod = akg.tvm.build(s, [fm_data, roi_data, out], "cce", name="psroialign", attrs = {"dim" : str(info)}, polyhedral=True) mod = akg.build(s, [fm_data, roi_data, out], "cce", name="psroialign", polyhedral=True) return mod
def group_conv_forward(_n, _h, _w, _c_i, _c_o, group, _k_h, _k_w, _a, _b, bias_value, pad_h, pad_w, _s_h, _s_w, cut_h, cut_co, cut_m, cut_k, cut_n, block_size, use_bias=False, kernel_name='group_conv'): if (not isinstance(_n, int)): _n, _h, _w, _c_i, _c_o, group, _k_h, _k_w = expr_to_int((_n, _h, _w, _c_i, _c_o, group, _k_h, _k_w)) pad_h, pad_w, _s_h, _s_w = expr_to_int((pad_h, pad_w, _s_h, _s_w)) cut_h, cut_co, cut_m, cut_k, cut_n, block_size = expr_to_int((cut_h, cut_co, cut_m, cut_k, cut_n, block_size)) conv_dtype = 'float16' if cut_h == _h: cut_h += pad_h + pad_h assert _c_o % group == 0 and _c_i % group == 0 assert _c_o % block_size == 0 and (_c_i // group) % block_size == 0 if (use_bias): bias = bias_value _o_h = (_h + 2 * pad_h - _k_h) // _s_h + 1 _o_w = (_w + 2 * pad_w - _k_w) // _s_w + 1 kc1 = akg.tvm.reduce_axis((0, _c_i // block_size // group), name='kc1') kh = akg.tvm.reduce_axis((0, _k_h), name='kh') kw = akg.tvm.reduce_axis((0, _k_w), name='kw') kc0 = akg.tvm.reduce_axis((0, block_size), name='kc0') p_top, p_bottom, p_left, p_right = pad_h, pad_h, pad_w, pad_w output_name = 'output' output_bias_name = 'output_bias' C = akg.tvm.compute((_n, _c_o // block_size, _o_h, _o_w, block_size), lambda n, c1, h, w, c0: akg.lang.ascend.mmad( akg.tvm.if_then_else( akg.tvm.any((h * _s_h + kh) < p_top, (h * _s_h + kh) > (_h + p_top - 1), (w * _s_w + kw) < p_left, (w * _s_w + kw) > (_w + p_left - 1)), akg.tvm.const(0.0, conv_dtype), _a[n, c1 // ((_c_o // block_size) // group) * ((_c_i // block_size) // group) + kc1, (h * _s_h + kh - p_top), (w * _s_w + kw - p_left), kc0]) * _b[(kc1 * _k_h + kh) * _k_w + kw, c1, c0, kc0], axis=[kc1, kh, kw, kc0]), attrs={ "pragma_conv_kernel_n": _c_o, "pragma_conv_kernel_h": _k_h, "pragma_conv_kernel_w": _k_w, "pragma_conv_padding_top": p_top, "pragma_conv_padding_bottom": p_bottom, "pragma_conv_padding_left": p_left, "pragma_conv_padding_right": p_right, "pragma_conv_bypass_l1": 1, "pragma_conv_stride_h": _s_h, "pragma_conv_stride_w": _s_w, "pragma_conv_fm_n": _n, "pragma_conv_fm_c": _c_i, "pragma_conv_fm_h": _h, "pragma_conv_fm_w": _w, "pragma_conv_dilation_h": 1, "pragma_conv_dilation_w": 1, "pragma_conv_h_cut": cut_h, "pragma_conv_w_cut": _w + 2 * pad_w, "pragma_conv_co_cut": cut_co, "pragma_conv_m_cut": cut_m, "pragma_conv_k_cut": cut_k, "pragma_conv_n_cut": cut_n, "feature": _a.op.name, "filter": _b.op.name, "bias": 'bias', "res": output_name, "res_bias": output_bias_name}, name=output_name) if use_bias: out = akg.tvm.compute(C.shape, lambda n, c1, h, w, c0: C[n, c1, h, w, c0] + bias[0, c1, 0, 0, c0], name=output_bias_name) bufs = [_a, _b, bias, out] else: out = C bufs = [_a, _b, out] # create schedule for cce s = akg.tvm.create_schedule([out.op]) # set dim info = set_dims_group(cut_h, cut_co, cut_m, cut_k, cut_n, expr_to_int(out.shape), _c_i, _c_o, group, _k_h, _k_w, _s_h, block_size) # build with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=False): mod = akg.build(s, bufs, "cce", name=kernel_name, attrs={"dim": info}, polyhedral=True) return out
def test_CCE_Conv(fmap_shape, filter_shape, pad_, stride_, tile_hh=0, tile_coco=0, tile_mm=0, tile_kk=0, tile_nn=0, bypass_l1=False, use_bias=False, kernel_name="quant_conv", cce_path='.'): # input shape (NCHW -> NC1HWC0) in_n, in_c, in_h, in_w = fmap_shape input_shape_nc1hwc0 = (in_n, in_c // block_size, in_h, in_w, block_size) # out_shape_nc1hwc0 = (in_n, in_c // 32, in_h, in_w, 32) in_n, in_c1, in_h, in_w, in_c0 = input_shape_nc1hwc0 # kernel shape (NCHW -> NC1HWC0 -> Fractal) k_n, k_c, k_h, k_w = filter_shape kernel_shape_nc1hwc0 = (k_n, k_c // 32, k_h, k_w, 32) k_n, k_c1, k_h, k_w, k_c0 = kernel_shape_nc1hwc0 kernel_shape_fractal = (k_c // 32 * k_h * k_w, k_n // 16, 16, 32) f_ko, f_no, f_ni, f_ki = kernel_shape_fractal # bias shape bias_shape_nc1hwc0 = (1, k_n // block_size, 1, 1, block_size) # padding ((padding_h, padding_w) -> (padding_top, padding_bottom, padding_left, padding_right)) padding = (pad_[0], pad_[0], pad_[1], pad_[1]) p_top, p_bottom, p_left, p_right = padding # stride (stride_h, stride_w) s_h, s_w = stride_ # A placeholder (NC1HWCO) A = akg.tvm.placeholder(input_shape_nc1hwc0, dtype=conv_dtype, name='FMap') # B_placeholder (fractal) B = akg.tvm.placeholder(kernel_shape_fractal, dtype='int8', name='Filter') ScaleQ = akg.tvm.placeholder((16,), dtype='float16', name='ScaleQ') OffsetQ = akg.tvm.placeholder((16,), dtype='float16', name='OffsetQ') out_shape_nc1hwc0 = (in_n, in_c // 32, in_h, in_w, 32) q_n, q_c1, q_h, q_w, q_c0 = out_shape_nc1hwc0 # print out_shape_nc1hwc0 Quant = akg.tvm.compute(out_shape_nc1hwc0, lambda qn, qc1, qh, qw, qc0: ( A[qn, qc1 + qc0 // 16, qh, qw, qc0 % 16] * ScaleQ[0] + OffsetQ[0]).astype( 'int8'), name='QuantOUT', attrs={'no_inline': 1}) if use_bias: bias_name = 'bias' bias_value = akg.tvm.placeholder(bias_shape_nc1hwc0, dtype=conv_dtype, name=bias_name) else: bias_name = 'None' # Create reduction variables kc1 = akg.tvm.reduce_axis((0, k_c1), name='kc1') kh = akg.tvm.reduce_axis((0, k_h), name='kh') kw = akg.tvm.reduce_axis((0, k_w), name='kw') kc0 = akg.tvm.reduce_axis((0, k_c0), name='kc0') out_h = (in_h + p_top + p_bottom - k_h) // (s_h) + 1 tile_out_h = (tile_hh - k_h) // s_h + 1 out_w = (in_w + p_left + p_right - k_w) // (s_w) + 1 out_shape_nc1hwc0 = (in_n, k_n // block_size, out_h, out_w, block_size) out_n, out_c1, out_h, out_w, out_c0 = out_shape_nc1hwc0 if (tile_coco > 0): c1_cut = tile_coco // block_size else: c1_cut = out_c1 # set dim index = 0 info = dim.Dim() if (q_c1 > 1): info.setdim(index=index, axis="KO", tilel1=q_c1, tilel0=q_c1) # ko if (q_h > 1): info.setdim(index=index, axis="C1", tilel1=tile_out_h, tilel0=tile_out_h) # c1 if (q_w > 1): info.setdim(index=index, axis="C0", tilel1=q_w, tilel0=q_w) # c0 if (q_c0 > 1): info.setdim(index=index, axis="KI", tilel1=q_c0, tilel0=q_c0) # ki index += 1 if (out_c1 > 1): info.setdim(index=index, axis="C1", tilel1=c1_cut, tilel0=0) # c1 if (out_h > 1): info.setdim(index=index, axis="H", tilel1=tile_out_h, tilel0=0) # h if (out_w > 1): info.setdim(index=index, axis="W", tilel1=out_w, tilel0=0) # w if (out_c0 > 1): info.setdim(index=index, axis="C0", tilel1=out_c0, tilel0=0) # c0 if (in_c1 > 1): info.setdim(index=index, axis="KC1", tilel1=in_c1 / 2, tilel0=0) # kc1 if (k_h > 1): info.setdim(index=index, axis="KH", tilel1=k_h, tilel0=0) # kh if (k_w > 1): info.setdim(index=index, axis="KW", tilel1=k_w, tilel0=0) # kw info = str(info) # Compute the convolution output_name = "output0" output_bias_name = "output1" # print out_shape_nc1hwc0 C = akg.tvm.compute(out_shape_nc1hwc0, lambda n, c1, h, w, c0: akg.tvm.sum( akg.tvm.if_then_else( akg.tvm.any((h * s_h + kh) < p_top, (h * s_h + kh) > (in_h + p_top - 1), (w * s_w + kw) < p_left, (w * s_w + kw) > (in_w + p_left - 1)), akg.tvm.const(0.0, 'int8'), Quant[n, kc1, (h * s_h + kh - p_top), (w * s_w + kw - p_left), kc0]) * B[(kc1 * k_h + kh) * k_w + kw, c1, c0, kc0], axis=[kc1, kh, kw, kc0]), name=output_name, attrs={ "pragma_conv_kernel_n": k_n, "pragma_conv_kernel_h": k_h, "pragma_conv_kernel_w": k_w, "pragma_conv_padding_top": p_top, "pragma_conv_padding_bottom": p_bottom, "pragma_conv_padding_left": p_left, "pragma_conv_padding_right": p_right, "pragma_conv_dilation_h": 1, "pragma_conv_dilation_w": 1, "pragma_conv_bypass_l1": 1 if bypass_l1 else 0, "pragma_conv_stride_h": s_h, "pragma_conv_stride_w": s_w, "pragma_conv_fm_n": in_n, "pragma_conv_fm_c": in_c, "pragma_conv_fm_h": in_h, "pragma_conv_fm_w": in_w, "pragma_conv_h_cut": (h_window_cut - 1) * s_h + k_h, "pragma_conv_w_cut": (in_w + p_left + p_right), "pragma_conv_co_cut": c1_cut * k_c0, "pragma_conv_m_cut": tile_mm, "pragma_conv_k_cut": tile_kk, "pragma_conv_n_cut": tile_nn, "feature": Quant.op.name, "filter": B.op.name, "bias": bias_name, "res": output_name, "res_bias": output_bias_name}) if use_bias: cube = akg.tvm.compute(out_shape_nc1hwc0, lambda n, c1, h, w, c0: C[n, c1, h, w, c0] + bias_value[0, c1, 0, 0, c0], name=output_bias_name) else: cube = C if fusion: # leakly relu negative_slope = 0.0 slope_tmp = akg.tvm.const(negative_slope, dtype=conv_dtype) # negative_slope*x out = akg.lang.ascend.vmuls(cube, slope_tmp) # max(x,negative_slope*x) out = akg.lang.ascend.vmax(out, cube) else: out = cube # schedule s = akg.tvm.create_schedule(out.op) attrs = {} with akg.build_config(add_lower_pass=utils.debug_mode(0), dump_pass_ir=True): if fusion: if use_bias: mod = akg.build(s, [A, B, ScaleQ, OffsetQ, bias_value, out], "cce", name=kernel_name, attrs={"dim": info}, polyhedral=True) else: mod = akg.build(s, [A, B, ScaleQ, OffsetQ, out], "cce", name=kernel_name, attrs={"dim": info}, polyhedral=True) else: if use_bias: mod = akg.build(s, [A, B, ScaleQ, OffsetQ, bias_value, out], "cce", name=kernel_name, attrs={"dim": info}, polyhedral=True) else: mod = akg.build(s, [A, B, ScaleQ, OffsetQ, out], "cce", name=kernel_name, attrs={"dim": info}, polyhedral=True) source_code = mod.imported_modules[0].get_source() # print(source_code) # utils.create_code(kernel_name, cce_path, source_code) if run_cce: run_conv(mod, fmap_shape, filter_shape, pad_[0], stride_[0], use_bias)
def maxpool_ad_manual_schedule_no_overlap_all_max(shape, kernel, stride, pad, dtype, attrs=None, polyhedral=False): """automatic differentiate of maxpool with manual schedule for no overlap case.""" kernel_h, kernel_w = kernel stride_h, stride_w = stride pad_h, pad_w, _, _ = pad batch_size, input_c1, input_h, input_w, input_c0 = shape pad_shape = (batch_size, input_c1, input_h + 2 * pad_h, input_w + 2 * pad_w, input_c0) def custom_maxpool_fdiff(out, inputs, head_, ad_attrs, new_pld_array): in_data = inputs[0] if stride_w != kernel_w: raise RuntimeError( "Only supports kernels with same dimensions as stride size!") if stride_h != kernel_h: raise RuntimeError( "Only supports kernels with same dimensions as stride size!") out_broadcast = akg.tvm.compute( pad_shape, lambda b, c1, h, w, c0: out(b, c1, akg.tvm.floordiv(h, stride_h), akg.tvm.floordiv(w, stride_w), c0), name="out_broadcast") # copy output to the shape of the padded input, copying the same value for the entire kernel size out_broadcast = akg.tvm.compute( pad_shape, lambda b, c1, h, w, c0: out(b, c1, akg.tvm.floordiv(h, stride_h), akg.tvm.floordiv(w, stride_w), c0), name="out_broadcast") # copy head to the shape of the padded input, copying the same value for the entire kernel size head_broadcast = akg.tvm.compute( pad_shape, lambda b, c1, h, w, c0: head_(b, c1, akg.tvm.floordiv(h, stride_h), akg.tvm.floordiv(w, stride_w), c0), name="head_broadcast") # check if value was a maximum and assign head of that position if it was # this is done for all the maximum values within one kernel result = akg.tvm.compute( in_data.shape, lambda b, c1, h, w, c0: akg.tvm.expr.Select( in_data(b, c1, h, w, c0) == out_broadcast( b, c1, h + pad_h, w + pad_w, c0), head_broadcast(b, c1, h + pad_h, w + pad_w, c0), akg.tvm.const(0, dtype=in_data.dtype)), name="result") return [result] out_size_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1 out_size_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1 out_shape = (batch_size, input_c1, out_size_h, out_size_w, input_c0) # tensor for the input data data = akg.tvm.placeholder(shape, dtype, name="input_data") # maxpool output forward = akg.tvm.placeholder(out_shape, name="forward", dtype=dtype) # adjoint tensor for the differentiation head = akg.tvm.placeholder(out_shape, name="head", dtype=dtype) # override differentiation computation with custom function [dl_ddata ] = akg.differentiate(forward, [data], head, None, None, override={forward: ([data], custom_maxpool_fdiff)}) # schedule for differetiation operation s = akg.tvm.create_schedule([dl_ddata.op]) # get computations result = dl_ddata forward_broadcast = result.op.input_tensors[1] head_broadcast = result.op.input_tensors[2] # cache reads and writes result_ub = s.cache_write(result, "local.UB") data_ub = s.cache_read(data, "local.UB", [result_ub]) head_ub = s.cache_read(head, "local.UB", [head_broadcast]) forward_ub = s.cache_read(forward, "local.UB", [forward_broadcast]) s[head_broadcast].set_scope("local.UB") s[forward_broadcast].set_scope("local.UB") s[head_ub].compute_at(s[head_broadcast], head_broadcast.op.axis[0]) s[forward_ub].compute_at(s[forward_broadcast], forward_broadcast.op.axis[0]) s[data_ub].compute_at(s[result_ub], result_ub.op.axis[0]) s[forward_broadcast].compute_at(s[result_ub], result_ub.op.axis[0]) s[head_broadcast].compute_at(s[result_ub], result_ub.op.axis[0]) _, c1, h, _, _ = result.op.axis if input_h + 2 * pad_h > 32 or input_w + 2 * pad_w > 32: h_outer, _ = s[result].split(h, 4) s[result_ub].compute_at(s[result], h_outer) else: s[result_ub].compute_at(s[result], c1) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [head, data, forward, dl_ddata], "cce", name="maxpool_ad_manual_schedule_no_overlap_all_max", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "maxpool_ad_manual_schedule_no_overlap_all_max" create_code(kernel_name, './', source_code) return mod
def maxpool_ad_manual_schedule_all_max(shape, kernel, stride, pad, dtype, polyhedral=True, attrs=None): """automatic differentiate of maxpool with manual schedule for all maximum.""" kernel_h, kernel_w = kernel stride_h, stride_w = stride pad_h, pad_w, _, _ = pad batch_size, input_c1, input_h, input_w, input_c0 = shape pad_shape = (batch_size, input_c1, input_h + 2 * pad_h, input_w + 2 * pad_w, input_c0) out_size_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1 out_size_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1 out_shape = (batch_size, input_c1, out_size_h, out_size_w, input_c0) def custom_maxpool_fdiff(out, inputs, head_, ad_attrs, new_pld_array): in_data = inputs[0] data_separated_by_windows = (kernel_h, kernel_w, batch_size, input_c1, out_size_h, out_size_w, input_c0) pad_data = akg.tvm.compute( pad_shape, lambda b, c1, h, w, c0: akg.tvm.expr.Select( akg.tvm.all(h >= pad_h, h < input_h + pad_h, w >= pad_w, w < input_w + pad_w), in_data(b, c1, h - pad_h, w - pad_w, c0), akg.tvm.const(0.0, dtype=dtype)), name="pad_data") data_reshaped = akg.tvm.compute( data_separated_by_windows, lambda wh, ww, b, c1, oh, ow, c0: pad_data( b, c1, oh * stride_h + wh, ow * stride_w + ww, c0), name="data_reshaped") max_broadcast = akg.tvm.compute( data_separated_by_windows, lambda wh, ww, b, c1, oh, ow, c0: out(b, c1, oh, ow, c0), name="max_broadcast") equal = akg.tvm.compute( data_separated_by_windows, lambda wh, ww, b, c1, oh, ow, c0: akg.tvm.expr.Select( max_broadcast(wh, ww, b, c1, oh, ow, c0) == data_reshaped( wh, ww, b, c1, oh, ow, c0), head_(b, c1, oh, ow, c0), akg.tvm.const(0.0, dtype=dtype)), name="equal") data_reorg = akg.tvm.compute( (out_size_h, out_size_w, batch_size, input_c1, input_h + 2 * pad_h, input_w + 2 * pad_w, input_c0), lambda oh, ow, b, c1, h, w, c0: akg.tvm.expr.Select( akg.tvm.any(h < oh * stride_h, h > oh * stride_h + kernel_h - 1, w < ow * stride_w, w > ow * stride_w + kernel_w - 1), akg.tvm.const(0, dtype=dtype), equal(h - oh * stride_h, w - ow * stride_w, b, c1, oh, ow, c0) ), name="data_reorg") result_pad = akg.topi.sum(data_reorg, [0, 1]) result = akg.tvm.compute(shape, lambda b, c1, h, w, c0: result_pad( b, c1, h + pad_h, w + pad_w, c0), name="result") return [result] # tensor for the input data data = akg.tvm.placeholder(shape, dtype, name="input_data") # maxpool output forward = akg.tvm.placeholder(out_shape, name="forward", dtype=dtype) # adjoint tensor for the differentiation head = akg.tvm.placeholder(out_shape, name="head", dtype=dtype) # override differentiation computation with custom function [dl_ddata ] = akg.differentiate(forward, [data], head, None, None, override={forward: ([data], custom_maxpool_fdiff)}) # schedule for differetiation operation s = akg.tvm.create_schedule([dl_ddata.op]) # get computations result = dl_ddata result_pad = result.op.input_tensors[0] data_reorg = result_pad.op.input_tensors[0] equal = data_reorg.op.input_tensors[0] max_broadcast = equal.op.input_tensors[0] data_reshaped = equal.op.input_tensors[1] pad_data = data_reshaped.op.input_tensors[0] data_ub = s.cache_read(data, "local.UB", [pad_data]) head_ub = s.cache_read(head, "local.UB", [equal]) forward_ub = s.cache_read(forward, "local.UB", [max_broadcast]) result_ub = s.cache_write(result, "local.UB") s[max_broadcast].set_scope("local.UB") s[data_reshaped].set_scope("local.UB") s[pad_data].set_scope("local.UB") s[equal].set_scope("local.UB") s[data_reorg].set_scope("local.UB") s[result_pad].set_scope("local.UB") s[data_ub].compute_inline() s[result_ub].compute_inline() s[pad_data].compute_inline() # equal dependencies s[forward_ub].compute_at(s[equal], equal.op.axis[0]) s[max_broadcast].compute_at(s[equal], equal.op.axis[0]) s[data_reshaped].compute_at(s[equal], equal.op.axis[0]) s[head_ub].compute_at(s[equal], equal.op.axis[0]) s[equal].compute_at(s[result_pad], result_pad.op.axis[0]) # result dependencies s[data_reorg].compute_inline() b, c1, h, w, c0 = result_pad.op.axis oh, ow = result_pad.op.reduce_axis s[result_pad].reorder(oh, ow, b, c1, h, w, c0) b, c1, h, w, c0 = result.op.axis h_out, _ = s[result].split(h, stride_h) s[result_pad].compute_at(s[result], h_out) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [head, data, forward, dl_ddata], "cce", name="maxpool_ad_manual_schedule_all_max", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "maxpool_ad_manual_schedule_all_max" create_code(kernel_name, './', source_code) return mod
def conv_02(fmap_shape, filter_shape, pad_, stride_, dilation_, tile_hh=0, tile_coco=0, tile_mm=0, tile_kk=0, tile_nn=0, bypass_l1=False, use_bias=False, block_size=16, conv_dtype='float16'): # input shape (NCHW -> NC1HWC0) in_n, in_c, in_h, in_w = fmap_shape in_c = (in_c + block_size - 1) // block_size * block_size # kernel shape (NCHW -> NC1HWC0 -> Fractal) k_n, k_c, k_h, k_w = filter_shape k_c = (k_c + block_size - 1) // block_size * block_size k_n = (k_n + block_size - 1) // block_size * block_size input_shape_nc1hwc0 = (in_n, in_c // block_size, in_h, in_w, block_size) in_n, _, in_h, in_w, _ = input_shape_nc1hwc0 kernel_shape_nc1hwc0 = (k_n, k_c // block_size, k_h, k_w, block_size) k_n, _, k_h, k_w, _ = kernel_shape_nc1hwc0 kernel_shape_fractal = (k_c // block_size * k_h * k_w, k_n // block_size, block_size, block_size) # A placeholder (NC1HWCO) A = akg.tvm.placeholder(input_shape_nc1hwc0, dtype=conv_dtype, name="input0") # B_placeholder (fractal) B = akg.tvm.placeholder(kernel_shape_fractal, dtype=conv_dtype, name="input1") if use_bias: bias_shape_nc1hwc0 = (1, k_n // block_size, 1, 1, block_size) bias_name = "input2" bias_value = akg.tvm.placeholder(bias_shape_nc1hwc0, dtype=conv_dtype, name=bias_name) else: bias_name = 'None' bias_value = None conv_forward = conv_compute_forward(fmap_shape, filter_shape, pad_, stride_, dilation_, A, B, bias_value, tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, bypass_l1, use_bias, block_size, conv_dtype) k_hw = k_h * k_w const_shift = k_hw - 1 # B in Fractal format; result in Fractal format def flip_weight(B, k_c, k_hw, const_shift): out_shape = (B.shape[1].value * k_hw, k_c // block_size, block_size, block_size) B_flip = akg.tvm.compute( out_shape, lambda i0, i1, i2, i3: B[i1 * k_hw + const_shift - truncmod( i0, k_hw), floordiv(i0, k_hw), i3, i2], name=B.name + "_flipped") return B_flip # H in 5D format; result in 5D format def strided_head(H, s_h, s_w): n, c1, h, w, c0 = H.shape out_shape = (n, c1, (h - 1) * s_h + 1, (w - 1) * s_w + 1, c0) H_strided = akg.tvm.compute( out_shape, lambda i0, i1, i2, i3, i4: akg.tvm.expr.Select( akg.tvm.any(truncmod(i2, s_h) != 0, truncmod(i3, s_w) != 0), akg.tvm.const(0.0, dtype="float16"), H[i0, i1, floordiv(i2, s_h), floordiv(i3, s_w), i4]), name=H.name + "_strided") return H_strided # A in 5D format; result in 5D format def transpose_data(A): out_shape = (A.shape[1].value * block_size, A.shape[0].value // block_size, A.shape[2].value, A.shape[3].value, block_size) A_transpose = akg.tvm.compute( out_shape, lambda j0, j1, j2, j3, j4: A[j1 * block_size + j4, floordiv(j0, block_size), j2, j3, truncmod(j0, block_size)], name=A.name + "_transposed") return A_transpose # Head is in 5D format; result in Fractal format def transpose_convert_head(Head): out_shape = ((Head.shape[0].value // block_size) * Head.shape[2].value * Head.shape[3].value, Head.shape[1].value, block_size, block_size) tmp_6D_shape = (Head.shape[0].value // block_size, block_size, Head.shape[1].value, Head.shape[2].value, Head.shape[3].value, block_size) Head_6D = akg.topi.reshape(Head, tmp_6D_shape) Head_6D_transpose = akg.topi.transpose(Head_6D, (0, 3, 4, 2, 5, 1)) Head_transpose_convert = akg.topi.reshape(Head_6D_transpose, out_shape) return Head_transpose_convert HEAD = akg.tvm.placeholder(conv_forward.shape, name="Head", dtype='float16') Head_transposed_NCHW = (HEAD.shape[1].value * HEAD.shape[4].value, HEAD.shape[0].value, HEAD.shape[2].value, HEAD.shape[3].value) s_h, s_w = stride_ Head_strided_NCHW = (HEAD.shape[0].value, HEAD.shape[1].value * HEAD.shape[4].value, (HEAD.shape[2].value - 1) * s_h + 1, (HEAD.shape[3].value - 1) * s_w + 1) A_transposed_NCHW = (in_c, in_n, in_h, in_w) K_flip_rot_NCHW = (k_c, k_n, k_h, k_w) Head_transposed_converted = transpose_convert_head(HEAD) pld_Head_transposed_converted = akg.tvm.placeholder( Head_transposed_converted.shape, name="Head_trans_fractal", dtype=conv_dtype) A_transposed = transpose_data(A) pld_A_transposed = akg.tvm.placeholder(A_transposed.shape, name="A_trans", dtype=conv_dtype) info = dim.Dim() info.setdim(index=0, axis=0, tilel1=1, tilel0=1) info.setdim(index=0, axis=1, tilel1=1, tilel0=1) info.setdim(index=0, axis=2, tilel1=1, tilel0=1) info.setdim(index=0, axis=3, tilel1=1, tilel0=1) B_flip = flip_weight(B, k_c, k_hw, const_shift) pld_B_flipped = akg.tvm.placeholder(B_flip.shape, name="B_flip", dtype=conv_dtype) s_flipped = akg.tvm.create_schedule(B_flip.op) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod_weight_flipped = akg.build(s_flipped, [B, B_flip], "cce", name=B.name + "_flipped", attrs={"dim": str(info)}, polyhedral=True) s_transposed_converted = akg.tvm.create_schedule( Head_transposed_converted.op) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod_head_transposed_converted = akg.build( s_transposed_converted, [HEAD, Head_transposed_converted], "cce", name="H_trans_converted", attrs={"dim": str(info)}, polyhedral=True) Head_strided = strided_head(HEAD, s_h, s_w) pld_Head_strided = akg.tvm.placeholder(Head_strided.shape, name="Head_trans_5D", dtype=conv_dtype) s_strided = akg.tvm.create_schedule(Head_strided.op) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod_head_strided = akg.build(s_strided, [HEAD, Head_strided], "cce", name="H_strided", attrs={"dim": str(info)}, polyhedral=True) s_transposed = akg.tvm.create_schedule(A_transposed.op) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod_transposed = akg.build(s_transposed, [A, A_transposed], "cce", name="A_transposed", attrs={"dim": str(info)}, polyhedral=True) ad_attrs = {"ad_conv_enable": 1, "ad_conv_reuse_conv": 1} jacs = list( akg.differentiate(conv_forward, [A], HEAD, ad_attrs, [pld_Head_strided, pld_B_flipped, None])) info = set_dims(Head_strided_NCHW, (k_c, k_n, k_h, k_w), (k_h - 1, k_w - 1), (1, 1), (1, 1), tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, block_size) sjac = akg.tvm.create_schedule([jacs[0].op]) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod_AD_data = akg.build(sjac, [pld_Head_strided, pld_B_flipped, jacs[0]], "cce", name="conv_AD_data", attrs={"dim": str(info)}, polyhedral=True) conv_data = conv_compute_forward(Head_strided_NCHW, K_flip_rot_NCHW, (k_h - 1, k_h - 1, k_w - 1, k_w - 1), (1, 1), (1, 1), pld_Head_strided, pld_B_flipped, None, tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, bypass_l1, use_bias, block_size, conv_dtype) info = set_dims(Head_strided_NCHW, (k_c, k_n, k_h, k_w), (k_h - 1, k_w - 1), (1, 1), (1, 1), tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, block_size) s_data = akg.tvm.create_schedule(conv_data.op) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): _ = akg.build(s_data, [pld_Head_strided, pld_B_flipped, conv_data], "cce", name="conv_data", attrs={"dim": str(info)}, polyhedral=True) ad_attrs = {"ad_conv_enable": 1, "ad_conv_reuse_conv": 1} jacs = list( akg.differentiate( conv_forward, [B], HEAD, ad_attrs, [pld_A_transposed, pld_Head_transposed_converted, None])) info = set_dims(A_transposed_NCHW, Head_transposed_NCHW, (0, 0), (1, 1), (s_h, s_w), tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, block_size) sjac = akg.tvm.create_schedule([jacs[0].op]) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod_AD_weight = akg.build( sjac, [pld_A_transposed, pld_Head_transposed_converted, jacs[0]], "cce", name="conv_AD_weight", attrs={"dim": str(info)}, polyhedral=True) conv_weight = conv_compute_forward( A_transposed_NCHW, Head_transposed_NCHW, (0, 0, 0, 0), (1, 1), (s_h, s_w), pld_A_transposed, pld_Head_transposed_converted, None, tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, bypass_l1, use_bias, block_size, conv_dtype) info = set_dims(A_transposed_NCHW, Head_transposed_NCHW, (0, 0), (1, 1), (s_h, s_w), tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, block_size) s_weight = akg.tvm.create_schedule(conv_weight.op) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): akg.build( s_weight, [pld_A_transposed, pld_Head_transposed_converted, conv_weight], "cce", name="conv_weight", attrs={"dim": str(info)}, polyhedral=True) return mod_AD_data, mod_AD_weight, mod_transposed, mod_head_transposed_converted, mod_head_strided, mod_weight_flipped
def reduce_max_ad_optimized_manual_schedule(input_shape, dtype, axis, keepdims, polyhedral=True, attrs=None): def custom_reduce_max_fdiff(out, inputs, head_, ad_attrs, new_pld_array): data_ = inputs[0] shape = data_.shape # reduces maximum value for each column max_ = akg.lang.ascend.reduce_max(data_, axis=axis, keepdims=True) # copies reduced values to get the original shape max_broadcast = akg.lang.ascend.broadcast(max_, shape) # head broadcast is needed to generate correct cce code for the selection operation head_broadcast = akg.tvm.compute( shape, lambda *indices: head_(*get_reduced_indices( *indices, axis=axis, keepdims=keepdims))) # zero all the values that are not max values on the result, remaining is equal to the adjoint of the output max_values_and_zeros = akg.tvm.compute( shape, lambda *indices: akg.tvm.expr.Select( data_(*indices) == max_broadcast(*indices), head_broadcast(*indices), akg.tvm.const(0, dtype='float16')), name="reduce_max_ad2") # cast data back to the original dtype if dtype != 'float16': return [Cast(max_values_and_zeros, dtype, target=utils.CCE)] else: return [max_values_and_zeros] # tensor for the input data data = akg.tvm.placeholder(input_shape, dtype, name="input_data") # computation of reduce max # not used on the schedule because this is the diferentiation op l = reduce_max(data, axis, keepdims, target=utils.CCE) # adjoint tensor for the differentiation head = akg.tvm.placeholder(l.shape, name="head", dtype=l.dtype) # cast input data if dtype != 'float16': data_cast = Cast(data, "float16", target=utils.CCE) head_cast = Cast(head, "float16", target=utils.CCE) else: data_cast = data head_cast = head # override differentiation computation with custom function [dl_ddata] = akg.differentiate( l, [data_cast], head_cast, None, None, override={l: ([data_cast], custom_reduce_max_fdiff)}) # get tensors from custom function if dtype != 'float16': max_values_and_zeros = dl_ddata.op.input_tensors[0] max_broadcast = max_values_and_zeros.op.input_tensors[1] max_ = max_broadcast.op.input_tensors[0] head_broadcast = max_values_and_zeros.op.input_tensors[2] else: max_broadcast = dl_ddata.op.input_tensors[1] max_ = max_broadcast.op.input_tensors[0] head_broadcast = dl_ddata.op.input_tensors[2] # schedule for differetiation operation # inputs: data and head s = akg.tvm.create_schedule([dl_ddata.op]) # cache reads of inputs if dtype != 'float16': head_ub = s.cache_read(head, "local.UB", [head_cast]) data_ub = s.cache_read(data, "local.UB", [data_cast]) else: # no cast operation head_ub = s.cache_read(head_cast, "local.UB", [head_broadcast]) data_ub = s.cache_read(data_cast, "local.UB", [max_, dl_ddata]) # cache write for the output dl_ddata_ub = s.cache_write(dl_ddata, "local.UB") # get tiling attributes if attrs is None: raise Exception('attrs is None') tiling_factors = attrs['tile'] split_iterators = [] assert len(tiling_factors) == len(dl_ddata.shape) # split the final compute and save the iterators for index, factor in enumerate(tiling_factors): split_iterators.append(s[dl_ddata].split(dl_ddata.op.axis[index], factor)) # get iterators iterator1 = split_iterators[0][0] # move computation of when there is a cast if dtype != "float16": s[data_cast].compute_at(s[dl_ddata], iterator1) s[data_cast].set_scope("local.UB") s[head_cast].compute_at(s[dl_ddata], iterator1) s[head_cast].set_scope("local.UB") s[max_values_and_zeros].compute_at(s[dl_ddata], iterator1) s[max_values_and_zeros].set_scope("local.UB") # move cache reads and writes s[data_ub].compute_at(s[dl_ddata], iterator1) s[head_ub].compute_at(s[dl_ddata], iterator1) s[dl_ddata_ub].compute_at(s[dl_ddata], iterator1) # move computation of the diferentiation s[max_].compute_at(s[dl_ddata], iterator1) s[max_].set_scope("local.UB") s[max_broadcast].compute_at(s[dl_ddata], iterator1) s[max_broadcast].set_scope("local.UB") s[head_broadcast].compute_at(s[dl_ddata], iterator1) s[head_broadcast].set_scope("local.UB") with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [head, data, dl_ddata], "cce", name="reduce_max_ad_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "reduce_max_ad_manual_schedule" create_code(kernel_name, './', source_code) return mod