def logsoftmax_ad(shape, dtype, axis, kernel_name, attrs): """Compute the gradient of logsoftmax by autodiff.""" check_list = ["float16"] if not dtype.lower() in check_list: raise RuntimeError("logsoftmax test only support %s while dtype is %s" % (",".join(check_list), dtype)) # check_shape(shape) if axis < 0: axis = len(shape) + axis if axis >= len(shape): raise RuntimeError("axis should be less than dimension") if axis != len(shape) - 1: raise RuntimeError("Only support the last axis currently") shape_new = [shape[-2], shape[-1]] if len(shape) > 2: for i in range(len(shape) - 2): shape_new[0] = shape_new[0] * shape[i] shape = shape_new a_up = akg.tvm.placeholder(shape, dtype=dtype, name="input") b_up = logsoftmax.logsoftmax_op(a_up, shape, axis) head = akg.tvm.placeholder(b_up.shape, name="head", dtype=dtype) _jacs = list(akg.differentiate(b_up, [a_up], head)) sjac = akg.tvm.create_schedule([_jacs[0].op]) sjac[_jacs[0].op.input_tensors[1]].compute_inline() op_vars = [head, a_up, _jacs[0]] with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(sjac, op_vars, "cce", name="test2", attrs=attrs, polyhedral=True) return mod
def invert_permutation_run(shape, dtype, attrs): # check shapes vc_util.check_shape(shape) if not (dtype.lower() in "int32"): raise RuntimeError( "indices_dtype only support int32 while dtype is %s" % dtype) A = akg.tvm.placeholder(shape, dtype, name="A") op = invert_permutation.invert_permutation(A) s = akg.tvm.create_schedule(op.op) kernel_name = utils.gen_name_kernel("invert_permutation", dtype, shape) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [A, op], "cce", name=kernel_name, attrs=attrs, polyhedral=True) input_data = np.random.permutation(np.arange(shape[0])).astype(np.int32) expect = np.full([shape[0]], 0, np.int32) for i, e in enumerate(input_data): expect[e] = i output = np.full([shape[0]], 0, np.int32) output = utils.mod_launch(mod, (input_data, output), expect=expect) return (input_data, ), output, expect, compare_tensor(output, expect, rtol=5e-03, equal_nan=True)
def gather(params_shape, indices_shape, params_dtype, indices_dtype, axis, kernel_name, cce_path="./"): """Gather data by indices""" vc_util.check_shape(params_shape, length=2) vc_util.check_shape(indices_shape, length=1) vc_util.ops_dtype_check(params_dtype, vc_util.DtypeForDavinci.ALL_TYPES) vc_util.ops_dtype_check(indices_dtype, vc_util.DtypeForDavinci.INT32) vc_util.check_equal("axis", "zero", axis, 0) # construct compute o_shape = (indices_shape[0], params_shape[1]) xx = akg.tvm.placeholder(params_shape, dtype=params_dtype, name="X") yy = akg.tvm.placeholder(indices_shape, dtype=indices_dtype, name="Y") res = akg.tvm.extern(o_shape, [xx, yy], lambda ins, outs: kernel_ir(outs[0], ins[0], ins[1]), name="res", dtype=params_dtype) s = akg.tvm.create_schedule(res.op) # create cce attrs = {"enable_multicore": False} with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [xx, yy, res], "cce", name=kernel_name, attrs=attrs) source_code = mod.imported_modules[0].get_source() utils.create_code(kernel_name, cce_path, source_code) return mod
def focalloss_ad_run2(shape, dtype, attrs): logits_pld = akg.tvm.placeholder(shape, dtype=dtype, name='logits') labels_pld = akg.tvm.placeholder(shape, dtype='int32', name='labels') d_labels, d_logits, head = focalloss_ad.focalloss_ad( labels_pld, logits_pld) print("autodiff d_logits:\n", akg.tvm.PrintTensorRecursively(d_logits)) print("autodiff d_labels:\n", akg.tvm.PrintTensorRecursively(d_labels)) # build autodiff kernels io = [labels_pld, logits_pld, head, d_labels, d_logits] s = akg.tvm.create_schedule([e.op for e in io]) kernel_name = utils.gen_name_kernel("focalloss_ad", dtype, ( shape[0], shape[1], )) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, io, "cce", name=kernel_name, attrs=attrs, polyhedral=True) labels_np = RANGEFILL((batchsize, )) logits_np = RANGEFILL((batchsize, ), 2) head_np = RANGEFILL((batchsize, ), 2) output = np.full(expect.shape, np.nan, dtype) output = utils.mod_launch(mod, (labels_np, logits_np, head_np, output), expect=output) expect = output # hack return (input_np, head_np), output, expect, compare_tensor(output, expect, atol=0.1)
def topk(shape, k, dtype, kernel_name, attrs): check_list = ["float16", "int32"] if not (dtype.lower() in check_list): raise RuntimeError("tile_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) if k > shape[-1]: raise RuntimeError("k should not be greater than shape[-1]") shape = (16, 16) out_shape = (16, 16) temp_shape = (16, 16 * 18) inputs = akg.tvm.placeholder(shape, name="input", dtype="float16") output = akg.tvm.placeholder(out_shape, name="output", dtype="float16") temp = akg.tvm.placeholder(temp_shape, name="temp", dtype="float16") values = compute_topk(output, inputs, temp) values1 = compute_get_last(values, temp) s = akg.tvm.create_schedule([values1.op]) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [inputs, values1], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod
def matmul_ad(data_shape, weight_shape, dtype, attrs=None): check_list = ["float16"] if not (dtype.lower() in check_list): raise RuntimeError("matmul test only support %s while dtype is %s" % (",".join(check_list), dtype)) # check_shape(shape) assert (len(data_shape) == 2) assert (len(weight_shape) == 2) assert (data_shape[1] == weight_shape[0]) m, k = data_shape _, n = weight_shape a = akg.tvm.placeholder((m, k), name='a', dtype=dtype) b = akg.tvm.placeholder((k, n), name='b', dtype=dtype) kk = akg.tvm.reduce_axis((0, k), name='kk') c = akg.tvm.compute( (m, n), lambda i, j: akg.lang.cce.mmad(a[i, kk] * b[kk, j], axis=kk), name="c") head = akg.tvm.placeholder(c.shape, name="Head", dtype='float16') _jacs = list(akg.differentiate(c, [a], head)) sjac = akg.tvm.create_schedule([_jacs[0].op]) op_vars = [head, b, _jacs[0]] with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(sjac, op_vars, "cce", name="test2", attrs=attrs, polyhedral=True) return mod
def globalavgpool(n, c, h, w, pool_type, attrs, kernel_name="global_pool"): """ Performs the global average pooling on the input. For each feature map we can define the formula as: \f[ res = \frac{1}{W * H} \\sum X_{i,j} \f] Note: The real input is create by akg.tvm.placeholder Args: n (int): input batchsize. c (int): input channel. h (int): input height. w (int): input weight. pool_type (str): pooling mode, default average. attrs (str): Default None. kernel_name (str): a str about kernel_name Returns: tvm.tensor.Tensor of shape n * c * 1 * 1 """ input = akg.tvm.placeholder((n, c, h, w), name='input', dtype="float16") output = akg.topi.nn.global_pool(input, pool_type=pool_type) s = akg.tvm.create_schedule(output.op) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [input, output], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod
def op_build_to_func(opnames, computes, args, custom_schedule, device, kernel_name, attrs): """op_build_to_func""" if device not in ("aicore", "aicpu"): logging.error("Device %s is not in [aicore, aicpu].", device) return None polyhedral = True dump_ir = os.getenv(MS_AKG_DUMP_IR) == "on" try: tmp_outputs = [x.op for x in computes] s = akg.tvm.create_schedule(tmp_outputs) if custom_schedule: polyhedral = False custom_schedule(s) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=dump_ir): if attrs: binds = attrs.pop(BINDS, None) rst = akg.build_to_func(s, args, name=kernel_name, attrs=attrs, polyhedral=polyhedral, binds=binds, target=device) else: rst = akg.build_to_func(s, args, name=kernel_name, polyhedral=polyhedral, target=device) except Exception: logging.error(traceback.format_exc()) return None return rst
def case_1(data_shape, dtype, kernel_name, attrs): """elemwise chain case 1""" vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.FLOAT16) vc_util.check_shape_length_equal("data", data_shape, 2) m, k = data_shape A = akg.tvm.placeholder((m, k), name='A', dtype=dtype) B = akg.tvm.placeholder((k, ), name='B', dtype=dtype) C = akg.tvm.placeholder((m, k), name='C', dtype=dtype) E = akg.tvm.compute((m, k), lambda i, j: A[i, j] * (B[j] + C[i, j]), name="E") forward_s = akg.tvm.create_schedule(E.op) op_vars = [A, B, C, E] forward_low = akg.lower(forward_s, op_vars, simple_mode=True, polyhedral=True) kernel_name = utils.gen_name_kernel(kernel_name, dtype, data_shape) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(forward_s, op_vars, "cce", name="test", attrs=attrs, polyhedral=True) source_code = mod.imported_modules[0].get_source() return mod
def test_CCE_Conv(FMap_shape, Filter_shape, Pad, Stride, Tile_h=0, Tile_co=0, Tile_m=0, Tile_k=0, Tile_n=0, use_bias=False, fp32_mad = True, kernel_name="conv"): # adjust to TilingApi # feature map (NCHW -> NC1HWC0) fmap_n, fmap_c, fmap_h, fmap_w = FMap_shape fmap_shape_NC1HWCO = (fmap_n, fmap_c // block_size, fmap_h, fmap_w, block_size) # filter (NCHW -> C1HWNC0) filter_n, filter_c, filter_h, filter_w = Filter_shape filter_shape_C1HWNC0 = (filter_c // block_size, filter_h, filter_w, filter_n, block_size) # filter (C1HWNC0 -> filter_fractal) filter_shape_fractal = ( filter_c * filter_h * filter_w // block_size, filter_n // block_size, block_size, block_size) # stride (stride_h, stride_w) stride = Stride # fmap_placeholder (NC1HWCO) fmap_placeholder = akg.tvm.placeholder(fmap_shape_NC1HWCO, dtype=conv_dtype, name='fmap') # filter_placeholder (fractal) filter_placeholder = akg.tvm.placeholder(filter_shape_fractal, dtype=conv_dtype, name='filter') if use_bias: bias_shape = (1, filter_n // block_size, 1, 1, block_size) bias_placeholder = akg.tvm.placeholder(bias_shape, dtype= conv_dtype, name='bias') conv_dsl_input = (fmap_placeholder, filter_placeholder, bias_placeholder) else: conv_dsl_input = (fmap_placeholder, filter_placeholder) conv_dsl_outputs = conv_dsl(conv_dsl_input, fmap_shape_NC1HWCO, filter_shape_C1HWNC0, Pad, stride, use_bias, fp32_mad) # calculate the tiling factor. Wo = (fmap_w + Pad[2] + Pad[3] - filter_w) // (stride[1]) + 1 H_tiling = (Tile_h - filter_h) // (stride[0]) + 1 # For adjusting to TilingApi, here are some tiling factor changes. # tiling_factor_h occurs in L1, and Tile_n is means the n in 'nchw', so we need translate it to H_tiling # used as Ho in A_im2col_row_major_shape # others are similar, they need to be changed to format where them are used. tiling_factor_h = H_tiling * Wo // block_size * block_size tiling_factor_co = Tile_co // block_size tiling_factor_m = Tile_m // block_size * block_size tiling_factor_n = Tile_n // block_size tiling_factor_k = Tile_k // block_size # schedule # pick the last one as the final result s = akg.tvm.create_schedule(conv_dsl_outputs[-1].op) conv_sch(s, (conv_dsl_input, conv_dsl_outputs), tiling_factor_h=tiling_factor_h, tiling_factor_m=tiling_factor_m, tiling_factor_k=tiling_factor_k, tiling_factor_n=tiling_factor_n) args = list(conv_dsl_input) + [conv_dsl_outputs[-1]] with akg.build_config(add_lower_pass = cce.debug_mode(0), dump_pass_ir = True): mod = akg.build(s, args, "cce", name=kernel_name, attrs= {"loop_partition_unroll": True}) return mod
def roipool(shape, roibox, pooled_shape, dtype, kernel_name="roipool_forward_output", attrs=None): check_list = ["float16"] if not (dtype.lower() in check_list): raise RuntimeError("tile_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) vc_util.check_shape(shape) assert (len(shape) == 4) assert (len(roibox) == 4) assert (len(pooled_shape) == 2) a_n, a_c, a_h, a_w = shape roi_t, roi_b, roi_l, roi_r = roibox assert (roi_t >= 0 and roi_t < roi_b and roi_b < a_h) assert (roi_l >= 0 and roi_l < roi_r and roi_r < a_w) a = akg.tvm.placeholder(shape, name="a", dtype=dtype) Crop = akg.tvm.compute([a_n, a_c, roi_b - roi_t, roi_r - roi_l], lambda n, c, h, w: a[n, c, roi_t + h, roi_l + w]) p_h, p_w = pooled_shape win_h = (roi_b - roi_t) // p_h + (1 if (roi_b - roi_t) % p_h > 0 else 0) win_w = (roi_r - roi_l) // p_w + (1 if (roi_r - roi_l) % p_w > 0 else 0) assert p_h <= (roi_b - roi_t) and p_w <= (roi_r - roi_l) Unpooled = akg.tvm.compute( [a_n, a_c, p_h, p_w, win_h, win_w], lambda n, c, h, w, wh, ww: akg.tvm.expr.Select( akg.tvm.all(h * win_h + wh < roi_b - roi_t, w * win_w + ww < roi_r - roi_l), Crop[n, c, h * win_h + wh, w * win_w + ww], akg.tvm.const(0, a.dtype))) rh = akg.tvm.reduce_axis((0, win_h)) rw = akg.tvm.reduce_axis((0, win_w)) output_shape = [a_n, a_c, p_h, p_w] res = akg.tvm.compute( output_shape, lambda n, c, h, w: akg.tvm.max(Unpooled[n, c, h, w, rh, rw], axis=[rh, rw])) s = akg.tvm.create_schedule(res.op) s[Crop].compute_inline() s[Unpooled].compute_inline() kernel_name = utils.gen_name_kernel(kernel_name, dtype, shape) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [a, res], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod, output_shape
def test_select(): N = 128 actual = akg.tvm.placeholder((N, ), name='actual', dtype='int32') predict = akg.tvm.placeholder((N, ), name='predict', dtype='int32') k = akg.tvm.reduce_axis((0, N), name='k') output = akg.tvm.compute( (N, N), lambda i, j: akg.tvm.sum(akg.tvm.expr.Select( akg.tvm.all(i == actual[k], j == predict[k]), 1.0, 0.0), axis=k)) s = akg.tvm.create_schedule(output.op) # build the cce kernel with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [actual, predict, output], "cce", polyhedral=True)
def concat_ad_run(shapes, dtype, axis, attrs): # prepare inputs placeholder inp_dtype = dtype.lower() data = [] for i in range(len(shapes)): shape = shapes[i] data.append( akg.tvm.placeholder(shape, name="data_%d" % i, dtype=inp_dtype)) kernel_name = utils.genKernelName("concat", inp_dtype, shapes) res, head = concat_ad.concat_ad(data, axis) opvars = [head] + data + [res] s = akg.tvm.create_schedule(res.op) op_attrs = [axis] if 'tuning' in attrs.keys(): t = attrs.get("tuning", False) kernel_name = attrs.get("kernel_name", False) mod = utils.op_build_test(concat_ad.concat_ad, [shapes], [dtype.lower()], op_attrs, kernel_name=kernel_name, attrs=attrs, tuning=t) if t: args, expect, head_data, inputs = gen_data(dtype, head, shapes) return mod, expect, tuple(args) else: return mod else: # build the cce kernel with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, opvars, "cce", name=kernel_name, attrs=attrs, polyhedral=True) print(mod.imported_modules[0].get_source()) args, expect, head_data, inputs = gen_data(dtype, head, shapes) output = utils.mod_launch(mod, tuple(args), expect=expect) return tuple(inputs) + (head_data, ), output, expect, compare_tensor( output, expect, rtol=5e-03, equal_nan=True)
def floormod(shape, dtype, kernel_name, attrs): """ Compute element-wise remainder of division. \f$res=a - floor(a/b) * b\f$ Args: shape (list): a list has any nums. dtype (str): parameters' type. kernel_name (str): a str about kernel_name. attrs (str): Default None. Returns: tvm.tensor.Tensor, shape and dtype are input params. """ vc_util.ops_dtype_check( dtype, [vc_util.DtypeForDavinci.ALL_FLOAT, vc_util.DtypeForDavinci.INT32]) vc_util.check_shape(shape) a = akg.tvm.placeholder(shape=shape, name="a", dtype=dtype) b = akg.tvm.placeholder(shape=shape, name="b", dtype=dtype) # res = a - floor(a/b) * b # Newton's Method for VREC para = akg.lang.cce.vrec(b) for _ in range(3): tmp1 = akg.lang.cce.vmul(b, para) tmp2 = akg.lang.cce.vmuls(tmp1, -1) tmp3 = akg.lang.cce.vadds(tmp2, 2) para = akg.lang.cce.vmul(tmp3, para) c = akg.lang.cce.vmul(a, para) d = akg.lang.cce.floor(c) e = akg.lang.cce.vmul(d, b) res = akg.lang.cce.vsub(a, e) s = akg.tvm.create_schedule(res.op) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [a, b, res], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod
def test_vmadd(): shape = (10, 256) dtype = 'float16' x = akg.tvm.placeholder(shape, name="x", dtype=dtype) def compute_func(*indices): y = x(*indices) + akg.tvm.const(2.0, dtype) return y * x(*indices) + x(*indices) + akg.tvm.const(1.0, dtype) res = akg.tvm.compute(shape, compute_func) s = akg.tvm.create_schedule(res.op) # build the cce kernel with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [x, res], "cce", polyhedral=True) assert "vmadd" in mod.imported_modules[0].get_source()
def test_quant(fmap_shape): # input shape(NCHW -> NC1HWC0) in_n, in_c, in_h, in_w = fmap_shape assert in_c % 32 == 0 input_shape_nc1hwc0 = (in_n, in_c // 16, in_h, in_w, 16) in_n, in_c1, in_h, in_w, in_c0 = input_shape_nc1hwc0 # placeholder (NC1HWC0) FMap = akg.tvm.placeholder(input_shape_nc1hwc0, dtype='float16', name='FMap') ScaleQ = akg.tvm.placeholder((16, ), dtype='float16', name='ScaleQ') OffsetQ = akg.tvm.placeholder((16, ), dtype='float16', name='OffsetQ') out_shape_nc1hwc0 = (in_n, in_c // 32, in_h, in_w, 32) print(out_shape_nc1hwc0) out_n, out_c1, out_h, out_w, out_c0 = out_shape_nc1hwc0 # quantize Quant = akg.tvm.compute(out_shape_nc1hwc0, lambda n, c1, h, w, c0: (FMap[n, c1 + c0 // 16, h, w, c0 % 16] * ScaleQ[0] + OffsetQ[0]).astype('int8'), name='output') info = dim.Dim() info.setdim(index=0, axis=0, tilel1=2, tilel0=0) info.setdim(index=0, axis=0, tilel1=32, tilel0=0) info.setdim(index=0, axis=0, tilel1=32, tilel0=0) info.setdim(index=0, axis=0, tilel1=16, tilel0=0) # schedule s = akg.tvm.create_schedule(Quant.op) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [FMap, ScaleQ, OffsetQ, Quant], 'cce', name='cce_quant', attrs={'dim': str(info)}, polyhedral=True) source_code = mod.imported_modules[0].get_source() print(source_code)
def reduce_min_ad_optimized_manual_schedule(input_shape, dtype, axis, keepdims, polyhedral=True, attrs=None): def get_shape(pld): return [d.value for d in pld.shape] data = akg.tvm.placeholder(input_shape, dtype, name="input_data") #only works for last axis and 2D. Need to extend to multiple dimension and axes. def custom_reduce_min_fdiff(out, inputs, grad, ad_attrs, new_pld_array): data = inputs[0] shape = get_shape(data) if len(get_shape(data)) == 2: # add an extra stage to avoid alignment problem min_input = akg.tvm.compute(data.shape, lambda *i: data(*i), name="min_input") min_ = akg.lang.cce.reduce_min(min_input, axis=-1, keepdims=True) min_broadcast = akg.lang.cce.broadcast(min_, shape) if dtype != "float16": data = cast(data, "float16") return [ akg.tvm.compute(shape, lambda i, j: akg.tvm.expr.Select( data[i, j] == min_broadcast[i, j], grad[i], akg.tvm.const(0, dtype="float16")), name="reduce_min_ad2") ] L = reduce_min.reduce_min(data, axis) head = akg.tvm.placeholder(L.shape, name="head", dtype=L.dtype) head_cast = cast(head, "float16") [dL_ddata ] = akg.differentiate(L, [data], head_cast, None, None, override={L: ([data], custom_reduce_min_fdiff)}) s = akg.tvm.create_schedule([dL_ddata.op]) head_ub = s.cache_read(head, "local.UB", [head_cast]) if dtype == "float16": data_ub = s.cache_read(data, "local.UB", [dL_ddata]) else: data_ub = s.cache_read(data, "local.UB", [dL_ddata.op.input_tensors[0]]) min_input_ub = s.cache_read( dL_ddata.op.input_tensors[1].op.input_tensors[0].op. input_tensors[0].op.input_tensors[0].op.input_tensors[0], "local.UB", [ dL_ddata.op.input_tensors[1].op.input_tensors[0].op. input_tensors[0].op.input_tensors[0] ]) s[dL_ddata.op.input_tensors[1].op.input_tensors[0].op.input_tensors[0]. op.input_tensors[0]].set_scope("local.UB") dL_ddata_ub = s.cache_write(dL_ddata, "local.UB") # tiling split_axis = {} for i in range(len(attrs['tile'])): split_axis["axis" + str(i)] = s[dL_ddata].split( dL_ddata.op.axis[i], attrs["tile"][i]) split_axis_sorted = sorted(split_axis.items()) if dtype == "float16": s[data_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) else: s[data_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) s[dL_ddata.op.input_tensors[0]].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) s[dL_ddata.op.input_tensors[0]].set_scope("local.UB") s[min_input_ub].compute_at(s[dL_ddata], split_axis_sorted[0][1][1]) s[head_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) s[head_cast].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) s[head_cast].set_scope("local.UB") s[dL_ddata.op.input_tensors[1]].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) s[dL_ddata.op.input_tensors[1]].set_scope("local.UB") s[dL_ddata.op.input_tensors[1].op.input_tensors[0]].compute_at( s[dL_ddata], split_axis_sorted[0][1][1]) s[dL_ddata.op.input_tensors[1].op.input_tensors[0]].set_scope("local.UB") s[dL_ddata.op.input_tensors[1].op.input_tensors[0].op. input_tensors[0]].compute_at(s[dL_ddata], split_axis_sorted[0][1][1]) s[dL_ddata.op.input_tensors[1].op.input_tensors[0].op. input_tensors[0]].set_scope("local.UB") # L is not being used for computation # s[L].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) # s[L].set_scope("local.UB"1 s[dL_ddata_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [data, head, dL_ddata], "cce", name="reduce_min_ad_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "reduce_min_ad_manual_schedule" utils.create_code(kernel_name, './', source_code) return mod
def im2col_manual_schedule(shape, kernel, stride, pad, dtype, polyhedral=True, attrs=None): ''' Compute im2col via cce im2col intrin function call directly Args: shape: shape of the data kernel: kernel sizes for im2col stride: stride sizes for im2col pad: padding sizes for im2col, including padding top, bottom, left, and right dtype: type of the data Return: cce intrin function call for im2col ''' load3d = intrin_load3d(dtype) b, c1, h, w, c0 = shape stride_h, stride_w = stride kernel_h, kernel_w = kernel pad_t, pad_b, pad_l, pad_r = pad dilation_w, dilation_h = 1, 1 jump_offset = 1 repeat_mode = 0 repeat_time = 1 csize = 0 block_size = 16 # output size <=> number of windows ho = (h + pad_b + pad_t - kernel_h) // stride_h + 1 wo = (w + pad_r + pad_l - kernel_w) // stride_w + 1 im2col_shape = (b, (ho * wo + block_size - 1) // block_size, c1 * kernel_h * kernel_w, block_size, c0) def _im2col_compute(i, j, k, data): j_h = (((j*block_size) // wo)*stride_h)-pad_t j_w = (((j*block_size) % wo)*stride_w)-pad_l # num rows in l1 for fmatrix is discounted by the amount of bottom padding h_3d = kernel_h - tvm.max(((j_h+kernel_h) - h), 0) pad_t_3d = tvm.max(-j_h, 0) pad_b_3d = tvm.max(((j_h+kernel_h) - h), 0) w_idx_kernel = (k % kernel_w) h_idx_kernel = ((k // kernel_w) % kernel_h) w_idx = j_w # when this is < 0, the slice will start from row 0 so there is no redundancy between base address and this param h_idx = tvm.min(j_h, 0) c1_idx = (k // kernel_w) // kernel_h load3d_input = data[i, c1_idx, # assume padding < kernel size tvm.max(0, j_h):tvm.min(h, j_h+kernel_h), 0:w, 0:c0] return load3d(load3d_input, w, h_3d, pad_l, pad_r, pad_t_3d, pad_b_3d, w_idx_kernel, h_idx_kernel, w_idx, h_idx, 0, stride_w, stride_h, kernel_w, kernel_h, dilation_w, dilation_h, jump_offset, repeat_mode, repeat_time, csize) # tensor for the input data data = tvm.placeholder(shape, dtype, name="input_data") # assume we need the whole width of a # choose a section of the rows of a that encompasses all of the windows in the current window-batch res = tvm.compute(im2col_shape, lambda i, j, k: _im2col_compute(i, j, k, data), name='im2col_fractal') # schedule for differentiation operation s = tvm.create_schedule([res.op]) data_ub = s.cache_read(data, "local.L1", [res]) res_ub = s.cache_write(res, "local.UB") s[data_ub].compute_at(s[res], res.op.axis[0]) s[res_ub].compute_at(s[res], res.op.axis[2]) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [data, res], "cce", name="im2col_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "im2col_manual_schedule" utils.create_code(kernel_name, './', source_code) return mod
def maxpool_manual_schedule(shape, kernel, stride, padding, dtype, attrs=None, polyhedral=False): """maxpool with manual schedule""" vc_util.davinci_format_check(shape, "NC1HWC0", dim=5) vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT) maxpool_param_check(kernel, stride, padding) data = akg.tvm.placeholder(shape, dtype, name="input_data") batch_size, in_c1, input_h, input_w, in_c0 = data.shape kernel_h, kernel_w = kernel stride_h, stride_w = stride if len(padding) == 2: pad_h, pad_w = padding elif len(padding) == 4: pad_h, pad_w = padding[0], padding[2] out_size_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1 out_size_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1 # padding operation if pad_h != 0 or pad_w != 0: pad_shape = (batch_size, in_c1, input_h + 2 * pad_h, input_w + 2 * pad_w, in_c0) padded_input = akg.tvm.compute( pad_shape, lambda n, c1, h, w, c0: akg.tvm.if_then_else( akg.tvm.any( h > input_h + pad_h - 1, h < pad_h, w > input_w + pad_w - 1, w < pad_w, ), akg.tvm.const(0.0, dtype=dtype), data[n, c1, h - pad_h, w - pad_w, c0], ), name="padded_input") else: padded_input = data # reduce iterators it_kernel_h = akg.tvm.reduce_axis((0, kernel_h), name="iterator_reduction_height") it_kernel_w = akg.tvm.reduce_axis((0, kernel_w), name="iterator_reduction_width") out_shape = (batch_size, in_c1, out_size_h, out_size_w, in_c0) res = akg.tvm.compute(out_shape, lambda n, c1, h, w, c0: akg.tvm.max( padded_input[n, c1, (h * stride_h + it_kernel_h), (w * stride_w + it_kernel_w), c0], axis=[it_kernel_h, it_kernel_w]), name="maxpool_not_hybrid") s = akg.tvm.create_schedule([res.op]) if pad_w != 0 or pad_h != 0: padded_input = res.op.input_tensors[0] else: padded_input = res # cache reads and writes # after this cache write: reference to res_ub to change the reduction axis res_ub = s.cache_write(res, "local.UB") if pad_w != 0 or pad_h != 0: data_ub = s.cache_read(data, "local.UB", [padded_input]) else: data_ub = s.cache_read(data, "local.UB", [res_ub]) # get tiling attributes if attrs is None: raise Exception('attrs is None') tiling_factors = attrs['tile'] split_iterators = [] if len(tiling_factors) != len(res.shape): raise RuntimeError("tiling factors mismatch out shape") # split the final compute and save the iterators for index, factor in enumerate(tiling_factors): split_iterators.append(s[res_ub].split(res_ub.op.axis[index], factor)) # get iterators iterator_b_outer = split_iterators[0][0] iterator_b_inner = split_iterators[0][1] iterator_c1_outer = split_iterators[1][0] iterator_c1_inner = split_iterators[1][1] iterator_h_outer = split_iterators[2][0] iterator_h_inner = split_iterators[2][1] iterator_w_outer = split_iterators[3][0] iterator_w_inner = split_iterators[3][1] iterator_c0_outer = split_iterators[4][0] iterator_c0_inner = split_iterators[4][1] # reduction axis iterator_reduce_h = res_ub.op.reduce_axis[0] iterator_reduce_w = res_ub.op.reduce_axis[1] # move caches s[res_ub].compute_at(s[res], res.op.axis[0]) s[data_ub].compute_at(s[res_ub], iterator_c1_outer) if pad_w != 0 or pad_h != 0: s[padded_input].compute_at(s[res_ub], iterator_c1_outer) s[padded_input].set_scope("local.UB") # reorder computation s[res_ub].reorder(iterator_b_outer, iterator_b_inner, iterator_c1_outer, iterator_c1_inner, iterator_h_outer, iterator_h_inner, iterator_w_outer, iterator_w_inner, iterator_reduce_h, iterator_reduce_w, iterator_c0_outer, iterator_c0_inner) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [data, res], "cce", name="maxpool_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "maxpool_ad_manual_schedule" utils.create_cce(kernel_name, './', source_code) return mod
def col2im_manual_schedule(shape, kernel, stride, pad, dtype, output_H_W, polyhedral=True, attrs=None): """ Col2im operation with manual schedule. Args: shape (Union[list, tuple]): seven int numbers for the input's image size. kernel (Union[list, tuple]): two int numbers for the sliding window's size. stride (Union[list, tuple]): two int numbers for the sliding window's stride. pad: (Union[list, tuple]): four int numbers for padding's sizes: top, bottom, left, and right dtype (str): parameters' type. output_H_W (Union[list, tuple]): two int numbers for the output's height and width. polyhedral (bool): If True, use auto-schedule, else use manual-schedule, default value is True. attrs (dict): Specifies parameters used in manual-schedule. Returns: tvm.tensor.Tensor as result for col2im operation. """ N, C1, KH, KW, OH, OW, C0 = shape H, W = output_H_W output_shape = (N, C1, H, W, C0) kernel_h, kernel_w = kernel stride_h, stride_w = stride pad_t, pad_b, pad_l, pad_r = pad assert H == (OH - 1) * stride_h + kernel_h - ( pad_t + pad_b), "Height of input and output do not match" assert W == (OW - 1) * stride_w + kernel_w - ( pad_l + pad_r), "Width of input and output do not match" col2im = intrin_col2im(shape, output_shape, kernel, stride, pad, dtype) # tensor for the input data data = tvm.placeholder(shape, dtype, name="input_data") # assume we need the whole width of A # choose a section of the rows of A that encompasses all of the windows in the current window-batch res = tvm.compute(output_shape, lambda b, c1, h, w, c0: data(b, c1, h % KH, w % KW, h % OH, w % OW, c0), name="col2im_intrinsic") # schedule for differetiation operation s = tvm.create_schedule([res.op]) res_ub = s.cache_write(res, "local.UB") data_ub = s.cache_read(data, "local.UB", [res_ub]) b, c1, h, w, c0 = res.op.axis s[data_ub].compute_at(s[res], c1) s[res_ub].compute_at(s[res], c1) s[res_ub].tensorize(res_ub.op.axis[0], col2im) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [data, res], "cce", name="col2im_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "col2im_manual_schedule" utils.create_code(kernel_name, "./", source_code) return mod
def vector_matmul(data_m, data_n, data_k, trans_a, trans_b, dtype, kernel_name, attrs): check_list = ["float16", "float32"] if not dtype in check_list: raise TypeError("softmax test only support %s while dtype is %s" % (",".join(check_list), dtype)) m = data_m n = data_n k = data_k data_shape, weight_shape = get_shape(m, n, k, trans_a, trans_b) output_shape = (m, n) A = akg.tvm.placeholder(data_shape, name='A', dtype=dtype) B = akg.tvm.placeholder(weight_shape, name='B', dtype=dtype) ZERO = akg.tvm.const(0.0, dtype=dtype) @script def matmul_hybrid_f_f(a, b, zero): t_1 = allocate((m, k, n), a.dtype, 'local') t_2 = output_tensor((m, n), a.dtype) for i_m in range(0, m): for i_k in range(0, k): for i_n in range(0, n): t_1[i_m, i_k, i_n] = a[i_m, i_k] * b[i_k, i_n] for i1_n in range(0, n): t_2[i_m, i1_n] = zero for i1_k in range(0, k): for i1_n in range(0, n): t_2[i_m, i1_n] = t_2[i_m, i1_n] + t_1[i_m, i1_k, i1_n] return t_2 @script def matmul_hybrid_f_t(a, b, zero): t_1 = allocate((m, n, k), a.dtype, 'local') t_2 = output_tensor((m, n), a.dtype) for i_m in range(0, m): for i_n in range(0, n): t_2[i_m, i_n] = zero for i_k in range(0, k): t_1[i_m, i_n, i_k] = a[i_m, i_k] * b[i_n, i_k] t_2[i_m, i_n] = t_1[i_m, i_n, i_k] + t_2[i_m, i_n] return t_2 @script def matmul_hybrid_t_f(a, b, zero): t_1 = allocate((m, k, n), a.dtype, 'local') t_2 = output_tensor((m, n), a.dtype) for i_m in range(0, m): for i_k in range(0, k): for i_n in range(0, n): t_1[i_m, i_k, i_n] = a[i_k, i_m] * b[i_k, i_n] for i1_n in range(0, n): t_2[i_m, i1_n] = zero for i1_k in range(0, k): for i1_n in range(0, n): t_2[i_m, i1_n] = t_2[i_m, i1_n] + t_1[i_m, i1_k, i1_n] return t_2 C = () if trans_a == False and trans_b == False: C = matmul_hybrid_f_f(A, B, ZERO) elif trans_a == False and trans_b == True: C = matmul_hybrid_f_t(A, B, ZERO) elif trans_a == True and trans_b == False: C = matmul_hybrid_t_f(A, B, ZERO) else: raise ValueError('Not support both transpose yet') forward_s = akg.tvm.create_schedule(C.op) op_vars = [A, B, C] with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(forward_s, op_vars, "cce", name=kernel_name, attrs=attrs, polyhedral=True) source_code = mod.imported_modules[0].get_source() utils.create_code(kernel_name, "./", source_code) return mod, output_shape
def maxpool_ad_manual_schedule_no_overlap_all_max(shape, kernel, stride, pad, dtype, attrs=None, polyhedral=False): """automatic differentiate of maxpool with manual schedule for no overlap case.""" kernel_h, kernel_w = kernel stride_h, stride_w = stride pad_h, pad_w, _, _ = pad batch_size, input_c1, input_h, input_w, input_c0 = shape pad_shape = (batch_size, input_c1, input_h + 2 * pad_h, input_w + 2 * pad_w, input_c0) def custom_maxpool_fdiff(out, inputs, head_, ad_attrs, new_pld_array): in_data = inputs[0] if stride_w != kernel_w: raise RuntimeError( "Only supports kernels with same dimensions as stride size!") if stride_h != kernel_h: raise RuntimeError( "Only supports kernels with same dimensions as stride size!") out_broadcast = akg.tvm.compute( pad_shape, lambda b, c1, h, w, c0: out(b, c1, akg.tvm.floordiv(h, stride_h), akg.tvm.floordiv(w, stride_w), c0), name="out_broadcast") # copy output to the shape of the padded input, copying the same value for the entire kernel size out_broadcast = akg.tvm.compute( pad_shape, lambda b, c1, h, w, c0: out(b, c1, akg.tvm.floordiv(h, stride_h), akg.tvm.floordiv(w, stride_w), c0), name="out_broadcast") # copy head to the shape of the padded input, copying the same value for the entire kernel size head_broadcast = akg.tvm.compute( pad_shape, lambda b, c1, h, w, c0: head_(b, c1, akg.tvm.floordiv(h, stride_h), akg.tvm.floordiv(w, stride_w), c0), name="head_broadcast") # check if value was a maximum and assign head of that position if it was # this is done for all the maximum values within one kernel result = akg.tvm.compute( in_data.shape, lambda b, c1, h, w, c0: akg.tvm.expr.Select( in_data(b, c1, h, w, c0) == out_broadcast( b, c1, h + pad_h, w + pad_w, c0), head_broadcast(b, c1, h + pad_h, w + pad_w, c0), akg.tvm.const(0, dtype=in_data.dtype)), name="result") return [result] out_size_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1 out_size_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1 out_shape = (batch_size, input_c1, out_size_h, out_size_w, input_c0) # tensor for the input data data = akg.tvm.placeholder(shape, dtype, name="input_data") # maxpool output forward = akg.tvm.placeholder(out_shape, name="forward", dtype=dtype) # adjoint tensor for the differentiation head = akg.tvm.placeholder(out_shape, name="head", dtype=dtype) # override differentiation computation with custom function [dl_ddata ] = akg.differentiate(forward, [data], head, None, None, override={forward: ([data], custom_maxpool_fdiff)}) # schedule for differetiation operation s = akg.tvm.create_schedule([dl_ddata.op]) # get computations result = dl_ddata forward_broadcast = result.op.input_tensors[1] head_broadcast = result.op.input_tensors[2] # cache reads and writes result_ub = s.cache_write(result, "local.UB") data_ub = s.cache_read(data, "local.UB", [result_ub]) head_ub = s.cache_read(head, "local.UB", [head_broadcast]) forward_ub = s.cache_read(forward, "local.UB", [forward_broadcast]) s[head_broadcast].set_scope("local.UB") s[forward_broadcast].set_scope("local.UB") s[head_ub].compute_at(s[head_broadcast], head_broadcast.op.axis[0]) s[forward_ub].compute_at(s[forward_broadcast], forward_broadcast.op.axis[0]) s[data_ub].compute_at(s[result_ub], result_ub.op.axis[0]) s[forward_broadcast].compute_at(s[result_ub], result_ub.op.axis[0]) s[head_broadcast].compute_at(s[result_ub], result_ub.op.axis[0]) _, c1, h, _, _ = result.op.axis if input_h + 2 * pad_h > 32 or input_w + 2 * pad_w > 32: h_outer, _ = s[result].split(h, 4) s[result_ub].compute_at(s[result], h_outer) else: s[result_ub].compute_at(s[result], c1) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [head, data, forward, dl_ddata], "cce", name="maxpool_ad_manual_schedule_no_overlap_all_max", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "maxpool_ad_manual_schedule_no_overlap_all_max" utils.create_cce(kernel_name, './', source_code) return mod
def maxpool_ad_manual_schedule_all_max(shape, kernel, stride, pad, dtype, polyhedral=True, attrs=None): """automatic differentiate of maxpool with manual schedule for all maximum.""" kernel_h, kernel_w = kernel stride_h, stride_w = stride pad_h, pad_w, _, _ = pad batch_size, input_c1, input_h, input_w, input_c0 = shape pad_shape = (batch_size, input_c1, input_h + 2 * pad_h, input_w + 2 * pad_w, input_c0) out_size_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1 out_size_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1 out_shape = (batch_size, input_c1, out_size_h, out_size_w, input_c0) def custom_maxpool_fdiff(out, inputs, head_, ad_attrs, new_pld_array): in_data = inputs[0] data_separated_by_windows = (kernel_h, kernel_w, batch_size, input_c1, out_size_h, out_size_w, input_c0) pad_data = akg.tvm.compute( pad_shape, lambda b, c1, h, w, c0: akg.tvm.expr.Select( akg.tvm.all(h >= pad_h, h < input_h + pad_h, w >= pad_w, w < input_w + pad_w), in_data(b, c1, h - pad_h, w - pad_w, c0), akg.tvm.const(0.0, dtype=dtype)), name="pad_data") data_reshaped = akg.tvm.compute( data_separated_by_windows, lambda wh, ww, b, c1, oh, ow, c0: pad_data( b, c1, oh * stride_h + wh, ow * stride_w + ww, c0), name="data_reshaped") max_broadcast = akg.tvm.compute( data_separated_by_windows, lambda wh, ww, b, c1, oh, ow, c0: out(b, c1, oh, ow, c0), name="max_broadcast") equal = akg.tvm.compute( data_separated_by_windows, lambda wh, ww, b, c1, oh, ow, c0: akg.tvm.expr.Select( max_broadcast(wh, ww, b, c1, oh, ow, c0) == data_reshaped( wh, ww, b, c1, oh, ow, c0), head_(b, c1, oh, ow, c0), akg.tvm.const(0.0, dtype=dtype)), name="equal") data_reorg = akg.tvm.compute( (out_size_h, out_size_w, batch_size, input_c1, input_h + 2 * pad_h, input_w + 2 * pad_w, input_c0), lambda oh, ow, b, c1, h, w, c0: akg.tvm.expr.Select( akg.tvm.any(h < oh * stride_h, h > oh * stride_h + kernel_h - 1, w < ow * stride_w, w > ow * stride_w + kernel_w - 1), akg.tvm.const(0, dtype=dtype), equal(h - oh * stride_h, w - ow * stride_w, b, c1, oh, ow, c0) ), name="data_reorg") result_pad = akg.topi.sum(data_reorg, [0, 1]) result = akg.tvm.compute(shape, lambda b, c1, h, w, c0: result_pad( b, c1, h + pad_h, w + pad_w, c0), name="result") return [result] # tensor for the input data data = akg.tvm.placeholder(shape, dtype, name="input_data") # maxpool output forward = akg.tvm.placeholder(out_shape, name="forward", dtype=dtype) # adjoint tensor for the differentiation head = akg.tvm.placeholder(out_shape, name="head", dtype=dtype) # override differentiation computation with custom function [dl_ddata ] = akg.differentiate(forward, [data], head, None, None, override={forward: ([data], custom_maxpool_fdiff)}) # schedule for differetiation operation s = akg.tvm.create_schedule([dl_ddata.op]) # get computations result = dl_ddata result_pad = result.op.input_tensors[0] data_reorg = result_pad.op.input_tensors[0] equal = data_reorg.op.input_tensors[0] max_broadcast = equal.op.input_tensors[0] data_reshaped = equal.op.input_tensors[1] pad_data = data_reshaped.op.input_tensors[0] data_ub = s.cache_read(data, "local.UB", [pad_data]) head_ub = s.cache_read(head, "local.UB", [equal]) forward_ub = s.cache_read(forward, "local.UB", [max_broadcast]) result_ub = s.cache_write(result, "local.UB") s[max_broadcast].set_scope("local.UB") s[data_reshaped].set_scope("local.UB") s[pad_data].set_scope("local.UB") s[equal].set_scope("local.UB") s[data_reorg].set_scope("local.UB") s[result_pad].set_scope("local.UB") s[data_ub].compute_inline() s[result_ub].compute_inline() s[pad_data].compute_inline() # equal dependencies s[forward_ub].compute_at(s[equal], equal.op.axis[0]) s[max_broadcast].compute_at(s[equal], equal.op.axis[0]) s[data_reshaped].compute_at(s[equal], equal.op.axis[0]) s[head_ub].compute_at(s[equal], equal.op.axis[0]) s[equal].compute_at(s[result_pad], result_pad.op.axis[0]) # result dependencies s[data_reorg].compute_inline() b, c1, h, w, c0 = result_pad.op.axis oh, ow = result_pad.op.reduce_axis s[result_pad].reorder(oh, ow, b, c1, h, w, c0) # s[result_pad].compute_at(s[result], result.op.axis[1]) b, c1, h, w, c0 = result.op.axis h_out, _ = s[result].split(h, stride_h) s[result_pad].compute_at(s[result], h_out) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [head, data, forward, dl_ddata], "cce", name="maxpool_ad_manual_schedule_all_max", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "maxpool_ad_manual_schedule_all_max" utils.create_cce(kernel_name, './', source_code) return mod
def reduce_max_ad_optimized_manual_schedule(input_shape, dtype, axis, keepdims, polyhedral=True, attrs=None): def custom_reduce_max_fdiff(out, inputs, head_, ad_attrs, new_pld_array): data_ = inputs[0] shape = data_.shape # reduces maximum value for each column max_ = akg.lang.cce.reduce_max(data_, axis=axis, keepdims=True) # copies reduced values to get the original shape max_broadcast = akg.lang.cce.broadcast(max_, shape) # head broadcast is needed to generate correct cce code for the selection operation head_broadcast = akg.tvm.compute( shape, lambda *indices: head_(*get_reduced_indices( *indices, axis=axis, keepdims=keepdims))) # zero all the values that are not max values on the result, remaining is equal to the adjoint of the output max_values_and_zeros = akg.tvm.compute( shape, lambda *indices: akg.tvm.expr.Select( data_(*indices) == max_broadcast(*indices), head_broadcast(*indices), akg.tvm.const(0, dtype='float16')), name="reduce_max_ad2") # cast data back to the original dtype if dtype != 'float16': return [cast(max_values_and_zeros, dtype)] else: return [max_values_and_zeros] # tensor for the input data data = akg.tvm.placeholder(input_shape, dtype, name="input_data") # computation of reduce max # not used on the schedule because this is the diferentiation op l = reduce_max.reduce_max(data, axis, keepdims) # adjoint tensor for the differentiation head = akg.tvm.placeholder(l.shape, name="head", dtype=l.dtype) # cast input data if dtype != 'float16': data_cast = cast(data, "float16") head_cast = cast(head, "float16") else: data_cast = data head_cast = head # override differentiation computation with custom function [dl_ddata] = akg.differentiate( l, [data_cast], head_cast, None, None, override={l: ([data_cast], custom_reduce_max_fdiff)}) # get tensors from custom function if dtype != 'float16': max_values_and_zeros = dl_ddata.op.input_tensors[0] max_broadcast = max_values_and_zeros.op.input_tensors[1] max_ = max_broadcast.op.input_tensors[0] head_broadcast = max_values_and_zeros.op.input_tensors[2] else: max_broadcast = dl_ddata.op.input_tensors[1] max_ = max_broadcast.op.input_tensors[0] head_broadcast = dl_ddata.op.input_tensors[2] # schedule for differetiation operation # inputs: data and head s = akg.tvm.create_schedule([dl_ddata.op]) # cache reads of inputs if dtype != 'float16': head_ub = s.cache_read(head, "local.UB", [head_cast]) data_ub = s.cache_read(data, "local.UB", [data_cast]) else: # no cast operation head_ub = s.cache_read(head_cast, "local.UB", [head_broadcast]) data_ub = s.cache_read(data_cast, "local.UB", [max_, dl_ddata]) # cache write for the output dl_ddata_ub = s.cache_write(dl_ddata, "local.UB") # get tiling attributes if attrs is None: raise Exception('attrs is None') tiling_factors = attrs['tile'] split_iterators = [] assert len(tiling_factors) == len(dl_ddata.shape) # split the final compute and save the iterators for index, factor in enumerate(tiling_factors): split_iterators.append(s[dl_ddata].split(dl_ddata.op.axis[index], factor)) # get iterators iterator1 = split_iterators[0][0] # move computation of when there is a cast if dtype != "float16": s[data_cast].compute_at(s[dl_ddata], iterator1) s[data_cast].set_scope("local.UB") s[head_cast].compute_at(s[dl_ddata], iterator1) s[head_cast].set_scope("local.UB") s[max_values_and_zeros].compute_at(s[dl_ddata], iterator1) s[max_values_and_zeros].set_scope("local.UB") # move cache reads and writes s[data_ub].compute_at(s[dl_ddata], iterator1) s[head_ub].compute_at(s[dl_ddata], iterator1) s[dl_ddata_ub].compute_at(s[dl_ddata], iterator1) # move computation of the diferentiation s[max_].compute_at(s[dl_ddata], iterator1) s[max_].set_scope("local.UB") s[max_broadcast].compute_at(s[dl_ddata], iterator1) s[max_broadcast].set_scope("local.UB") s[head_broadcast].compute_at(s[dl_ddata], iterator1) s[head_broadcast].set_scope("local.UB") with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [head, data, dl_ddata], "cce", name="reduce_max_ad_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "reduce_max_ad_manual_schedule" utils.create_cce(kernel_name, './', source_code) return mod
def fc(fMapBatch, weight, fc_dtype, block_size, attrs, kernel_name="Fully_Connected"): """ Computes full connection. Args: fMapBatch(akg.tvm.Tensor): Should be a 4D tensor. weight(akg.tvm.Tensor): Should be a 4D tensor of same type as fMapBatch. fc_dtype(str): Specifies data type of input tensors. block_size(int): Block size. attrs(dicts): Attributes. kernel_name(str): Kernel name. Returns: akg.tvm.Tensor of same type as input tensors. """ # NCHW f_n, f_c, f_h, f_w = fMapBatch.shape w_n, w_c, w_h, w_w = weight.shape if f_c != w_c or f_h != w_h or f_w != w_w or w_n < 32: raise RuntimeError("invalid input shape") f_shape_nc1hwc0 = (f_n, f_c // block_size, f_h, f_w, block_size) w_shape_fractal = (w_c // block_size * w_h * w_w, w_n // block_size, block_size, block_size) A = akg.tvm.placeholder(f_shape_nc1hwc0, dtype=fc_dtype, name='fmap') B = akg.tvm.placeholder(w_shape_fractal, dtype=fc_dtype, name='weight') out_shape_nc1hwc0 = (f_n, w_n // block_size, 1, 1, block_size) weight_shape_nc1hwc0 = (w_n, w_c // block_size, w_h, w_w, block_size) _, k_c1, k_h, k_w, k_c0 = weight_shape_nc1hwc0 kc1 = akg.tvm.reduce_axis((0, k_c1), name='kc1') kh = akg.tvm.reduce_axis((0, k_h), name='kh') kw = akg.tvm.reduce_axis((0, k_w), name='kw') kc0 = akg.tvm.reduce_axis((0, k_c0), name='kc0') res = akg.tvm.compute(out_shape_nc1hwc0, lambda n, c1, h, w, c0: akg.lang.cce.mmad( A[n, kc1, (h + kh), (w + kw), kc0] * B[ (kc1 * k_h + kh) * k_w + kw, c1, c0, kc0], axis=[kc1, kh, kw, kc0]), name="res") s = akg.tvm.create_schedule(res.op) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [A, B, res], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod
def op_build(op_func, input_shapes, input_types, op_attrs=None, kernel_name="", attrs=None, log_cce=False, dump_ir=True, dump_cce=True, polyhedral=True, tuning=False): """ Return module built from op_func with given inputs. Args: op_func (function returning an op or (op, [op_vars])): The op build function. input_shapes(iterable of iterable of int): the dim sizes for input for op. input_types (iterable of iterable of str): the dtypes for each input. op_attrs (list or tuple): extra attributes for the op. kernel_name (str): name of op. attrs (dict): tiling parameter. log_cce (bool): False by default. dump_ir (bool): True by default. dump_cce (bool): False by default. polyhedral (bool): True by default. tuning (bool): False by default. Return: module. """ inputs = [] set_dim_key = "" shape_params = [] for i, (shape, dtype) in enumerate(zip(input_shapes, input_types)): if isinstance(shape, (list, tuple)) and shape and isinstance( shape[0], (list, tuple)): tmp_input = [] for j, tmp_shape in enumerate(shape): tmp_input.append( akg.tvm.placeholder(tmp_shape, dtype, "input_%d_%d" % (i + 1, j + 1))) for tmp in tmp_shape: if isinstance(tmp, akg.tvm.expr.Var): shape_params.append(tmp) inputs.append(tmp_input) elif isinstance(shape, (list, tuple)) and shape and isinstance( shape[0], akg.tvm.expr.Var): inputs.append( akg.tvm.placeholder(shape, dtype, "input_%d" % (i + 1))) for tmp_shape in shape: if isinstance(tmp_shape, akg.tvm.expr.Var): shape_params.append(tmp_shape) elif isinstance(shape, akg.tvm.tensor.Tensor): inputs.append(shape) for tmp_shape in shape.shape: shape_params.append(tmp_shape) else: inputs.append( akg.tvm.placeholder(shape, dtype, "input_%d" % (i + 1))) attrs_params = [] if op_attrs is not None: args = inputs + op_attrs for tmp_attr in op_attrs: if isinstance(tmp_attr, (list, tuple)) and tmp_attr and isinstance( tmp_attr[0], akg.tvm.expr.Var): for attr_param in tmp_attr: if isinstance(attr_param, akg.tvm.expr.Var): attrs_params.append(attr_param) elif isinstance(tmp_attr, akg.tvm.expr.Var): attrs_params.append(tmp_attr) else: args = inputs # backup inputs because the tensor names may be updated inside op_func inputs_backup = recursive_copy(inputs) output = op_func(*args) # restore inputs to make sure that tensor names are not changed by op_func inputs = inputs_backup if attrs is None or 'dim' not in attrs or not attrs['dim']: dim_info = "" if attrs is None: attrs = dict() if op_func.__name__ in ct_util.set_dim_func_map.keys(): value = ct_util.set_dim_func_map[op_func.__name__] if inspect.isfunction(value): dim_info = value(*args) elif isinstance(value, dict): key = [] key.append(ft_util.convert_to_list(input_shapes)) key.append(ft_util.convert_to_list(input_types)) if op_attrs is not None: key.append(op_attrs) key = str(tuple(key)) if key in value.keys(): dim_info = ct_util.set_dims(value[key]) else: raise RuntimeError( "Registered set_dim_map is invalid. Must be a function or a dict!" ) if isinstance(dim_info, (list, tuple)): dim_info = dim_info[0] attrs['dim'] = dim_info compute_func = None # func which is defined in dsl for doing compute_inline or other sch_tmpl = None if isinstance(output, (list, tuple)): from inspect import isfunction new_outputs = [] for elem in output: if isfunction(elem): compute_func = elem elif isinstance(elem, dict): for key, value in elem.items(): if key not in attrs or not attrs[key]: attrs[key] = value elif isinstance(elem, (list, tuple)): new_outputs += elem else: new_outputs.append(elem) output = new_outputs elif isinstance(output, dict): sch_tmpl = output output = sch_tmpl['output'] binds = None if not attrs else attrs.pop(BINDS, None) op_var = [] for xx in inputs: if isinstance(xx, list): for x in xx: op_var.append(x) else: op_var.append(xx) shape_var = [] if attrs_params: [shape_var.append(i) for i in attrs_params if i not in shape_var] [shape_var.append(i) for i in shape_params if i not in shape_var] if isinstance(output, (list, tuple)): op_var = op_var + [i for i in output if TensorUtils.is_output_value(i)] else: if TensorUtils.is_output_value(output): op_var = op_var + [output] if sch_tmpl != None: assert (sch_tmpl['target'] == 'cuda') kernel_name = kernel_name if kernel_name != "" else sch_tmpl['op_name'] with akg.tvm.target.cuda() as target: s = sch_tmpl['schedule'](sch_tmpl['output']) with akg.tvm.build_config(dump_pass_ir=True): mod = akg.tvm.build(s, op_var, target, target_host='stackvm', name=kernel_name) dump_cuda_meta.dump(mod, kernel_name, s, op_var) return mod if isinstance(output, (list, tuple)): tmp = [] for x in list(output): if isinstance(x, tuple): tmp.append(x[0].op) else: tmp.append(x.op) s = akg.tvm.create_schedule(tmp) else: s = akg.tvm.create_schedule(output.op) if compute_func is not None: compute_func(s) polyhedral = False kernel_name = kernel_name if kernel_name != "" else op_func.__name__ mode = get_runtime_mode() level = attrs.get("help_tiling") if tuning or (level is not None and level > help_tiling_level['None']): if op_func.__name__ in ct_util.set_dim_func_map.keys(): func_ = ct_util.set_dim_func_map[op_func.__name__] if inspect.isfunction(func_): set_dim_key = func_(*args)[1] elif op_func.__name__ in ct_util.gen_key_func_map.keys(): func_ = ct_util.gen_key_func_map[op_func.__name__] if inspect.isfunction(func_): set_dim_key = func_(*args) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): spaces = akg.lower(s, op_var, name=kernel_name, attrs=attrs, polyhedral=polyhedral, tuning=tuning) if set_dim_key == "": set_dim_key = str(args) return spaces, set_dim_key if mode == "cpu": mod = akg.tvm.build(s, op_var, "llvm") if not os.path.isdir("./cpu/ir/"): os.makedirs("./cpu/ir/") with os.fdopen( os.open("./cpu/ir/" + kernel_name + ".cc", os.O_WRONLY | os.O_CREAT, 0o400), 'w') as irf: irf.write(akg.tvm.lower(s, op_var, shape_var, simple_mode=True)) return mod with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=dump_ir): mod = akg.build(s, op_var, "cce", shape_var, name=kernel_name, attrs=attrs, polyhedral=polyhedral, binds=binds) if mod is None: return None source_code = mod.imported_modules[0].get_source() if log_cce: logging.debug("#################cce code####################") logging.debug(source_code) if dump_cce: cce_path = "./" create_cce(kernel_name, cce_path, source_code) return mod