def gather(params_shape, indices_shape, params_dtype, indices_dtype, axis, kernel_name, cce_path="./"): """Gather data by indices""" vc_util.check_shape(params_shape, length=2) vc_util.check_shape(indices_shape, length=1) vc_util.ops_dtype_check(params_dtype, vc_util.DtypeForDavinci.ALL_TYPES) vc_util.ops_dtype_check(indices_dtype, vc_util.DtypeForDavinci.INT32) vc_util.check_equal("axis", "zero", axis, 0) # construct compute o_shape = (indices_shape[0], params_shape[1]) xx = akg.tvm.placeholder(params_shape, dtype=params_dtype, name="X") yy = akg.tvm.placeholder(indices_shape, dtype=indices_dtype, name="Y") res = akg.tvm.extern(o_shape, [xx, yy], lambda ins, outs: kernel_ir(outs[0], ins[0], ins[1]), name="res", dtype=params_dtype) s = akg.tvm.create_schedule(res.op) # create cce attrs = {"enable_multicore": False} with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [xx, yy, res], "cce", name=kernel_name, attrs=attrs) source_code = mod.imported_modules[0].get_source() utils.create_cce(kernel_name, cce_path, source_code) return mod
def nms_run(shape_tensor, thres, dtype, kernel_name, attrs): # Create op op_attrs = [thres] if 'tuning' in attrs.keys(): t = attrs.get("tuning", False) kernel_name = attrs.get("kernel_name", False) mod = utils.op_build_test(nms.nms, [shape_tensor], [dtype], op_attrs=op_attrs, kernel_name=kernel_name, attrs=attrs, tuning=t) if t: anchor, expect, output, out_shape = gen_data( shape_tensor[0], shape_tensor[1], thres, dtype) return mod, expect, (anchor, output) else: return mod else: mod = utils.op_build_test(nms.nms, [shape_tensor], [dtype], op_attrs=op_attrs, kernel_name=kernel_name, attrs=attrs) anchor, expect, output, out_shape = gen_data(shape_tensor[0], shape_tensor[1], thres, dtype) output = utils.mod_launch(mod, (anchor, output), expect=expect) output = np.frombuffer(output.tobytes(), np.uint16).reshape(out_shape) source_code = mod.imported_modules[0].get_source() utils.create_cce(kernel_name, "./", source_code) expect = np.frombuffer(expect.tobytes(), np.uint16).reshape(out_shape) return anchor, output, expect, np.all(output == expect)
def iou_for_train_run(shape_tensor, shape_tensor1, dtype, kernel_name, attrs): # Create op if 'tuning' in attrs.keys(): t = attrs.get("tuning", False) kernel_name = attrs.get("kernel_name", False) mod = utils.op_build_test(IOU_for_train.iou_for_train, [shape_tensor, shape_tensor1], [dtype, dtype], kernel_name=kernel_name, attrs=attrs, tuning=t) if t: anchor, expect, ground_truth, output = gen_output_data(dtype, shape_tensor, shape_tensor1) return mod, expect, (anchor, ground_truth, output) else: return mod else: mod = utils.op_build_test(IOU_for_train.iou_for_train, [shape_tensor, shape_tensor1], [dtype, dtype], kernel_name=kernel_name, attrs=attrs) anchor, expect, ground_truth, output = gen_output_data(dtype, shape_tensor, shape_tensor1) output = utils.mod_launch(mod, (anchor, ground_truth, output), expect=expect) source_code = mod.imported_modules[0].get_source() utils.create_cce(kernel_name, "./", source_code) return input, output, expect, compare_tensor(output, expect, rtol=5e-03, equal_nan=True)
def square_difference_run(shape1, shape2, dtype, kernel_name, attrs, cce_path="./"): if 'tuning' in attrs.keys(): t = attrs.get("tuning", False) kernel_name = attrs.get("kernel_name", False) mod = utils.op_build_test(square_difference.square_difference, input_shapes=[shape1, shape2], input_types=[dtype, dtype], kernel_name=kernel_name, attrs=attrs, tuning=t) if t: expect, input1, input2, output = gen_data(dtype, shape1, shape2) return mod, expect, (input1, input2, output) else: return mod else: mod = utils.op_build_test(square_difference.square_difference, input_shapes=[shape1, shape2], input_types=[dtype, dtype], kernel_name=kernel_name, attrs=attrs) expect, input1, input2, output = gen_data(dtype, shape1, shape2) source_code = mod.imported_modules[0].get_source() utils.create_cce(kernel_name, cce_path, source_code) output = utils.mod_launch(mod, (input1, input2, output), expect=expect) return (input1, input2), output, expect, compare_tensor(output, expect, rtol=5e-03, equal_nan=True)
def add_a_conv(fmap_shape, filter_shape, pad_, stride_, dilation_, tile_hh=0, tile_coco=0, tile_mm=0, tile_kk=0, tile_nn=0, bypass_l1=False, use_bias=False, block_size=16, conv_dtype='float16'): conv, a_value, b_value, bias_value, kernel_name, dim_info = add_a_conv_compute( fmap_shape, filter_shape, pad_, stride_, dilation_, tile_hh, tile_coco, tile_mm, tile_kk, tile_nn, bypass_l1, use_bias, block_size, conv_dtype) # schedule s = akg.tvm.create_schedule(conv.op) print(conv, a_value, b_value, bias_value) attrs = {} attrs["pragma_reschedule"] = True attrs["pragma_rmselfdep"] = False attrs['dim'] = dim_info with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): if use_bias: mod = akg.build(s, [a_value, b_value, bias_value, conv], "cce", name=kernel_name, attrs=attrs, polyhedral=True) else: mod = akg.build(s, [a_value, b_value, conv], "cce", name=kernel_name, attrs=attrs, polyhedral=True) source_code = mod.imported_modules[0].get_source() cce_path = '.' utils.create_cce(kernel_name, cce_path, source_code) return mod
def vector_matmul_run(case_index, m, n, k, trans_a, trans_b, read_data, dump_data, dtype, kernel_name, attrs): batch_tuple = (1, ) # m = (m+15)//16*16 # n = (n+15)//16*16 # k = (k+15)//16*16 mod, out_shape = vector_matmul.vector_matmul(m, n, k, trans_a, trans_b, dtype, kernel_name, attrs) utils.create_cce(kernel_name, "./", mod.imported_modules[0].get_source()) # Generate data m_a, m_b, bench_mark = vector_matmul_data(case_index, m, n, k, trans_a, trans_b, read_data, dump_data, dtype) # mod launch output = np.full(out_shape, np.nan, dtype=dtype) output = utils.mod_launch(mod, (m_a, m_b, output), expect=batch_tuple) # compare result compare_result = result_compare(output, bench_mark, batch_tuple, m, n, k, r_tol=1e-2) return (m_a, m_b), output, bench_mark, compare_result
def dropout_execute(shape_tensor, keep_prob, dtype, kernel_name, attrs=None): # Create op if 'tuning' in attrs.keys(): t = attrs.get("tuning", False) kernel_name = attrs.get("kernel_name", False) mod = dropout_compile(shape_tensor, keep_prob, dtype, kernel_name, attrs, tuning=t) if t: expect, input, output, mask = gen_data(dtype, shape_tensor, keep_prob) return mod, expect, (input, mask, output) else: return mod else: mod = dropout_compile(shape_tensor, keep_prob, dtype, kernel_name, attrs) expect, input, output, mask = gen_data(dtype, shape_tensor, keep_prob) output = utils.mod_launch(mod, (input, mask, output), expect=expect) source_code = mod.imported_modules[0].get_source() utils.create_cce(kernel_name, "./", source_code) rtol, atol = get_rtol_atol("dropout", dtype) return (input, mask), output, expect, compare_tensor(output, expect, rtol=rtol, atol=atol, equal_nan=True)
def conv_run_mansch(FMap_shape, Filter_shape, Pad, Stride, Dilation=None, use_bias=False, bypass_L1=False, dump_data=False, Tile=None, attrs=None): conv_dtype = 'float16' fp32_mad = True if attrs is not None and 'fp32mmad' in attrs: fp32_mad = attrs['fp32mmad'] mod = conv_mansch.test_CCE_Conv(FMap_shape, Filter_shape, Pad, Stride, Tile[0], Tile[1], Tile[2], Tile[3], Tile[4], use_bias=use_bias, fp32_mad=fp32_mad, kernel_name="conv_mansch") source_code = mod.imported_modules[0].get_source() utils.create_cce("conv_mansch", ".", source_code) A, B, bias_data, expect = gen_data(FMap_shape, Filter_shape, Pad, Stride, Dilation, use_bias) expect = expect.reshape((expect.shape[0], expect.shape[1], expect.shape[2]*expect.shape[3],expect.shape[4])) # output on conv2d is in 4d format out_data = 60000.0*np.ones(expect.shape).astype(conv_dtype) if use_bias: out_data = utils.mod_launch(mod, [A.astype(conv_dtype), B.astype(conv_dtype), bias_data.astype(conv_dtype), out_data.astype(conv_dtype)], expect=expect) else: out_data = utils.mod_launch(mod, [A.astype(conv_dtype), B.astype(conv_dtype), out_data.astype(conv_dtype)], expect=expect) np.set_printoptions(threshold=sys.maxsize) assert_res = True try: assert_res = result_compare(out_data, expect, r_tol=5e-3) np.testing.assert_allclose(out_data, expect, rtol=5e-02, atol=1e-2, equal_nan=True, verbose=True) print("conv_test_Succeed") except BaseException as e: data_len = expect.size np.savetxt("actual.txt", out_data.reshape(data_len)) np.savetxt("expect.txt", expect.reshape(data_len)) print(str(e)) return (A, B), out_data, expect, assert_res
def maxpool_ad_manual_schedule_no_overlap_all_max(shape, kernel, stride, pad, dtype, attrs=None, polyhedral=False): """automatic differentiate of maxpool with manual schedule for no overlap case.""" kernel_h, kernel_w = kernel stride_h, stride_w = stride pad_h, pad_w, _, _ = pad batch_size, input_c1, input_h, input_w, input_c0 = shape pad_shape = (batch_size, input_c1, input_h + 2 * pad_h, input_w + 2 * pad_w, input_c0) def custom_maxpool_fdiff(out, inputs, head_, ad_attrs, new_pld_array): in_data = inputs[0] if stride_w != kernel_w: raise RuntimeError( "Only supports kernels with same dimensions as stride size!") if stride_h != kernel_h: raise RuntimeError( "Only supports kernels with same dimensions as stride size!") out_broadcast = akg.tvm.compute( pad_shape, lambda b, c1, h, w, c0: out(b, c1, akg.tvm.floordiv(h, stride_h), akg.tvm.floordiv(w, stride_w), c0), name="out_broadcast") # copy output to the shape of the padded input, copying the same value for the entire kernel size out_broadcast = akg.tvm.compute( pad_shape, lambda b, c1, h, w, c0: out(b, c1, akg.tvm.floordiv(h, stride_h), akg.tvm.floordiv(w, stride_w), c0), name="out_broadcast") # copy head to the shape of the padded input, copying the same value for the entire kernel size head_broadcast = akg.tvm.compute( pad_shape, lambda b, c1, h, w, c0: head_(b, c1, akg.tvm.floordiv(h, stride_h), akg.tvm.floordiv(w, stride_w), c0), name="head_broadcast") # check if value was a maximum and assign head of that position if it was # this is done for all the maximum values within one kernel result = akg.tvm.compute( in_data.shape, lambda b, c1, h, w, c0: akg.tvm.expr.Select( in_data(b, c1, h, w, c0) == out_broadcast( b, c1, h + pad_h, w + pad_w, c0), head_broadcast(b, c1, h + pad_h, w + pad_w, c0), akg.tvm.const(0, dtype=in_data.dtype)), name="result") return [result] out_size_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1 out_size_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1 out_shape = (batch_size, input_c1, out_size_h, out_size_w, input_c0) # tensor for the input data data = akg.tvm.placeholder(shape, dtype, name="input_data") # maxpool output forward = akg.tvm.placeholder(out_shape, name="forward", dtype=dtype) # adjoint tensor for the differentiation head = akg.tvm.placeholder(out_shape, name="head", dtype=dtype) # override differentiation computation with custom function [dl_ddata ] = akg.differentiate(forward, [data], head, None, None, override={forward: ([data], custom_maxpool_fdiff)}) # schedule for differetiation operation s = akg.tvm.create_schedule([dl_ddata.op]) # get computations result = dl_ddata forward_broadcast = result.op.input_tensors[1] head_broadcast = result.op.input_tensors[2] # cache reads and writes result_ub = s.cache_write(result, "local.UB") data_ub = s.cache_read(data, "local.UB", [result_ub]) head_ub = s.cache_read(head, "local.UB", [head_broadcast]) forward_ub = s.cache_read(forward, "local.UB", [forward_broadcast]) s[head_broadcast].set_scope("local.UB") s[forward_broadcast].set_scope("local.UB") s[head_ub].compute_at(s[head_broadcast], head_broadcast.op.axis[0]) s[forward_ub].compute_at(s[forward_broadcast], forward_broadcast.op.axis[0]) s[data_ub].compute_at(s[result_ub], result_ub.op.axis[0]) s[forward_broadcast].compute_at(s[result_ub], result_ub.op.axis[0]) s[head_broadcast].compute_at(s[result_ub], result_ub.op.axis[0]) _, c1, h, _, _ = result.op.axis if input_h + 2 * pad_h > 32 or input_w + 2 * pad_w > 32: h_outer, _ = s[result].split(h, 4) s[result_ub].compute_at(s[result], h_outer) else: s[result_ub].compute_at(s[result], c1) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [head, data, forward, dl_ddata], "cce", name="maxpool_ad_manual_schedule_no_overlap_all_max", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "maxpool_ad_manual_schedule_no_overlap_all_max" utils.create_cce(kernel_name, './', source_code) return mod
def maxpool_ad_manual_schedule_all_max(shape, kernel, stride, pad, dtype, polyhedral=True, attrs=None): """automatic differentiate of maxpool with manual schedule for all maximum.""" kernel_h, kernel_w = kernel stride_h, stride_w = stride pad_h, pad_w, _, _ = pad batch_size, input_c1, input_h, input_w, input_c0 = shape pad_shape = (batch_size, input_c1, input_h + 2 * pad_h, input_w + 2 * pad_w, input_c0) out_size_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1 out_size_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1 out_shape = (batch_size, input_c1, out_size_h, out_size_w, input_c0) def custom_maxpool_fdiff(out, inputs, head_, ad_attrs, new_pld_array): in_data = inputs[0] data_separated_by_windows = (kernel_h, kernel_w, batch_size, input_c1, out_size_h, out_size_w, input_c0) pad_data = akg.tvm.compute( pad_shape, lambda b, c1, h, w, c0: akg.tvm.expr.Select( akg.tvm.all(h >= pad_h, h < input_h + pad_h, w >= pad_w, w < input_w + pad_w), in_data(b, c1, h - pad_h, w - pad_w, c0), akg.tvm.const(0.0, dtype=dtype)), name="pad_data") data_reshaped = akg.tvm.compute( data_separated_by_windows, lambda wh, ww, b, c1, oh, ow, c0: pad_data( b, c1, oh * stride_h + wh, ow * stride_w + ww, c0), name="data_reshaped") max_broadcast = akg.tvm.compute( data_separated_by_windows, lambda wh, ww, b, c1, oh, ow, c0: out(b, c1, oh, ow, c0), name="max_broadcast") equal = akg.tvm.compute( data_separated_by_windows, lambda wh, ww, b, c1, oh, ow, c0: akg.tvm.expr.Select( max_broadcast(wh, ww, b, c1, oh, ow, c0) == data_reshaped( wh, ww, b, c1, oh, ow, c0), head_(b, c1, oh, ow, c0), akg.tvm.const(0.0, dtype=dtype)), name="equal") data_reorg = akg.tvm.compute( (out_size_h, out_size_w, batch_size, input_c1, input_h + 2 * pad_h, input_w + 2 * pad_w, input_c0), lambda oh, ow, b, c1, h, w, c0: akg.tvm.expr.Select( akg.tvm.any(h < oh * stride_h, h > oh * stride_h + kernel_h - 1, w < ow * stride_w, w > ow * stride_w + kernel_w - 1), akg.tvm.const(0, dtype=dtype), equal(h - oh * stride_h, w - ow * stride_w, b, c1, oh, ow, c0) ), name="data_reorg") result_pad = akg.topi.sum(data_reorg, [0, 1]) result = akg.tvm.compute(shape, lambda b, c1, h, w, c0: result_pad( b, c1, h + pad_h, w + pad_w, c0), name="result") return [result] # tensor for the input data data = akg.tvm.placeholder(shape, dtype, name="input_data") # maxpool output forward = akg.tvm.placeholder(out_shape, name="forward", dtype=dtype) # adjoint tensor for the differentiation head = akg.tvm.placeholder(out_shape, name="head", dtype=dtype) # override differentiation computation with custom function [dl_ddata ] = akg.differentiate(forward, [data], head, None, None, override={forward: ([data], custom_maxpool_fdiff)}) # schedule for differetiation operation s = akg.tvm.create_schedule([dl_ddata.op]) # get computations result = dl_ddata result_pad = result.op.input_tensors[0] data_reorg = result_pad.op.input_tensors[0] equal = data_reorg.op.input_tensors[0] max_broadcast = equal.op.input_tensors[0] data_reshaped = equal.op.input_tensors[1] pad_data = data_reshaped.op.input_tensors[0] data_ub = s.cache_read(data, "local.UB", [pad_data]) head_ub = s.cache_read(head, "local.UB", [equal]) forward_ub = s.cache_read(forward, "local.UB", [max_broadcast]) result_ub = s.cache_write(result, "local.UB") s[max_broadcast].set_scope("local.UB") s[data_reshaped].set_scope("local.UB") s[pad_data].set_scope("local.UB") s[equal].set_scope("local.UB") s[data_reorg].set_scope("local.UB") s[result_pad].set_scope("local.UB") s[data_ub].compute_inline() s[result_ub].compute_inline() s[pad_data].compute_inline() # equal dependencies s[forward_ub].compute_at(s[equal], equal.op.axis[0]) s[max_broadcast].compute_at(s[equal], equal.op.axis[0]) s[data_reshaped].compute_at(s[equal], equal.op.axis[0]) s[head_ub].compute_at(s[equal], equal.op.axis[0]) s[equal].compute_at(s[result_pad], result_pad.op.axis[0]) # result dependencies s[data_reorg].compute_inline() b, c1, h, w, c0 = result_pad.op.axis oh, ow = result_pad.op.reduce_axis s[result_pad].reorder(oh, ow, b, c1, h, w, c0) # s[result_pad].compute_at(s[result], result.op.axis[1]) b, c1, h, w, c0 = result.op.axis h_out, _ = s[result].split(h, stride_h) s[result_pad].compute_at(s[result], h_out) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [head, data, forward, dl_ddata], "cce", name="maxpool_ad_manual_schedule_all_max", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "maxpool_ad_manual_schedule_all_max" utils.create_cce(kernel_name, './', source_code) return mod
def reduce_min_ad_optimized_manual_schedule(input_shape, dtype, axis, keepdims, polyhedral=True, attrs=None): def get_shape(pld): return [d.value for d in pld.shape] data = akg.tvm.placeholder(input_shape, dtype, name="input_data") #only works for last axis and 2D. Need to extend to multiple dimension and axes. def custom_reduce_min_fdiff(out, inputs, grad, ad_attrs, new_pld_array): data = inputs[0] shape = get_shape(data) if len(get_shape(data)) == 2: # add an extra stage to avoid alignment problem min_input = akg.tvm.compute(data.shape, lambda *i: data(*i), name="min_input") min_ = akg.lang.cce.reduce_min(min_input, axis=-1, keepdims=True) min_broadcast = akg.lang.cce.broadcast(min_, shape) if dtype != "float16": data = cast(data, "float16") return [ akg.tvm.compute(shape, lambda i, j: akg.tvm.expr.Select( data[i, j] == min_broadcast[i, j], grad[i], akg.tvm.const(0, dtype="float16")), name="reduce_min_ad2") ] L = reduce_min.reduce_min(data, axis) head = akg.tvm.placeholder(L.shape, name="head", dtype=L.dtype) head_cast = cast(head, "float16") [dL_ddata ] = akg.differentiate(L, [data], head_cast, None, None, override={L: ([data], custom_reduce_min_fdiff)}) s = akg.tvm.create_schedule([dL_ddata.op]) head_ub = s.cache_read(head, "local.UB", [head_cast]) if dtype == "float16": data_ub = s.cache_read(data, "local.UB", [dL_ddata]) else: data_ub = s.cache_read(data, "local.UB", [dL_ddata.op.input_tensors[0]]) min_input_ub = s.cache_read( dL_ddata.op.input_tensors[1].op.input_tensors[0].op. input_tensors[0].op.input_tensors[0].op.input_tensors[0], "local.UB", [ dL_ddata.op.input_tensors[1].op.input_tensors[0].op. input_tensors[0].op.input_tensors[0] ]) s[dL_ddata.op.input_tensors[1].op.input_tensors[0].op.input_tensors[0]. op.input_tensors[0]].set_scope("local.UB") dL_ddata_ub = s.cache_write(dL_ddata, "local.UB") # tiling split_axis = {} for i in range(len(attrs['tile'])): split_axis["axis" + str(i)] = s[dL_ddata].split( dL_ddata.op.axis[i], attrs["tile"][i]) split_axis_sorted = sorted(split_axis.items()) if dtype == "float16": s[data_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) else: s[data_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) s[dL_ddata.op.input_tensors[0]].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) s[dL_ddata.op.input_tensors[0]].set_scope("local.UB") s[min_input_ub].compute_at(s[dL_ddata], split_axis_sorted[0][1][1]) s[head_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) s[head_cast].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) s[head_cast].set_scope("local.UB") s[dL_ddata.op.input_tensors[1]].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) s[dL_ddata.op.input_tensors[1]].set_scope("local.UB") s[dL_ddata.op.input_tensors[1].op.input_tensors[0]].compute_at( s[dL_ddata], split_axis_sorted[0][1][1]) s[dL_ddata.op.input_tensors[1].op.input_tensors[0]].set_scope("local.UB") s[dL_ddata.op.input_tensors[1].op.input_tensors[0].op. input_tensors[0]].compute_at(s[dL_ddata], split_axis_sorted[0][1][1]) s[dL_ddata.op.input_tensors[1].op.input_tensors[0].op. input_tensors[0]].set_scope("local.UB") # L is not being used for computation # s[L].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) # s[L].set_scope("local.UB"1 s[dL_ddata_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [data, head, dL_ddata], "cce", name="reduce_min_ad_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "reduce_min_ad_manual_schedule" utils.create_cce(kernel_name, './', source_code) return mod
def col2im_manual_schedule(shape, kernel, stride, pad, dtype, output_H_W, polyhedral=True, attrs=None): """ Col2im operation with manual schedule. Args: shape (Union[list, tuple]): seven int numbers for the input's image size. kernel (Union[list, tuple]): two int numbers for the sliding window's size. stride (Union[list, tuple]): two int numbers for the sliding window's stride. pad: (Union[list, tuple]): four int numbers for padding's sizes: top, bottom, left, and right dtype (str): parameters' type. output_H_W (Union[list, tuple]): two int numbers for the output's height and width. polyhedral (bool): If True, use auto-schedule, else use manual-schedule, default value is True. attrs (dict): Specifies parameters used in manual-schedule. Returns: tvm.tensor.Tensor as result for col2im operation. """ N, C1, KH, KW, OH, OW, C0 = shape H, W = output_H_W output_shape = (N, C1, H, W, C0) kernel_h, kernel_w = kernel stride_h, stride_w = stride pad_t, pad_b, pad_l, pad_r = pad assert H == (OH - 1) * stride_h + kernel_h - ( pad_t + pad_b), "Height of input and output do not match" assert W == (OW - 1) * stride_w + kernel_w - ( pad_l + pad_r), "Width of input and output do not match" col2im = intrin_col2im(shape, output_shape, kernel, stride, pad, dtype) # tensor for the input data data = tvm.placeholder(shape, dtype, name="input_data") # assume we need the whole width of A # choose a section of the rows of A that encompasses all of the windows in the current window-batch res = tvm.compute(output_shape, lambda b, c1, h, w, c0: data(b, c1, h % KH, w % KW, h % OH, w % OW, c0), name="col2im_intrinsic") # schedule for differetiation operation s = tvm.create_schedule([res.op]) res_ub = s.cache_write(res, "local.UB") data_ub = s.cache_read(data, "local.UB", [res_ub]) b, c1, h, w, c0 = res.op.axis s[data_ub].compute_at(s[res], c1) s[res_ub].compute_at(s[res], c1) s[res_ub].tensorize(res_ub.op.axis[0], col2im) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [data, res], "cce", name="col2im_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "col2im_manual_schedule" utils.create_cce(kernel_name, "./", source_code) return mod
def reduce_max_ad_optimized_manual_schedule(input_shape, dtype, axis, keepdims, polyhedral=True, attrs=None): def custom_reduce_max_fdiff(out, inputs, head_, ad_attrs, new_pld_array): data_ = inputs[0] shape = data_.shape # reduces maximum value for each column max_ = akg.lang.cce.reduce_max(data_, axis=axis, keepdims=True) # copies reduced values to get the original shape max_broadcast = akg.lang.cce.broadcast(max_, shape) # head broadcast is needed to generate correct cce code for the selection operation head_broadcast = akg.tvm.compute( shape, lambda *indices: head_(*get_reduced_indices( *indices, axis=axis, keepdims=keepdims))) # zero all the values that are not max values on the result, remaining is equal to the adjoint of the output max_values_and_zeros = akg.tvm.compute( shape, lambda *indices: akg.tvm.expr.Select( data_(*indices) == max_broadcast(*indices), head_broadcast(*indices), akg.tvm.const(0, dtype='float16')), name="reduce_max_ad2") # cast data back to the original dtype if dtype != 'float16': return [cast(max_values_and_zeros, dtype)] else: return [max_values_and_zeros] # tensor for the input data data = akg.tvm.placeholder(input_shape, dtype, name="input_data") # computation of reduce max # not used on the schedule because this is the diferentiation op l = reduce_max.reduce_max(data, axis, keepdims) # adjoint tensor for the differentiation head = akg.tvm.placeholder(l.shape, name="head", dtype=l.dtype) # cast input data if dtype != 'float16': data_cast = cast(data, "float16") head_cast = cast(head, "float16") else: data_cast = data head_cast = head # override differentiation computation with custom function [dl_ddata] = akg.differentiate( l, [data_cast], head_cast, None, None, override={l: ([data_cast], custom_reduce_max_fdiff)}) # get tensors from custom function if dtype != 'float16': max_values_and_zeros = dl_ddata.op.input_tensors[0] max_broadcast = max_values_and_zeros.op.input_tensors[1] max_ = max_broadcast.op.input_tensors[0] head_broadcast = max_values_and_zeros.op.input_tensors[2] else: max_broadcast = dl_ddata.op.input_tensors[1] max_ = max_broadcast.op.input_tensors[0] head_broadcast = dl_ddata.op.input_tensors[2] # schedule for differetiation operation # inputs: data and head s = akg.tvm.create_schedule([dl_ddata.op]) # cache reads of inputs if dtype != 'float16': head_ub = s.cache_read(head, "local.UB", [head_cast]) data_ub = s.cache_read(data, "local.UB", [data_cast]) else: # no cast operation head_ub = s.cache_read(head_cast, "local.UB", [head_broadcast]) data_ub = s.cache_read(data_cast, "local.UB", [max_, dl_ddata]) # cache write for the output dl_ddata_ub = s.cache_write(dl_ddata, "local.UB") # get tiling attributes if attrs is None: raise Exception('attrs is None') tiling_factors = attrs['tile'] split_iterators = [] assert len(tiling_factors) == len(dl_ddata.shape) # split the final compute and save the iterators for index, factor in enumerate(tiling_factors): split_iterators.append(s[dl_ddata].split(dl_ddata.op.axis[index], factor)) # get iterators iterator1 = split_iterators[0][0] # move computation of when there is a cast if dtype != "float16": s[data_cast].compute_at(s[dl_ddata], iterator1) s[data_cast].set_scope("local.UB") s[head_cast].compute_at(s[dl_ddata], iterator1) s[head_cast].set_scope("local.UB") s[max_values_and_zeros].compute_at(s[dl_ddata], iterator1) s[max_values_and_zeros].set_scope("local.UB") # move cache reads and writes s[data_ub].compute_at(s[dl_ddata], iterator1) s[head_ub].compute_at(s[dl_ddata], iterator1) s[dl_ddata_ub].compute_at(s[dl_ddata], iterator1) # move computation of the diferentiation s[max_].compute_at(s[dl_ddata], iterator1) s[max_].set_scope("local.UB") s[max_broadcast].compute_at(s[dl_ddata], iterator1) s[max_broadcast].set_scope("local.UB") s[head_broadcast].compute_at(s[dl_ddata], iterator1) s[head_broadcast].set_scope("local.UB") with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [head, data, dl_ddata], "cce", name="reduce_max_ad_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "reduce_max_ad_manual_schedule" utils.create_cce(kernel_name, './', source_code) return mod
def maxpool_manual_schedule(shape, kernel, stride, padding, dtype, attrs=None, polyhedral=False): """maxpool with manual schedule""" vc_util.davinci_format_check(shape, "NC1HWC0", dim=5) vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT) maxpool_param_check(kernel, stride, padding) data = akg.tvm.placeholder(shape, dtype, name="input_data") batch_size, in_c1, input_h, input_w, in_c0 = data.shape kernel_h, kernel_w = kernel stride_h, stride_w = stride if len(padding) == 2: pad_h, pad_w = padding elif len(padding) == 4: pad_h, pad_w = padding[0], padding[2] out_size_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1 out_size_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1 # padding operation if pad_h != 0 or pad_w != 0: pad_shape = (batch_size, in_c1, input_h + 2 * pad_h, input_w + 2 * pad_w, in_c0) padded_input = akg.tvm.compute( pad_shape, lambda n, c1, h, w, c0: akg.tvm.if_then_else( akg.tvm.any( h > input_h + pad_h - 1, h < pad_h, w > input_w + pad_w - 1, w < pad_w, ), akg.tvm.const(0.0, dtype=dtype), data[n, c1, h - pad_h, w - pad_w, c0], ), name="padded_input") else: padded_input = data # reduce iterators it_kernel_h = akg.tvm.reduce_axis((0, kernel_h), name="iterator_reduction_height") it_kernel_w = akg.tvm.reduce_axis((0, kernel_w), name="iterator_reduction_width") out_shape = (batch_size, in_c1, out_size_h, out_size_w, in_c0) res = akg.tvm.compute(out_shape, lambda n, c1, h, w, c0: akg.tvm.max( padded_input[n, c1, (h * stride_h + it_kernel_h), (w * stride_w + it_kernel_w), c0], axis=[it_kernel_h, it_kernel_w]), name="maxpool_not_hybrid") s = akg.tvm.create_schedule([res.op]) if pad_w != 0 or pad_h != 0: padded_input = res.op.input_tensors[0] else: padded_input = res # cache reads and writes # after this cache write: reference to res_ub to change the reduction axis res_ub = s.cache_write(res, "local.UB") if pad_w != 0 or pad_h != 0: data_ub = s.cache_read(data, "local.UB", [padded_input]) else: data_ub = s.cache_read(data, "local.UB", [res_ub]) # get tiling attributes if attrs is None: raise Exception('attrs is None') tiling_factors = attrs['tile'] split_iterators = [] if len(tiling_factors) != len(res.shape): raise RuntimeError("tiling factors mismatch out shape") # split the final compute and save the iterators for index, factor in enumerate(tiling_factors): split_iterators.append(s[res_ub].split(res_ub.op.axis[index], factor)) # get iterators iterator_b_outer = split_iterators[0][0] iterator_b_inner = split_iterators[0][1] iterator_c1_outer = split_iterators[1][0] iterator_c1_inner = split_iterators[1][1] iterator_h_outer = split_iterators[2][0] iterator_h_inner = split_iterators[2][1] iterator_w_outer = split_iterators[3][0] iterator_w_inner = split_iterators[3][1] iterator_c0_outer = split_iterators[4][0] iterator_c0_inner = split_iterators[4][1] # reduction axis iterator_reduce_h = res_ub.op.reduce_axis[0] iterator_reduce_w = res_ub.op.reduce_axis[1] # move caches s[res_ub].compute_at(s[res], res.op.axis[0]) s[data_ub].compute_at(s[res_ub], iterator_c1_outer) if pad_w != 0 or pad_h != 0: s[padded_input].compute_at(s[res_ub], iterator_c1_outer) s[padded_input].set_scope("local.UB") # reorder computation s[res_ub].reorder(iterator_b_outer, iterator_b_inner, iterator_c1_outer, iterator_c1_inner, iterator_h_outer, iterator_h_inner, iterator_w_outer, iterator_w_inner, iterator_reduce_h, iterator_reduce_w, iterator_c0_outer, iterator_c0_inner) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [data, res], "cce", name="maxpool_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "maxpool_ad_manual_schedule" utils.create_cce(kernel_name, './', source_code) return mod
def im2col_manual_schedule(shape, kernel, stride, pad, dtype, polyhedral=True, attrs=None): ''' Compute im2col via cce im2col intrin function call directly Args: shape: shape of the data kernel: kernel sizes for im2col stride: stride sizes for im2col pad: padding sizes for im2col, including padding top, bottom, left, and right dtype: type of the data Return: cce intrin function call for im2col ''' load3d = intrin_load3d(dtype) b, c1, h, w, c0 = shape stride_h, stride_w = stride kernel_h, kernel_w = kernel pad_t, pad_b, pad_l, pad_r = pad dilation_w, dilation_h = 1, 1 jump_offset = 1 repeat_mode = 0 repeat_time = 1 csize = 0 block_size = 16 # output size <=> number of windows ho = (h + pad_b + pad_t - kernel_h) // stride_h + 1 wo = (w + pad_r + pad_l - kernel_w) // stride_w + 1 im2col_shape = (b, (ho * wo + block_size - 1) // block_size, c1 * kernel_h * kernel_w, block_size, c0) def _im2col_compute(i, j, k, data): j_h = (((j * block_size) // wo) * stride_h) - pad_t j_w = (((j * block_size) % wo) * stride_w) - pad_l # num rows in l1 for fmatrix is discounted by the amount of bottom padding h_3d = kernel_h - tvm.max(((j_h + kernel_h) - h), 0) pad_t_3d = tvm.max(-j_h, 0) pad_b_3d = tvm.max(((j_h + kernel_h) - h), 0) w_idx_kernel = (k % kernel_w) h_idx_kernel = ((k // kernel_w) % kernel_h) w_idx = j_w # when this is < 0, the slice will start from row 0 so there is no redundancy between base address and this param h_idx = tvm.min(j_h, 0) c1_idx = (k // kernel_w) // kernel_h load3d_input = data[i, c1_idx, # assume padding < kernel size tvm.max(0, j_h):tvm.min(h, j_h + kernel_h), 0:w, 0:c0] return load3d(load3d_input, w, h_3d, pad_l, pad_r, pad_t_3d, pad_b_3d, w_idx_kernel, h_idx_kernel, w_idx, h_idx, 0, stride_w, stride_h, kernel_w, kernel_h, dilation_w, dilation_h, jump_offset, repeat_mode, repeat_time, csize) # tensor for the input data data = tvm.placeholder(shape, dtype, name="input_data") # assume we need the whole width of a # choose a section of the rows of a that encompasses all of the windows in the current window-batch res = tvm.compute(im2col_shape, lambda i, j, k: _im2col_compute(i, j, k, data), name='im2col_fractal') # schedule for differentiation operation s = tvm.create_schedule([res.op]) data_ub = s.cache_read(data, "local.L1", [res]) res_ub = s.cache_write(res, "local.UB") s[data_ub].compute_at(s[res], res.op.axis[0]) s[res_ub].compute_at(s[res], res.op.axis[2]) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [data, res], "cce", name="im2col_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "im2col_manual_schedule" utils.create_cce(kernel_name, './', source_code) return mod
def matmul_run_mansch(MatrixShape, l1_tiling, l0_tiling, kernel_name, attrs=None): mShape = MatrixShape[0] kShape = MatrixShape[1] nShape = MatrixShape[2] mBurstSize = cce.BLOCK_IN kBurstSize = cce.BLOCK_REDUCE nBurstSize = cce.BLOCK_OUT res_dtype = "float%d" % cce.OUT_WIDTH A_dtype = "float%d" % cce.INP_WIDTH B_dtype = "float%d" % cce.WGT_WIDTH # compute matrix shape as cube AShape = (mShape // mBurstSize, kShape // kBurstSize, mBurstSize, kBurstSize) BShape = (kShape // kBurstSize, nShape // nBurstSize, nBurstSize, kBurstSize) CShape = (nShape // nBurstSize, mShape // mBurstSize, mBurstSize, nBurstSize) # generate data A = random_gaussian(AShape, miu=0.5, sigma=0.01).astype(A_dtype) B = random_gaussian(BShape, miu=0.5, sigma=0.01).astype(B_dtype) out_data = np.zeros(CShape).astype(res_dtype) # launch the kernel mod = matmul_mansch.gemm_dsl(MatrixShape, l1_tiling, l0_tiling, kernel_name) source_code = mod.imported_modules[0].get_source() utils.create_cce(kernel_name, ".", source_code) res = utils.mod_launch(mod, [A, B, out_data]) # transform numpy data to compute benchMark A = A.swapaxes(1, 2) B = B.swapaxes(1, 3) B = B.swapaxes(2, 3) A = A.reshape((mShape, kShape)) B = B.reshape((kShape, nShape)) C = np.zeros((mShape, nShape)).astype(np.float16) np.matmul(A, B, C) # transform CCE output to (m, n) form res = res.swapaxes(0, 2) res = res.swapaxes(0, 1) res = res.reshape((mShape, nShape)) assert_res = True # compare with numpy try: np.testing.assert_allclose(res, C, rtol=1e-2, equal_nan=True, verbose=True) except: assert_res = False return (A, B), out_data, C, assert_res