def _scalar_dequant_v100_v2(x_l0c, deq_ub, align_shape, x_shape, relu_flag, sqrt_mode): """ dequant for scale in v100 """ res = tvm.compute( align_shape, lambda i, j, k, l: (x_l0c(i, j, k, l).astype("float16") * deq_ub(0, 0, 0, 0)), name='dequant_to_fp16') if sqrt_mode: res = tvm.compute(x_shape, lambda i, j, k, l: (res(i, j, k, l) * deq_ub(0, 0, 0, 0)), name='dequant_sqrt') if relu_flag: res = tvm.compute(x_shape, lambda *indices: tvm.relu(res(*indices)), name="dequant_relu") res = tvm.compute(x_shape, lambda *indice: res(*indice), name="res", tag='dequant_res', attrs={ 'sqrt_mode': sqrt_mode, 'relu_mode': relu_flag, 'is_scalar': 1 }) return res
def _transform(theta, input_dim, out_size, input_shape, dtype): num_batch = input_shape[0] height = input_shape[1] width = input_shape[2] num_channels = input_shape[3] theta = topi.reshape(theta, (num_batch, 2, 3)) theta = topi.cast(theta, dtype) out_height = out_size[0] out_width = out_size[1] grid = _meshgrid(out_height, out_width) grid = topi.reshape(grid, (num_batch, 3, out_height*out_width)) grid = topi.cast(grid, dtype=dtype) k = tvm.reduce_axis((0, 3), 'k') T_g = tvm.compute((num_batch, 2, out_height*out_width),lambda b, y, x: tvm.sum(theta[b, y, k] * grid[b, k, x], axis = k), name = 'T_g') x_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,0,k], name = 'x_s') y_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,1,k], name = 'y_s') x_s_flat = topi.reshape(x_s, (num_batch*out_height*out_width,)) y_s_flat = topi.reshape(y_s, (num_batch*out_height*out_width,)) input_transformed = _interpolate(input_dim, input_shape, x_s_flat, y_s_flat, out_size, dtype) output = topi.reshape(input_transformed, [num_batch, out_height, out_width, num_channels]) return output
def _dequant_v200_v2(x_l0c, deq_ub, align_shape, x_shape, relu_flag, tensor_flag): """ dequant for vector in v200 """ if tensor_flag: res_f16 = tvm.compute( align_shape, lambda i, j, k, l: tvm.vdeq_cast(x_l0c(i, j, k, l), deq_ub(0, j, 0, l), dtype="float16", do_relu=relu_flag), name='dequant_to_fp16', tag="dequant_vector") else: res_f16 = tvm.compute( align_shape, lambda i, j, k, l: tvm.deq_cast( x_l0c(i, j, k, l), deq_ub(0, 0, 0, 0), dtype="float16"), name='dequant_to_fp16', tag="dequant_scale") is_scalar = 1 if tensor_flag: is_scalar = 0 res = tvm.compute(x_shape, lambda *indice: res_f16(*indice), name='res', tag="dequant_res", attrs={'is_scalar': is_scalar}) return res
def _scalar_dequant_v100(x, x_shape, align_shape, deq_scale, relu_flag, sqrt_mode): """ dequant for scale in v100 """ res_f16 = tvm.compute( align_shape, lambda i, j, k, l: (x(i, j, k, l).astype("float16") * deq_scale(0, 0, 0, 0, 0)), name='dequant1', tag="dequant1_scale") res = tvm.compute(x_shape, lambda *indice: res_f16(*indice), name='dequant_remove_pad', tag="dequant_remove_pad") if relu_flag: res = tvm.compute(x_shape, lambda *indices: tvm.relu(res(*indices)), name="dequant_relu", tag="dequant_relu") if sqrt_mode: res = tvm.compute( x_shape, lambda i, j, k, l: (res(i, j, k, l) * deq_scale(0, 0, 0, 0, 0)), name='dequant2', tag='dequant2_scale', ) return res
def _vector_depthwise_fused_v200(x, x_shape, align_shape, deq_scale, relu_flag): """ depthwise dequant for vector in v200 """ res_f16 = tvm.compute( align_shape, lambda i, j, a, k, l: tvm.vdeq_cast(x(i, j // 2, j % 2, k, l), deq_scale(0, j, 0, 0, l), dtype="float16", do_relu=relu_flag), name='dequant1', tag="dequant1_vector", attrs={"relu_flag": relu_flag}) align_shape[3] = x_shape[3].value res = tvm.compute(align_shape, lambda *indice: res_f16(*indice), name='dequant_remove_pad', tag="dequant_remove_pad", attrs={"sqrt_flag": 0}) return res
def _unpack_compute_copy(input_place, y, num, axis, kernel_name="unpack"): """ unpack a tensor into `num` tensors along axis dimension. Parameters ---------- input_place: TVM tensor the tensor of input. y: tuple or list the list of output tensor. num : int. the length of the dim axis. axis: int. the axis to unpack along. kernel_name : str. cce kernel name, default value is "unpack". Returns ------- gm2ub_tensor_list: list the list of gm2ub tensors, tensor type is TVM tensor. ub2gm_tensor_list: list the list of ub2gm tensors, tensor type is TVM tensor. virtual_node: the tensors of virtual output node, tensor type is TVM tensor. """ input_shape = te.lang.cce.util.shape_to_list(input_place.shape) output_shape = input_shape for index, _ in enumerate(output_shape): output_shape[index] = output_shape[index] if index != axis else 1 offset = 0 gm2ub_tensor_list = [] ub2gm_tensor_list = [] for i in range(num): gm2ub_tensor = tvm.compute(output_shape, lambda *index: input_place(*_index_offset( output_shape, axis, offset, *index)), name=''.join(['tensor', str(i)])) gm2ub_tensor_list.append(gm2ub_tensor) ub2gm_tensor = tvm.compute(output_shape, lambda *index: gm2ub_tensor(*index), name=''.join(['res', str(i)])) ub2gm_tensor_list.append(ub2gm_tensor) offset = offset + output_shape[axis] # create a virtual node def _add_compute(*index): virtual_tensor = ub2gm_tensor_list[0](*index) for ub2gm_tensor in ub2gm_tensor_list[1:]: virtual_tensor += ub2gm_tensor(*index) return virtual_tensor virtual_node = tvm.compute(output_shape, lambda *index: _add_compute(*index), name="virtual_node") return gm2ub_tensor_list, ub2gm_tensor_list, virtual_node
def newton_iteration(shape, tensor_x_rec, tensor_x, symbol, iter_num): """ the function of newton_iteration Parameters ---------- shape: tensor shape tensor_x_rec: tensor tensor_x: tensor symbol: tensor symbol Returns ------- tensor_list: dict scope_list: dict emit_list: dict """ dtype_c = tensor_x_rec.dtype num_two = tvm.const(2, dtype=dtype_c) neg_one = tvm.const(-1, dtype=dtype_c) tmp = tensor_x_rec tensor_list = {} scope_list = {} emit_list = {} tmp_mul = None tmp_neg = None tmp_add = None for index in range(0, iter_num): key = "tmp_mul_" + symbol + str(index) tmp_mul = tvm.compute(shape, lambda *i: tensor_x(*i) * tmp(*i), name=key) tensor_list[key] = tmp_mul scope_list[key] = cce.scope_ubuf emit_list[key] = "vector_mul" key = "tmp_neg_" + symbol + str(index) tmp_neg = tvm.compute(shape, lambda *i: tmp_mul(*i) * neg_one, name=key) tensor_list[key] = tmp_neg scope_list[key] = cce.scope_ubuf emit_list[key] = "vector_muls" key = "tmp_add_" + symbol + str(index) tmp_add = tvm.compute(shape, lambda *i: tmp_neg(*i) + num_two, name=key) tensor_list[key] = tmp_add scope_list[key] = cce.scope_ubuf emit_list[key] = "vector_adds" key = "tmp_" + symbol + str(index) tmp = tvm.compute(shape, lambda *i: tmp_add(*i) * tmp(*i), name=key) tensor_list[key] = tmp scope_list[key] = cce.scope_ubuf emit_list[key] = "vector_mul" return tensor_list, scope_list, emit_list
def _compute_update(logbase, sign_decay, sign_gm, grad): vmul_tmp = tvm.compute(sign_gm.shape, lambda *indice: sign_gm(*indice) * sign_decay[0], tag='elewise_single_VS_mul') vmul_tmp = tvm.compute(vmul_tmp.shape, lambda *indice: vmul_tmp(*indice) * logbase[0], tag='elewise_single_VS_mul') exp_tmp = te.lang.cce.vexp(vmul_tmp) update = te.lang.cce.vmul(exp_tmp, grad) return update
def apply_keras_momentum_d_compute(var, accum, lr, grad, momentum, out_var, out_accum, use_nesterov, kernel_name="apply_keras_momentum_d"): """ the operator's compute :param var: weight, placeholder :param accum: accum, placeholder :param lr: learning rate, placeholder :param grad: gradient, placeholder :param momentum: nesterov momentum, placeholder :param out_var: updated of var :param out_accum: updated of accum :param use_nesterov: bool :return: out_var, out_accum """ inp_dtype = var.dtype # check the instruction supports or not vmul_support = tbe_platform.cce_conf.api_check_support( "te.lang.cce.vmul", "float32") if inp_dtype == "float32" and not vmul_support: raise RuntimeError( "Input dtype is float32, but do not support on the platform") # update var and accum according to the momentum scheme # accum = accum * momentum - grad * lr accum_momen = tvm.compute(accum.shape, lambda *indices: accum(*indices) * momentum[0], tag='elewise_single_VS_mul') grad_lr = tvm.compute(grad.shape, lambda *indices: grad(*indices) * lr[0], tag='elewise_single_VS_mul') out_accum = te.lang.cce.vsub(accum_momen, grad_lr) # var = var + accum * momentum - grad * lr if use_nesterov is True: accum_momen2 = tvm.compute( accum.shape, lambda *indices: out_accum(*indices) * momentum[0], tag='elewise_single_VS_mul') add_var_am = te.lang.cce.vadd(var, accum_momen2) out_var = te.lang.cce.vsub(add_var_am, grad_lr) # var = var + accum else: out_var = te.lang.cce.vadd(var, out_accum) def _compute(*index): return out_var(*index), out_accum(*index) return tvm.compute(var.shape, _compute, name='outputs')
def avg_pool_compute1(x, y, ksize, strides, padding="VALID", data_format="NHWC", is_fused_compute=True, kernel_name="avg_pool"): """ describe compute return: tensor """ # create window and stride for pooling2d if data_format in ("NHWC",): window = [ksize[1], ksize[2]] stride = [strides[1], strides[2]] else: window = [ksize[2], ksize[3]] stride = [strides[2], strides[3]] window = list(window) stride = list(stride) # l1 fusion and l2 fusion l1_fusion_type = x.op.attrs["L1_fusion_type"].value \ if "L1_fusion_type" in x.op.attrs else -1 fusion_params = get_fusion_params(x, y, is_fused_compute) in_select_read_flag = fusion_params.get("in_select_read_flag") in_valid_shape = fusion_params.get("in_valid_shape") in_slice_offset = fusion_params.get("in_slice_offset") if in_select_read_flag: select_tensor_in = tvm.compute(in_valid_shape, lambda n, c1, h, w, c0: x(n, c1, h + in_slice_offset[2], w, c0), name="tensor_read_select", attrs=x.op.attrs) res = te.lang.cce.pooling2d(select_tensor_in, window, stride, "AVG", padding, fusion_params=fusion_params) elif l1_fusion_type == 1: x.op.attrs["addr_type"].value = 1 in_l1_flag = True fusion_params["in_l1_flag"] = in_l1_flag l1_width_fusion_in = tvm.compute(x.shape, lambda n, c1, h, w, c0: x(n, c1, h, w, c0), name="l1_width_fusion_tensor_in", attrs=x.op.attrs) res = te.lang.cce.pooling2d(l1_width_fusion_in, window, stride, "AVG", padding, fusion_params=fusion_params) else: res = te.lang.cce.pooling2d(x, window, stride, "AVG", padding, fusion_params=fusion_params) return res
def _reform_by_vadds(input_tensor, input_shape, output_shape, offset_val, nz_format_flag): """ 5 dim input tensor C0 change Parameters ---------- input_tensor : input tensor input_shape : the shape of input tensor output_shape :the shape of output tensor offset_val : the val of offset nz_format_flag: the format of input tensor Returns ------- res tensor """ vadds_vector = tvm.compute(output_shape, _reform_compute_generate( input_tensor, input_shape, output_shape, (True, offset_val, -1), nz_format_flag), name='reform_by_vadds') return vadds_vector
def write_select_compute(input_tensor, output_x, kernel_name="write_select"): """ calculating data Parameters ---------- input_tensor : TVM tensor the input tensor output_x : dict dict of output_x, include keys(shape and dtype) kernel_name : str kernel name, default value is "write_select" Returns ------- output tensor """ # input_shape = output_x.get("shape") input_shape = input_tensor.shape valid_shape = output_x.get("valid_shape") if len(valid_shape) != PARA_LIST_LEN: raise RuntimeError("the len of valid shape should be 5") _, _, h_valid, w_valid, c0_valid = valid_shape compute_name = "res_write_select" + "_" + str(NAME_INDEX[0]) NAME_INDEX[0] += 1 res = tvm.compute(input_shape, lambda *indice: input_tensor(*indice), name=compute_name, attrs={"HWC0": h_valid * w_valid * c0_valid}, tag=WRITE_SELECT_TAG) return res
def strided_write_compute(x, y, axis, stride, kernel_name='strided_write'): """ write data to tensor by stride. Parameters: ---------- x: placeholder of input tesnor. y: dict of output tensor. axis: which axis to write data by stride. stride: data write stride. kernel_name: cce kernel name, default value is "strided_write". Returns: ---------- output_y: result tensor. """ shape_y = tuple(i.value for i in x.shape) output_y = tvm.compute(shape_y, lambda *indice: x(*indice), name="strided_write", attrs={"stride": stride}, tag=STRIDED_WRITE_TAG) return output_y
def _format_transfer_nz(shape, x, c1_index): """ C0 from 16 to 32 for FRACTAL_NZ """ trans_shape = shape[:] trans_shape[c1_index] = trans_shape[c1_index] // 2 trans_shape[-1] = trans_shape[-1] * 2 res = tvm.compute(trans_shape, _format_compute(x, trans_shape, c1_index), name='data_transfer', tag="requant_data_transfer") res = tvm.compute(trans_shape, lambda *i: res[i], name='res', tag='requant_NZ') return res
def _muti_output(var_out, m_out, output_data, m_output_data, shape): """ this compute is for muti output Parameters: ---------- var_out: the value of var_out m_out: the value of m_out output_data: the dict of output_data shape: the shape of var Returns ------- the new value of out_var and out_m the output """ # this compute is for muti output def _compute(*index): return var_out(*index), m_out(*index), output_data( *index), m_output_data(*index) out_var, out_m, out_data, m_out_data = tvm.compute(shape, _compute, name="outputs") return out_var, out_m, out_data, m_out_data
def _reform_by_vmuls(input_tensor, input_shape, output_shape, scale_val, nz_format_flag): """ 5 dim input tensor C0 change Parameters ---------- input_tensor : input tensor input_shape : the shape of input tensor output_shape :the shape of output tensor scale_val : the val of scale nz_format_flag: the format of input tensor Returns ------- res tensor """ vmuls_vector = tvm.compute(output_shape, _reform_compute_generate( input_tensor, input_shape, output_shape, (False, -1, scale_val), nz_format_flag), name='reform_by_vmuls') return vmuls_vector
def strided_read_compute(x, y, axis, stride, kernel_name='strided_read'): """ read data from tensor by stride. Parameters: ---------- x: placeholder of input tesnor. y: dict of output tensor. axis: which axis to read data by stride. stride: data read stride. kernel_name: cce kernel name, default value is "strided_read". Returns: ---------- output_y: result tensor. """ output_y = tvm.compute(y.get("shape"), lambda batch_idx, c1_idx, h_idx, w_idx, c0_idx: x[ batch_idx, c1_idx, h_idx, w_idx, c0_idx], name="strided_read", tag=STRIDED_READ_TAG, attrs=x.op.attrs) return output_y
def custom_equal(shape_x, shape_y, dtype, kernel_name="cce_tf_equal", need_build=False, need_print=False): """ do element-wise equal operation between two input tensors Parameters: ---------- shape_x : shape of input x shape_y : shape of input y dtype : source data type, support float16,float32,int32,int8,uint8 kernel_name : cce kernel name, default value is "cce_tf_equal" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_rule(shape_y) check_list = ["float16", "float32", "int32", "int8", "uint8", "bool"] dtype = dtype.lower() if not (dtype in check_list): raise RuntimeError( "tf_equal_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT) shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y) util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT) x = tvm.placeholder(shape_x, dtype=dtype, name="x") y = tvm.placeholder(shape_y, dtype=dtype, name="y") x_tmp = te.lang.cce.broadcast(x, shape_max) y_tmp = te.lang.cce.broadcast(y, shape_max) res = tvm.compute(shape_max, lambda *i: x_tmp(*i) == y_tmp(*i), name='res') sch = tvm.create_schedule(res.op) if need_print: with build_config: print(tvm.lower(sch, [x, y, res], simple_mode=True)) if need_build: with build_config: tvm.build(sch, [x, y, res], "cce", name=kernel_name)
def _vector_depthwise_fused_v100(x, x_shape, align_shape, deq_scale, relu_flag, sqrt_mode): """ dequant for vector in v100 """ if relu_flag: res_f16 = tvm.compute(align_shape, lambda i, j, a, k, l: tvm.relu( x(i, j // 2, j % 2, k, l).astype("float16") * deq_scale(0, j, 0, 0, l)), name='dequant1', tag="dequant1_vector", attrs={"relu_flag": 1}) else: res_f16 = tvm.compute(align_shape, lambda i, j, a, k, l: x(i, j // 2, j % 2, k, l). astype("float16") * deq_scale(0, j, a, 0, l), name='dequant1', tag="dequant1_vector", attrs={"relu_flag": 0}) align_shape[3] = x_shape[3].value if not sqrt_mode: res = tvm.compute(align_shape, lambda *indice: res_f16(*indice), name='dequant_remove_pad', tag="dequant_remove_pad", attrs={"sqrt_flag": 0}) else: res_sqrt = tvm.compute( align_shape, lambda i, j, a, k, l: (res_f16(i, j, a, k, l) * deq_scale(0, j, a, 0, l)), name='dequant2', tag='dequant2_vector') res = tvm.compute(align_shape, lambda *indice: res_sqrt(*indice), name='dequant2_remove_pad', tag="dequant2_remove_pad", attrs={"sqrt_flag": 1}) return res
def _input_compute_generate(x, in_shape, read_shape, c1_dim, c1_index): """ generate lambda func """ x_shape = te.lang.cce.util.shape_to_list(x.shape) dtype = x.dtype x_slice_offset = _get_input_attr(x, "slice_offset", [], True) l1_fusion_flag = _get_input_attr(x, "l1_fusion_flag", -1, False) if not x_slice_offset: x_slice_offset = [0, 0, 0, 0, 0] if l1_fusion_flag != -1: x_w = x_shape[3] n_offset, _, h_offset, w_offset, _ = x_slice_offset if c1_dim % 2 == 0: input_ub = tvm.compute( in_shape, lambda n, c1, m, c0: x(n + n_offset, c1, (m // x_w) + h_offset, (m % x_w) + w_offset, c0), name="input_ub", attrs={"c_out": c1_dim}) else: input_ub = tvm.compute( read_shape, lambda n, c1, m, c0: tvm.select( c1 <= in_shape[c1_index] - 1, x(n + n_offset, c1, (m // x_w) + h_offset, (m % x_w) + w_offset, c0), tvm.const(0, dtype=dtype)), name='input_ub', attrs={"c_out": c1_dim}) else: if c1_dim % 2 == 0: input_ub = tvm.compute(in_shape, lambda *i: x(*i), name="input_ub", attrs={"c_out": c1_dim}) else: input_ub = tvm.compute( read_shape, lambda *indice: tvm.select( indice[c1_index] <= in_shape[c1_index] - 1, x(*indice), tvm.const(0, dtype=dtype)), name='input_ub', attrs={"c_out": c1_dim}) return input_ub
def _scalar_dequant_v200(x, x_shape, align_shape, deq_scale): """ dequant for scale in v200 """ res_f16 = tvm.compute( align_shape, lambda i, j, k, l: tvm.deq_cast( x(i, j, k, l), deq_scale(0, 0, 0, 0, 0), dtype="float16"), name='dequant', tag="dequant_scale") res = tvm.compute(x_shape, lambda *indice: res_f16(*indice), name='dequant_remove_pad', tag="dequant_remove_pad") return res
def _compute_m_t(m, beta, grad): beta_tmp = tvm.compute(m.shape, lambda *indice: m(*indice) * beta[0], tag='elewise_single_VS_mul') beta_na = tvm.compute( beta.shape, lambda *indice: beta(*indice) * tvm.const(CONST_ONE_NA, beta.dtype), tag='elewise_single_VS_mul') beta_na = tvm.compute( beta_na.shape, lambda *indice: beta_na(*indice) + tvm.const(CONST_ONE, beta_na.dtype), tag='elewise_single_VS_add') beta_sub_tmp = tvm.compute(grad.shape, lambda *indice: grad(*indice) * beta_na[0], tag='elewise_single_VS_mul') m_t = te.lang.cce.vadd(beta_tmp, beta_sub_tmp) return m_t
def _assign_sub_compute(data_var, data_value, out, kernel_name='assign_sub'): """ assign_sub compute function Parameters ---------- data_var : tvm.tensor tensor of var data_value : tvm.tensor tensor of value out : dict dict of out. kernel_name : str cce kernel name, default value is "assign_sub" Returns ------- sch : tvm.schedule the compute schedule res : tvm.tensor tensor of result """ shape = data_var.shape shape = [i.value for i in shape] data_var_ub = tvm.compute(shape, lambda *i: data_var(*i), name='data_var_ub') data_value_ub = tvm.compute(shape, lambda *i: data_value(*i), name='data_value_ub') if data_var.dtype == "int8" or data_var.dtype == "uint8": data_var_cast = tvm.compute( shape, lambda *i: data_var_ub(*i).astype("float16"), name="data_var_cast") data_value_cast = tvm.compute( shape, lambda *i: data_value_ub(*i).astype("float16"), name="data_value_cast") else: data_var_cast = data_var_ub data_value_cast = data_value_ub res_ub = tvm.compute(shape, lambda *i: data_var_cast(*i) - data_value_cast(*i), name='res_ub.local.UB') if data_var.dtype == "int8" or data_var.dtype == "uint8": res_ub_cast = tvm.compute(shape, lambda *i: res_ub(*i).astype(data_var.dtype), name="res_ub_cast") else: res_ub_cast = res_ub res = tvm.compute(shape, lambda *i: res_ub_cast(*i), name='res') schedule_list = (data_var_ub, data_value_ub, data_var_cast, data_value_cast, res_ub, res_ub_cast) sch = _assign_sub_schedule(schedule_list, res, shape, data_var.dtype, data_var) return sch, res
def max_pool3d_compute(x, y, ksize, strides, padding="VALID", data_format="NDHWC", kernel_name="max_pool3d"): """ describe compute return: tensor """ shape = x.shape # copy gm to ub tensor_in_ub = tvm.compute(shape, lambda *i: x[i], name="tensor_in_ub") # vmax in W shape_w = (shape[0], shape[1], shape[2], shape[3] // 2, shape[4]) tensor_w = tvm.compute( shape_w, lambda n, d, h, w, c: tvm.max(tensor_in_ub[n, d, h, 2 * w, c], tensor_in_ub[n, d, h, 2 * w + 1, c]), name='tensor_w') # vmax in H shape_h = (shape[0], shape[1], shape[2] // 2, shape[3] // 2, shape[4]) tensor_h = tvm.compute( shape_h, lambda n, d, h, w, c: tvm.max(tensor_w[n, d, 2 * h, w, c], tensor_w[ n, d, 2 * h + 1, w, c]), name='tensor_h') # vmax in D shape_d = (shape[0], shape[1] // 2, shape[2] // 2, shape[3] // 2, shape[4]) tensor_d = tvm.compute( shape_d, lambda n, d, h, w, c: tvm.max(tensor_h[n, 2 * d, h, w, c], tensor_h[ n, 2 * d + 1, h, w, c]), name='tensor_d') # copy ub to gm res = tvm.compute(shape_d, lambda *i: tensor_d[i], name='res') return res
def newton_iteration(shape, tensor_x_rec, tensor_x, symbol, tensor_list, scope_list, operation_list): """ the function of newton_iteration Parameters ---------- shape : tensor shape tensor_x_rec : tensor tensor_x : tensor symbol : tensor symbol Returns ------- """ dtype_c = tensor_x_rec.dtype const_num_neg_two = tvm.const(-2, dtype=dtype_c) const_num_neg_one = tvm.const(-1, dtype=dtype_c) tensor_newton_mul0 = tvm.compute( shape, lambda *i: tensor_x_rec(*i) * tensor_x(*i), name="tensor_newton_mul0_" + symbol) tensor_list["tensor_newton_mul0_" + symbol] = tensor_newton_mul0 scope_list["tensor_newton_mul0_" + symbol] = cce.scope_ubuf operation_list["tensor_newton_mul0_" + symbol] = "vector_mul" tensor_newton_add = tvm.compute( shape, lambda *i: tensor_newton_mul0(*i) + const_num_neg_two, name="tensor_newton_add_" + symbol) tensor_list["tensor_newton_add_" + symbol] = tensor_newton_add scope_list["tensor_newton_add_" + symbol] = cce.scope_ubuf operation_list["tensor_newton_add_" + symbol] = "vector_add" tensor_newton_mul1 = tvm.compute( shape, lambda *i: tensor_newton_add(*i) * tensor_x_rec(*i), name="tensor_newton_mul1_" + symbol) tensor_list["tensor_newton_mul1_" + symbol] = tensor_newton_mul1 scope_list["tensor_newton_mul1_" + symbol] = cce.scope_ubuf operation_list["tensor_newton_mul1_" + symbol] = "vector_mul" tensor_newton_mul2 = tvm.compute( shape, lambda *i: tensor_newton_mul1(*i) * const_num_neg_one, name="tensor_newton_mul2_" + symbol) return tensor_newton_mul2
def im2col_fractal_v2(A_im2col_shape, A, config, compute_dtype): """ calculate im2col_fractal tensor Parameters ---------- A_im2col_shape : shape of A_im2col A : feature map config: the config of cube compute_dtype: dtype of compute result ------- Returns : A_im2col_fractal tensor """ def _im2col_fractal_indices(indices, A): """ calculate im2col_fractal tvm lambda function Parameters ---------- indices : indices in lambda function A : feature map ------- Returns : im2col_fractal tvm lambda function """ block_size = config['mac'][1] block_size_M = config['mac'][0] n, hw, c1, kernel_h, kernel_w, c0 = A.shape batch_size, i1, j1, i0, j0 = indices n_index = batch_size hw_index = i1 * block_size_M + i0 c1_index = (((j1 * block_size + j0) // c0.value) // kernel_w.value) // kernel_h.value kh_index = (((j1 * block_size + j0) // c0.value) // kernel_w.value) % kernel_h.value kw_index = ((j1 * block_size + j0) // c0.value) % kernel_w.value c0_index = (j1 * block_size + j0) % c0.value dtype = compute_dtype return tvm.select( tvm.any(hw_index < 0, hw_index > hw.value - 1), tvm.const(0.0, dtype), A(n_index, hw_index, c1_index, kh_index, kw_index, c0_index)) return tvm.compute(A_im2col_shape, lambda *indices: _im2col_fractal_indices(indices, A), name='im2col_fractal', tag='im2col_fractal')
def custom_logical_not(shape, dtype, kernel_name="cce_tf_logical_not", need_build=False, need_print=False): """ logical not for the input tensor Parameters ---------- shape : input shape of data dtype : the data type, support bool kernel_name : cce kernel name, default value is "cce_logical_not" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape) check_list = ["bool"] if not dtype.lower() in check_list: raise RuntimeError( "logical_not_cce ony supports %s while dtype is %s" % (",".join(check_list), dtype)) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) inp_dtype = dtype.lower() data = tvm.placeholder(shape, name="data", dtype=inp_dtype) with tvm.target.cce(): result = tvm.compute( shape, lambda *i: tvm.select(data[i] is True, False, True), name="result") schedule = tvm.create_schedule(result.op) if need_print: with build_config: print(tvm.lower(schedule, [data, result], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [data, result], "cce", name=kernel_name)
def _compute_offset(in_tensor, in_shape, out_shape, attr_list, nz_format_flag): """ the compute of scale Parameters ---------- in_tensor : input tensor in_shape : the shape of input tensor out_shape :the shape of output tensor attr_list : the attr list nz_format_flag: the format of input tensor Returns ------- res tensor """ offset = attr_list[0] reform_flag = attr_list[1] scale = attr_list[2] if offset != 0 or scale == 1: offset_value = tvm.const(offset, "float16") if reform_flag: offset_ub = _reform_by_vadds(in_tensor, in_shape, out_shape, offset_value, nz_format_flag) else: offset_ub = tvm.compute( out_shape, lambda *indice: in_tensor(*indice) + offset_value, name="offset_ub") cast_i8_ub = tvm.compute( out_shape, lambda *indice: topi.cast(offset_ub(*indice), "int8"), name='cast_i8_ub') else: cast_i8_ub = tvm.compute( out_shape, lambda *indice: topi.cast(in_tensor(*indice), "int8"), name='cast_i8_ub') return cast_i8_ub
def _s32_to_s8_normal_compute(x, req_scale, align_shape, c1_index, tensor_flag, relu_flag): """ generate s32_to_s8 compute """ if tensor_flag: res_ub = tvm.compute(align_shape, _deq_cast_compute(x, req_scale, align_shape, c1_index, tensor_flag, relu_flag), name='s32_to_s8', tag="requant_vector") else: res_ub = tvm.compute(align_shape, _deq_cast_compute(x, req_scale, align_shape, c1_index, tensor_flag, relu_flag), name='s32_to_s8', tag="requant_scale") return res_ub
def apply_proximal_gradient_descent_compute( var, alpha, l1, l2, delta, out, kernel_name="apply_proximal_gradient_descent"): """ the operator's compute prox_v = var - alpha * delta if l1 > 0 : var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} else: var = prox_v / (var + l2 * delta) Parameters: ---------- var: the dict of var, only support float16, float32 alpha: the dict of alpha, only support float16, float32 l1: the dict of l1, only support float16, float32 l2: the dict of l2, only support float16, float32 delta: the dict of delta, only support float16, float32 out: the dict of output, only support float16, float32 Returns the value of out_var output_data """ dtype = var.dtype if dtype == "float16": var = te.lang.cce.cast_to(var, "float32") alpha = te.lang.cce.cast_to(alpha, "float32") l1 = te.lang.cce.cast_to(l1, "float32") l2 = te.lang.cce.cast_to(l2, "float32") delta = te.lang.cce.cast_to(delta, "float32") alpha_broad = te.lang.cce.broadcast(alpha, var.shape) l1_broad = te.lang.cce.broadcast(l1, var.shape) l2_broad = te.lang.cce.broadcast(l2, var.shape) var_out = _compute_process(var, alpha_broad, l1_broad, l2_broad, delta) if dtype == "float16": var_out = te.lang.cce.cast_to(var_out, "float16") else: var_out = te.lang.cce.cast_to(var_out, "float32") # this compute is for muti output def _compute(*index): return var_out(*index), var_out(*index) return tvm.compute(var.shape, _compute, name="outputs")