def _tan_2x_multi(input_x, times): """calculating tan x by calculating tan (x/2^times) and using double angle formula multiple times""" # Calculate tan (x/2^times) if input_x.dtype == FLOAT_16 and utils.product_is_mini(): input_x_divide = topi.multiply(input_x, tvm.const(1.0/(2.0**times), FLOAT_16)) res = _tan_expand(input_x_divide) else: input_x_divide = topi.multiply(input_x, 1.0/(2.0**times)) res = _tan_expand(input_x_divide) while times != 0: # using double angle formula: tan 2x = 2*tan x/(1-tan x*tan x) if input_x.dtype == FLOAT_16 and utils.product_is_mini(): res_numerator = topi.multiply(res, tvm.const(2.0, FLOAT_16)) tanx_square = topi.multiply(res, res) res_denominator = topi.add(topi.multiply(tanx_square, tvm.const(-1.0, FLOAT_16)), tvm.const(1.0, FLOAT_16)) else: res_numerator = topi.multiply(res, 2.0) tanx_square = topi.multiply(res, res) res_denominator = topi.add(topi.multiply(tanx_square, -1.0), 1.0) if utils.product_is_mini(): res = mul(res_numerator, reciprocal(res_denominator)) else: res = div(res_numerator, res_denominator) times = times - 1 return res
def matrix_set_diag_compute(input_matrix, input_diagonal, input_help): """matrix_set_diag compute implemention""" shape_input = get_shape(input_matrix) input_dtype = input_matrix.dtype if input_dtype == "int8" or input_dtype == "uint8": input_matrix = topi.cast(input_matrix, "float16") input_diagonal = topi.cast(input_diagonal, "float16") input_help = topi.cast(input_help, "float16") if input_dtype == "int32" and product_is_mini(): input_matrix = topi.cast(input_matrix, "float16") input_diagonal = topi.cast(input_diagonal, "float16") input_help = topi.cast(input_help, "float16") input_matrix = topi.cast(input_matrix, "float32") input_diagonal = topi.cast(input_diagonal, "float32") input_help = topi.cast(input_help, "float32") if input_dtype == "int32" and not product_is_mini(): input_matrix = topi.cast(input_matrix, "float32") input_diagonal = topi.cast(input_diagonal, "float32") input_help = topi.cast(input_help, "float32") diag_tmp = topi.broadcast_to(input_diagonal, shape_input) help_tmp = topi.add(input_help, -1) help_y = topi.abs(help_tmp) res_vmul_x = topi.multiply(input_matrix, help_y) res_vmul_y = topi.multiply(diag_tmp, input_help) res = topi.add(res_vmul_x, res_vmul_y) if input_dtype == "int32" and product_is_mini(): res = topi.cast(res, "float16") res = topi.cast(res, input_dtype) return res
def less(data1, data2): """ compute tensor with smaller value in data1 and data2 elementwisely. Args: data1 (tvm.tensor.Tensor): Tensor of type float16, float32 and int32. data2 (tvm.tensor.Tensor): Tensor of type float16, float32 and int32. Returns: tvm.tensor.Tensor. If data1 less than data2, return True, else return False. """ vc_util.check_shape(data1.shape) vc_util.check_shape(data2.shape) # check types vc_util.elemwise_dtype_check( data1.dtype, data2.dtype, [vc_util.DtypeForDavinci.ALL_FLOAT, vc_util.DtypeForDavinci.INT32]) # check runtime mode, and change dtype if utils.product_is_mini() and data1.dtype != "float16": data1 = akg.topi.cast(data1, "float16") data2 = akg.topi.cast(data2, "float16") if (not utils.product_is_mini()) and data1.dtype == "int32": data1 = akg.topi.cast(data1, "float32") data2 = akg.topi.cast(data2, "float32") res = akg.topi.less(data1, data2) return res
def matrix_diag_part_compute(input_diagonal, input_help): """matrix_diag_part compute implemention""" shape_input_diagonal = get_shape(input_diagonal) dtype_input_diagonal = input_diagonal.dtype if dtype_input_diagonal == "int8" or dtype_input_diagonal == "uint8": input_diagonal = topi.cast(input_diagonal, "float16") input_help = topi.cast(input_help, "float16") if dtype_input_diagonal == "int32" and product_is_mini(): input_diagonal = topi.cast(input_diagonal, "float16") input_help = topi.cast(input_help, "float16") input_diagonal = topi.cast(input_diagonal, "float32") input_help = topi.cast(input_help, "float32") if dtype_input_diagonal == "int32" and not product_is_mini(): input_diagonal = topi.cast(input_diagonal, "float32") input_help = topi.cast(input_help, "float32") res_vmul = topi.multiply(input_help, input_diagonal) if shape_input_diagonal[-2] < shape_input_diagonal[-1]: res = topi.sum(res_vmul, -1) else: res = topi.sum(res_vmul, -2) if dtype_input_diagonal == "int32" and product_is_mini(): res = topi.cast(res, "float16") res = topi.cast(res, dtype_input_diagonal) return res
def kldiv_loss_grad(pre_deriv, inputs, outputs): """ do backprop for kldiv loss Args: pre_deriv (tvm.tensor.Tensor): Gradient tensor for forward output. inputs (tvm.tensor.Tensor): Forward input tensor. outputs (tvm.tensor.Tensor): Forward output tensor. Returns: Gradient tensor for forward input. """ inputs_dtype = inputs.dtype target_dtype = outputs.dtype pre_deriv_dtype = pre_deriv.dtype utils.ops_dtype_check([inputs_dtype, target_dtype, pre_deriv_dtype], utils.DtypeForDavinci.ALL_FLOAT) if get_const_tuple(outputs.shape) != get_const_tuple(inputs.shape): raise RuntimeError("Please ensure inputs have the same size." "", outputs.shape, inputs.shape) inputs_dtype_old = inputs_dtype if product_is_mini() and inputs_dtype == 'float32': inputs = akg.topi.cast(inputs, "float16") outputs = akg.topi.cast(outputs, "float16") inputs_dtype = "float16" cur_deriv = akg.topi.divide(outputs, inputs) cur_deriv = akg.topi.multiply(cur_deriv, pre_deriv) if product_is_mini() and inputs_dtype_old == 'float32': cur_deriv = akg.topi.cast(cur_deriv, inputs_dtype_old) return cur_deriv
def exp(in_data): """ Compute exponential of in_data element-wise :math:`exp^x` Args: in_data (tvm.tensor.Tensor): Tensor of type float16, float32. Rerurns: tvm.tensor.Tensor of same type and shape as in_data. Raises: ValueError: If the type of input is invalid. """ dtype = in_data.dtype vc_util.check_shape(in_data.shape) if dtype == "float32" and utils.product_is_mini(): in_data = akg.tvm.compute( in_data.shape, lambda *indice: in_data(*indice).astype("float16"), name='type_cast') output = akg.tvm.compute(in_data.shape, lambda *index: akg.tvm.exp(in_data(*index)), name='exp') if dtype == "float32" and utils.product_is_mini(): output = akg.tvm.compute( in_data.shape, lambda *indice: output(*indice).astype("float32"), name='res') return output
def tan_compute(input_x): """tan compute implemention""" dtype = input_x.dtype # cast to type float32 when type is float16 in cloud and mini, or int32 in cloud if dtype == FLOAT_16 or dtype == FLOAT_32 or (dtype == INT_32 and not product_is_mini()): input_x = topi.cast(input_x, FLOAT_32) # adjust x to [-pi/2,pi/2] using x = x-round(x/pi)*pi round_pi_div = akg.lang.ascend.round( topi.multiply(input_x, tvm.const(1.0 / PI, FLOAT_32))) round_pi_div = akg.lang.ascend.cast_to(round_pi_div, FLOAT_32) input_x = topi.subtract( input_x, topi.multiply(round_pi_div, tvm.const(PI, FLOAT_32))) # cast to type float16 when type is int32 in mini elif dtype == INT_32 and product_is_mini(): input_x = topi.cast(input_x, FLOAT_16) # adjust x to [-pi/2,pi/2] using x = x-round(x/pi)*pi round_pi_div = akg.lang.ascend.round( topi.multiply(input_x, tvm.const(1.0 / PI, FLOAT_16))) round_pi_div = akg.lang.ascend.cast_to(round_pi_div, FLOAT_16) input_x = topi.subtract( input_x, topi.multiply(round_pi_div, tvm.const(PI, FLOAT_16))) res = _tan_2x_multi(input_x, TAN_2X_TIMES) # cast the dtype to original dtype res = topi.cast(res, dtype) return res
def less(data1, data2, target=utils.CCE): """ compute tensor with smaller value in data1 and data2 elementwisely. Args: data1 (tvm.tensor.Tensor): Tensor of type float16, float32 and int32. data2 (tvm.tensor.Tensor): Tensor of type float16, float32 and int32. Returns: tvm.tensor.Tensor. If data1 less than data2, return True, else return False. Supported Platforms: 'Ascend', 'GPU', 'CPU' """ utils.check_supported_target(target) utils.check_shape(data1.shape) utils.check_shape(data2.shape) # check types if target == utils.CCE: utils.elemwise_dtype_check( data1.dtype, data2.dtype, [utils.DtypeForDavinci.ALL_FLOAT, utils.DtypeForDavinci.INT32]) # check runtime mode, and change dtype if product_is_mini() and data1.dtype != "float16": data1 = akg.topi.cast(data1, "float16") data2 = akg.topi.cast(data2, "float16") if (not product_is_mini()) and data1.dtype == "int32": data1 = akg.topi.cast(data1, "float32") data2 = akg.topi.cast(data2, "float32") res = akg.topi.less(data1, data2) return res
def tanh_ad(head, in_data): """ Compute gradient of tanh operator using automatic differentiate. Args: head (tvm.tensor.Tensor): Tensor of type float16, float32. in_data (tvm.tensor.Tensor): Tensor of type float16, float32. Returns: tvm.tensor.Tensor has the same shape as input. """ in_dtype = in_data.dtype # On cloud environment, cast data type from 'float16' to 'float32', # then cast result back to 'float16', could achieve higher precision. if in_dtype == 'float16' and not utils.product_is_mini(): in_data = akg.topi.cast(in_data, "float32") head = akg.topi.cast(head, "float32") out_data = tanh.tanh(in_data) jacs = list(akg.differentiate(out_data, [in_data], head)) jacs_res = jacs[0] if in_dtype == 'float16' and not utils.product_is_mini(): jacs_res = akg.topi.cast(jacs_res, 'float16') return jacs_res
def _log_ascend(data): """ Compute natural logarithm of x element-wise. Args: data (tvm.tensor.Tensor): Tensor of type float16, float32, int8, uint8, int32. Returns: tvm.tensor.Tensor of same type and shape as data """ in_data = data dtype = in_data.dtype utils.ops_dtype_check(dtype, utils.DtypeForDavinci.ALL_FLOAT) if dtype == "float32" and product_is_mini(): in_data = akg.tvm.compute( in_data.shape, lambda *indice: in_data(*indice).astype("float16"), name='type_cast') output = akg.tvm.compute(in_data.shape, lambda *index: akg.tvm.log(in_data(*index)), name='log') if dtype == "float32" and product_is_mini(): output = akg.tvm.compute( in_data.shape, lambda *indice: output(*indice).astype("float32"), name='res') return output
def _equal_ascend(input1, input2, target=utils.CCE): # check shapes shape1 = [x.value for x in input1.shape] shape2 = [x.value for x in input2.shape] shapes = [shape1, shape2] for _, shp in enumerate(shapes): utils.check_shape(shp) utils.ops_dtype_check([input1.dtype, input2.dtype], [utils.DtypeForDavinci.ALL_FLOAT, utils.DtypeForDavinci.INT32, utils.DtypeForDavinci.INT8, utils.DtypeForDavinci.UINT8]) dtype = input1.dtype orig_dtype = dtype if product_is_mini() and dtype != "float16": dtype = "float16" if (not product_is_mini()) and dtype not in ("float16", "float32"): # for int32, if cast to float16, there may be overflow dtype = "float32" if orig_dtype == "float32" and dtype == "float16": input_sub = sub(input1, input2, target) input_sub = Cast(input_sub, dtype, target) zero = akg.tvm.const(0.0, dtype) res = akg.topi.equal(input_sub, zero) else: input1 = Cast(input1, dtype, target) input2 = Cast(input2, dtype, target) res = akg.topi.equal(input1, input2) return res
def asinh(x, target=utils.CCE): r""" Compute asinh function. .. math:: asinh(x) = log(x+\sqrt{x*x+1}) Args: x (tvm.tensor.Tensor): Tensor of type float16, float32. Returns: tvm.tensor.Tensor, has the same type and shape as x. Supported Platforms: 'Ascend' """ # check shape utils.check_shape(x) # check input tensor data_type utils.ops_dtype_check(x.dtype, utils.DtypeForDavinci.ALL_FLOAT) dtype = x.dtype # Known that, asinh(x) = log(x + sqrt(x*x+1)), and, asinh(-x) = -asinh(x) # If x is a large negative number, (x + sqrt(x*x+1)) will be close to zero. # So, asinh(x) = sign(x) * log(|x| + sqrt(|x|*|x| + 1)) compute_dtype = dtype if dtype == "float16": # To avoid overflow and higher accuracy, x is casted to float32 compute_dtype = "float32" x = topi.cast(x, compute_dtype) x_abs = topi.abs(x) if product_is_mini(): # sqrt(|x|*|x| + 1) = |x| * sqrt(1 + 1/(|x|*|x|)) vsquare_add_one = topi.add(1, topi.divide(1, topi.multiply(x_abs, x_abs))) sqrt_compute_value = sqrt_mini_newton_iter_impl(vsquare_add_one) sqrt_value = topi.multiply(x_abs, sqrt_compute_value) else: x_abs_square_add_one = topi.add(topi.multiply(x_abs, x_abs), 1) sqrt_value = topi.sqrt(x_abs_square_add_one) x_add_sqrt = topi.add(x_abs, sqrt_value) if product_is_mini(): log_value = log_compute_mini_impl(x_add_sqrt, target) else: log_value = topi.log(x_add_sqrt) res = topi.multiply(Sign(x, target), log_value) if res.dtype != dtype: res = topi.cast(res, dtype) if product_is_mini(): attrs = {"enable_auto_inline": False} return res, attrs return res
def xlogy_grad_compute(placeholders, shape_max, dtype, rx, ry): """ do element-wise xlogy_grad compute Args: placeholders (Union[list, typle]): the placeholder of data input shape_max (Union[list, typle]): the shape of broadcast dtype (string): the type of data input rx (list): the reduction indices of data input with broadcast ry (list): the reduction indices for data input with broadcast Returns output_y1 (tvm.tensor.Tensor): result of xlogy_grad output_y2 (tvm.tensor.Tensor): result of xlogy_grad """ x1_ori = placeholders[0] x2_ori = placeholders[1] grad_ori = placeholders[2] if dtype == "float16": x1 = akg.lang.cce.cast_to(x1_ori, "float32") x2 = akg.lang.cce.cast_to(x2_ori, "float32") grad = akg.lang.cce.cast_to(grad_ori, "float32") x1 = akg.lang.cce.broadcast(x1, shape_max) x2 = akg.lang.cce.broadcast(x2, shape_max) grad = akg.lang.cce.broadcast(grad, shape_max) else: x1 = akg.lang.cce.broadcast(x1_ori, shape_max) x2 = akg.lang.cce.broadcast(x2_ori, shape_max) grad = akg.lang.cce.broadcast(grad_ori, shape_max) esp_min = tvm.const(1.18e-38, dtype="float32") x1_addespmin = akg.lang.cce.vadds(x1, esp_min) if utils.product_is_mini(): not_zero_x1 = akg.lang.cce.vmul(x1, reciprocal(x1_addespmin)) log_x2 = tvm.compute( x2.shape, lambda *i: (tvm.log(x2(*i).astype("float16"))).astype("float32"), name="log_x2") else: not_zero_x1 = div(x1, x1_addespmin) log_x2 = akg.lang.cce.vlog(x2) partial_x1 = akg.lang.cce.vmul(not_zero_x1, log_x2) partial_x1g = akg.lang.cce.vmul(partial_x1, grad) partial_x2 = div(x1, x2) if not utils.product_is_mini() else \ akg.lang.cce.vmul(x1, reciprocal(x2)) partial_x2g = akg.lang.cce.vmul(partial_x2, grad) output_y1 = akg.lang.cce.sum(partial_x1g, rx, keepdims=True) output_y2 = akg.lang.cce.sum(partial_x2g, ry, keepdims=True) if dtype == "float16": output_y1 = akg.lang.cce.cast_to(output_y1, "float16") output_y2 = akg.lang.cce.cast_to(output_y2, "float16") return output_y1, output_y2
def kldiv_loss(inputs, outputs, reduction='none'): """ Computes Kullback-Leibler divergence loss between outputs and inputs. In default, loss = outputs*(log(outputs) - log(inputs)), the way using to reduce loss is defined in reduction Args: inputs (tvm.tensor.Tensor): Tensor with type float16, float32 outputs (tvm.tensor.Tensor): Tensor with same type as inputs. reduction (str): uses one of ['sum', 'mean', 'batchmean'] Returns: Tensor with same type as input tensors. """ inputs_dtype = inputs.dtype target_dtype = outputs.dtype utils.ops_dtype_check([inputs_dtype, target_dtype], utils.DtypeForDavinci.ALL_FLOAT) if get_const_tuple(outputs.shape) != get_const_tuple(inputs.shape): raise RuntimeError("Please ensure inputs have the same size.", outputs.shape, inputs.shape) inputs_dtype_old = inputs_dtype if product_is_mini() and inputs_dtype == 'float32': inputs = akg.topi.cast(inputs, "float16") outputs = akg.topi.cast(outputs, "float16") inputs_dtype = "float16" log_inputs = akg.topi.log(inputs) log_target = akg.topi.log(outputs) loss = akg.topi.subtract(log_target, log_inputs) loss = akg.topi.multiply(outputs, loss) if reduction == 'sum': loss = akg.topi.sum(loss) if reduction == 'mean': loss = akg.topi.sum(loss) deno = 1.0 for num in inputs.shape: deno = deno * num deno = akg.topi.cast(deno, dtype=inputs_dtype) loss = akg.topi.divide(loss, deno) if reduction == 'batchmean': reduce_axis = tuple(numpy.arange(1, len(inputs.shape))) loss = akg.topi.sum(loss, axis=reduce_axis, keepdims=False) deno = 1.0 for num in inputs.shape[1:]: deno = deno * num deno = akg.topi.cast(deno, dtype=inputs_dtype) loss = akg.topi.divide(loss, deno) if product_is_mini() and inputs_dtype_old == 'float32': loss = akg.topi.cast(loss, inputs_dtype_old) return loss
def select(l1, tmp_val, gradient_accum): """Returns tmp_val if l1 > 0 else gradient_accum.""" if product_is_mini(): l1 = topi.cast(l1, "float16") tmp_val = topi.cast(tmp_val, "float16") gradient_accum = topi.cast(gradient_accum, "float16") tmp_val = akg.tvm.compute( tmp_val.shape, lambda *i: tvm.expr.Select(l1[0] > 0, tmp_val(*i), gradient_accum(*i))) return topi.cast(tmp_val, "float32") if product_is_mini() else tmp_val
def _exp_ascend(in_data): dtype = in_data.dtype utils.check_shape(in_data.shape) if dtype == "float32" and product_is_mini(): in_data = akg.tvm.compute(in_data.shape, lambda *indice: in_data(*indice).astype("float16"), name='type_cast') output = akg.tvm.compute(in_data.shape, lambda *index: akg.tvm.exp(in_data(*index)), name='exp') if dtype == "float32" and product_is_mini(): output = akg.tvm.compute(in_data.shape, lambda *indice: output(*indice).astype("float32"), name='res') return output
def Tanh(in_data, target=utils.CCE): """ Compute tanh function. This version is able to avoid exp(x) overflow when x is large. ..math:`res = sign(in_data) * (1 - exp(-2*abs(in_data))) / (1 + exp(-2*abs(in_data)))` Args: in_data (tvm.tensor.Tensor): input tensor of type float16, float32. Returns: tvm.tensor.Tensor, has the same type and shape as in_data. Supported Platforms: 'Ascend' """ utils.check_shape(in_data.shape) dtype = in_data.dtype utils.ops_dtype_check(dtype, utils.DtypeForDavinci.ALL_FLOAT) ori_dtype = dtype in_data_compute = in_data if ori_dtype == "float32" and product_is_mini(): in_data_compute = akg.tvm.compute(in_data.shape, lambda *indice: in_data(* \ indice).astype("float16"), name='type_cast') dtype = 'float16' in_data_abs = akg.lang.ascend.vabs(in_data_compute) exponent = akg.lang.ascend.vmuls(in_data_abs, akg.tvm.const(-2, dtype)) exp_value = akg.lang.ascend.vexp(exponent) exp_value_add_one = akg.lang.ascend.vadds(exp_value, akg.tvm.const(1, dtype)) one_sub_exp_value = akg.topi.subtract(akg.tvm.const(1, dtype), exp_value) exp_value_add_one_rec = RecPositive(exp_value_add_one, target) tanh_value_pos = akg.topi.multiply(one_sub_exp_value, exp_value_add_one_rec) output_shape = in_data_compute.shape sign = akg.tvm.compute( output_shape, lambda *indice: akg.tvm.expr.Select( in_data_compute(*indice) < akg.tvm.const(0, dtype), akg.tvm.const(-1, dtype), akg.tvm.const(1, dtype))) tanh_value = akg.topi.multiply(sign, tanh_value_pos) if ori_dtype == "float32" and product_is_mini(): tanh_value = akg.tvm.compute( tanh_value.shape, lambda *indice: tanh_value(*indice).astype("float32"), name='res') return tanh_value
def acosh(x, target=utils.CCE): r""" Compute acosh function. .. math:: acosh(x) = log(x+\sqrt{x*x-1}) Args: x (tvm.tensor.Tensor): Tensor of type float16, float32. Each entry in it must be in `[1, inf)` Returns: tvm.tensor.Tensor, has the same type and shape as x. Supported Platforms: 'Ascend' """ # check shape utils.check_shape(x) # check input tensor data_type utils.ops_dtype_check(x.dtype, utils.DtypeForDavinci.ALL_FLOAT) dtype = x.dtype if dtype == "float16": # To avoid overflow and higher accuracy, x is casted to float32 x = akg.topi.cast(x, "float32") """acosh(x) = log(x + sqrt(x*x-1))""" x_square = akg.topi.multiply(x, x) x_square_sub = akg.topi.subtract(x_square, 1) if product_is_mini(): sqrt_value = _sqrt_mini_vsqrt_newton_iter(x_square_sub) else: sqrt_value = akg.topi.sqrt(x_square_sub) sqrt_add = akg.topi.add(sqrt_value, x) if product_is_mini(): res = log_compute_mini_impl(sqrt_add, target) else: res = akg.topi.log(sqrt_add) if res.dtype != dtype: res = akg.topi.cast(res, dtype) if product_is_mini(): attrs = {"enable_auto_inline": False} return res, attrs return res
def cosh_call(x): """Compute cosh by the call method.""" dtype = x.dtype shape = get_shape(x) # in order to get the precise calcuate result if product_is_mini() and dtype == "float32": x = akg.lang.ascend.cast_to(x, "float16") res = akg.tvm.compute(shape, lambda *indice: akg.lang.ascend.cosh(x(*indice)), name="res") if product_is_mini() and dtype == "float32": res = akg.lang.ascend.cast_to(res, "float32") return res, get_attrs()
def l1_loss_grad(pre_deriv, inputs, target): """ do backprop for L1 loss (MAE) """ inputs_dtype = inputs.dtype target_dtype = target.dtype pre_deriv_dtype = pre_deriv.dtype # check inputs data types check_list = ["float16", "float32"] if not inputs_dtype.lower() in check_list: raise RuntimeError("inputs only support %s while dtype is %s" % ( ",".join(check_list), inputs_dtype)) if not target_dtype.lower() in check_list: raise RuntimeError("target only support %s while dtype is %s" % ( ",".join(check_list), target_dtype)) if not pre_deriv_dtype.lower() in check_list: raise RuntimeError("prev Derivative only support %s while dtype is %s" % ( ",".join(check_list), pre_deriv_dtype)) if not get_const_tuple(target.shape) == get_const_tuple(inputs.shape): raise RuntimeError( "Please ensure inputs have the same size.", target.shape, prediction.shape) inputs_dtype_old = inputs_dtype if utils.product_is_mini() and inputs_dtype == 'float32': inputs = akg.topi.cast(inputs, "float16") target = akg.topi.cast(target, "float16") inputs_dtype = "float16" def grad_dsl(inputs, target, pre_deriv): # do roadcast outside, cause tvm need shape check;if shape not fix how to check #pre_deriv = akg.topi.broadcast_to(pre_deriv, inputs.shape) coefficient = akg.tvm.const(-1.0, dtype=inputs_dtype) res = akg.tvm.compute(inputs.shape, lambda *i: akg.tvm.if_then_else( inputs(*i) >= target(*i), pre_deriv(*i), coefficient * pre_deriv(*i)) ) return res cur_deriv = grad_dsl(inputs, target, pre_deriv) if utils.product_is_mini() and inputs_dtype_old == 'float32': cur_deriv = akg.topi.cast(cur_deriv, inputs_dtype_old) return cur_deriv
def smooth_l1_loss_grad_run(shape, dtype, attrs=None, kernel_name="smooth_l1_loss_grad"): assert len(shape) >= 2, "last dimension of the shape will be reduced, so the shape length should be >= 2" sample_shape = shape[:-1] anchor_samples_dtype = "int32" # sigma is a constant parameter sigma = 1.0 anchor_sample_correct = 0 if not utils.product_is_mini(): attrs['enable_align_fix'] = True attrs['enable_multicore'] = True if 'tuning' in attrs.keys(): t = attrs.get("tuning", False) kernel_name = attrs.get("kernel_name", False) mod = utils.op_build_test(smooth_l1_loss_grad.smooth_l1_loss_grad, [sample_shape, shape, shape, sample_shape], [dtype, dtype, dtype, anchor_samples_dtype], op_attrs=[sigma, anchor_sample_correct], attrs=attrs, kernel_name=kernel_name, dump_code=True, tuning=t) if t: anchor_samples, dloss, expect, output, prediction, prediction_, target, target_ = gen_data( anchor_sample_correct, anchor_samples_dtype, dtype, sample_shape, shape, sigma) return mod, expect, (dloss, prediction, target, anchor_samples, output) else: return mod else: anchor_samples, dloss, expect, output, prediction, prediction_, target, target_ = gen_data( anchor_sample_correct, anchor_samples_dtype, dtype, sample_shape, shape, sigma) mod = utils.op_build_test(smooth_l1_loss_grad.smooth_l1_loss_grad, [sample_shape, shape, shape, sample_shape], [dtype, dtype, dtype, anchor_samples_dtype], op_attrs=[sigma, anchor_sample_correct], attrs=attrs, kernel_name=kernel_name, dump_code=True) output = utils.mod_launch(mod, (dloss, prediction, target, anchor_samples, output), expect=expect) return (dloss, prediction, target, anchor_samples), output, expect, compare_tensor(output, expect, atol=5e-3, rtol=5e-3)
def _asin_grad_compute(x, dy): """Compute asin_grad.""" dtype = x.dtype if dtype == "float16": x = topi.cast(x, "float32") dy = topi.cast(dy, "float32") # step 1: calculate num_to_vrsqrt = 1 - x^2 data = topi.multiply(x, x) data = topi.multiply(data, tvm.const(-1, "float32")) num_to_vrsqrt = topi.add(data, tvm.const(1, "float32")) # step 2: calculate dy * (1 / sqrt(1 - x^2)) if utils.product_is_mini(): # mini: use newton's method for high accuracy result res = _vrsqrt_newton(num_to_vrsqrt) res = topi.multiply(res, dy) else: # cloud: use vdiv for high efficiency computation vsqrt_res = topi.sqrt(num_to_vrsqrt) res = topi.divide(dy, vsqrt_res) if dtype == "float16": res = topi.cast(res, "float16") return res
def truncate_div(input_x1, input_x2): """ Calculating data's truncate_div, res = floor(x1/x2) if x1/x2>0 else ceil(x1/x2). Args: input_x1 (tvm.tensor.Tensor): Input tensor, support float16, float32 on mini device, while support int32, int8, uint8, float16, float32 on cloud ones. input_x2 (tvm.tensor.Tensor): Input tensor, with same dtype as input_x1. Returns: A tvm.tensor.Tensor as result of truncate_div. """ vc_util.check_shape(get_shape(input_x1)) vc_util.check_shape(get_shape(input_x2)) vc_util.elemwise_dtype_check(input_x1.dtype, input_x2.dtype) vc_util.ops_dtype_check( input_x1.dtype, (vc_util.DtypeForDavinci.ALL_FLOAT) if utils.product_is_mini() \ else (vc_util.DtypeForDavinci.ALL_FLOAT, vc_util.DtypeForDavinci.INT32, vc_util.DtypeForDavinci.INT8, vc_util.DtypeForDavinci.UINT8)) return truncate_div_compute(input_x1, input_x2)
def floordiv(data1, data2): """ Calculate x/y, and always returns an integer which is floored. Args: data1 (tvm.tensor.Tensor): Tensor of type float16, float32. data2 (tvm.tensor.Tensor): Tensor of type float16, float32. Returns: tvm.tensor.Tensor, has type of int32. """ vc_util.ops_dtype_check([data1.dtype, data2.dtype], vc_util.DtypeForDavinci.ALL_FLOAT) shape1 = [x.value for x in data1.shape] vc_util.check_shape(shape1) shape2 = [x.value for x in data2.shape] vc_util.check_shape(shape2) if utils.product_is_mini(): rec = reciprocal(data2, high_precision=True) res = data1 * rec else: res = akg.topi.divide(data1, data2) res = akg.lang.cce.floor(res) return res
def reciprocal(data, high_precision=True): """ Computes the reciprocal of data element-wise. Args: data (list[tvm.tensor.Tensor]): a list of tvm.tensor.Tensor of type float16, float32. high_precision (bool): a bool value, whether to use high-precision version. Returns: tvm.tensor.Tensor of same type and shape as data. """ vc_util.ops_dtype_check(data.dtype, vc_util.DtypeForDavinci.ALL_FLOAT) shape = [x.value for x in data.shape] vc_util.check_shape(shape) res = akg.tvm.compute(shape, lambda *indice: akg.tvm.const(1, data.dtype) / (data(*indice)), name="res") # When product is mini, using Newtom iteration method to achieve higher precision. if utils.product_is_mini() and high_precision: steps = 1 for _ in range(steps): temp1 = data * res temp2 = temp1 * akg.tvm.const(-1, data.dtype) temp3 = temp2 + akg.tvm.const(2, data.dtype) res = temp3 * res return res
def _bessel_i1e_compute(input_data): """bessel i1e compute""" shape = vc_util.get_shape(input_data) dtype = input_data.dtype # chose the type of data in begin if dtype == "float16": input_data = cast(input_data, "float32") abs_data = abs_value(input_data) # compute bessel_i1e for data in (-3.75, 3.75) before_res = _before_res_compute(abs_data) # compute bessel_i1e for data in other domain after_res = _after_res_compute(abs_data) # As vcmp_lt and vsel instruction don't support fp32 on mini # It can be simplified by some methods, such as , "auto cast" if utils.product_is_mini(): res = akg.tvm.compute( shape, lambda *indice: akg.tvm.expr.Select( abs_data[indice].astype("float16") < akg.tvm.const( CONST_LIMIT, "float16"), before_res[indice].astype( "float16"), after_res[indice].astype("float16"))) res = cast(res, "float32") else: res = akg.tvm.compute( shape, lambda *indice: akg.tvm.expr.Select(abs_data[ indice] < CONST_LIMIT, before_res[indice], after_res[indice])) data_sign = sign(input_data) res = mul(res, data_sign) if dtype == "float16": res = cast(res, "float16") return res
def floor_div(data1, data2, target=utils.CCE): """ Calculate x/y, and always returns an integer which is floored. Args: data1 (tvm.tensor.Tensor): Tensor of type float16, float32. data2 (tvm.tensor.Tensor): Tensor of type float16, float32. Returns: tvm.tensor.Tensor, has type of int32. Supported Platforms: 'Ascend' """ utils.ops_dtype_check([data1.dtype, data2.dtype], utils.DtypeForDavinci.ALL_FLOAT) shape1 = [x.value for x in data1.shape] utils.check_shape(shape1) shape2 = [x.value for x in data2.shape] utils.check_shape(shape2) if product_is_mini(): rec = reciprocal(data2, high_precision=True, target=target) res = data1 * rec else: res = akg.topi.divide(data1, data2) res = akg.lang.ascend.floor(res) return res
def _atan_compute(data): """compute for atan""" dtype = data.dtype if dtype == "float16": data = topi.cast(data, "float32") abs_data = topi.abs(data) tensor_one = dc.one_const(abs_data.dtype) abs_data_sub_one = topi.subtract(abs_data, tensor_one) abs_data_add_one = topi.add(abs_data, tensor_one) abs_data2 = topi.abs(topi.divide(abs_data_sub_one, abs_data_add_one)) # calucate data less than one res = _do_atan_taylor(abs_data) # calucate data more than one res_mt_one = topi.add(_do_atan_taylor(abs_data2), tvm.const(CONST_PI_BY_FOUR, abs_data2.dtype)) res = topi.minimum(res, res_mt_one) if utils.product_is_mini() and data.dtype == "float32": sign_mask = topi.cast(topi.sign(topi.cast(data, "float16")), "float32") else: sign_mask = topi.sign(data) res = topi.multiply(res, sign_mask) if dtype == "float16": res = topi.cast(res, "float16") return res
def compute_blockdim(shape, axis, dtype): # strategy: all the shape except reduce axis can be used for multicore blockdim_limit = 2 if utils.product_is_mini() else 32 blockdim = 1 if isinstance(shape, int): shape = [shape] if not isinstance(axis, list): axis = list(axis) for a in axis: if a < 0: a += len(shape) axis = sorted(axis) red_sh = 1 if isinstance(shape, (list, tuple)): for i, sh in enumerate(shape): if not isinstance(sh, int): raise TypeError( "Shape to compute blockdim must be a list/tuple of integer" ) if i in axis: red_sh *= sh else: blockdim = blockdim * sh else: raise TypeError( "Shape to compute blockdim must be a list/tuple of integer") if red_sh < 32 / get_bytes(dtype): # when reduce axis is too small, multicore may not always increase performace blockdim = 1 return min(blockdim_limit, blockdim)
def atanh(input_data): """ Return atanh(x)=0.5*ln((1+x)/(1-x)) if abs(x)<1. Args: input_data (tvm.tensor.Tensor): Input tensor, only support float16, float32. Returns: A tvm.tensor.Tensor as result of atanh. Supported Platforms: 'Ascend' """ shape = get_shape(input_data) utils.check_shape(shape) inp_dtype = input_data.dtype utils.ops_dtype_check(inp_dtype, utils.DtypeForDavinci.ALL_FLOAT) if inp_dtype == "float16": input_data = topi.cast(input_data, "float32") if product_is_mini(): res = _compute_mini(input_data, shape) else: res = _compute_cloud(input_data) res = topi.cast(res, inp_dtype) return res