def _apply_ada_max_compute(var, m, v, grad, lr, beta1, beta1_power, beta2, epsilon): """Compute ada_max.""" # cast to float32 for improved accuracy inp_dtype = var.dtype if inp_dtype == 'float16': var = topi.cast(var, 'float32') m = topi.cast(m, 'float32') v = topi.cast(v, 'float32') lr = topi.cast(lr, 'float32') beta1_power = topi.cast(beta1_power, 'float32') beta1 = topi.cast(beta1, 'float32') beta2 = topi.cast(beta2, 'float32') grad = topi.cast(grad, 'float32') epsilon = tvm.const(epsilon, 'float32') # m += (grad - m) * (1 - beta1) rhs = tvm.compute(beta1.shape, lambda *i: beta1(*i) * neg_one_const("float32")) rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) + one_const("float32")) lhs = topi.subtract(grad, m) rhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) * rhs[0]) m = topi.add(m, rhs) # v = max(beta2*v, abs(grad)) lhs = tvm.compute(v.shape, lambda *i: v(*i) * beta2[0]) rhs = topi.abs(grad) v = topi.maximum(lhs, rhs) # var -= lr / (1 - beta1_power) * (m / (v + epsilon)) # lr * m / (1 - beta1_power) * (v + epsilon) # v + epsilon rhs = tvm.compute(v.shape, lambda *i: v(*i) + epsilon) # 1 - beta1_power lhs = tvm.compute(beta1_power.shape, lambda *i: beta1_power(*i) * neg_one_const("float32")) lhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) + one_const("float32")) # (1 - beta1_power) * (v + epsilon) rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) * lhs[0]) # lr * m lhs = tvm.compute(m.shape, lambda *i: m(*i) * lr[0]) # lr * m / (1 - beta1_power) * (v + epsilon) rhs = reciprocal(rhs) rhs = topi.multiply(lhs, rhs) var = topi.subtract(var, rhs) if inp_dtype == 'float16': var = topi.cast(var, inp_dtype) m = topi.cast(m, inp_dtype) v = topi.cast(v, inp_dtype) return var, m, v
def _asin_compute(data_input): """Compute asin""" dtype = data_input.dtype boundary = tvm.const(BOUNDARY, "float32") # Change dtype to float32 if dtype == "float16": data_input = topi.cast(data_input, "float32") # Sign mask data_sign = sign(data_input) # All positive data1 = topi.multiply(data_input, data_sign) # x belongs to (0, 2^(-0.5)) choice_1 = topi.minimum(data1, boundary) choice_1 = topi.subtract(choice_1, boundary) choice_1_floor = akg.lang.cce.floor(choice_1) # the dtype of choice_1_floor is int32, need to be cast to fp32. if utils.product_is_mini(): choice_1_floor = topi.cast(choice_1_floor, "float16") choice_1_floor = topi.cast(choice_1_floor, "float32") else: choice_1_floor = topi.cast(choice_1_floor, "float32") choice_1 = topi.multiply(choice_1_floor, neg_one_const("float32")) taylor1 = _taylor_compute(data1) res_1 = topi.multiply(taylor1, choice_1) # x belongs to (2^(-0.5), 1) choice_2 = topi.subtract(one_const("float32"), choice_1) data2 = topi.subtract(one_const("float32"), topi.multiply(data1, data1)) data2_sqrt = _sqrt(data2) taylor2 = _taylor_compute(data2_sqrt, data2) res_2 = topi.multiply(taylor2, neg_one_const("float32")) res_2 = topi.add(res_2, tvm.const(HALF_PI, "float32")) res_2 = topi.multiply(res_2, choice_2) # Restore sign res_1 = topi.add(res_1, res_2) res_1 = topi.multiply(res_1, data_sign) # Restore dtype if dtype == "float16": res_1 = topi.cast(res_1, "float16") return res_1
def _compute_mini(data_input, shape): """ Use log and taylor to compute arctanh has the feature: arctanh(-abs(x)) = -arctanh(abs(x)) """ data_abs = topi.abs(data_input) result_ln = _compute_log(data_abs) result_taylor = _compute_taylor(data_abs) data_abs = topi.cast(data_abs, "float16") data_input = topi.cast(data_input, "float16") result_taylor = topi.cast(result_taylor, "float16") result_ln = topi.cast(result_ln, "float16") # when |x| < 0.5 using taylor computing, and when 0.5<|x|<1 using log() data_res = tvm.compute(shape, lambda *i : akg.tvm.expr.Select(data_abs(*i) < dc.half_const("float16"), result_taylor(*i), result_ln(*i)), name="le") # arctanh has the feature: arctanh(-abs(x)) = -arctanh(abs(x)) data_res_neg = topi.multiply(data_res, dc.neg_one_const("float16")) data_res = tvm.compute(shape, lambda *i : akg.tvm.expr.Select(data_input(*i) < dc.zero_const("float16"), data_res_neg(*i), data_res(*i)), name="neg") return data_res
def _newton_iter(data, init_x): """Do element-wise Newton compute.""" # Newton begin:x(n+1) = x(n)*(3-a*x(n)^2)/2 init_square = topi.multiply(init_x, init_x) newton_res = topi.multiply(init_square, data) newton_res = topi.multiply(newton_res, neg_one_const("float32")) newton_res = topi.add(newton_res, tvm.const(3, "float32")) newton_res = topi.multiply(newton_res, init_x) newton_res = topi.multiply(newton_res, tvm.const(0.5, "float32")) return newton_res
def _compute_m_t(m, beta, grad): """Update m.""" beta_tmp = tvm.compute(m.shape, lambda *indice: m(*indice) * beta[0]) beta_na = tvm.compute( beta.shape, lambda *indice: beta(*indice) * neg_one_const("float32")) beta_na = tvm.compute( beta_na.shape, lambda *indice: beta_na(*indice) + one_const("float32")) beta_sub_tmp = tvm.compute(grad.shape, lambda *indice: grad(*indice) * beta_na[0]) m_t = topi.add(beta_tmp, beta_sub_tmp) return m_t
def _compute_log(data_input, target=utils.CCE): """atanh(x) value is 0.5*log((1+x)/(1-x))""" data_1_sum_x = topi.add(data_input, dc.one_const(data_input.dtype)) data_sub_x = topi.multiply(data_input, dc.neg_one_const(data_input.dtype)) data_1_sub_x = topi.add(data_sub_x, dc.one_const(data_input.dtype)) data_x_mul = data_1_sum_x / data_1_sub_x data_x_log = log(data_x_mul, target) data_res = topi.multiply(data_x_log, dc.half_const(data_input.dtype)) return data_res
def _compute_log(data_input): """Atanh(x) = 0.5*log((1+x)/(1-x))""" data_1_sum_x = topi.add(data_input, dc.one_const(data_input.dtype)) data_sub_x = topi.multiply(data_input, dc.neg_one_const(data_input.dtype)) data_1_sub_x = topi.add(data_sub_x, dc.one_const(data_input.dtype)) data_x_mul = data_1_sum_x / data_1_sub_x data_x_log = log.log(data_x_mul) data_res = topi.multiply(data_x_log, dc.half_const(data_input.dtype)) return data_res
def _init_atan2_mask(data_y_, data_x_): """ Compute mask for atan2. Args: data_y (tvm.tensor.Tensor): The y of atan2(y, x). data_x (tvm.tensor.Tensor): The x of atan2(y, x). Returns: mask (tvm.tensor.Tensor): The mask of x's and y's value. """ is_cast_for_mini = utils.product_is_mini() and data_y_.dtype == "float32" # in mini, select only support float16 if is_cast_for_mini: data_x = topi.cast(data_x_, "float16") data_y = topi.cast(data_y_, "float16") else: data_x = data_x_ data_y = data_y_ dtype_input = data_y.dtype tensor_one = dc.one_const(dtype_input) tensor_zero = dc.zero_const(dtype_input) tensor_neg_one = dc.neg_one_const(dtype_input) y_ge_zero = tvm.compute( data_y.shape, lambda *i: tvm.expr.Select( data_y(*i) >= tensor_zero, tensor_one, tensor_neg_one), name="y_ge_zero") x_lt_zero_y_mask = tvm.compute( data_y.shape, lambda *i: tvm.expr.Select( data_x(*i) < tensor_zero, y_ge_zero(*i), tensor_zero), name="xlt0_y_mask") if is_cast_for_mini: x_lt_zero_y_mask = topi.cast(x_lt_zero_y_mask, "float32") y_ge_zero = topi.cast(y_ge_zero, "float32") return (x_lt_zero_y_mask, y_ge_zero)
def erfc(input_x): r""" Computes the complementary error of input_x. .. math:: \operatorname{erfc} (x) = 1 - \operatorname{erf} (x). Args: input_x (tvm.tensor.Tensor): Input tensor, only support float16, float32. Returns: tvm.tensor.Tensor with the same shape and dtype as input_x. """ dtype = input_x.dtype vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT) vc_util.check_shape(input_x.shape) erfc_res = topi.add(dc.one_const(dtype), topi.multiply(dc.neg_one_const(dtype), erf(input_x))) return erfc_res
def _erf_compute(input_x): r""" Compute erf. .. math:: \operatorname{erf}(x) = sign(x) \left( 1 - (a_1t+a_2t^2+a_3t^3+a_4t^4+a_5t^5) e^{-x^2} + \epsilon(|x|) \right), \\ t = \dfrac{1}{1+p|x|} \\ \left|\epsilon(|x|)\right| \le 1.5 \times 10^{-7} \\ where \; p=.3275911 \quad a_1=.254829592 \quad a_2=-.284496736 \\ a_3=1.421413741 \quad a_4=-1.453152027 \quad a_5=1.061405429 Args: input_x (tvm.tensor.Tensor): Input tensor. Returns: tvm.tensor.Tensor as rational approximation. """ dtype = input_x.dtype shape = get_shape(input_x) cst_one = dc.one_const("float32") cst_neg_one = dc.neg_one_const("float32") cst_p = tvm.const(SCALER_P, "float32") cst_a1 = tvm.const(SCALER_A1, "float32") cst_a2 = tvm.const(SCALER_A2, "float32") cst_a3 = tvm.const(SCALER_A3, "float32") cst_a4 = tvm.const(SCALER_A4, "float32") cst_a5 = tvm.const(SCALER_A5, "float32") fp16_max = tvm.const(SCALER_FP16_MAX, "float32") fp16_min = tvm.const(SCALER_FP16_MIN, "float32") if dtype == "float16": input_x = topi.cast(input_x, "float32") # calculate: sign = floor[(x*fp16max) / (|x*fp16max| + fp16min)] data_sign_vmuls = topi.multiply(input_x, fp16_max) data_sign_abs = topi.abs(data_sign_vmuls) data_adds = topi.add(data_sign_abs, fp16_min) data_sign_div = div(data_sign_vmuls, data_adds) data_round = round_value(data_sign_div) # mini device should cast to fp16 first if utils.product_is_mini(): data_round = topi.cast(data_round, "float16") tensor_sign = topi.cast(data_round, "float32") # t = 1 / (1 + px) tensor_abs = topi.abs(input_x) one_plus_px = topi.add(cst_one, topi.multiply(tensor_abs, cst_p)) data_t = div(topi.full(shape, "float32", 1.0), one_plus_px) # e^{-x^2} abs_square = topi.multiply(tensor_abs, tensor_abs) neg_square = topi.multiply(abs_square, cst_neg_one) exp_neg_square = exp(neg_square) # a1t + a2t^2 + a3t^3 + a4t^4 + a5t^5 = ((((a5t + a4)t + a3)t + a2)t + a1)t tmp_a5 = topi.multiply(cst_a5, data_t) tmp_a5a4 = topi.multiply(topi.add(tmp_a5, cst_a4), data_t) tmp_a5a4a3 = topi.multiply(topi.add(tmp_a5a4, cst_a3), data_t) tmp_a5a4a3a2 = topi.multiply(topi.add(tmp_a5a4a3, cst_a2), data_t) data_muladd = topi.multiply(topi.add(tmp_a5a4a3a2, cst_a1), data_t) # erf = sign(x) * (1 - data_muladd * e^{-x^2}) erf_res = topi.multiply( tensor_sign, topi.add( cst_one, topi.multiply(cst_neg_one, topi.multiply(data_muladd, exp_neg_square)))) if dtype == "float16": erf_res = topi.cast(erf_res, dtype) return erf_res