def relu_compute(x, y, kernel_name="relu"): """ Algrithm : relu(x) = max(x, 0) Parameters ---------- x: the placeholder of data input y : the dict of output kernel_name : cce kernel name Returns ------- res : result of relu """ inp_dtype = x.dtype shape = x.shape compatible_dtype = x.dtype if inp_dtype == 'int8' and api_check_support('te.lang.cce.cast_to', 's82f16'): x = te.lang.cce.cast_to(x, 'float16') compatible_dtype = 'float16' if api_check_support('te.lang.cce.vrelu', compatible_dtype): data_res = te.lang.cce.vrelu(x) else: tensor_zero = te.lang.cce.broadcast( tvm.const(CONST_ZERO, compatible_dtype), shape) data_res = te.lang.cce.vmax(x, tensor_zero) data_res = te.lang.cce.cast_to(data_res, inp_dtype) return data_res
def atan2_compute(y, x, output_dict, kernel_name="atan2"): """ Algorithm: atan2 ---------------------------------- Parameters: y: Input data y. x: Input data x. kernel_name: cce kernel name, default value is "atan2" ---------------------------------- Returns: A Tensor of atan2(x). """ shape_y = y.shape dtype_y = y.dtype shape_x = x.shape shape_y = te.lang.cce.util.shape_to_list(shape_y) shape_x = te.lang.cce.util.shape_to_list(shape_x) shape_y, shape_x, shape_broadcast = broadcast_shapes(shape_y, shape_x, param_name_input1="x1", param_name_input2="x2") y = te.lang.cce.broadcast(y, shape_broadcast) x = te.lang.cce.broadcast(x, shape_broadcast) if dtype_y == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): y = te.lang.cce.cast_to(y, "float32") x = te.lang.cce.cast_to(x, "float32") mask = _init_atan2_mask(y, x) # caculate the atan(y/x) when x > 0 res = te.lang.cce.vdiv(y, x) res = _atan_compute(res) y_cmp_zero = te.lang.cce.vmuls(mask[CONST_ONE], tvm.const(CONST_PI_BY_TWO, y.dtype)) res_x_lt_zero = te.lang.cce.vmuls(mask[CONST_ZERO], tvm.const(CONST_PI, y.dtype)) if x.dtype == res.dtype and api_check_support("te.lang.cce.vcmpsel", x.dtype): res = te.lang.cce.vcmpsel(x, tvm.const(CONST_ZERO, x.dtype), 'eq', y_cmp_zero, res) else: tensor_zero = te.lang.cce.broadcast(tvm.const(CONST_ZERO, x.dtype), shape_broadcast) x_equal_zero = te.lang.cce.vcmp(x, tensor_zero, 'eq') res = te.lang.cce.vsel(x_equal_zero, y_cmp_zero, res) res = te.lang.cce.vadd(res, res_x_lt_zero) if dtype_y == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def bessel_i1e_compute(x, y, kernel_name="bessel_i1e"): """ Algrithm: I0 = 1 + ( (z/2) / (1!) )^2 + ((z/2)^2 / (2!))^2 + ... + ((z/2)^n / (n!)) ^2 I0e = I0 / exp(x) I1e = I0e * z / (2*(k+1)) u = 4 * v^2 Ive = (1 - (u-1)/(8*z) + (u-1)*(u-9)/(2! * (8*z)^2) - (u-1)*(u-9)*(u-25)/(3!*(8*z)^3)) /sqrt(2*pi*z) Parameters ---------- x: the placeholder of data input y: the dict of output kernel_name: cce kernel name, default value is "bessel_i1e" Returns ------- A tensor. Has the same type as x. """ shape_input = x.shape dtype_input = x.dtype # chose the type of data in begin if dtype_input == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): x = te.lang.cce.cast_to(x, "float32") abs_data = te.lang.cce.vabs(x) broad_const_limit = te.lang.cce.broadcast(tvm.const(CONST_LIMIT, x.dtype), shape_input) before_res = _before_res_compute(abs_data, broad_const_limit) after_res = _after_res_compute(abs_data, broad_const_limit) if abs_data.dtype == before_res.dtype and \ api_check_support("te.lang.cce.vcmpsel", abs_data.dtype): res = te.lang.cce.vcmpsel(abs_data, broad_const_limit, 'lt', before_res, after_res) else: select_index = te.lang.cce.vcmp(abs_data, broad_const_limit, 'lt') res = te.lang.cce.vsel(select_index, before_res, after_res) data_sign = util_compute.sign(x) res = te.lang.cce.vmul(res, data_sign) if dtype_input == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def _accumulate_nv2_compute(x, y, num, kernel_name='accumulate_nv2'): """ Process accumulate_nv2 operator. Parameters: ---------- x : the list of input tensor. y : the dict of output. num : the size of input. kernel_name : cce kernel name, default value is "accumulate_nv2". Returns: ------- result : result of accumulate. """ dtype = x[0].dtype shape = x[0].shape length = len(x) result = x[0] # in order to improve the accuracy, convert float16 to float32 if dtype == 'float16' and length > 1 and \ api_check_support("te.lang.cce.vadd", "float32"): result = te.lang.cce.cast_to(result, 'float32') for i in range(1, length): rhs = x[i] if dtype == 'float16' and \ api_check_support("te.lang.cce.vadd", "float32"): rhs = te.lang.cce.cast_to(x[i], 'float32') result = te.lang.cce.vadd(result, rhs) if length == 1: # te.lang.cce.vmuls supports float16, float32. int8, uint8, int32 will # be converted to float16. This will cause the data to be truncated. # so use te.lang.cce.vmul. if dtype == "int32": value_one = tvm.const(NUM_ONE, dtype=dtype) value_one_tensor = te.lang.cce.broadcast(value_one, shape) result = te.lang.cce.vmul(result, value_one_tensor) else: result = te.lang.cce.vmuls(result, NUM_ONE) # in order to improve the accuracy, convert float32 back to float16 if dtype == 'float16' and length > 1: result = te.lang.cce.cast_to(result, 'float16') return result
def asinh_grad_compute(y, dy, output_res, kernel_name="cce_asinh_grad"): """ do element-wise asinh_grad compute Parameters: ---------- y : the placeholders of input y dy : the placeholders of input dy output_res : output dict kernel_name : cce kernel name, default value is "cce_asinh_grad" Return : ------- dy * (1/cosh(y)) """ dtype = y.dtype if dtype == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): y = te.lang.cce.cast_to(y, "float32") dy = te.lang.cce.cast_to(dy, "float32") if api_check_support('te.lang.cce.vexp', 'float32'): # use vexp,vdiv api for high efficiency computation # cosh(y) = (e^y + e^-y) / 2 # (e^2y + 1) / 2e^y exp_pos = te.lang.cce.vexp(y) res = te.lang.cce.vmul(exp_pos, exp_pos) res = te.lang.cce.vadds(res, tvm.const(NUM_ONE, y.dtype)) data_dy1 = te.lang.cce.vmuls(dy, tvm.const(NUM_TWO, y.dtype)) data_dy1 = te.lang.cce.vmul(data_dy1, exp_pos) res = te.lang.cce.vdiv(data_dy1, res) else: # use taylor's method for high accuracy result y = te.lang.cce.vmuls(y, tvm.const(NUM_REPEAT, y.dtype)) cosh_value_0 = _cosh_taylor_compute(y) # repeat 3 times cosh_value_1 = _cosh_repeat(cosh_value_0) cosh_value_2 = _cosh_repeat(cosh_value_1) cosh_value = _cosh_repeat(cosh_value_2) res = te.lang.cce.vrec(cosh_value) res = te.lang.cce.vmul(res, dy) if dtype == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def acosh_grad_compute(y, dy, z, kernel_name="acos_grad"): """ do acosh_grad compute Parameters: ---------------- y: input tensor y dy: input tensor dy z: output dict kernel_name: cce kernel name, default value is "acosh_grad" return: dy * (1 / sinh(y)) ---------------- """ dtype = y.dtype dtype_1 = dtype if dtype == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): y = te.lang.cce.cast_to(y, "float32") dy = te.lang.cce.cast_to(dy, "float32") dtype = "float32" data_y = te.lang.cce.vmuls(y, tvm.const(NUM_REPEAT, dtype)) sinh_value_0 = _taylor_sinh_compute(data_y) sinh_value_1 = _sinh_repeat_with_sqrt(sinh_value_0) sinh_value_2 = _sinh_repeat_with_sqrt(sinh_value_1) data_sinh = _sinh_repeat_with_sqrt(sinh_value_2) res = te.lang.cce.vdiv(dy, data_sinh) if dtype_1 == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def acosh_compute(input_data, output_res, kernel_name="acosh"): """ do element-wise acosh compute f(x) = log(x+sqrt(x^2-1)), for all inputs Parameters: ---------- input_data: the placeholder of data input output_res : the dict of output kernel_name : cce kernel name, default value is "acosh" Returns : A Tensor. Has the same type as input_data. ------- """ data = input_data input_dtype = data.dtype.lower() if input_dtype == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): data = te.lang.cce.cast_to(data, "float32") res = te.lang.cce.vmul(data, data) res = te.lang.cce.vadds(res, tvm.const(CONST_NEG_ONE, data.dtype)) res = te.lang.cce.vsqrt(res, 1) res = te.lang.cce.vadd(res, data) res = te.lang.cce.vlog(res, 1) if input_dtype == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def _less_compare_float32(data_x, data_y): """ Compare data_x and data_y to determine whether data_x is less than data_y. If the element in data_x is less than in data_y, then return 1, else return 0. max num of float32 is 2**126 but cce can only support 2**62, so use 62/62/2 to adaptor 126 (2**(-126))*(2**(62))*(2**(62))*(2**(2)) = 1 so min_value*max_value*max_value*factor_value = 1 """ shape_inputs = te.lang.cce.util.shape_to_list(data_x.shape) min_value = tvm.const(2 ** (-126), dtype=D_TYPE) max_value = tvm.const(2 ** 62, dtype=D_TYPE) factor_value = tvm.const(2 ** 2, dtype=D_TYPE) if api_check_support("te.lang.cce.vmaxs", data_x.dtype): res_sub = te.lang.cce.vsub(data_y, data_x) res_min = te.lang.cce.vmins(res_sub, min_value) res_max = te.lang.cce.vmaxs(res_min, tvm.const(0, dtype=D_TYPE)) else: data_zero = te.lang.cce.vmuls(data_x, 0) min_value_tensor = te.lang.cce.vadds(data_zero, min_value) res_sub = te.lang.cce.vsub(data_y, data_x) res_min = te.lang.cce.vmin(res_sub, min_value_tensor) res_max = te.lang.cce.vmax(res_min, data_zero) res_max_mul = te.lang.cce.vmuls(res_max, max_value) res_max_mul_max = te.lang.cce.vmuls(res_max_mul, max_value) res = te.lang.cce.vmuls(res_max_mul_max, factor_value) return res
def acos_grad_compute(y, dy, z, kernel_name="acos_grad"): """ do acos_grad compute with sqrt and div Parameters: ---------------- y: input tensor y dy: input tensor dy z: output dict kernel_name: cce kernel name, default value is "acos_grad" return: dy * (- 1 / (1 - data_y^2)^1/2) ---------------- """ dtype = y.dtype dtype_1 = dtype if dtype == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): y = te.lang.cce.cast_to(y, "float32") dy = te.lang.cce.cast_to(dy, "float32") dtype = "float32" data1_square = te.lang.cce.vmul(y, y) data1_square = te.lang.cce.vmuls(data1_square, tvm.const(NUM_MINUS_ONE, dtype=dtype)) data1_square = te.lang.cce.vadds(data1_square, tvm.const(NUM_ONE, dtype=dtype)) data1_reciprocal = te.lang.cce.vsqrt(data1_square, 1) data1_reciprocal = te.lang.cce.vdiv(dy, data1_reciprocal) res = te.lang.cce.vmuls(data1_reciprocal, tvm.const(NUM_MINUS_ONE, dtype=dtype)) if dtype_1 == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def atanh_compute(x, y, kernel_name="atanh"): """ Algrithm : atanh(x) = 0.5 * log((1 + x) / (1 - x)) if abs(x) < 1 Parameters ---------- x: the placeholder of data input y : the dict of output kernel_name : cce kernel name Returns ------- res : result of atanh """ inp_dtype = x.dtype shape = x.shape if inp_dtype == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): x = te.lang.cce.cast_to(x, "float32") data_res = _compute(x, shape) if inp_dtype == "float16": data_res = te.lang.cce.cast_to(data_res, "float16") else: data_res = te.lang.cce.cast_to(data_res, "float32") return data_res
def rsqrt_compute(x, y, kernel_name="rsqrt_cce"): """ Algrithm : rsqrt(x) = 1 / sqrt(x) where x > 0 Parameters ---------- x: the placeholder of data input y : the dict of output kernel_name : cce kernel name Returns ------- res : result of rsqrt """ inp_dtype = x.dtype if inp_dtype == "float16" and api_check_support("te.lang.cce.vadd", "float32"): x = te.lang.cce.cast_to(x, "float32") data_res = _compute(x) if inp_dtype == "float16": data_res = te.lang.cce.cast_to(data_res, "float16") return data_res
def elu_grad_compute(grads, activations, y, kernel_name="elu_grad"): """ elu_grad_compute f(x) = vmul(add(min(activation, 0), 1), gradient) Parameters: ---------- data_gradient : the placeholder of gradient data data_activation : the placeholder of activation data data_output : the dict of output kernel_name : cce kernel name, default value is "elu_grad" Returns : A Tensor. Has the same type as data_gradient. ------- """ dtype = grads.dtype shape = grads.shape if dtype.lower() == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): grads = te.lang.cce.cast_to(grads, "float32") activations = te.lang.cce.cast_to(activations, "float32") if api_check_support("te.lang.cce.vmins", "float32"): min_res = te.lang.cce.vmins(activations, NUM_ZERO) add_res = te.lang.cce.vadds(min_res, NUM_ONE) res = te.lang.cce.vmul(add_res, grads) else: input_border = tvm.const(NUM_ZERO, grads.dtype) scalar_param_one = tvm.const(NUM_ONE, grads.dtype) tensor_input_border = te.lang.cce.broadcast(input_border, shape) tensor_scalar_param_one = te.lang.cce.broadcast( scalar_param_one, shape) min_res = te.lang.cce.vmin(activations, tensor_input_border) add_res = te.lang.cce.vadd(min_res, tensor_scalar_param_one) res = te.lang.cce.vmul(add_res, grads) if dtype.lower() == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def asinh_compute_mini(input_x, output_y, kernel_name="asinh"): """ algrithm: asinh(x) = log(x + sqrt(x^2 + 1)) Parameters ---------- input_x: the placeholder of data input output_y : the dict of output kernel_name : cce kernel name, default value is "asinh" Returns ------- res : result of asinh """ inp_dtype = input_x.dtype.lower() shape = input_x.shape has_improve_precision = False if inp_dtype == "float16" and \ tbe_platform.cce_conf.api_check_support("te.lang.cce.vrec", "float32"): input_x = te.lang.cce.cast_to(input_x, "float32") has_improve_precision = True input_x1 = te.lang.cce.vabs(input_x) # to fix bug for input data is 0.0 input_x1 = te.lang.cce.vadds(input_x1, MIN_FP16) data_1_x = te.lang.cce.vrec(input_x1) data_1_x_square = te.lang.cce.vmul(data_1_x, data_1_x) data_1_x_square = te.lang.cce.vadds(data_1_x_square, tvm.const(CONST_ONE, "float32")) data_s_1_sqrt = _newton_sqrt(data_1_x_square, inp_dtype) data_res = te.lang.cce.vmul(data_s_1_sqrt, input_x1) data_res = te.lang.cce.vadd(input_x1, data_res) result = _log_taylor(data_res, shape) res_neg = te.lang.cce.vmuls(result, tvm.const(CONST_NEG_ONE, inp_dtype)) if input_x.dtype == result.dtype and api_check_support( "te.lang.cce.vcmpsel", input_x.dtype): res = te.lang.cce.vcmpsel(input_x, tvm.const(CONST_ZERO, input_x.dtype), 'le', res_neg, result) else: const_zero_tensor = te.lang.cce.broadcast( tvm.const(CONST_ZERO, input_x.dtype), shape) compare_one = te.lang.cce.vcmp(input_x, const_zero_tensor, "le") res = te.lang.cce.vsel(compare_one, res_neg, result) if has_improve_precision: res = te.lang.cce.cast_to(res, "float16") else: res = te.lang.cce.cast_to(res, "float32") return res
def _log_compute(data_x, res, shape): """ when data > 2, use vlog directly when data > 32768, float16 will overflow, use log(x/2.5)+log(2.5) Parameters ---------- data: input tensor that we want to calculate log Returns ------- res : return of log """ # if data > 2, use vlog if data_x.dtype == res.dtype and api_check_support("te.lang.cce.vcmpsel", data_x.dtype): res = te.lang.cce.vcmpsel(data_x, tvm.const(CONST_TWO, data_x.dtype), 'ge', te.lang.cce.vlog(data_x), res) else: threshold_3 = te.lang.cce.broadcast(tvm.const(CONST_TWO, "float32"), shape) index_3 = te.lang.cce.vcmp(data_x, threshold_3, 'ge') res = te.lang.cce.vsel(index_3, te.lang.cce.vlog(data_x), res) # if data > 32768, use log(x/2.5)+log(2.5) overflow_value = te.lang.cce.vmuls(data_x, CONST_FIVE_TWO) res_overflow = te.lang.cce.vadds(te.lang.cce.vlog(overflow_value), LOG_FIVE_TWO) if data_x.dtype == res.dtype and api_check_support("te.lang.cce.vcmpsel", data_x.dtype): res = te.lang.cce.vcmpsel(data_x, tvm.const(FLOAT_16_MAX, data_x.dtype), 'ge', res_overflow, res) else: float_16_max_tensor = te.lang.cce.broadcast( tvm.const(FLOAT_16_MAX, "float32"), shape) index_4 = te.lang.cce.vcmp(data_x, float_16_max_tensor, 'ge') res = te.lang.cce.vsel(index_4, res_overflow, res) res = te.lang.cce.cast_to(res, "float32") return res
def relu_v2_compute(x, y, mask, kernel_name="relu_v2_cce"): """ Algrithm : relu_v2(x) = x and 1 when x > 0 , else 0, 0 Parameters ---------- x: the placeholder of data input y : the dict of output mask : the dict of output kernel_name : cce kernel name Returns ------- res : result of relu_v2_res mask: result of relu_v2_mask """ inp_dtype = x.dtype shape = x.shape compatible_dtype = x.dtype if inp_dtype == 'int8' and api_check_support('te.lang.cce.cast_to', 's82f16'): x = te.lang.cce.cast_to(x, 'float16') compatible_dtype = 'float16' if api_check_support('te.lang.cce.vrelu', compatible_dtype): data_res = te.lang.cce.vrelu(x) else: tensor_zero = te.lang.cce.broadcast( tvm.const(CONST_ZERO, compatible_dtype), shape) data_res = te.lang.cce.vmax(x, tensor_zero) data_res = te.lang.cce.cast_to(data_res, inp_dtype) mask = te.lang.cce.vcmp(x, CONST_ZERO, "gt", "bit") return data_res, mask
def _less_equal_compare_float32(data_x, data_y): """ if x is less than y or equal y, then return 1, else return 0. Parameters: ---------- data_x : TVM tensor tensor x data_y : TVM tensor tensor y Returns ------- the compare result """ scalar_min_fp32 = tvm.const(2 ** (-126), dtype="float32") scalar_mul_fp32_first = tvm.const(2 ** (50), dtype="float32") scalar_mul_fp32_second = tvm.const(2 ** (26), dtype="float32") scalar_one_fp32 = tvm.const(1.0, dtype="float32") scalar_one_fp32_neg = scalar_one_fp32 * tvm.const(-1.0, dtype="float32") if api_check_support("te.lang.cce.vmaxs", data_x.dtype): data_max = te.lang.cce.vmax(data_x, data_y) data_sub = te.lang.cce.vsub(data_y, data_max) data_abs = te.lang.cce.vabs(data_sub) data_min = te.lang.cce.vmins(data_abs, scalar_min_fp32) data_mul = te.lang.cce.vmuls(data_min, scalar_mul_fp32_first) data_mul_first = te.lang.cce.vmuls(data_mul, scalar_mul_fp32_first) data_mul_second = te.lang.cce.vmuls(data_mul_first, scalar_mul_fp32_second) data_sub_first = te.lang.cce.vadds(data_mul_second, scalar_one_fp32_neg) data_out = te.lang.cce.vabs(data_sub_first) else: tensor_zero = te.lang.cce.vmuls(data_x, 0) tensor_min_fp32 = te.lang.cce.vadds(tensor_zero, scalar_min_fp32) data_max = te.lang.cce.vmax(data_x, data_y) data_sub = te.lang.cce.vsub(data_y, data_max) data_abs = te.lang.cce.vabs(data_sub) data_min = te.lang.cce.vmin(data_abs, tensor_min_fp32) data_mul = te.lang.cce.vmuls(data_min, scalar_mul_fp32_first) data_mul_first = te.lang.cce.vmuls(data_mul, scalar_mul_fp32_first) data_mul_second = te.lang.cce.vmuls(data_mul_first, scalar_mul_fp32_second) data_sub_first = te.lang.cce.vadds(data_mul_second, scalar_one_fp32_neg) data_out = te.lang.cce.vabs(data_sub_first) return data_out
def atan_compute(x, y, kernel_name="atan"): """ Algorithm: atan ---------------------------------- Parameters: x: Input data y : the dict of output kernel_name: cce kernel name, default value is "atan" ---------------------------------- Returns: A Tensor of atan(x). """ dtype = x.dtype shape = x.shape if dtype == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): x = te.lang.cce.cast_to(x, "float32") abs_data = te.lang.cce.vabs(x) tensor_one = te.lang.cce.broadcast(tvm.const(CONST_POS_ONE, x.dtype), shape) abs_data_sub_one = te.lang.cce.vsub(abs_data, tensor_one) abs_data_add_one = te.lang.cce.vadd(abs_data, tensor_one) abs_data2 = te.lang.cce.vdiv(abs_data_sub_one, abs_data_add_one) abs_data2 = te.lang.cce.vabs(abs_data2) # calucate data less than one res = _do_taylor(abs_data) # calucate data more than one res_mt_one = _do_taylor(abs_data2) res_mt_one = te.lang.cce.vadds(res_mt_one, CONST_PI_BY_FOUR) res = te.lang.cce.vmin(res, res_mt_one) sign_mask = util_compute.sign(x) res = te.lang.cce.vmul(res, sign_mask) if dtype == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def _between_nudged_min_max_compute(x, nudged_min, nudged_max): """ Compare x with nudged_min and nudged_max. If the element in x is greater than nudged_min and less than nudged_max, then return 1, else return 0. max num of float32 is 2**126 but cce can only support 2**62, so use 62/62/2 to adaptor 126 (2**(-126))*(2**(62))*(2**(62))*(2**(2)) = 1 """ shape_inputs = te.lang.cce.util.shape_to_list(x.shape) min_value = tvm.const(2 ** (-126), dtype=D_TYPE) max_value = tvm.const(2 ** 62, dtype=D_TYPE) factor_value = tvm.const(2 ** 2, dtype=D_TYPE) if api_check_support("te.lang.cce.vmaxs", x.dtype): sub_tensor_min = te.lang.cce.vsub(x, nudged_min) sub_min = te.lang.cce.vadds(sub_tensor_min, min_value) more_nudged_min_tensor = te.lang.cce.vmaxs(sub_min, tvm.const(0, dtype=D_TYPE)) sub_tensor_max = te.lang.cce.vsub(nudged_max, x) sub_max = te.lang.cce.vadds(sub_tensor_max, min_value) less_nudged_max_tensor = te.lang.cce.vmaxs(sub_max, tvm.const(0, dtype=D_TYPE)) between_nudged_tensor = te.lang.cce.vmul(more_nudged_min_tensor, less_nudged_max_tensor) between_nudged_element = te.lang.cce.vmins(between_nudged_tensor, min_value) else: data_zero = te.lang.cce.vmuls(x, 0) min_value_tensor = te.lang.cce.vadds(data_zero, min_value) sub_tensor_min = te.lang.cce.vsub(x, nudged_min) sub_min = te.lang.cce.vadds(sub_tensor_min, min_value) more_nudged_min_tensor = te.lang.cce.vmax(sub_min, data_zero) sub_tensor_max = te.lang.cce.vsub(nudged_max, x) sub_max = te.lang.cce.vadds(sub_tensor_max, min_value) less_nudged_max_tensor = te.lang.cce.vmax(sub_max, data_zero) between_nudged_tensor = te.lang.cce.vmul(more_nudged_min_tensor, less_nudged_max_tensor) between_nudged_element = te.lang.cce.vmin(between_nudged_tensor, min_value_tensor) vmul_max_value = te.lang.cce.vmuls(between_nudged_element, max_value) vmul_factor_value = te.lang.cce.vmuls(vmul_max_value, max_value) between_nudged = te.lang.cce.vmuls(vmul_factor_value, factor_value) return between_nudged
def approximate_equal_compute(input_x, input_y, output_z, tolerance, kernel_name="approximate_equal"): """ algorithm: approximate_equal calculating abs(x-y) <= tolerance Parameters ---------- input_x : the placeholders of input data input_y : the placeholders of input data tolerance: default 1e-5 output_z: shape and dtype of output kernel_name: cce kernel name, default value is "approximate_equal" Returns ------- the function of _approximate_equal_compute """ input_dtype = input_x.dtype if input_dtype == "float16" and api_check_support("te.lang.cce.vadd", "float32"): input_x = te.lang.cce.cast_to(input_x, "float32") input_y = te.lang.cce.cast_to(input_y, "float32") res_vsub = te.lang.cce.vsub(input_x, input_y) res_vabs = te.lang.cce.vabs(res_vsub) res_vabs = te.lang.cce.cast_to(res_vabs, input_x.dtype) tol_tensor = te.lang.cce.broadcast(tvm.const(tolerance, input_x.dtype), input_x.shape) res_cmp = te.lang.cce.vcmp(res_vabs, tol_tensor, 'le') zero_rb_tensor = te.lang.cce.broadcast(tvm.const(NUM_ZERO, "float16"), input_x.shape) one_rb_tensor = te.lang.cce.broadcast(tvm.const(NUM_ONE, "float16"), input_x.shape) res = te.lang.cce.vsel(res_cmp, one_rb_tensor, zero_rb_tensor) res = te.lang.cce.cast_to(res, "int8") return res
def _atan_compute(input_x): """ Algorithm: atan ---------------------------------- Parameters: input_x: Input data. ---------------------------------- Returns: A Tensor of atan(x). """ shape = input_x.shape dtype = input_x.dtype if dtype == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): input_x = te.lang.cce.cast_to(input_x, "float32") abs_data = te.lang.cce.vabs(input_x) tensor_one = te.lang.cce.broadcast(tvm.const(CONST_POS_ONE, input_x.dtype), shape) abs_data_sub_one = te.lang.cce.vsub(abs_data, tensor_one) abs_data_add_one = te.lang.cce.vadd(abs_data, tensor_one) abs_data2 = te.lang.cce.vdiv(abs_data_sub_one, abs_data_add_one) abs_data2 = te.lang.cce.vabs(abs_data2) # calucate data less than one res = _do_taylor(abs_data) # calucate data more than one res_mt_one = _do_taylor(abs_data2) res_mt_one = te.lang.cce.vadds(res_mt_one, CONST_PI_BY_FOUR) res = te.lang.cce.vmin(res, res_mt_one) sign_mask = util_compute.sign(input_x) res = te.lang.cce.vmul(res, sign_mask) if dtype == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def _less_compare_float32(data_x, data_y): """ if x is less than y, then return 1, else return 0. Parameters: ---------- data_x : TVM tensor tensor x data_y : TVM tensor tensor y Returns ------- the compare result """ shape_inputs = te.lang.cce.util.shape_to_list(data_x.shape) # minimun num of float32 2**(-126) min_value = tvm.const(2 ** (-126), dtype="float32") if api_check_support("te.lang.cce.vmaxs", data_x.dtype): res_sub = te.lang.cce.vsub(data_y, data_x) res_min = te.lang.cce.vmins(res_sub, min_value) res_max = te.lang.cce.vmaxs(res_min, tvm.const(0, dtype="float32")) else: data_zero = te.lang.cce.vmuls(data_x, 0) data_min = te.lang.cce.vadds(data_zero, min_value) res_sub = te.lang.cce.vsub(data_y, data_x) res_min = te.lang.cce.vmin(res_sub, data_min) res_max = te.lang.cce.vmax(res_min, data_zero) # max num of float32 is 2**126 # but cce can only support 2**62, so use 50/50/26 to adaptor 126 res_mul_first = te.lang.cce.vmuls(res_max, tvm.const(2 ** 50, dtype="float32")) res_mul_second = te.lang.cce.vmuls(res_mul_first, tvm.const(2 ** 50, dtype="float32")) res = te.lang.cce.vmuls(res_mul_second, tvm.const(2 ** 26, dtype="float32")) return res
def add_n_compute(datas, output, tensor_num, kernel_name="add_n"): """ calculating data's adds, z = a + b + c... Parameters ---------- datas : list of placeholders all input data output : dict dict of output tensor_num: nums of input kernel_name : string cce kernel name, default value is add_n Returns ------- res : output of the data's add_n """ data_type = datas[0].dtype has_covert_float32 = (data_type == "float16" and api_check_support("te.lang.cce.vadd", "float32")) first_data = datas[0] if not has_covert_float32 else\ te.lang.cce.cast_to(datas[0], "float32") res = first_data for i, data_n in enumerate(datas): if i == 0: continue temp_data = data_n if not has_covert_float32 else \ te.lang.cce.cast_to(data_n, "float32") res = te.lang.cce.vadd(res, temp_data) if has_covert_float32: res = te.lang.cce.cast_to(res, "float16") return res
def asin_grad_compute(y, dy, z, kernel_name="asin_grad"): """ do element-wise asin_grad compute Parameters: ---------- y : the placeholders of input y dy : the placeholders of input dy z : output dict kernel_name : cce kernel name, default value is "cce_asin_grad" return : dy * (1 / sqrt(1 - y^2)) ------- """ dtype = y.dtype if dtype == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): y = te.lang.cce.cast_to(y, "float32") dy = te.lang.cce.cast_to(dy, "float32") # step 1: calculate num_to_vrsqrt = 1 - y^2 data = te.lang.cce.vmul(y, y) data = te.lang.cce.vmuls(data, tvm.const(NUM_MINUS_ONE, y.dtype)) num_to_vrsqrt = te.lang.cce.vadds(data, tvm.const(NUM_ONE, y.dtype)) # step 2: calculate dy * (1 / sqrt(1 - y^2)) vsqrt_res = te.lang.cce.vsqrt(num_to_vrsqrt, 1) res = te.lang.cce.vdiv(dy, vsqrt_res) if dtype == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def atan_grad_compute(y, dy, z, kernel_name="atan_grad"): """ Calculation for backward gradient Parameters: ---------- y: the placeholder of input data dy: the placeholder of input dy output_z : dict of output kernel_name : cce kernel name, default value is atan_grad Algorithm : ---------- res = 1/(1+y^2)*dy Returns ---------- result res """ scalar_one = tvm.const(CONST_ONE, "float32") dtype = y.dtype if dtype == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): y = te.lang.cce.cast_to(y, "float32") dy = te.lang.cce.cast_to(dy, "float32") data_square = te.lang.cce.vmul(y, y) sum_tmp = te.lang.cce.vadds(data_square, scalar_one) res = te.lang.cce.vdiv(dy, sum_tmp) if dtype == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def _log_taylor(data_x, shape): """ use taylor expansion to calculate log Parameters ---------- data_x: input tensor that we want to calculate sqrt dtype : the type of tensor Returns ------- res : return of log """ data = te.lang.cce.vadds(data_x, tvm.const(CONST_NEG_ONE, "float32")) data_1 = te.lang.cce.vadds( data, tvm.const(CONST_NEG_ONE * CONST_LOG_THRESHOLD_1, "float32")) if api_check_support("te.lang.cce.vcmpsel", "float32"): data_sel = te.lang.cce.vcmpsel( data, tvm.const(CONST_LOG_THRESHOLD_1, data.dtype), 'ge', te.lang.cce.vmuls(data_1, tvm.const(CONST_DOT_SIX, "float32")), data) data_sel = te.lang.cce.cast_to(data_sel, "float32") data_2 = te.lang.cce.vadds( data_sel, tvm.const(CONST_NEG_ONE * CONST_LOG_THRESHOLD_2, "float32")) data_vmuls = te.lang.cce.vmuls(data_2, tvm.const(CONST_THREE_FOUR, "float32")) data_sel_1 = te.lang.cce.vcmpsel( data_sel, tvm.const(CONST_LOG_THRESHOLD_2, data_sel.dtype), 'ge', data_vmuls, data_sel) data_sel_1 = te.lang.cce.cast_to(data_sel_1, "float32") taylor = _taylor_compute(data_sel_1) # add log(4/3) res = te.lang.cce.vcmpsel( data_sel, tvm.const(CONST_LOG_THRESHOLD_2, data_sel.dtype), 'ge', te.lang.cce.vadds(taylor, tvm.const(LOG_FOUR_THREE, "float32")), taylor) res = te.lang.cce.cast_to(res, "float32") # add log(5/3) data = te.lang.cce.cast_to(data, "float32") res = te.lang.cce.vcmpsel( data, tvm.const(CONST_LOG_THRESHOLD_1, data.dtype), 'ge', te.lang.cce.vadds(taylor, tvm.const(LOG_FIVE_THREE, "float32")), res) else: threshold_1 = te.lang.cce.broadcast( tvm.const(CONST_LOG_THRESHOLD_1, "float32"), shape) index_1 = te.lang.cce.vcmp(data, threshold_1, 'ge') data_sel = te.lang.cce.vsel( index_1, te.lang.cce.vmuls(data_1, tvm.const(CONST_DOT_SIX, "float32")), data) data_sel = te.lang.cce.cast_to(data_sel, "float32") threshold_2 = te.lang.cce.broadcast( tvm.const(CONST_LOG_THRESHOLD_2, "float32"), shape) index_2 = te.lang.cce.vcmp(data_sel, threshold_2, 'ge') data_2 = te.lang.cce.vadds( data_sel, tvm.const(CONST_NEG_ONE * CONST_LOG_THRESHOLD_2, "float32")) data_vmuls = te.lang.cce.vmuls(data_2, tvm.const(CONST_THREE_FOUR, "float32")) data_sel = te.lang.cce.vsel(index_2, data_vmuls, data_sel) data_sel = te.lang.cce.cast_to(data_sel, "float32") taylor = _taylor_compute(data_sel) # add log(4/3) res = te.lang.cce.vsel( index_2, te.lang.cce.vadds(taylor, tvm.const(LOG_FOUR_THREE, "float32")), taylor) res = te.lang.cce.cast_to(res, "float32") # add log(5/3) res = te.lang.cce.vsel( index_1, te.lang.cce.vadds(taylor, tvm.const(LOG_FIVE_THREE, "float32")), res) res = te.lang.cce.cast_to(res, "float32") # d: vlog: res = _log_compute(data_x, res, shape) return res
def apply_centered_rms_prop_d_compute(var, mg, ms, mom, lr, rho, momentum, epsilon, grad, var_out, mg_out, ms_out, mom_out, kernel_name="apply_centered_rms_prop_d"): """ Update '*var' according to the centered RMSProp algorithm. mean_square = decay * mean_square + (1-decay) * gradient ** 2 mean_grad = decay * mean_grad + (1-decay) * gradient Delta = learning_rate*gradient/sqrt(mean_square+epsilon-mean_grad**2) mg_{t} <- rho * mg_{t-1} + (1-rho) * grad ms_{t} <- rho * ms_{t-1} + (1-rho) * grad * grad mom_{t} <- momentum*mom_{t-1}+lr*grad/sqrt(ms_{t}-mg{t}*mg{t}+epsilon) var_{t} <- var_{t-1} - mom_{t} Parameters: ---------- var: dict of tensor var, include shape and dtype, dtype support float16 and float32. mg: dict of tensor mg(mean_grad), include shape and dtype, dtype support float16 and float32. ms: dict of tensor ms(mean_square), include shape and dtype, dtype support float16 and float32. mom: dict of tensor mom, include shape and dtype, dtype support float16 and float32. lr: dict of scalar lr(learning rate). Must have the same dtype as var. rho: dict of scalar rho(decay rate). Must have the same dtype as var. momentum: dict of scalar momentum. Must have the same dtype as var. epsilon: dict of scalar epsilon. Must have the same dtype as var. grad: dict of tensor grad. Must have the same dtype as var. out: dict of output out. kernel_name : cce kernel name, default value is "apply_centered_rms_prop_d". Returns ------- None """ inp_dtype = var.dtype if inp_dtype == "float16" and api_check_support("te.lang.cce.vadd", "float32"): var = te.lang.cce.cast_to(var, "float32") mg = te.lang.cce.cast_to(mg, "float32") ms = te.lang.cce.cast_to(ms, "float32") mom = te.lang.cce.cast_to(mom, "float32") lr = te.lang.cce.cast_to(lr, "float32") rho = te.lang.cce.cast_to(rho, "float32") momentum = te.lang.cce.cast_to(momentum, "float32") epsilon = te.lang.cce.cast_to(epsilon, "float32") grad = te.lang.cce.cast_to(grad, "float32") tensor_one_rho = tvm.compute( rho.shape, lambda *indices: rho(*indices) * tvm.const(NUM_ONE_NA, rho.dtype), tag='elewise_single_VS_mul') tensor_one_rho = tvm.compute(tensor_one_rho.shape, lambda *indices: tensor_one_rho(*indices) + tvm.const(NUM_ONE, tensor_one_rho.dtype), tag='elewise_single_VS_add') mg_rho = tvm.compute(mg.shape, lambda *indices: mg(*indices) * rho[0], tag='elewise_single_VS_mul') rhs = tvm.compute(grad.shape, lambda *indices: grad(*indices) * tensor_one_rho[0], tag='elewise_single_VS_mul') out_mg = te.lang.cce.vadd(mg_rho, rhs) ms_rho = tvm.compute(ms.shape, lambda *indices: ms(*indices) * rho[0], tag='elewise_single_VS_mul') rhs = te.lang.cce.vmul(grad, grad) rhs = tvm.compute(rhs.shape, lambda *indices: rhs(*indices) * tensor_one_rho[0], tag='elewise_single_VS_mul') out_ms = te.lang.cce.vadd(ms_rho, rhs) lhs_mom = tvm.compute(mom.shape, lambda *indices: mom(*indices) * momentum[0], tag='elewise_single_VS_mul') lr_grad = tvm.compute(grad.shape, lambda *indices: grad(*indices) * lr[0], tag='elewise_single_VS_mul') rhs = te.lang.cce.vmul(out_mg, out_mg) rhs = te.lang.cce.vsub(out_ms, rhs) rhs_eps = tvm.compute(rhs.shape, lambda *indices: rhs(*indices) + epsilon[0], tag='elewise_single_VS_add') rhs_eps = te.lang.cce.vsqrt(rhs_eps) rhs_eps = te.lang.cce.vdiv(lr_grad, rhs_eps) out_mom = te.lang.cce.vadd(lhs_mom, rhs_eps) out_var = te.lang.cce.vsub(var, out_mom) if inp_dtype == "float16": out_var = te.lang.cce.cast_to(out_var, "float16") out_mg = te.lang.cce.cast_to(out_mg, "float16") out_ms = te.lang.cce.cast_to(out_ms, "float16") out_mom = te.lang.cce.cast_to(out_mom, "float16") mg_output_data = te.lang.cce.vadds(out_mg, NUM_ZERO) ms_output_data = te.lang.cce.vadds(out_ms, NUM_ZERO) mom_output_data = te.lang.cce.vadds(out_mom, NUM_ZERO) # this compute is for multi output def _compute(*index): return out_mg(*index), out_ms(*index), out_mom(*index), out_var( *index), out_var(*index), mg_output_data(*index), ms_output_data( *index), mom_output_data(*index) return tvm.compute(var.shape, _compute, name="outputs")
def fake_quant_with_min_max_args_compute(x, y, min=-6, max=6, num_bits=8, narrow_range=False, kernel_name="fake_quant_with_min_" "max_args"): """ Computes Fake-quantize the 'x' tensor, type float32 to 'y' tensor of same type calculating data's : y = (floor(clamped_shifted * inv_nudged_scale + 0.5f)))*scale+nudged_min scale=(max-min)/(quant_max-quant_min) Parameters ---------- x: TVM tenor the placeholder of input data,type is float32 y: dict the dict of output data min: scalar float int Defaults to -6 max: scalar float int Defaults to 6 [min; max] define the clamping range for the x data num_bits: float int Defaults to 8.num_bits is the bitwidth of the quantization, between 2 and 16 narrow_range: bool True or False.if None,narrow_range=False if True x values are quantized into the quantization range [1; 2^num_bits - 1] if False x values are quantized into the quantization range [0; 2^num_bits - 1] kernel_name: str cce kernel name, default value is "fake_quant_with_min_max_args" Returns ------- res: TVM tensor the result of fake_quant_with_min_max_args_compute """ shape_x = te.lang.cce.util.shape_to_list(x.shape) output_dtype = x.dtype nudged_min, nudged_max, scale = _nudge_min_max(min, max, num_bits, narrow_range) if api_check_support("te.lang.cce.vmaxs", x.dtype): nudged_min_neg = nudged_min * (-1.0) inv_nudged_scale = 1.00 / scale # Transform the input between nudged_max and nudged_min clamped_vmin = te.lang.cce.vmins(x, nudged_max) clamped = te.lang.cce.vmaxs(clamped_vmin, nudged_min) # Calculate the quantized and dequantized results clamped_shifted = te.lang.cce.vadds(clamped, nudged_min_neg) vmul_shifted = te.lang.cce.vmuls(clamped_shifted, inv_nudged_scale) vadds_shifted = te.lang.cce.vadds(vmul_shifted, tvm.const(0.5, dtype="float32")) floor_vadds_shifted = te.lang.cce.floor(vadds_shifted) floor_cast = te.lang.cce.cast_to(floor_vadds_shifted, output_dtype) res_scale = te.lang.cce.vmuls(floor_cast, scale) res = te.lang.cce.vadds(res_scale, nudged_min) else: zero_tensor = te.lang.cce.vmuls(x, 0) nudged_max_tensor = te.lang.cce.vadds(zero_tensor, nudged_max) nudged_min_tensor = te.lang.cce.vadds(zero_tensor, nudged_min) inv_nudged_scale = 1.00 / scale inv_nudged_scale_const = tvm.const(inv_nudged_scale, dtype=output_dtype) # Transform the input between nudged_max and nudged_min clamped_vmin = te.lang.cce.vmin(x, nudged_max_tensor) clamped = te.lang.cce.vmax(clamped_vmin, nudged_min_tensor) # Calculate the quantized and dequantized results clamped_shifted = te.lang.cce.vsub(clamped, nudged_min_tensor) vmul_shifted = te.lang.cce.vmuls(clamped_shifted, inv_nudged_scale_const) vadds_shifted = te.lang.cce.vadds(vmul_shifted, tvm.const(0.5, dtype="float32")) floor_vadds_shifted = te.lang.cce.floor(vadds_shifted) floor_cast = te.lang.cce.cast_to(floor_vadds_shifted, output_dtype) res_scale = te.lang.cce.vmuls(floor_cast, scale) res = te.lang.cce.vadd(res_scale, nudged_min_tensor) return res
def bessel_i0e_compute(x, y, kernel_name="bessel_i0e"): """ Algrithm: I0 = 1 + ( (z/2) / (1!) )^2 + ((z/2)^2 / (2!))^2 + ... + ((z/2)^n / (n!)) ^2 I0e = I0 / exp(x) t = x / 3.75 I0(x) = e^-|x|(1 + 3.5156229t^2 + 3.0899424t^4 + 1.2067492t^6 + 0.2659732t^8 + 0.0360768t^10 + 0.0045813t^12)), |x| <= 3.75 I0(x) = (1 / sqrt(|x|))*(0.39894228 + 0.01328592t^-1 + 0.00225319t^-2 + -0.00157565t^-3 + 0.00916281t^-4 + -0.02057706t^-5 + 0.02635537t^-6 + -0.01647633t^-7 + 0.00392377t^-8), |x| >= 3.75 Parameters ---------- x: the placeholder of data input y : the dict of output kernel_name : cce kernel name, default value is "bessel_i0e" Returns ------- A tensor. Has the same type as x. """ shape_input = x.shape dtype_input = x.dtype # chose the type of data in begin if dtype_input == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): x = te.lang.cce.cast_to(x, "float32") abs_data = te.lang.cce.vabs(x) # compute bessel_i0e for data in (-3.75, 3.75) broad_const_limit = te.lang.cce.broadcast(tvm.const(CONST_LIMIT, x.dtype), shape_input) before_abs_data = te.lang.cce.vmin(abs_data, broad_const_limit) data = te.lang.cce.vdiv(before_abs_data, broad_const_limit) square_data = te.lang.cce.vmul(data, data) before_res = te.lang.cce.vmuls(square_data, tvm.const(ITR_BEFORE[LEN_BEFORE - 1])) before_res = te.lang.cce.vadds(before_res, ITR_BEFORE[LEN_BEFORE - 2]) for index in reversed(range(LEN_BEFORE - 2)): before_res = te.lang.cce.vmul(before_res, square_data) before_res = te.lang.cce.vadds(before_res, ITR_BEFORE[index]) exp_data = te.lang.cce.vexp(before_abs_data) before_res = te.lang.cce.vdiv(before_res, exp_data) # compute bessel_i0e for data in other domain data = te.lang.cce.vdiv(broad_const_limit, abs_data) after_res = te.lang.cce.vmuls(data, tvm.const(ITR_AFTER[LEN_AFTER - 1])) after_res = te.lang.cce.vadds(after_res, ITR_AFTER[LEN_AFTER - 2]) for index in reversed(range(LEN_AFTER - 2)): after_res = te.lang.cce.vmul(after_res, data) after_res = te.lang.cce.vadds(after_res, ITR_AFTER[index]) sqrt_data = te.lang.cce.vsqrt(abs_data, 1) after_res = te.lang.cce.vdiv(after_res, sqrt_data) after_res = te.lang.cce.vmin(before_res, after_res) # chose the type of data in end if dtype_input == "float16": after_res = te.lang.cce.cast_to(after_res, "float16") return after_res
def __init__(self, x, cont, w_xh_x_static, h_0, w_xh, bias_h, w_hh, w_ho, bias_o, o_t, h_t, expose_hidden=False, num_output=0, kernel_name="basicrnn_cell", impl_mode="high_performance"): """ Init BasicRNNCell base parameters Parameters ---------- x: dict data of input cont: dict data of cont w_xh_x_static: dict data of w_xh_x_static h_0: dict data of h_0 w_xh: dict data of w_xh w_hh: dict data of w_hh w_ho: dict data of w_ho bias_h: dict data of bias_h bias_o: dict data of bias_o o_t: dict data of o_t h_t: dict data of o_t expose_hidden: bool if expose hidden state num_output: int number of output kernel_name: str the name of the operator impl_mode: str impl mode Returns ------- None """ self.kernel_name = kernel_name self.impl_mode = impl_mode self.tensor_list1 = {} self.tensor_list2 = {} self.emit_cmd = {} self.scope_list = {} self.tanh_ht_tensor = None self.tanh_ot_tensor = None self.expose_hidden = expose_hidden self.num_output = num_output self.has_static = True if w_xh_x_static is None: self.has_static = False dtypes = { "x": x.get("dtype").lower(), "w_xh": w_xh.get("dtype").lower(), "w_ho": w_ho.get("dtype").lower(), "bias_h": bias_h.get("dtype").lower(), "bias_o": bias_o.get("dtype").lower(), "o_t": o_t.get("dtype").lower(), "h_t": h_t.get("dtype").lower() } shapes = { "x": x.get("shape"), "w_xh": w_xh.get("shape"), "w_ho": w_ho.get("shape"), "bias_h": (math.ceil(float(bias_h.get("shape")[0]) / 16), 16), "bias_o": (math.ceil(float(bias_o.get("shape")[0]) / 16), 16), "o_t": o_t.get("shape"), "h_t": h_t.get("shape") } datas = { "x": tvm.placeholder(shapes["x"], name="x", dtype=dtypes["x"]), "w_xh": tvm.placeholder(shapes["w_xh"], name="w_xh", dtype=dtypes["w_xh"]), "w_ho": tvm.placeholder(shapes["w_ho"], name="w_ho", dtype=dtypes["w_ho"]), "bias_h": tvm.placeholder(shapes["bias_h"], name="bias_h", dtype=dtypes["bias_h"]), "bias_o": tvm.placeholder(shapes["bias_o"], name="bias_o", dtype=dtypes["bias_o"]) } dims = { "batch_dim": shapes["x"][1], "input_dim": shapes["x"][0], "hidden_dim": shapes["w_xh"][1] } if self.has_static: dtypes["w_xh_x_static"] = w_xh_x_static.get("dtype").lower() shapes["w_xh_x_static"] = w_xh_x_static.get("shape") datas["w_xh_x_static"] = tvm.placeholder( shapes["w_xh_x_static"], name="w_xh_x_static", dtype=dtypes["w_xh_x_static"]) if self.expose_hidden: dtypes["h_0"] = h_0.get("dtype").lower() dtypes["cont"] = cont.get("dtype").lower() dtypes["w_hh"] = w_hh.get("dtype").lower() shapes["cont"] = (math.ceil(float(cont.get("shape")[0]) / 16), 16) shapes["h_0"] = h_0.get("shape") shapes["w_hh"] = w_hh.get("shape") datas["cont"] = tvm.placeholder(shapes["cont"], name="cont", dtype=dtypes["cont"]) datas["h_0"] = tvm.placeholder(shapes["h_0"], name="h_0", dtype=dtypes["h_0"]) datas["w_hh"] = tvm.placeholder(shapes["w_hh"], name="w_hh", dtype=dtypes["w_hh"]) self.check_input_parameters(dtypes, shapes, dims) self.shapes = shapes self.dtypes = dtypes self.datas = datas self.dims = dims self.device = "mini" if not cce_conf.api_check_support("te.lang.cce.vadd", "float32"): self.device = "hisi_es"
def apply_momentum_compute_d(var, accum, lr, grad, momentum, var_out, accum_out, use_nesterov, kernel_name='apply_momentum_d'): """ Update '*var' according to the ApplyMomentum algorithm. accum = accum * momentum + grad if use_nesterov is True: var -= grad * lr + accum * momentum * lr else: var -= accum * lr Parameters: ---------- var : mutable tensor var. accum: mutable tensor accum. lr : scalar lr. grad : tensor grad. momentum : scalar momentum. var_out : the dict of output var. accum_out : the dict of output accum. use_nesterov: bool. If true, use nesterov computing grad, default value is False. kernel_name : cce kernel name, default value is "apply_momentum_d". Returns: ------- None """ # cast to float32 for higher accuracy dtype = var.dtype if dtype == "float16" and api_check_support("te.lang.cce.vadd", "float32"): var = te.lang.cce.cast_to(var, "float32") accum = te.lang.cce.cast_to(accum, "float32") lr = te.lang.cce.cast_to(lr, "float32") grad = te.lang.cce.cast_to(grad, "float32") momentum = te.lang.cce.cast_to(momentum, "float32") # update accum accum_delta = tvm.compute(accum.shape, lambda *indice: accum(*indice) * momentum[0], tag='elewise_single_VS_mul') accum_t = te.lang.cce.vadd(accum_delta, grad) # update var if use_nesterov: var_delta = tvm.compute(grad.shape, lambda *indice: grad(*indice) * lr[0], tag='elewise_single_VS_mul') var_delta_2 = tvm.compute( accum_t.shape, lambda *indice: accum_t(*indice) * momentum[0], tag='elewise_single_VS_mul') var_delta_2 = tvm.compute(var_delta_2.shape, lambda *indice: var_delta_2(*indice) * lr[0], tag='elewise_single_VS_mul') var_delta = te.lang.cce.vadd(var_delta, var_delta_2) var_t = te.lang.cce.vsub(var, var_delta) else: var_delta = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * lr[0], tag='elewise_single_VS_mul') var_t = te.lang.cce.vsub(var, var_delta) if dtype == "float16": var_t = te.lang.cce.cast_to(var_t, "float16") accum_t = te.lang.cce.cast_to(accum_t, "float16") var_out_data = te.lang.cce.vadds(var_t, tvm.const(NUM_ZERO, var_t.dtype)) accum_out_data = te.lang.cce.vadds(accum_t, tvm.const(NUM_ZERO, accum_t.dtype)) def _compute(*index): return accum_t(*index), var_t(*index), var_out_data(*index), \ accum_out_data(*index) return tvm.compute(var.shape, _compute, name="outputs")