def fused_relu_grad_bn_double_update_grad(data_1, data_2, data_3, data_4, data_5, data_6, data_7, layout='NHWC'): transform_list = [data_2, data_4, data_5, data_6, data_7] for i in transform_list: if layout == "NCHW": i = topi.transpose(i, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError( 'Layout not supported {} '.format(layout)) data_tmp1 = topi.full_like(data_7, 0.0) data_tmp2 = topi.greater(data_7, data_tmp1) data_tmp3 = topi.add(data_5, data_6) data_tmp4 = topi.where(data_tmp2, data_tmp3, data_tmp1) data_tmp5 = topi.cast(data_tmp4, 'float32') data_tmp7 = topi.sum(data_tmp5, axis=(0, 1, 2)) n, h, w, c = data_7.shape data_tmp8 = topi.cast(data_2, 'float32') data_tmp9 = topi.full_like(data_tmp7, 1.0/(n*h*w)) data_tmp10 = topi.multiply(data_1, data_tmp9) data_tmp11 = topi.broadcast_to(data_tmp10, data_tmp8.shape) data_tmp12 = topi.subtract(data_tmp8, data_tmp11) data_tmp13 = topi.multiply(data_tmp5, data_tmp12) data_tmp15 = topi.sum(data_tmp13, axis=(0, 1, 2)) data_tmp16 = topi.cast(data_4, 'float32') data_tmp17 = topi.multiply(data_3, data_tmp9) data_tmp18 = topi.broadcast_to(data_tmp17, data_tmp16.shape) data_tmp19 = topi.subtract(data_tmp16, data_tmp18) data_tmp20 = topi.multiply(data_tmp5, data_tmp19) data_tmp22 = topi.sum(data_tmp20, axis=(0, 1, 2)) return [data_tmp7, data_tmp15, data_tmp22]
def bitwise_and(x1, x2): """ Computes the bitwise and of `x1` and `x2`. Args: x1 (tvm.tensor.Tensor): tensor x1, only support int16,uint16. x2 (tvm.tensor.Tensor): tensor x2, only support int16,uint16. Returns: A tvm.tensor.Tensor as result of bitwise and. """ _check_parameters(x1, x2) shape_x = get_shape(x1) shape_y = get_shape(x2) _, _, shape_max = produce_shapes(shape_x, shape_y) data_x = topi.broadcast_to(x1, shape_max) data_y = topi.broadcast_to(x2, shape_max) res = tvm.compute(data_x.shape, lambda *i: data_x(*i) & data_y(*i), name="and_res") return res
def fake_quant_with_min_max_vars_per_channel_gradient(input_gradients, input_data, input_min, input_max, num_bits=8, narrow_range=False): """ Computes gradients of Fake-quantize on the 'input_data' tensor, output_backprops = input_gradients*(if input_data>=nudged_min and <=nudged_max 1 else 0) Args: input_gradients (tvm.tensor.Tensor): input gradients from previously operation input_data (tvm.tensor.Tensor): input of fake-quantize, only supports "float32" input_min (tvm.tensor.Tensor): input_min shape equals to input_max shape The last dimension shoud be same for shapes of min, max and shape_inputs only support fp32 input_max (tvm.tensor.Tensor): only support fp32 num_bits (int): Defaults to 8. bitwidth of the quantization,between 2 and 16 narrow_range (bool): True, quantized into the quantization range [1, 2^num_bits - 1] False,quantized into the quantization range [0, 2^num_bits - 1] Returns: tvm.tensor.Tensor """ input_gradients_shape = get_shape(input_gradients) input_data_shape = get_shape(input_data) input_min_shape = get_shape(input_min) input_max_shape = get_shape(input_max) utils.check_shape(input_gradients_shape) utils.check_shape(input_data_shape) utils.check_shape(input_min_shape) utils.check_shape(input_max_shape) utils.elemwise_shape_check(input_gradients.shape, input_data.shape) utils.elemwise_shape_check(input_min_shape, input_max_shape) if input_min_shape[0] != input_data_shape[-1]: raise RuntimeError( "The shapes of min,max and shape_inputs last one dimension shoud be same" ) utils.ops_dtype_check(input_gradients.dtype, utils.DtypeForDavinci.FLOAT32) utils.ops_dtype_check(input_data.dtype, utils.DtypeForDavinci.FLOAT32) utils.ops_dtype_check(input_min.dtype, utils.DtypeForDavinci.FLOAT32) utils.ops_dtype_check(input_max.dtype, utils.DtypeForDavinci.FLOAT32) if num_bits > 16 or num_bits < 2: raise RuntimeError("numbits should be range[2,16]") input_min_broadcast = topi.broadcast_to(input_min, input_data_shape) input_max_broadcast = topi.broadcast_to(input_max, input_data_shape) res = fake_quant_with_min_max_vars_per_channel_gradient_compute( input_gradients, input_data, input_min_broadcast, input_max_broadcast, num_bits, narrow_range) return res
def fused_bn_reduce_grad(data0, data1, data2, data3, data4, data5, data6, data7, layout='NHWC', out_dtype='float16', target=utils.CUDA): if layout == 'NCHW': data3 = topi.transpose(data3, (0, 2, 3, 1)) data7 = topi.transpose(data7, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError('Layout not supported {} '.format(layout)) n, h, w, c = data3.shape const = n * h * w inter_dtype = 'float32' out1 = topi.multiply(data4, data5) out1 = topi.divide(out1, const) out1 = topi.expand_dims(out1, axis=0, num_newaxis=3) out1 = topi.broadcast_to(out1, (n, h, w, c)) data3 = topi.cast(data3, inter_dtype) data2 = topi.expand_dims(data2, axis=0, num_newaxis=3) data2 = topi.broadcast_to(data2, (n, h, w, c)) out2 = topi.multiply(data3, const) out2 = topi.subtract(out2, data2) data1 = topi.expand_dims(data1, axis=0, num_newaxis=3) data1 = topi.broadcast_to(data1, (n, h, w, c)) data7 = topi.cast(data7, inter_dtype) out3 = topi.divide(data6, const) out3 = topi.subtract(data7, out3) out3 = topi.multiply(data1, out3) out3 = topi.divide(out3, data0) output = topi.subtract(out2, out3) output = topi.multiply(output, out1) output = topi.cast(output, out_dtype) if layout == "NCHW": output = topi.transpose(output, (0, 3, 1, 2)) return output
def _apply_adadelta_compute(var, accum, accum_update, grad, lr, rho, epsilon): """Compute apply_adadelta""" dtype = var.dtype if dtype == "float16": var = topi.cast(var, "float32") accum = topi.cast(accum, "float32") accum_update = topi.cast(accum_update, "float32") lr = topi.cast(lr, "float32") rho = topi.cast(rho, "float32") grad = topi.cast(grad, "float32") epsilon = tvm.const(epsilon, "float32") tensor_one = akg.lang.ascend.broadcast(tvm.const(1, "float32"), var.shape) tensor_rho = topi.broadcast_to(rho, var.shape) tensor_rho_gs = topi.subtract(tensor_one, tensor_rho) tensor_epsilon = akg.lang.ascend.broadcast(epsilon, var.shape) # accum = accum * rho + grad ** 2 * (1 - rho) rhs = topi.multiply(accum, tensor_rho) lhs = topi.multiply(grad, grad) lhs = topi.multiply(lhs, tensor_rho_gs) accum_res = akg.lang.ascend.vadd(lhs, rhs) # update = (accum_update + epsilon).sqrt * (accum + epsilon).rsqrt * grad rhs = topi.add(accum_update, tensor_epsilon) rhs = sqrt(rhs, target=utils.CCE) lhs = topi.add(accum_res, tensor_epsilon) lhs = rsqrt(lhs, target=utils.CCE) lhs = topi.multiply(grad, lhs) update = topi.multiply(lhs, rhs) # var -= update * lr var_res = topi.broadcast_to(lr, var.shape) var_res = topi.multiply(update, var_res) var_res = topi.subtract(var, var_res) # accum_update = rho * accum_update + (1 - rho) * update.square rhs = topi.multiply(accum_update, tensor_rho) lhs = topi.multiply(update, update) lhs = topi.multiply(lhs, tensor_rho_gs) accum_update_res = akg.lang.ascend.vadd(lhs, rhs) if dtype == "float16": var_res = topi.cast(var_res, "float16") accum_res = topi.cast(accum_res, "float16") accum_update_res = topi.cast(accum_update_res, "float16") return var_res, accum_res, accum_update_res
def broadcast_to(x, shape, target=utils.CCE): """ Broadcast an tensor to a compatible shape. Args: x (tvm.tensor.Tensor): Tensor of type float32, float16, int8, uint8, int32 shape (list, tuple): The shape of output tensor. Returns: An tvm.tensor.Tensor with the same type as x. Supported Platforms: 'Ascend' """ # check shape utils.check_shape(x) utils.check_shape(shape) # check dtype dtype = x.dtype utils.ops_dtype_check(dtype, utils.DtypeForDavinci.ALL_TYPES) # vector_dup instruction don't support int8 and uint8 # It can be simplified by some methods, such as , "auto cast" x_shape = get_shape(x) if len(x_shape) == 1 and x_shape[0] == 1 and dtype in ["int8", "uint8"]: x = Cast(x, "float16", target) res = topi.broadcast_to(x, shape) if res.dtype != dtype: res = Cast(res, dtype, target) return res
def fused_bn_follow(data0, data1, data2, data3, data4, target=utils.CUDA): """ input: data: length is 5 data0: param0 beta data1: param1 gamma data2: param2 BNupdate: xi_variance data3: param6 BNreduce: xi_mean data4: param7 xi_conv2d layout: (N, C, H, W) output: beta + gamma * xi_variance * ( xi - xi_mean/(N*H*W) ) """ n, h, w, c = data4.shape const = n * h * w inter_dtype = 'float32' data4 = topi.cast(data4, inter_dtype) multiply0 = topi.divide(data3, const) multiply0 = topi.expand_dims(multiply0, axis=0, num_newaxis=3) multiply0 = topi.broadcast_to(multiply0, (n, h, w, c)) subtract0 = topi.subtract(data4, multiply0) multiply1 = topi.multiply(subtract0, data2) multiply2 = topi.multiply(multiply1, data1) add0 = topi.add(multiply2, data0) return add0
def matrix_set_diag_compute(input_matrix, input_diagonal, input_help): """matrix_set_diag compute implemention""" shape_input = get_shape(input_matrix) input_dtype = input_matrix.dtype if input_dtype == "int8" or input_dtype == "uint8": input_matrix = topi.cast(input_matrix, "float16") input_diagonal = topi.cast(input_diagonal, "float16") input_help = topi.cast(input_help, "float16") if input_dtype == "int32" and product_is_mini(): input_matrix = topi.cast(input_matrix, "float16") input_diagonal = topi.cast(input_diagonal, "float16") input_help = topi.cast(input_help, "float16") input_matrix = topi.cast(input_matrix, "float32") input_diagonal = topi.cast(input_diagonal, "float32") input_help = topi.cast(input_help, "float32") if input_dtype == "int32" and not product_is_mini(): input_matrix = topi.cast(input_matrix, "float32") input_diagonal = topi.cast(input_diagonal, "float32") input_help = topi.cast(input_help, "float32") diag_tmp = topi.broadcast_to(input_diagonal, shape_input) help_tmp = topi.add(input_help, -1) help_y = topi.abs(help_tmp) res_vmul_x = topi.multiply(input_matrix, help_y) res_vmul_y = topi.multiply(diag_tmp, input_help) res = topi.add(res_vmul_x, res_vmul_y) if input_dtype == "int32" and product_is_mini(): res = topi.cast(res, "float16") res = topi.cast(res, input_dtype) return res