def _elu_mini_compute(exp_res, data, shape): """ do element-wise e^x - 1 compute in mini scene f(x) = e^x - 1, x <= TAYLOR_THRESHOLD or x >= 0 f(x) = fifth taylor computer, TAYLOR_THRESHOLD < x < 0 Args: exp_res (tvm.tensor.Tensor): the tensor of e^x -1, float16 data (tvm.tensor.Tensor): input, float16 shape (list): the shape of input Returns: tvm.tensor.Tensor """ TAYLOR_THRESHOLD = -0.7 input_right_border = tvm.const(0.0, "float16") right_border = tvm.compute(shape, lambda *i: input_right_border) taylor_res = _elu_taylor_compute(data) input_left_border = tvm.const(TAYLOR_THRESHOLD, "float16") left_border = tvm.compute(shape, lambda *i: input_left_border) exp_taylor_neg = tvm.compute(shape, lambda *i: tvm.expr.Select\ (data(*i) > left_border(*i), taylor_res(*i), exp_res(*i)), name="gt") exp_res = tvm.compute(shape, lambda *i: tvm.expr.Select\ (data(*i) < right_border(*i), exp_taylor_neg(*i), exp_res(*i)), name="lt") return exp_res
def _compute_mini(data_input, shape): """ Use log and taylor to compute arctanh has the feature: arctanh(-abs(x)) = -arctanh(abs(x)) """ data_abs = topi.abs(data_input) result_ln = _compute_log(data_abs) result_taylor = _compute_taylor(data_abs) data_abs = topi.cast(data_abs, "float16") data_input = topi.cast(data_input, "float16") result_taylor = topi.cast(result_taylor, "float16") result_ln = topi.cast(result_ln, "float16") # when |x| < 0.5 using taylor computing, and when 0.5<|x|<1 using log() data_res = tvm.compute(shape, lambda *i : akg.tvm.expr.Select(data_abs(*i) < dc.half_const("float16"), result_taylor(*i), result_ln(*i)), name="le") # arctanh has the feature: arctanh(-abs(x)) = -arctanh(abs(x)) data_res_neg = topi.multiply(data_res, dc.neg_one_const("float16")) data_res = tvm.compute(shape, lambda *i : akg.tvm.expr.Select(data_input(*i) < dc.zero_const("float16"), data_res_neg(*i), data_res(*i)), name="neg") return data_res
def ReLU6Grad(y_grad, x, target=utils.CUDA): """ Computes Gradients of Rectified Linear 6. Args: y_grad (tvm.tensor.Tensor): Tensor of type float16, float32, gradients backpropagated to the ReLU6 op. x (tvm.tensor.Tensor): Tensor of type float16/float32, inputs that where passed to the ReLU6 op, or its outputs. Returns: tvm.tensor.Tensor, has same type and shape as x. Supported Platforms: 'GPU' """ if target != utils.CUDA: raise RuntimeError("the target %s is not supported!" % target) shape = x.shape dtype = x.dtype zero = tvm.const(0, dtype) six = tvm.const(6, dtype) res0 = tvm.compute(shape, lambda *i: tvm.if_then_else(x(*i) >= zero, x(*i), zero)) res6 = tvm.compute( shape, lambda *i: tvm.if_then_else(x(*i) >= six, zero, res0(*i))) res = tvm.compute( shape, lambda *i: tvm.if_then_else(res6(*i) == zero, zero, y_grad(*i))) return res
def batch_matmul_4D(data1, data2, bias=None, out_dtype="float32", layout1="NHDT", layout2="NHDT", layout_out="NHDT"): layout1_dict = {} layout2_dict = {} layout1_str = layout1.replace('N', 'B').replace('H', 'b').replace('D', 'm').replace('T', 'k') layout2_str = layout2.replace('N', 'B').replace('H', 'b').replace('D', 'n').replace('T', 'k') layout1_list = list(layout1_str) layout2_list = list(layout2_str) for i in range(len(layout1)): layout1_dict[layout1_list[i]] = data1.shape[i] layout2_dict[layout2_list[i]] = data2.shape[i] reduce_axis = tvm.reduce_axis((0, layout1_dict['k']), name='reduce_axis') if out_dtype == "float32": res = tvm.compute((layout1_dict['B'], layout1_dict['b'], layout1_dict['m'], layout2_dict['n']), lambda B, b, i, j: tvm.sum( data1[B, b, i if layout1_list[2] == 'm' else reduce_axis, reduce_axis if layout1_list[3] == 'k' else i].astype("float") * data2[B, b, j if layout2_list[2] == 'n' else reduce_axis, reduce_axis if layout2_list[3] == 'k' else j].astype("float"), axis=reduce_axis)) else: res = tvm.compute((layout1_dict['B'], layout1_dict['b'], layout1_dict['m'], layout2_dict['n']), lambda B, b, i, j: tvm.sum( data1[B, b, i if layout1_list[2] == 'm' else reduce_axis, reduce_axis if layout1_list[3] == 'k' else i] * data2[B, b, j if layout2_list[2] == 'n' else reduce_axis, reduce_axis if layout2_list[3] == 'k' else j], axis=reduce_axis)) if bias is not None: res = topi.add(res, bias) if layout_out != "NHDT": res = auto_out_transpose(res, layout_out) return res
def TensorcoreConv(data, weight, stride=[1, 1], pad=[0, 0, 0, 0], dilation=[1, 1], out_dtype="float32", name="out", target=utils.CUDA): batch, in_h, in_w, in_c = data.shape out_c, k_h, k_w, _ = weight.shape pad_top, pad_bottom, pad_left, pad_right = pad s_h, s_w = stride d_h, d_w = dilation k_h_d = (k_h - 1) * d_h + 1 k_w_d = (k_w - 1) * d_w + 1 o_h = (in_h + pad_top + pad_bottom - k_h_d) // s_h + 1 o_w = (in_w + pad_left + pad_right - k_w_d) // s_w + 1 has_pad = not (pad_left == 0 and pad_right == 0 and pad_top == 0 and pad_bottom == 0) if has_pad: data_pad = tvm.compute( (batch, in_h + pad_top + pad_bottom, in_w + pad_left + pad_right, in_c), lambda n, h, w, i: tvm.if_then_else( tvm.all(h >= pad_top, h - pad_bottom < in_h, w >= pad_left, w - pad_right < in_w), data[n, h - pad_top, w - pad_left, i], tvm.const(0.0, "float16"), ), name="Pad", ) else: data_pad = data rc = tvm.reduce_axis((0, in_c), name="rc") rh = tvm.reduce_axis((0, k_h), name="rh") rw = tvm.reduce_axis((0, k_w), name="rw") if out_dtype == "float32": out = tvm.compute( (batch, o_h, o_w, out_c), lambda n, h, w, o: tvm.sum(data_pad[n, (h * s_h + rh * d_h), ( w * s_w + rw * d_w), rc].astype("float32") * weight[ o, rh, rw, rc].astype("float32"), axis=[rc, rh, rw]), name=name) else: out = tvm.compute( (batch, o_h, o_w, out_c), lambda n, h, w, o: tvm.sum(data_pad[n, (h * s_h + rh * d_h), ( w * s_w + rw * d_w), rc] * weight[o, rh, rw, rc], axis=[rc, rh, rw]), name=name) return out
def _compute_update(logbase, sign_decay, sign_gm, grad): """Calculate var decay.""" vmul_tmp = tvm.compute(sign_gm.shape, lambda *indice: sign_gm(*indice) * sign_decay[0]) vmul_tmp = tvm.compute(vmul_tmp.shape, lambda *indice: vmul_tmp(*indice) * logbase[0]) exp_tmp = exp(vmul_tmp) update = topi.multiply(exp_tmp, grad) return update
def _compute_m_t(m, beta, grad): """Update m.""" beta_tmp = tvm.compute(m.shape, lambda *indice: m(*indice) * beta[0]) beta_na = tvm.compute( beta.shape, lambda *indice: beta(*indice) * neg_one_const("float32")) beta_na = tvm.compute( beta_na.shape, lambda *indice: beta_na(*indice) + one_const("float32")) beta_sub_tmp = tvm.compute(grad.shape, lambda *indice: grad(*indice) * beta_na[0]) m_t = topi.add(beta_tmp, beta_sub_tmp) return m_t
def sgd_compute(parameters, gradient, learning_rate, accum, momentum, stat, dampening=0.0, weight_decay=0.0, nesterov=False): """sgd compute implementation""" dtype = parameters.dtype if dtype == "float16": parameters = topi.cast(parameters, "float32") accum = topi.cast(accum, "float32") learning_rate = topi.cast(learning_rate, "float32") gradient = topi.cast(gradient, "float32") momentum = topi.cast(momentum, "float32") stat = topi.cast(stat, "float32") # if weight_decay != 0.0, need compute grad_delta to update gradient if weight_decay != 0.0: parameters = topi.multiply(parameters, tvm.const(1.0, 'float32')) grad_delta = topi.multiply(parameters, weight_decay) gradient = topi.add(gradient, grad_delta) stat_mid = topi.multiply(stat, tvm.const(-1, "float32")) stat_act = topi.add(stat_mid, tvm.const(1, "float32")) dampening_t = topi.multiply(stat_act, dampening) # update accum accum_delta = tvm.compute(accum.shape, lambda *indice: accum(*indice) * momentum[0]) gradient_damp = topi.multiply(gradient, dampening_t) accum_t = topi.add(accum_delta, gradient) if dampening != 0.0: accum_t = topi.subtract(accum_t, gradient_damp) # update parameters if nesterov: parameters_delta = tvm.compute(gradient.shape, lambda *indice: gradient(*indice) * learning_rate[0]) parameters_delta_2 = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * momentum[0]) parameters_delta_2 = tvm.compute(parameters_delta_2.shape, lambda *indice: parameters_delta_2(*indice) * learning_rate[0]) parameters_delta = topi.add(parameters_delta, parameters_delta_2) parameters_t = topi.subtract(parameters, parameters_delta) else: parameters_delta = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * learning_rate[0]) parameters_t = topi.subtract(parameters, parameters_delta) # update stat stat_t = topi.multiply(stat_act, tvm.const(NUM_ZERO, 'float32')) if dtype == "float16": parameters_t = topi.cast(parameters_t, "float16") accum_t = topi.cast(accum_t, "float16") stat_t = topi.cast(stat_t, "float16") return parameters_t, accum_t, stat_t
def bitwise_and(x1, x2): """ Computes the bitwise and of `x1` and `x2`. Args: x1 (tvm.tensor.Tensor): tensor x1, only support int16,uint16. x2 (tvm.tensor.Tensor): tensor x2, only support int16,uint16. Returns: A tvm.tensor.Tensor as result of bitwise and. """ _check_parameters(x1, x2) shape_x = get_shape(x1) shape_y = get_shape(x2) _, _, shape_max = produce_shapes(shape_x, shape_y) data_x = topi.broadcast_to(x1, shape_max) data_y = topi.broadcast_to(x2, shape_max) res = tvm.compute(data_x.shape, lambda *i: data_x(*i) & data_y(*i), name="and_res") return res
def _less_equal_compare_float32(data_x, data_y): """if x <= y, then return 1, else 0""" data_out = tvm.compute( data_x.shape, lambda *index: tvm.expr.Select( data_x(*index) <= data_y(*index), dc.one_const(data_x.dtype), dc.zero_const(data_x.dtype))) return data_out
def _default2zn(data): shape = [get_const(x) for x in data.shape] dtype = data.dtype if len(shape) < 2: raise ValueError( "length of shape of input_data should be greater than or equal to 2, but got %d" % len(shape)) m, n = shape[-2:] output_shape = [] for i in range(0, len(shape) - 2): output_shape.append(shape[i]) m1 = (m + cs - 1) // cs n1 = (n + cs - 1) // cs output_shape.extend([n1, m1, cs, cs]) def fcompute(*output_indices): input_indices = [] batch_len = len(output_indices) - 4 n1_indice = output_indices[batch_len] m1_indice = output_indices[batch_len + 1] m0_indcie = output_indices[batch_len + 2] n0_indcie = output_indices[batch_len + 3] m_indice = m1_indice * cs + m0_indcie n_indice = n1_indice * cs + n0_indcie for i in range(0, batch_len): input_indices.append(output_indices[i]) input_indices.append(m_indice) input_indices.append(n_indice) res = tvm.if_then_else(tvm.any(m_indice >= m, n_indice >= n), tvm.const(0, dtype), data(*input_indices)) return res output = tvm.compute(output_shape, fcompute, name=output_name) return output
def reverse_compute(input_data, axis): """reverse compute implementation.""" shape = input_data.shape axis_flag = [1] * len(shape) for i in axis: axis_flag[i] = -1 def _map_index(*index): """calculate normal index""" begin = [0] * len(shape) for i, _ in enumerate(shape): if i in axis: begin[i] = shape[i] - 1 if i == 0: index_org = (index[i] * axis_flag[i] + begin[i], ) else: index_org = index_org + (index[i] * axis_flag[i] + begin[i], ) return index_org output = tvm.compute(shape, lambda *i: input_data(*_map_index(*i)), name='output') return output
def Conv(data, weight, stride=[1, 1], pad=[0, 0, 0, 0], dilation=[1, 1], name="out", target=utils.CUDA): """ Supported Platforms: 'GPU' """ if target != utils.CUDA: raise RuntimeError("the target %s is not supported!" % target) batch, in_c, in_h, in_w = data.shape out_c, in_c, k_h, k_w = weight.shape pad_top, pad_bottom, pad_left, pad_right = pad s_h, s_w = stride d_h, d_w = dilation k_h_d = (k_h - 1) * d_h + 1 k_w_d = (k_w - 1) * d_w + 1 o_h = (in_h + pad_top + pad_bottom - k_h_d) // s_h + 1 o_w = (in_w + pad_left + pad_right - k_w_d) // s_w + 1 out_shape = (batch, out_c, o_h, o_w) data_pad = topi.nn.pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_bottom, pad_right], 0.0) rc = tvm.reduce_axis((0, in_c), name="rc") rh = tvm.reduce_axis((0, k_h), name="rh") rw = tvm.reduce_axis((0, k_w), name="rw") out = tvm.compute(out_shape, lambda n, c, h, w: tvm.sum( data_pad[n, rc, h * s_h + rh * d_h, w * s_w + rw * d_w] * weight[c, rc, rh, rw], axis=[rc, rh, rw]), name=name) # use for relu condition # out = tvm.compute(out.shape, lambda *i: tvm.max(out(*i), tvm.const(0, out.dtype)), name="relu") return out
def atan_grad(head, input_x): """ Compute gradient of input_x in atan. .. math:: dx = \\frac{1}{1 + x^2} \\cdot dy Args: head (tvm.tensor.Tensor): Gradient tensor of forward's output with the same shape and dtype as input_x. input_x (tvm.tensor.Tensor): Forward's input tensor support float16 and float32. Returns: A tvm.tensor.Tensor as gradient of forward's input. Supported Platforms: 'Ascend' """ utils.elemwise_shape_check(head.shape, input_x.shape) utils.elemwise_dtype_check(head.dtype, input_x.dtype, utils.DtypeForDavinci.ALL_FLOAT) dtype = input_x.dtype tensor_one = dc.one_const(dtype) def _compute(*i): return tensor_one / (tensor_one + input_x(*i) * input_x(*i)) * head(*i) out_tensor = tvm.compute(input_x.shape, _compute, name="out") return out_tensor
def HSwishGrad(y_grad, x, target=utils.CUDA): """ HSwishGrad Args: y_grad: x: Returns: """ if target != utils.CUDA: raise RuntimeError("the target %s is not supported!" % target) shape = x.shape res0 = tvm.compute(shape, lambda *i: tvm.if_then_else(x(*i) <= -3, 0, y_grad(*i) * (2 * x(*i) + 3) / 6)) res6 = tvm.compute(shape, lambda *i: tvm.if_then_else(x(*i) >= 3, y_grad(*i), res0(*i))) return res6
def _apply_ada_max_compute(var, m, v, grad, lr, beta1, beta1_power, beta2, epsilon): """Compute ada_max.""" # cast to float32 for improved accuracy inp_dtype = var.dtype if inp_dtype == 'float16': var = topi.cast(var, 'float32') m = topi.cast(m, 'float32') v = topi.cast(v, 'float32') lr = topi.cast(lr, 'float32') beta1_power = topi.cast(beta1_power, 'float32') beta1 = topi.cast(beta1, 'float32') beta2 = topi.cast(beta2, 'float32') grad = topi.cast(grad, 'float32') epsilon = tvm.const(epsilon, 'float32') # m += (grad - m) * (1 - beta1) rhs = tvm.compute(beta1.shape, lambda *i: beta1(*i) * neg_one_const("float32")) rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) + one_const("float32")) lhs = topi.subtract(grad, m) rhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) * rhs[0]) m = topi.add(m, rhs) # v = max(beta2*v, abs(grad)) lhs = tvm.compute(v.shape, lambda *i: v(*i) * beta2[0]) rhs = topi.abs(grad) v = topi.maximum(lhs, rhs) # var -= lr / (1 - beta1_power) * (m / (v + epsilon)) # lr * m / (1 - beta1_power) * (v + epsilon) # v + epsilon rhs = tvm.compute(v.shape, lambda *i: v(*i) + epsilon) # 1 - beta1_power lhs = tvm.compute(beta1_power.shape, lambda *i: beta1_power(*i) * neg_one_const("float32")) lhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) + one_const("float32")) # (1 - beta1_power) * (v + epsilon) rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) * lhs[0]) # lr * m lhs = tvm.compute(m.shape, lambda *i: m(*i) * lr[0]) # lr * m / (1 - beta1_power) * (v + epsilon) rhs = reciprocal(rhs) rhs = topi.multiply(lhs, rhs) var = topi.subtract(var, rhs) if inp_dtype == 'float16': var = topi.cast(var, inp_dtype) m = topi.cast(m, inp_dtype) v = topi.cast(v, inp_dtype) return var, m, v
def _apply_gradient_descent_compute(var, alpha, delta): """Compute gradient_descent""" # step 1: calculate delta * alpha var_change = tvm.compute(delta.shape, lambda *indices: delta(*indices) * alpha[0]) # step 2: calculate var - delta * alpha reuse_var = topi.subtract(var, var_change) return reuse_var
def xlogy_grad_compute(placeholders, shape_max, dtype, rx, ry): """ do element-wise xlogy_grad compute Args: placeholders (Union[list, typle]): the placeholder of data input shape_max (Union[list, typle]): the shape of broadcast dtype (string): the type of data input rx (list): the reduction indices of data input with broadcast ry (list): the reduction indices for data input with broadcast Returns output_y1 (tvm.tensor.Tensor): result of xlogy_grad output_y2 (tvm.tensor.Tensor): result of xlogy_grad """ x1_ori = placeholders[0] x2_ori = placeholders[1] grad_ori = placeholders[2] if dtype == "float16": x1 = akg.lang.cce.cast_to(x1_ori, "float32") x2 = akg.lang.cce.cast_to(x2_ori, "float32") grad = akg.lang.cce.cast_to(grad_ori, "float32") x1 = akg.lang.cce.broadcast(x1, shape_max) x2 = akg.lang.cce.broadcast(x2, shape_max) grad = akg.lang.cce.broadcast(grad, shape_max) else: x1 = akg.lang.cce.broadcast(x1_ori, shape_max) x2 = akg.lang.cce.broadcast(x2_ori, shape_max) grad = akg.lang.cce.broadcast(grad_ori, shape_max) esp_min = tvm.const(1.18e-38, dtype="float32") x1_addespmin = akg.lang.cce.vadds(x1, esp_min) if utils.product_is_mini(): not_zero_x1 = akg.lang.cce.vmul(x1, reciprocal(x1_addespmin)) log_x2 = tvm.compute( x2.shape, lambda *i: (tvm.log(x2(*i).astype("float16"))).astype("float32"), name="log_x2") else: not_zero_x1 = div(x1, x1_addespmin) log_x2 = akg.lang.cce.vlog(x2) partial_x1 = akg.lang.cce.vmul(not_zero_x1, log_x2) partial_x1g = akg.lang.cce.vmul(partial_x1, grad) partial_x2 = div(x1, x2) if not utils.product_is_mini() else \ akg.lang.cce.vmul(x1, reciprocal(x2)) partial_x2g = akg.lang.cce.vmul(partial_x2, grad) output_y1 = akg.lang.cce.sum(partial_x1g, rx, keepdims=True) output_y2 = akg.lang.cce.sum(partial_x2g, ry, keepdims=True) if dtype == "float16": output_y1 = akg.lang.cce.cast_to(output_y1, "float16") output_y2 = akg.lang.cce.cast_to(output_y2, "float16") return output_y1, output_y2
def fake_quant_with_min_max_args(input_data, min_=-6, max_=6, num_bits=8, narrow_range=False): """ Computes Fake-quantize the 'input_data' tensor, type float32 to 'output_data' tensor of same type output_data = (floor(clamped_shifted * inv_nudged_scale + 0.5f))) * scale + nudged_min scale = (max-min) / (quant_max-quant_min) Args: data_x1 (tvm.tensor.Tensor): Tensor of dtype "float32" min ([float, int]): scalar, defaults to -6 max ([float, int]): scalar, defaults to 6. [min; max] define the clamping range for the input_data data num_bits ([float, int]): Defaults to 8. num_bits is the bitwidth of the quantization,between 2 and 16 narrow_range ([bool]): True, quantized into the quantization range [1; 2^num_bits - 1] False,quantized into the quantization range [0; 2^num_bits - 1] Returns: tvm.tensor.Tensor """ shape = get_shape(input_data) utils.check_shape(shape) dtype = input_data.dtype utils.ops_dtype_check(dtype, utils.DtypeForDavinci.FLOAT32) nudged_min, nudged_max, scale = nudge_min_max(min_, max_, num_bits, narrow_range) zero_tensor = tvm.compute(input_data.shape, lambda *i: tvm.const(0, dtype="float32"), name="zero_tensor") nudged_max_tensor = topi.add(zero_tensor, nudged_max) nudged_min_tensor = topi.add(zero_tensor, nudged_min) inv_nudged_scale = 1.00 / scale # Transform the input between nudged_max and nudged_min clamped_vmin = topi.minimum(input_data, nudged_max_tensor) clamped = topi.maximum(clamped_vmin, nudged_min_tensor) # Calculate the quantized and dequantized results clamped_shifted = topi.subtract(clamped, nudged_min_tensor) vmul_shifted = topi.multiply(clamped_shifted, inv_nudged_scale) vadds_shifted = topi.add(vmul_shifted, 0.5) floor_vadds_shifted = floor(vadds_shifted) floor_cast = akg.lang.ascend.cast_to(floor_vadds_shifted, dtype) res_scale = topi.multiply(floor_cast, scale) res = topi.add(res_scale, nudged_min_tensor) return res
def _init_atan2_mask(data_y_, data_x_): """ Compute mask for atan2. Args: data_y (tvm.tensor.Tensor): The y of atan2(y, x). data_x (tvm.tensor.Tensor): The x of atan2(y, x). Returns: mask (tvm.tensor.Tensor): The mask of x's and y's value. """ is_cast_for_mini = utils.product_is_mini() and data_y_.dtype == "float32" # in mini, select only support float16 if is_cast_for_mini: data_x = topi.cast(data_x_, "float16") data_y = topi.cast(data_y_, "float16") else: data_x = data_x_ data_y = data_y_ dtype_input = data_y.dtype tensor_one = dc.one_const(dtype_input) tensor_zero = dc.zero_const(dtype_input) tensor_neg_one = dc.neg_one_const(dtype_input) y_ge_zero = tvm.compute( data_y.shape, lambda *i: tvm.expr.Select( data_y(*i) >= tensor_zero, tensor_one, tensor_neg_one), name="y_ge_zero") x_lt_zero_y_mask = tvm.compute( data_y.shape, lambda *i: tvm.expr.Select( data_x(*i) < tensor_zero, y_ge_zero(*i), tensor_zero), name="xlt0_y_mask") if is_cast_for_mini: x_lt_zero_y_mask = topi.cast(x_lt_zero_y_mask, "float32") y_ge_zero = topi.cast(y_ge_zero, "float32") return (x_lt_zero_y_mask, y_ge_zero)
def HSwishGrad(y_grad, x): """ HSwishGrad Args: y_grad: x: Returns: """ shape = x.shape res0 = tvm.compute( shape, lambda *i: tvm.if_then_else( x(*i) <= -3, 0, y_grad(*i) * (2 * x(*i) + 3) / 6)) res6 = tvm.compute( shape, lambda *i: tvm.if_then_else(x(*i) >= 3, y_grad(*i), res0(*i))) return res6
def relu_grad(head, in_data): shape = head.shape dtype = head.dtype zero = tvm.const(0, dtype) relugrad = tvm.compute( shape, lambda *i: tvm.if_then_else(in_data(*i) >= zero, head(*i), zero), tag=tag.INJECTIVE) return relugrad
def fused_relu_grad_bn_double_reduce_grad(data0, data1, data2, data3, data4, data5, data6, data7, data8, data9, data10, data11, data12, data13, data14, data15, layout="NHWC", out_dtype="float16", target=utils.CUDA): if layout == 'NCHW': data5 = topi.transpose(data5, (0, 2, 3, 1)) data9 = topi.transpose(data9, (0, 2, 3, 1)) data13 = topi.transpose(data13, (0, 2, 3, 1)) data14 = topi.transpose(data14, (0, 2, 3, 1)) data15 = topi.transpose(data15, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError( 'Layout not supported {} '.format(layout)) inter_dtype = "float32" n, h, w, c = data5.shape scale = n * h * w mul = topi.multiply(data2, data3) mul1221 = topi.divide(mul, scale) # ReluGrad zero = tvm.const(0, data15.dtype) add = topi.add(data13, data14) addgrad = tvm.compute(add.shape, lambda *i: tvm.if_then_else(data15(*i) >= zero, add(*i), zero), tag=tag.INJECTIVE) addgrad = topi.cast(addgrad, inter_dtype) mul3283 = topi.multiply(scale, addgrad) sub1159 = topi.subtract(mul3283, data6) data5_cast = topi.cast(data5, inter_dtype) mul2372 = topi.divide(data4, scale) sub631 = topi.subtract(data5_cast, mul2372) mul1220 = topi.multiply(sub631, data1) div = topi.divide(mul1220, data0) sub271 = topi.subtract(sub1159, div) mul1218 = topi.multiply(mul1221, sub271) mul1218_cast = topi.cast(mul1218, out_dtype) mul1231 = topi.multiply(data11, data12) mul1230 = topi.divide(mul1231, scale) data9_cast = topi.cast(data9, inter_dtype) mul2364 = topi.divide(data8, scale) sub625 = topi.subtract(data9_cast, mul2364) mul1229 = topi.multiply(data10, sub625) div272 = topi.divide(mul1229, data7) sub272 = topi.subtract(sub1159, div272) mul1228 = topi.multiply(mul1230, sub272) mul1228_cast = topi.cast(mul1228, out_dtype) if layout == "NCHW": mul1218_cast = topi.transpose(mul1218_cast, (0, 3, 1, 2)) mul1228_cast = topi.transpose(mul1228_cast, (0, 3, 1, 2)) return [mul1218_cast, mul1228_cast]
def batch_matmul_3d(data1, data2, attrs): """batch matmul for 3-D data""" bias, out_dtype, layout1, layout2, layout_out = attrs layout1_dict = {} layout2_dict = {} layout1 = layout1[1:] layout2 = layout2[1:] layout1_str = layout1.replace('N', 'b').replace( 'H', 'b').replace('D', 'm').replace('T', 'k') layout2_str = layout2.replace('N', 'b').replace( 'H', 'b').replace('D', 'n').replace('T', 'k') layout1_list = list(layout1_str) layout2_list = list(layout2_str) for i in range(len(layout1)): layout1_dict[layout1_list[i]] = data1.shape[i] layout2_dict[layout2_list[i]] = data2.shape[i] reduce_axis = tvm.reduce_axis( (0, layout1_dict.get('k')), name='reduce_axis') if out_dtype == "float32": res = tvm.compute( (layout1_dict.get('b'), layout1_dict.get('m'), layout2_dict.get('n')), lambda b, i, j: tvm.sum( data1[b, i if layout1_list[1] == 'm' else reduce_axis, reduce_axis if layout1_list[2] == 'k' else i].astype("float") * data2[b, j if layout2_list[1] == 'n' else reduce_axis, reduce_axis if layout2_list[2] == 'k' else j].astype("float"), axis=reduce_axis)) else: res = tvm.compute( (layout1_dict.get('b'), layout1_dict.get('m'), layout2_dict.get('n')), lambda b, i, j: tvm.sum( data1[b, i if layout1_list[1] == 'm' else reduce_axis, reduce_axis if layout1_list[2] == 'k' else i] * data2[b, j if layout2_list[1] == 'n' else reduce_axis, reduce_axis if layout2_list[2] == 'k' else j], axis=reduce_axis)) if bias is not None: res = topi.add(res, bias) if layout_out != "NHDT": res = auto_out_transpose(res, layout_out) return res
def cimag(inputs, attrs): del attrs in_tensor = inputs[0] out_shape = in_tensor.shape[:-1] def fcompute(*index): out_index = [x for x in index] out_index.append(1) return in_tensor(*out_index) return tvm.compute(out_shape, fcompute, name="imag")
def topi_nn_hsigmoid(x): """ topi hsigmoid Args: x: Returns: """ return tvm.compute(x.shape, lambda *i: tvm.if_then_else(x(*i) <= -3, 0, tvm.if_then_else(x(*i) >= 3, 1, (x(*i) + 3) / 6)))
def my_dsl(dtype, kernel_name, attrs): m = tvm.var("M") n = tvm.var("N") A = tvm.placeholder((m, ), name="A", dtype=dtype) B = tvm.placeholder((m, ), name="B", dtype=dtype) if insn == "add": C = topi.add(A, B) elif insn == "sub": C = topi.subtract(A, B) if insn == "mul": C = topi.multiply(A, B) elif insn == "div": C = topi.divide(A, B) elif insn == "max": C = topi.maximum(A, B) elif insn == "min": C = topi.minimum(A, B) elif insn == "abs": C = tvm.compute(A.shape, lambda *index: tvm.abs(A(*index)), name='C') elif insn == "exp": C = topi.exp(A) elif insn == "log": C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) elif insn == "adds": C = A + tvm.const(2, dtype) elif insn == "muls": C = A * tvm.const(2, dtype) # C = tvm.compute((m, ), lambda i: A[i] + B[i], name="C") s = tvm.create_schedule([C.op]) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): if insnType == "binary": mod = akg.build(s, [A, B, C], "cce", name=kernel_name, attrs=attrs, polyhedral=True) else: mod = akg.build(s, [A, C], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod
def resize_nearest_neighbor_grad(grad, size, align_corners=True, out_dtype=None): """ Perform resize_nearest_neighbor_grad. """ in_n, in_c, in_h, in_w = grad.shape output_shape = [in_n, in_c, size[0], size[1]] if align_corners: y_ratio = (in_h - 1).astype('float') / (size[0] - 1) x_ratio = (in_w - 1).astype('float') / (size[1] - 1) else: y_ratio = (in_h).astype('float') / (size[0]) x_ratio = (in_w).astype('float') / (size[1]) def _get_pixel(n, c, y, x): y = tvm.max(tvm.min(y, in_h - 1), 0) x = tvm.max(tvm.min(x, in_w - 1), 0) return grad(n, c, y, x).astype('float') def _get_indices(*indices): n, c, y, x = indices return n, c, y, x def _cast_output(value): if out_dtype: dtype = out_dtype else: dtype = grad.dtype return value.astype(dtype) # Nearest neighbor computation def _nearest_neighbor_grad(*indices): n, c, y, x = _get_indices(*indices) in_y = y_ratio * y in_x = x_ratio * x if align_corners: yint = tvm.round(in_y).astype('int32') xint = tvm.round(in_x).astype('int32') else: # Add epsilon to floor to prevent gpu rounding errors. epsilon = 1e-5 yint = tvm.floor(in_y + epsilon).astype('int32') xint = tvm.floor(in_x + epsilon).astype('int32') return _cast_output(_get_pixel(n, c, yint, xint)) compute_func = _nearest_neighbor_grad return tvm.compute(output_shape, compute_func, name='resize_nearest_neighbor_grad', tag=tag.INJECTIVE)
def HSigmoidGrad(y_grad, x): """ HSigmoidGrad Args: y_grad: x: Returns: """ return tvm.compute( x.shape, lambda *i: tvm.if_then_else( x(*i) <= -3, 0, tvm.if_then_else(x(*i) >= 3, 0, y_grad(*i) / 6)))
def topi_nn_HSwish(x): """ topi HSwish Args: x: Returns: """ return tvm.compute( x.shape, lambda *i: tvm.if_then_else( x(*i) <= -3, 0, tvm.if_then_else(x(*i) >= 3, x(*i), x(*i) * (x(*i) + 3) / 6)))