Beispiel #1
0
def xdivy_grad_compute(placeholders, shape_max, dtype, rx, ry):
    """
    Do element-wise xdivy_grad compute.

    Args:
        placeholders (Union[list, typle]): the placeholder of data input
        shape_max (Union[list, typle]): the shape of broadcast
        dtype (string): the type of data input
        rx (list): the reduction indices of data input with broadcast
        ry (list): the reduction indices for data input with broadcast

    Returns:
        output_y1 (tvm.tensor.Tensor): result of xdivy_grad
        output_y2 (tvm.tensor.Tensor): result of xdivy_grad
    """
    x1_ori = placeholders[0]
    x2_ori = placeholders[1]
    grad_ori = placeholders[2]

    if dtype == "float16":
        x1 = akg.lang.ascend.cast_to(x1_ori, "float32")
        x2 = akg.lang.ascend.cast_to(x2_ori, "float32")
        grad = akg.lang.ascend.cast_to(grad_ori, "float32")
        x1 = akg.lang.ascend.broadcast(x1, shape_max)
        x2 = akg.lang.ascend.broadcast(x2, shape_max)
        grad = akg.lang.ascend.broadcast(grad, shape_max)
    else:
        x1 = akg.lang.ascend.broadcast(x1_ori, shape_max)
        x2 = akg.lang.ascend.broadcast(x2_ori, shape_max)
        grad = akg.lang.ascend.broadcast(grad_ori, shape_max)

    esp_min = tvm.const(1.18e-38, dtype="float32")
    x1_addepsmin = akg.lang.ascend.vadds(x1, esp_min)

    if product_is_mini():
        x1_addepsmin_rec = reciprocal(x1_addepsmin)
        not_zero_x1 = akg.lang.ascend.vmul(x1, x1_addepsmin_rec)
        x2_rec = reciprocal(x2)
        partial_x1 = akg.lang.ascend.vmul(not_zero_x1, x2_rec)
    else:
        not_zero_x1 = Divide(x1, x1_addepsmin, target="cce")
        partial_x1 = Divide(not_zero_x1, x2, target="cce")

    partial_x1g = akg.lang.ascend.vmul(partial_x1, grad)

    neg_one = tvm.const(-1, dtype="float32")
    neg_x1 = akg.lang.ascend.vmuls(x1, neg_one)
    partial_x1pow = akg.lang.ascend.vmul(partial_x1, partial_x1)
    partial_x2 = akg.lang.ascend.vmul(neg_x1, partial_x1pow)
    partial_x2g = akg.lang.ascend.vmul(partial_x2, grad)

    output_y1 = akg.lang.ascend.sum(partial_x1g, rx, keepdims=True)
    output_y2 = akg.lang.ascend.sum(partial_x2g, ry, keepdims=True)

    if dtype == "float16":
        output_y1 = akg.lang.ascend.cast_to(output_y1, "float16")
        output_y2 = akg.lang.ascend.cast_to(output_y2, "float16")

    return output_y1, output_y2
Beispiel #2
0
def xlogy_grad_compute(placeholders, shape_max, dtype, rx, ry):
    """
    do element-wise xlogy_grad compute

    Args:
        placeholders (Union[list, typle]): the placeholder of data input
        shape_max (Union[list, typle]): the shape of broadcast
        dtype (string): the type of data input
        rx (list): the reduction indices of data input with broadcast
        ry (list): the reduction indices for data input with broadcast

    Returns
        output_y1 (tvm.tensor.Tensor): result of xlogy_grad
        output_y2 (tvm.tensor.Tensor): result of xlogy_grad
    """
    x1_ori = placeholders[0]
    x2_ori = placeholders[1]
    grad_ori = placeholders[2]

    if dtype == "float16":
        x1 = akg.lang.ascend.cast_to(x1_ori, "float32")
        x2 = akg.lang.ascend.cast_to(x2_ori, "float32")
        grad = akg.lang.ascend.cast_to(grad_ori, "float32")
        x1 = akg.lang.ascend.broadcast(x1, shape_max)
        x2 = akg.lang.ascend.broadcast(x2, shape_max)
        grad = akg.lang.ascend.broadcast(grad, shape_max)
    else:
        x1 = akg.lang.ascend.broadcast(x1_ori, shape_max)
        x2 = akg.lang.ascend.broadcast(x2_ori, shape_max)
        grad = akg.lang.ascend.broadcast(grad_ori, shape_max)

    esp_min = tvm.const(1.18e-38, dtype="float32")
    x1_addespmin = akg.lang.ascend.vadds(x1, esp_min)

    if product_is_mini():
        not_zero_x1 = akg.lang.ascend.vmul(x1, reciprocal(x1_addespmin))
        log_x2 = tvm.compute(
            x2.shape,
            lambda *i: (tvm.log(x2(*i).astype("float16"))).astype("float32"),
            name="log_x2")
    else:
        not_zero_x1 = Divide(x1, x1_addespmin, target="cce")
        log_x2 = akg.lang.ascend.vlog(x2)

    partial_x1 = akg.lang.ascend.vmul(not_zero_x1, log_x2)
    partial_x1g = akg.lang.ascend.vmul(partial_x1, grad)

    partial_x2 = Divide(x1, x2, target="cce") if not product_is_mini() else \
        akg.lang.ascend.vmul(x1, reciprocal(x2))
    partial_x2g = akg.lang.ascend.vmul(partial_x2, grad)

    output_y1 = akg.lang.ascend.sum(partial_x1g, rx, keepdims=True)
    output_y2 = akg.lang.ascend.sum(partial_x2g, ry, keepdims=True)

    if dtype == "float16":
        output_y1 = akg.lang.ascend.cast_to(output_y1, "float16")
        output_y2 = akg.lang.ascend.cast_to(output_y2, "float16")
    return output_y1, output_y2
Beispiel #3
0
def _apply_ada_max_compute(var, m, v, grad, lr, beta1, beta1_power, beta2,
                           epsilon):
    """Compute ada_max."""
    # cast to float32 for improved accuracy
    inp_dtype = var.dtype
    if inp_dtype == 'float16':
        var = topi.cast(var, 'float32')
        m = topi.cast(m, 'float32')
        v = topi.cast(v, 'float32')
        lr = topi.cast(lr, 'float32')
        beta1_power = topi.cast(beta1_power, 'float32')
        beta1 = topi.cast(beta1, 'float32')
        beta2 = topi.cast(beta2, 'float32')
        grad = topi.cast(grad, 'float32')
    epsilon = tvm.const(epsilon, 'float32')

    # m += (grad - m) * (1 - beta1)
    rhs = tvm.compute(beta1.shape,
                      lambda *i: beta1(*i) * neg_one_const("float32"))
    rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) + one_const("float32"))
    lhs = topi.subtract(grad, m)
    rhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) * rhs[0])
    m = topi.add(m, rhs)

    # v = max(beta2*v, abs(grad))
    lhs = tvm.compute(v.shape, lambda *i: v(*i) * beta2[0])
    rhs = topi.abs(grad)
    v = topi.maximum(lhs, rhs)

    # var -= lr / (1 - beta1_power) * (m / (v + epsilon))
    # lr * m / (1 - beta1_power) * (v + epsilon)
    # v + epsilon
    rhs = tvm.compute(v.shape, lambda *i: v(*i) + epsilon)
    # 1 - beta1_power
    lhs = tvm.compute(beta1_power.shape,
                      lambda *i: beta1_power(*i) * neg_one_const("float32"))
    lhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) + one_const("float32"))
    # (1 - beta1_power) * (v + epsilon)
    rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) * lhs[0])
    # lr * m
    lhs = tvm.compute(m.shape, lambda *i: m(*i) * lr[0])
    # lr * m / (1 - beta1_power) * (v + epsilon)
    rhs = reciprocal(rhs)
    rhs = topi.multiply(lhs, rhs)
    var = topi.subtract(var, rhs)

    if inp_dtype == 'float16':
        var = topi.cast(var, inp_dtype)
        m = topi.cast(m, inp_dtype)
        v = topi.cast(v, inp_dtype)

    return var, m, v
Beispiel #4
0
def softsign_compute(input_features):
    """ompute for softsign"""
    dtype = input_features.dtype
    if dtype == "float16":
        input_features = akg.lang.ascend.cast_to(input_features, "float32")

    data_abs = akg.lang.ascend.vabs(input_features)
    data_add = akg.lang.ascend.vadds(data_abs, SCALAR_ONE)
    data_rec = reciprocal(data_add)
    res = akg.lang.ascend.vmul(input_features, data_rec)

    if dtype == "float16":
        res = akg.lang.ascend.cast_to(res, "float16")

    return res
Beispiel #5
0
def fake_quant_with_min_max_vars_per_channel_compute(input_data,
                                                     input_min,
                                                     input_max,
                                                     num_bits=8,
                                                     narrow_range=False):
    """fake_quant_with_min_max_vars_per_channel compute implemention"""
    shape = get_shape(input_data.shape)
    dtype = input_data.dtype
    min_broadcast = akg.lang.ascend.broadcast(input_min, shape, dtype)
    max_broadcast = akg.lang.ascend.broadcast(input_max, shape, dtype)
    # get nudged_min and nudged_max by nudged_min_max_compute function
    nudged_min_nudged_max = nudged_min_max_compute(min_broadcast,
                                                   max_broadcast, num_bits,
                                                   narrow_range)
    # transform the input between nudged_max and nudged_min
    clamped_tmp = topi.minimum(input_data, nudged_min_nudged_max[1])
    clamped = topi.maximum(clamped_tmp, nudged_min_nudged_max[0])

    # calculate the quantized and dequantized results
    clamped_shifted = topi.subtract(clamped, nudged_min_nudged_max[0])
    if product_is_mini():
        clamped_shifted_div_scale = mul(clamped_shifted,
                                        reciprocal(nudged_min_nudged_max[2]),
                                        target=utils.CCE)
    else:
        clamped_shifted_div_scale = Divide(clamped_shifted,
                                           nudged_min_nudged_max[2],
                                           target=utils.CCE)
    result_tmp = topi.add(clamped_shifted_div_scale, dc.half_const(dtype))
    floor_result_tmp = akg.lang.ascend.floor(result_tmp)
    if product_is_mini():
        floor_result_tmp = topi.cast(floor_result_tmp, "float16")

    floor_result_tmp = topi.cast(floor_result_tmp, "float32")
    scale_product = topi.multiply(floor_result_tmp, nudged_min_nudged_max[2])
    tmp_res = topi.add(scale_product, nudged_min_nudged_max[0])
    # get bool_both_zero_value by bool_both_zero_compute function
    bool_both_zero_value = bool_both_zero_compute(min_broadcast, max_broadcast)
    res = topi.multiply(tmp_res, bool_both_zero_value)

    return res
Beispiel #6
0
def nudged_min_max_compute(min_broadcast, max_broadcast, num_bits,
                           narrow_range):
    """
    Calculate the maximum and minimum values of the quantization.

    Notes:
        Each channel scale[i] euqal to (max_broadcast[i] - min_broadcast[i]) / (quant_max - quant_min).
        Then compute nudged_zero_point:
                nudged_zero_point = floor(between_min_max_float + 0.5) + less_quant_min_float + more_quant_max_float,
        between_min_max_float is first calculated by:
                zero_point_from_min = (quant_min_float - min_broadcast) / scale,
        then between_min_max_float = zero_point_from_min, which min_broadcast <= zero_point_from_min <= max_broadcast.
        Besides, the value of less_quant_min_float is equal to quant_min or zero, zero_point_from_min < quant_min_float,
        the value is quant_min, else is 0. The same as more_quant_max_float.
        Finally according to scale and nudged_zero_point to compute nudged_min and nudged_max:
                 nudged_min = (quant_min - nudged_zero_point) * scale
                 nudged_max = (quant_max - nudged_zero_point) * scale

    Args:
        min_broadcast (tvm.tensor.Tensor): minimum value to be quantified for each channel.
        max_broadcast (tvm.tensor.Tensor): maximum value to be quantified for each channel.
        num_bits (int): num_bits is the bitwidth of the quantization, range [2,16].
        narrow_range (bool): if True, for each channel, quantized into the quantization range [0, 2^num_bits - 1] else
                      quantized into the quantization range [1, 2^num_bits - 1].

    Returns:
        nudged_min (tvm.tensor.Tensor): The same type and shape as min_broadcast.
        nudged_max (tvm.tensor.Tensor): The same type and shape as max_broadcast.
        scale (tvm.tensor.Tensor): The same type and shape as max_broadcast.
    """

    dtype = min_broadcast.dtype
    quant_min = 1 if narrow_range else 0
    quant_max = (2**num_bits) - 1

    # because of need compute each channel, so quant_min and quant_max need to broadcast.
    quant_min_float = topi.full(min_broadcast.shape, dtype,
                                tvm.const(quant_min, dtype))
    quant_max_float = topi.full(min_broadcast.shape, dtype,
                                tvm.const(quant_max, dtype))

    # caculate each channel max and min difference.
    max_sub_min = topi.subtract(max_broadcast, min_broadcast)
    quant_max_sub_quant_min = topi.subtract(quant_max_float, quant_min_float)
    # compute scale = (max_broadcast - min_broadcast) / (quant_max - quant_min)
    # and min_div_scale = min_broadcast / scale
    if product_is_mini():
        scale = mul(max_sub_min,
                    reciprocal(quant_max_sub_quant_min),
                    target=utils.CCE)
        min_div_scale = Mul(min_broadcast, reciprocal(scale), target=utils.CCE)
    else:
        scale = Divide(max_sub_min, quant_max_sub_quant_min, target=utils.CCE)
        min_div_scale = Divide(min_broadcast, scale, target=utils.CCE)

    # zero_point_from_min = quant_min_float - min_broadcast / scale
    zero_point_from_min = topi.subtract(quant_min_float, min_div_scale)
    # if zero_point_from_min < quant_min_float, bool_less_quant_min_float = 1 else 0
    bool_less_quant_min_float = less_compare_float32(zero_point_from_min,
                                                     quant_min_float)
    # if quant_max_float < zero_point_from_min, bool_more_quant_max_float = 1 else 0
    bool_more_quant_max_float = less_compare_float32(quant_max_float,
                                                     zero_point_from_min)

    # according to above bool param to select effective value
    less_quant_min_float = topi.multiply(quant_min_float,
                                         bool_less_quant_min_float)
    more_quant_max_float = topi.multiply(quant_max_float,
                                         bool_more_quant_max_float)

    # compute which num is not less than quant_min_float and not large than quant_max_float
    tensor_one = topi.full(min_broadcast.shape, dtype, dc.one_const(dtype))
    bool_not_less_quant_min_float = topi.subtract(tensor_one,
                                                  bool_less_quant_min_float)
    bool_not_more_quant_max_float = topi.subtract(tensor_one,
                                                  bool_more_quant_max_float)
    bool_between_min_max = topi.multiply(bool_not_less_quant_min_float,
                                         bool_not_more_quant_max_float)
    between_min_max_float = topi.multiply(zero_point_from_min,
                                          bool_between_min_max)
    # add 0.5 to num which min <= num <= max and then floor them.
    between_min_max_add_half_one = topi.add(between_min_max_float,
                                            dc.half_const(dtype))
    between_min_max_round = akg.lang.ascend.floor(between_min_max_add_half_one)
    if product_is_mini():
        between_min_max_round = topi.cast(between_min_max_round, "float16")

    between_min_max_round = topi.cast(between_min_max_round, "float32")

    # calculate the maximum and minimum values of the quantization
    nudged_zero_point_tmp = topi.add(less_quant_min_float,
                                     more_quant_max_float)
    nudged_zero_point = topi.add(nudged_zero_point_tmp, between_min_max_round)

    nudged_min_tmp = topi.subtract(quant_min_float, nudged_zero_point)
    nudged_max_tmp = topi.subtract(quant_max_float, nudged_zero_point)
    nudged_min = topi.multiply(nudged_min_tmp, scale)
    nudged_max = topi.multiply(nudged_max_tmp, scale)
    res = [nudged_min, nudged_max, scale]

    return res
Beispiel #7
0
def _apply_adagrad_da_compute(var, gradient_accum, gradient_squared_accum,
                              grad, lr, l1, l2, global_step):
    """Compute adagrad_da."""
    dtype = var.dtype
    # cast to float32 for higher precision
    if dtype == "float16":
        gradient_accum = topi.cast(gradient_accum, "float32")
        gradient_squared_accum = topi.cast(gradient_squared_accum, "float32")
        grad = topi.cast(grad, "float32")
        lr = topi.cast(lr, "float32")
        l1 = topi.cast(l1, "float32")
        l2 = topi.cast(l2, "float32")
    if product_is_mini():
        global_step = topi.cast(global_step, "float16")
        global_step = topi.cast(global_step, "float32")
    else:
        global_step = topi.cast(global_step, "float32")

    # 1.grad_accum += grad
    gradient_accum = topi.add(gradient_accum, grad)

    # 2.grad_squared_accum += grad * grad
    gs = topi.multiply(grad, grad)
    gradient_squared_accum = topi.add(gradient_squared_accum, gs)

    # 3.if l1 > 0: tmp_val = Sign(grad_accum) * max(|grad_accum|-l1*global_step, 0)
    #   else:      tmp_val = grad_accum
    sign_val = Sign(gradient_accum)
    abs_val = topi.abs(gradient_accum)
    mul_val = topi.multiply(global_step, l1)
    sub_val = topi.subtract(abs_val, mul_val)
    max_val = topi.maximum(sub_val, tvm.const(0, sub_val.dtype))
    tmp_val = topi.multiply(sign_val, max_val)

    def select(l1, tmp_val, gradient_accum):
        """Returns tmp_val if l1 > 0 else gradient_accum."""
        if product_is_mini():
            l1 = topi.cast(l1, "float16")
            tmp_val = topi.cast(tmp_val, "float16")
            gradient_accum = topi.cast(gradient_accum, "float16")
        tmp_val = akg.tvm.compute(
            tmp_val.shape, lambda *i: tvm.expr.Select(l1[0] > 0, tmp_val(*i),
                                                      gradient_accum(*i)))
        return topi.cast(tmp_val, "float32") if product_is_mini() else tmp_val

    tmp_val = select(l1, tmp_val, gradient_accum)

    # 4.x_value = -1 * lr * tmp_val
    x_value = topi.multiply(lr, tvm.const(-1, "float32"))
    x_value = topi.multiply(x_value, tmp_val)

    # 5.y_value = l2 * global_step * lr + sqrt(grad_squared_accum)
    pro_val = topi.multiply(l2, global_step)
    pro_val = topi.multiply(pro_val, lr)
    sqrt_val = sqrt(gradient_squared_accum, target=utils.CCE)
    y_value = topi.add(pro_val, sqrt_val)

    # 6.var = x_value / y_value
    if product_is_mini():
        y_rec = reciprocal(y_value, target=utils.CCE)
        var_out = topi.multiply(x_value, y_rec)
    else:
        var_out = topi.divide(x_value, y_value)

    if dtype == "float16":
        var_out = akg.lang.ascend.cast_to(var_out, "float16")
        gradient_accum = akg.lang.ascend.cast_to(gradient_accum, "float16")
        gradient_squared_accum = akg.lang.ascend.cast_to(
            gradient_squared_accum, "float16")

    return var_out, gradient_accum, gradient_squared_accum
Beispiel #8
0
def reciprocal(x, target=utils.CUDA):
    """Reciprocal"""
    return math.reciprocal(x, target)