def fused_relu_grad_bn_double_update_grad(data_1, data_2, data_3, data_4, data_5, data_6, data_7, layout='NHWC'):
    transform_list = [data_2, data_4, data_5, data_6, data_7]
    for i in transform_list:
        if layout == "NCHW":
            i = topi.transpose(i, axes=(0, 2, 3, 1))
        elif layout != "NHWC":
            raise NotImplementedError( 'Layout not supported {} '.format(layout))

    data_tmp1 = topi.full_like(data_7, 0.0)
    data_tmp2 = topi.greater(data_7, data_tmp1)
    data_tmp3 = topi.add(data_5, data_6)
    data_tmp4 = topi.where(data_tmp2, data_tmp3, data_tmp1)
    data_tmp5 = topi.cast(data_tmp4, 'float32')
    data_tmp7 = topi.sum(data_tmp5, axis=(0, 1, 2))

    n, h, w, c = data_7.shape
    data_tmp8 = topi.cast(data_2, 'float32')
    data_tmp9 = topi.full_like(data_tmp7, 1.0/(n*h*w))
    data_tmp10 = topi.multiply(data_1, data_tmp9)
    data_tmp11 = topi.broadcast_to(data_tmp10, data_tmp8.shape)
    data_tmp12 = topi.subtract(data_tmp8, data_tmp11)
    data_tmp13 = topi.multiply(data_tmp5, data_tmp12)
    data_tmp15 = topi.sum(data_tmp13, axis=(0, 1, 2))

    data_tmp16 = topi.cast(data_4, 'float32')
    data_tmp17 = topi.multiply(data_3, data_tmp9)
    data_tmp18 = topi.broadcast_to(data_tmp17, data_tmp16.shape)
    data_tmp19 = topi.subtract(data_tmp16, data_tmp18)
    data_tmp20 = topi.multiply(data_tmp5, data_tmp19)
    data_tmp22 = topi.sum(data_tmp20, axis=(0, 1, 2))

    return [data_tmp7, data_tmp15, data_tmp22]
예제 #2
0
def bitwise_and(x1, x2):
    """
    Computes the bitwise and of `x1` and `x2`.

    Args:
        x1 (tvm.tensor.Tensor): tensor x1, only support int16,uint16.
        x2 (tvm.tensor.Tensor): tensor x2, only support int16,uint16.

    Returns:
        A tvm.tensor.Tensor as result of bitwise and.
    """
    _check_parameters(x1, x2)

    shape_x = get_shape(x1)
    shape_y = get_shape(x2)
    _, _, shape_max = produce_shapes(shape_x, shape_y)

    data_x = topi.broadcast_to(x1, shape_max)
    data_y = topi.broadcast_to(x2, shape_max)

    res = tvm.compute(data_x.shape,
                      lambda *i: data_x(*i) & data_y(*i),
                      name="and_res")

    return res
예제 #3
0
def fake_quant_with_min_max_vars_per_channel_gradient(input_gradients,
                                                      input_data,
                                                      input_min,
                                                      input_max,
                                                      num_bits=8,
                                                      narrow_range=False):
    """
    Computes gradients of Fake-quantize on the 'input_data' tensor,

    output_backprops = input_gradients*(if input_data>=nudged_min and <=nudged_max 1 else 0)

    Args:
        input_gradients (tvm.tensor.Tensor): input gradients from previously operation
        input_data (tvm.tensor.Tensor): input of fake-quantize, only supports "float32"
        input_min (tvm.tensor.Tensor): input_min shape equals to input_max shape
            The last dimension shoud be same for shapes of min, max and shape_inputs
            only support fp32
        input_max (tvm.tensor.Tensor): only support fp32
        num_bits (int): Defaults to 8. bitwidth of the quantization,between 2 and 16
        narrow_range (bool): 
            True, quantized into the quantization range [1, 2^num_bits - 1]
            False,quantized into the quantization range [0, 2^num_bits - 1]

    Returns:
        tvm.tensor.Tensor
    """
    input_gradients_shape = get_shape(input_gradients)
    input_data_shape = get_shape(input_data)
    input_min_shape = get_shape(input_min)
    input_max_shape = get_shape(input_max)

    utils.check_shape(input_gradients_shape)
    utils.check_shape(input_data_shape)
    utils.check_shape(input_min_shape)
    utils.check_shape(input_max_shape)

    utils.elemwise_shape_check(input_gradients.shape, input_data.shape)
    utils.elemwise_shape_check(input_min_shape, input_max_shape)
    if input_min_shape[0] != input_data_shape[-1]:
        raise RuntimeError(
            "The shapes of min,max and shape_inputs last one dimension shoud be same"
        )

    utils.ops_dtype_check(input_gradients.dtype, utils.DtypeForDavinci.FLOAT32)
    utils.ops_dtype_check(input_data.dtype, utils.DtypeForDavinci.FLOAT32)
    utils.ops_dtype_check(input_min.dtype, utils.DtypeForDavinci.FLOAT32)
    utils.ops_dtype_check(input_max.dtype, utils.DtypeForDavinci.FLOAT32)

    if num_bits > 16 or num_bits < 2:
        raise RuntimeError("numbits should be range[2,16]")

    input_min_broadcast = topi.broadcast_to(input_min, input_data_shape)
    input_max_broadcast = topi.broadcast_to(input_max, input_data_shape)

    res = fake_quant_with_min_max_vars_per_channel_gradient_compute(
        input_gradients, input_data, input_min_broadcast, input_max_broadcast,
        num_bits, narrow_range)
    return res
예제 #4
0
def fused_bn_reduce_grad(data0,
                         data1,
                         data2,
                         data3,
                         data4,
                         data5,
                         data6,
                         data7,
                         layout='NHWC',
                         out_dtype='float16',
                         target=utils.CUDA):

    if layout == 'NCHW':
        data3 = topi.transpose(data3, (0, 2, 3, 1))
        data7 = topi.transpose(data7, (0, 2, 3, 1))
    elif layout != 'NHWC':
        raise NotImplementedError('Layout not supported {} '.format(layout))

    n, h, w, c = data3.shape
    const = n * h * w
    inter_dtype = 'float32'
    out1 = topi.multiply(data4, data5)
    out1 = topi.divide(out1, const)
    out1 = topi.expand_dims(out1, axis=0, num_newaxis=3)
    out1 = topi.broadcast_to(out1, (n, h, w, c))

    data3 = topi.cast(data3, inter_dtype)
    data2 = topi.expand_dims(data2, axis=0, num_newaxis=3)
    data2 = topi.broadcast_to(data2, (n, h, w, c))
    out2 = topi.multiply(data3, const)
    out2 = topi.subtract(out2, data2)

    data1 = topi.expand_dims(data1, axis=0, num_newaxis=3)
    data1 = topi.broadcast_to(data1, (n, h, w, c))
    data7 = topi.cast(data7, inter_dtype)
    out3 = topi.divide(data6, const)
    out3 = topi.subtract(data7, out3)
    out3 = topi.multiply(data1, out3)
    out3 = topi.divide(out3, data0)

    output = topi.subtract(out2, out3)
    output = topi.multiply(output, out1)

    output = topi.cast(output, out_dtype)

    if layout == "NCHW":
        output = topi.transpose(output, (0, 3, 1, 2))

    return output
예제 #5
0
def _apply_adadelta_compute(var, accum, accum_update, grad, lr, rho, epsilon):
    """Compute apply_adadelta"""
    dtype = var.dtype
    if dtype == "float16":
        var = topi.cast(var, "float32")
        accum = topi.cast(accum, "float32")
        accum_update = topi.cast(accum_update, "float32")
        lr = topi.cast(lr, "float32")
        rho = topi.cast(rho, "float32")
        grad = topi.cast(grad, "float32")

    epsilon = tvm.const(epsilon, "float32")
    tensor_one = akg.lang.ascend.broadcast(tvm.const(1, "float32"), var.shape)
    tensor_rho = topi.broadcast_to(rho, var.shape)
    tensor_rho_gs = topi.subtract(tensor_one, tensor_rho)
    tensor_epsilon = akg.lang.ascend.broadcast(epsilon, var.shape)

    # accum = accum * rho + grad ** 2 * (1 - rho)
    rhs = topi.multiply(accum, tensor_rho)
    lhs = topi.multiply(grad, grad)
    lhs = topi.multiply(lhs, tensor_rho_gs)
    accum_res = akg.lang.ascend.vadd(lhs, rhs)

    # update = (accum_update + epsilon).sqrt * (accum + epsilon).rsqrt * grad
    rhs = topi.add(accum_update, tensor_epsilon)
    rhs = sqrt(rhs, target=utils.CCE)
    lhs = topi.add(accum_res, tensor_epsilon)
    lhs = rsqrt(lhs, target=utils.CCE)
    lhs = topi.multiply(grad, lhs)
    update = topi.multiply(lhs, rhs)

    # var -= update * lr
    var_res = topi.broadcast_to(lr, var.shape)
    var_res = topi.multiply(update, var_res)
    var_res = topi.subtract(var, var_res)

    # accum_update = rho * accum_update + (1 - rho) * update.square
    rhs = topi.multiply(accum_update, tensor_rho)
    lhs = topi.multiply(update, update)
    lhs = topi.multiply(lhs, tensor_rho_gs)
    accum_update_res = akg.lang.ascend.vadd(lhs, rhs)

    if dtype == "float16":
        var_res = topi.cast(var_res, "float16")
        accum_res = topi.cast(accum_res, "float16")
        accum_update_res = topi.cast(accum_update_res, "float16")

    return var_res, accum_res, accum_update_res
예제 #6
0
def broadcast_to(x, shape, target=utils.CCE):
    """
    Broadcast an tensor to a compatible shape.

    Args:
        x (tvm.tensor.Tensor): Tensor of type float32, float16, int8, uint8, int32
        shape (list, tuple): The shape of output tensor.

    Returns:
        An tvm.tensor.Tensor with the same type as x.

    Supported Platforms:
        'Ascend'
    """
    # check shape
    utils.check_shape(x)
    utils.check_shape(shape)

    # check dtype
    dtype = x.dtype
    utils.ops_dtype_check(dtype, utils.DtypeForDavinci.ALL_TYPES)

    # vector_dup instruction don't support int8 and uint8
    # It can be simplified by some methods, such as , "auto cast"
    x_shape = get_shape(x)
    if len(x_shape) == 1 and x_shape[0] == 1 and dtype in ["int8", "uint8"]:
        x = Cast(x, "float16", target)

    res = topi.broadcast_to(x, shape)
    if res.dtype != dtype:
        res = Cast(res, dtype, target)
    return res
예제 #7
0
def fused_bn_follow(data0, data1, data2, data3, data4, target=utils.CUDA):
    """
    input:
    data: length is 5
    data0: param0 beta
    data1: param1 gamma
    data2: param2 BNupdate: xi_variance
    data3: param6 BNreduce: xi_mean
    data4: param7 xi_conv2d

    layout: (N, C, H, W)

    output:
    beta + gamma * xi_variance * ( xi -  xi_mean/(N*H*W) )
    """

    n, h, w, c = data4.shape
    const = n * h * w
    inter_dtype = 'float32'
    data4 = topi.cast(data4, inter_dtype)

    multiply0 = topi.divide(data3, const)
    multiply0 = topi.expand_dims(multiply0, axis=0, num_newaxis=3)
    multiply0 = topi.broadcast_to(multiply0, (n, h, w, c))

    subtract0 = topi.subtract(data4, multiply0)

    multiply1 = topi.multiply(subtract0, data2)
    multiply2 = topi.multiply(multiply1, data1)

    add0 = topi.add(multiply2, data0)

    return add0
예제 #8
0
def matrix_set_diag_compute(input_matrix, input_diagonal, input_help):
    """matrix_set_diag compute implemention"""
    shape_input = get_shape(input_matrix)
    input_dtype = input_matrix.dtype

    if input_dtype == "int8" or input_dtype == "uint8":
        input_matrix = topi.cast(input_matrix, "float16")
        input_diagonal = topi.cast(input_diagonal, "float16")
        input_help = topi.cast(input_help, "float16")
    if input_dtype == "int32" and product_is_mini():
        input_matrix = topi.cast(input_matrix, "float16")
        input_diagonal = topi.cast(input_diagonal, "float16")
        input_help = topi.cast(input_help, "float16")
        input_matrix = topi.cast(input_matrix, "float32")
        input_diagonal = topi.cast(input_diagonal, "float32")
        input_help = topi.cast(input_help, "float32")
    if input_dtype == "int32" and not product_is_mini():
        input_matrix = topi.cast(input_matrix, "float32")
        input_diagonal = topi.cast(input_diagonal, "float32")
        input_help = topi.cast(input_help, "float32")
    diag_tmp = topi.broadcast_to(input_diagonal, shape_input)
    help_tmp = topi.add(input_help, -1)
    help_y = topi.abs(help_tmp)

    res_vmul_x = topi.multiply(input_matrix, help_y)
    res_vmul_y = topi.multiply(diag_tmp, input_help)
    res = topi.add(res_vmul_x, res_vmul_y)

    if input_dtype == "int32" and product_is_mini():
        res = topi.cast(res, "float16")

    res = topi.cast(res, input_dtype)

    return res