Ejemplo n.º 1
0
def _elu_mini_compute(exp_res, data, shape):
    """
    do element-wise e^x - 1 compute in mini scene

    f(x) = e^x - 1,                   x <= TAYLOR_THRESHOLD or x >= 0
    f(x) = fifth taylor computer,     TAYLOR_THRESHOLD < x < 0

    Args:
        exp_res (tvm.tensor.Tensor): the tensor of e^x -1, float16
        data (tvm.tensor.Tensor): input, float16
        shape (list): the shape of input

    Returns: 
        tvm.tensor.Tensor
    """
    TAYLOR_THRESHOLD = -0.7
    input_right_border = tvm.const(0.0, "float16")
    right_border = tvm.compute(shape, lambda *i: input_right_border)

    taylor_res = _elu_taylor_compute(data)

    input_left_border = tvm.const(TAYLOR_THRESHOLD, "float16")
    left_border = tvm.compute(shape, lambda *i: input_left_border)
    exp_taylor_neg = tvm.compute(shape, lambda *i: tvm.expr.Select\
                    (data(*i) > left_border(*i), taylor_res(*i), exp_res(*i)), name="gt")
    exp_res = tvm.compute(shape, lambda *i: tvm.expr.Select\
              (data(*i) < right_border(*i), exp_taylor_neg(*i), exp_res(*i)), name="lt")
    return exp_res
Ejemplo n.º 2
0
def _compute_mini(data_input, shape):
    """
    Use log and taylor to compute
    arctanh has the feature: arctanh(-abs(x)) = -arctanh(abs(x))
    """

    data_abs = topi.abs(data_input)
    result_ln = _compute_log(data_abs)
    result_taylor = _compute_taylor(data_abs)

    data_abs = topi.cast(data_abs, "float16")
    data_input = topi.cast(data_input, "float16")
    result_taylor = topi.cast(result_taylor, "float16")
    result_ln = topi.cast(result_ln, "float16")
    # when |x| < 0.5 using taylor computing, and when 0.5<|x|<1 using log()
    data_res = tvm.compute(shape,
                           lambda *i : akg.tvm.expr.Select(data_abs(*i) < dc.half_const("float16"),
                                                           result_taylor(*i),
                                                           result_ln(*i)),
                           name="le")

    # arctanh has the feature: arctanh(-abs(x)) = -arctanh(abs(x))
    data_res_neg = topi.multiply(data_res, dc.neg_one_const("float16"))
    data_res = tvm.compute(shape,
                           lambda *i : akg.tvm.expr.Select(data_input(*i) < dc.zero_const("float16"),
                                                           data_res_neg(*i),
                                                           data_res(*i)),
                           name="neg")
    return data_res
Ejemplo n.º 3
0
def ReLU6Grad(y_grad, x, target=utils.CUDA):
    """
    Computes Gradients of Rectified Linear 6.

    Args:
        y_grad (tvm.tensor.Tensor): Tensor of type float16, float32, gradients backpropagated to the ReLU6 op.
        x (tvm.tensor.Tensor): Tensor of type float16/float32, inputs that where passed to the ReLU6 op, or its outputs.

    Returns:
        tvm.tensor.Tensor, has same type and shape as x.
    
    Supported Platforms:
        'GPU'
    """
    if target != utils.CUDA:
        raise RuntimeError("the target %s is not supported!" % target)
    shape = x.shape
    dtype = x.dtype

    zero = tvm.const(0, dtype)
    six = tvm.const(6, dtype)

    res0 = tvm.compute(shape,
                       lambda *i: tvm.if_then_else(x(*i) >= zero, x(*i), zero))
    res6 = tvm.compute(
        shape, lambda *i: tvm.if_then_else(x(*i) >= six, zero, res0(*i)))
    res = tvm.compute(
        shape, lambda *i: tvm.if_then_else(res6(*i) == zero, zero, y_grad(*i)))
    return res
Ejemplo n.º 4
0
def batch_matmul_4D(data1, data2, bias=None, out_dtype="float32", layout1="NHDT", layout2="NHDT", layout_out="NHDT"):
    layout1_dict = {}
    layout2_dict = {}
    layout1_str = layout1.replace('N', 'B').replace('H', 'b').replace('D', 'm').replace('T', 'k')  
    layout2_str = layout2.replace('N', 'B').replace('H', 'b').replace('D', 'n').replace('T', 'k')   
    layout1_list = list(layout1_str)
    layout2_list = list(layout2_str)

    for i in range(len(layout1)):
        layout1_dict[layout1_list[i]] = data1.shape[i]
        layout2_dict[layout2_list[i]] = data2.shape[i]

    reduce_axis = tvm.reduce_axis((0, layout1_dict['k']), name='reduce_axis')

    if out_dtype == "float32":
        res = tvm.compute((layout1_dict['B'], layout1_dict['b'], layout1_dict['m'], layout2_dict['n']), lambda B, b, i, j: tvm.sum(
                data1[B, b, i if layout1_list[2] == 'm' else reduce_axis, reduce_axis if layout1_list[3] == 'k' else i].astype("float") *
                data2[B, b, j if layout2_list[2] == 'n' else reduce_axis, reduce_axis if layout2_list[3] == 'k' else j].astype("float"), axis=reduce_axis))
    else:
        res = tvm.compute((layout1_dict['B'], layout1_dict['b'], layout1_dict['m'], layout2_dict['n']), lambda B, b, i, j: tvm.sum(
                data1[B, b, i if layout1_list[2] == 'm' else reduce_axis, reduce_axis if layout1_list[3] == 'k' else i] *
                data2[B, b, j if layout2_list[2] == 'n' else reduce_axis, reduce_axis if layout2_list[3] == 'k' else j], axis=reduce_axis))
    
    if bias is not None:
        res = topi.add(res, bias)

    if layout_out != "NHDT":
        res = auto_out_transpose(res, layout_out)
    return res
Ejemplo n.º 5
0
def TensorcoreConv(data,
                   weight,
                   stride=[1, 1],
                   pad=[0, 0, 0, 0],
                   dilation=[1, 1],
                   out_dtype="float32",
                   name="out",
                   target=utils.CUDA):
    batch, in_h, in_w, in_c = data.shape
    out_c, k_h, k_w, _ = weight.shape
    pad_top, pad_bottom, pad_left, pad_right = pad
    s_h, s_w = stride
    d_h, d_w = dilation
    k_h_d = (k_h - 1) * d_h + 1
    k_w_d = (k_w - 1) * d_w + 1
    o_h = (in_h + pad_top + pad_bottom - k_h_d) // s_h + 1
    o_w = (in_w + pad_left + pad_right - k_w_d) // s_w + 1

    has_pad = not (pad_left == 0 and pad_right == 0 and pad_top == 0
                   and pad_bottom == 0)

    if has_pad:
        data_pad = tvm.compute(
            (batch, in_h + pad_top + pad_bottom, in_w + pad_left + pad_right,
             in_c),
            lambda n, h, w, i: tvm.if_then_else(
                tvm.all(h >= pad_top, h - pad_bottom < in_h, w >= pad_left, w -
                        pad_right < in_w),
                data[n, h - pad_top, w - pad_left, i],
                tvm.const(0.0, "float16"),
            ),
            name="Pad",
        )
    else:
        data_pad = data

    rc = tvm.reduce_axis((0, in_c), name="rc")
    rh = tvm.reduce_axis((0, k_h), name="rh")
    rw = tvm.reduce_axis((0, k_w), name="rw")

    if out_dtype == "float32":
        out = tvm.compute(
            (batch, o_h, o_w, out_c),
            lambda n, h, w, o: tvm.sum(data_pad[n, (h * s_h + rh * d_h), (
                w * s_w + rw * d_w), rc].astype("float32") * weight[
                    o, rh, rw, rc].astype("float32"),
                                       axis=[rc, rh, rw]),
            name=name)
    else:
        out = tvm.compute(
            (batch, o_h, o_w, out_c),
            lambda n, h, w, o: tvm.sum(data_pad[n, (h * s_h + rh * d_h), (
                w * s_w + rw * d_w), rc] * weight[o, rh, rw, rc],
                                       axis=[rc, rh, rw]),
            name=name)

    return out
Ejemplo n.º 6
0
def _compute_update(logbase, sign_decay, sign_gm, grad):
    """Calculate var decay."""
    vmul_tmp = tvm.compute(sign_gm.shape,
                           lambda *indice: sign_gm(*indice) * sign_decay[0])
    vmul_tmp = tvm.compute(vmul_tmp.shape,
                           lambda *indice: vmul_tmp(*indice) * logbase[0])
    exp_tmp = exp(vmul_tmp)
    update = topi.multiply(exp_tmp, grad)
    return update
Ejemplo n.º 7
0
def _compute_m_t(m, beta, grad):
    """Update m."""
    beta_tmp = tvm.compute(m.shape, lambda *indice: m(*indice) * beta[0])
    beta_na = tvm.compute(
        beta.shape, lambda *indice: beta(*indice) * neg_one_const("float32"))
    beta_na = tvm.compute(
        beta_na.shape, lambda *indice: beta_na(*indice) + one_const("float32"))
    beta_sub_tmp = tvm.compute(grad.shape,
                               lambda *indice: grad(*indice) * beta_na[0])
    m_t = topi.add(beta_tmp, beta_sub_tmp)
    return m_t
Ejemplo n.º 8
0
def sgd_compute(parameters, gradient, learning_rate, accum, momentum, stat,
                dampening=0.0, weight_decay=0.0, nesterov=False):
    """sgd compute implementation"""
    dtype = parameters.dtype
    if dtype == "float16":
        parameters = topi.cast(parameters, "float32")
        accum = topi.cast(accum, "float32")
        learning_rate = topi.cast(learning_rate, "float32")
        gradient = topi.cast(gradient, "float32")
        momentum = topi.cast(momentum, "float32")
        stat = topi.cast(stat, "float32")

    # if weight_decay != 0.0, need compute grad_delta to update gradient
    if weight_decay != 0.0:
        parameters = topi.multiply(parameters, tvm.const(1.0, 'float32'))
        grad_delta = topi.multiply(parameters, weight_decay)
        gradient = topi.add(gradient, grad_delta)

    stat_mid = topi.multiply(stat, tvm.const(-1, "float32"))
    stat_act = topi.add(stat_mid, tvm.const(1, "float32"))

    dampening_t = topi.multiply(stat_act, dampening)

    # update accum
    accum_delta = tvm.compute(accum.shape, lambda *indice: accum(*indice) * momentum[0])

    gradient_damp = topi.multiply(gradient, dampening_t)
    accum_t = topi.add(accum_delta, gradient)
    if dampening != 0.0:
        accum_t = topi.subtract(accum_t, gradient_damp)

    # update parameters
    if nesterov:
        parameters_delta = tvm.compute(gradient.shape, lambda *indice: gradient(*indice) * learning_rate[0])
        parameters_delta_2 = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * momentum[0])
        parameters_delta_2 = tvm.compute(parameters_delta_2.shape,
                                         lambda *indice: parameters_delta_2(*indice) * learning_rate[0])
        parameters_delta = topi.add(parameters_delta, parameters_delta_2)
        parameters_t = topi.subtract(parameters, parameters_delta)
    else:
        parameters_delta = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * learning_rate[0])
        parameters_t = topi.subtract(parameters, parameters_delta)

    # update stat
    stat_t = topi.multiply(stat_act, tvm.const(NUM_ZERO, 'float32'))


    if dtype == "float16":
        parameters_t = topi.cast(parameters_t, "float16")
        accum_t = topi.cast(accum_t, "float16")
        stat_t = topi.cast(stat_t, "float16")

    return parameters_t, accum_t, stat_t
Ejemplo n.º 9
0
def bitwise_and(x1, x2):
    """
    Computes the bitwise and of `x1` and `x2`.

    Args:
        x1 (tvm.tensor.Tensor): tensor x1, only support int16,uint16.
        x2 (tvm.tensor.Tensor): tensor x2, only support int16,uint16.

    Returns:
        A tvm.tensor.Tensor as result of bitwise and.
    """
    _check_parameters(x1, x2)

    shape_x = get_shape(x1)
    shape_y = get_shape(x2)
    _, _, shape_max = produce_shapes(shape_x, shape_y)

    data_x = topi.broadcast_to(x1, shape_max)
    data_y = topi.broadcast_to(x2, shape_max)

    res = tvm.compute(data_x.shape,
                      lambda *i: data_x(*i) & data_y(*i),
                      name="and_res")

    return res
Ejemplo n.º 10
0
def _less_equal_compare_float32(data_x, data_y):
    """if x <= y, then return 1, else 0"""
    data_out = tvm.compute(
        data_x.shape, lambda *index: tvm.expr.Select(
            data_x(*index) <= data_y(*index), dc.one_const(data_x.dtype),
            dc.zero_const(data_x.dtype)))
    return data_out
Ejemplo n.º 11
0
    def _default2zn(data):
        shape = [get_const(x) for x in data.shape]
        dtype = data.dtype
        if len(shape) < 2:
            raise ValueError(
                "length of shape of input_data should be greater than or equal to 2, but got %d"
                % len(shape))
        m, n = shape[-2:]
        output_shape = []
        for i in range(0, len(shape) - 2):
            output_shape.append(shape[i])
        m1 = (m + cs - 1) // cs
        n1 = (n + cs - 1) // cs
        output_shape.extend([n1, m1, cs, cs])

        def fcompute(*output_indices):
            input_indices = []
            batch_len = len(output_indices) - 4
            n1_indice = output_indices[batch_len]
            m1_indice = output_indices[batch_len + 1]
            m0_indcie = output_indices[batch_len + 2]
            n0_indcie = output_indices[batch_len + 3]
            m_indice = m1_indice * cs + m0_indcie
            n_indice = n1_indice * cs + n0_indcie
            for i in range(0, batch_len):
                input_indices.append(output_indices[i])
            input_indices.append(m_indice)
            input_indices.append(n_indice)
            res = tvm.if_then_else(tvm.any(m_indice >= m, n_indice >= n),
                                   tvm.const(0, dtype), data(*input_indices))
            return res

        output = tvm.compute(output_shape, fcompute, name=output_name)
        return output
Ejemplo n.º 12
0
def reverse_compute(input_data, axis):
    """reverse compute implementation."""
    shape = input_data.shape
    axis_flag = [1] * len(shape)
    for i in axis:
        axis_flag[i] = -1

    def _map_index(*index):
        """calculate normal index"""
        begin = [0] * len(shape)
        for i, _ in enumerate(shape):
            if i in axis:
                begin[i] = shape[i] - 1
            if i == 0:
                index_org = (index[i] * axis_flag[i] + begin[i], )
            else:
                index_org = index_org + (index[i] * axis_flag[i] + begin[i], )

        return index_org

    output = tvm.compute(shape,
                         lambda *i: input_data(*_map_index(*i)),
                         name='output')

    return output
Ejemplo n.º 13
0
def Conv(data, weight, stride=[1, 1], pad=[0, 0, 0, 0], dilation=[1, 1], name="out", target=utils.CUDA):
    """
    Supported Platforms:
        'GPU'
    """
    if target != utils.CUDA:
        raise RuntimeError("the target %s is not supported!" % target)
    batch, in_c, in_h, in_w = data.shape
    out_c, in_c, k_h, k_w = weight.shape
    pad_top, pad_bottom, pad_left, pad_right = pad
    s_h, s_w = stride
    d_h, d_w = dilation
    k_h_d = (k_h - 1) * d_h + 1
    k_w_d = (k_w - 1) * d_w + 1
    o_h = (in_h + pad_top + pad_bottom - k_h_d) // s_h + 1
    o_w = (in_w + pad_left + pad_right - k_w_d) // s_w + 1
    out_shape = (batch, out_c, o_h, o_w)

    data_pad = topi.nn.pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_bottom, pad_right], 0.0)

    rc = tvm.reduce_axis((0, in_c), name="rc")
    rh = tvm.reduce_axis((0, k_h), name="rh")
    rw = tvm.reduce_axis((0, k_w), name="rw")

    out = tvm.compute(out_shape,
                    lambda n, c, h, w: tvm.sum(
                        data_pad[n, rc, h * s_h + rh * d_h, w * s_w + rw * d_w] * weight[c, rc, rh, rw],
                        axis=[rc, rh, rw]),
                    name=name)
    # use for relu condition
    # out = tvm.compute(out.shape, lambda *i: tvm.max(out(*i), tvm.const(0, out.dtype)), name="relu")
    return out
Ejemplo n.º 14
0
def atan_grad(head, input_x):
    """
    Compute gradient of input_x in atan.

    .. math::
        dx = \\frac{1}{1 + x^2} \\cdot dy

    Args:
        head (tvm.tensor.Tensor): Gradient tensor of forward's output with the
                                  same shape and dtype as input_x.
        input_x (tvm.tensor.Tensor): Forward's input tensor support float16
                                     and float32.

    Returns:
        A tvm.tensor.Tensor as gradient of forward's input.
    
    Supported Platforms:
        'Ascend'
    """
    utils.elemwise_shape_check(head.shape, input_x.shape)
    utils.elemwise_dtype_check(head.dtype, input_x.dtype,
                               utils.DtypeForDavinci.ALL_FLOAT)

    dtype = input_x.dtype
    tensor_one = dc.one_const(dtype)

    def _compute(*i):
        return tensor_one / (tensor_one + input_x(*i) * input_x(*i)) * head(*i)

    out_tensor = tvm.compute(input_x.shape, _compute, name="out")

    return out_tensor
Ejemplo n.º 15
0
def HSwishGrad(y_grad, x, target=utils.CUDA):
    """
    HSwishGrad
    Args:
        y_grad:
        x:

    Returns:

    """
    if target != utils.CUDA:
        raise RuntimeError("the target %s is not supported!" % target)
    shape = x.shape
    res0 = tvm.compute(shape, lambda *i: tvm.if_then_else(x(*i) <= -3, 0, y_grad(*i) * (2 * x(*i) + 3) / 6))
    res6 = tvm.compute(shape, lambda *i: tvm.if_then_else(x(*i) >= 3, y_grad(*i), res0(*i)))
    return res6
Ejemplo n.º 16
0
def _apply_ada_max_compute(var, m, v, grad, lr, beta1, beta1_power, beta2,
                           epsilon):
    """Compute ada_max."""
    # cast to float32 for improved accuracy
    inp_dtype = var.dtype
    if inp_dtype == 'float16':
        var = topi.cast(var, 'float32')
        m = topi.cast(m, 'float32')
        v = topi.cast(v, 'float32')
        lr = topi.cast(lr, 'float32')
        beta1_power = topi.cast(beta1_power, 'float32')
        beta1 = topi.cast(beta1, 'float32')
        beta2 = topi.cast(beta2, 'float32')
        grad = topi.cast(grad, 'float32')
    epsilon = tvm.const(epsilon, 'float32')

    # m += (grad - m) * (1 - beta1)
    rhs = tvm.compute(beta1.shape,
                      lambda *i: beta1(*i) * neg_one_const("float32"))
    rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) + one_const("float32"))
    lhs = topi.subtract(grad, m)
    rhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) * rhs[0])
    m = topi.add(m, rhs)

    # v = max(beta2*v, abs(grad))
    lhs = tvm.compute(v.shape, lambda *i: v(*i) * beta2[0])
    rhs = topi.abs(grad)
    v = topi.maximum(lhs, rhs)

    # var -= lr / (1 - beta1_power) * (m / (v + epsilon))
    # lr * m / (1 - beta1_power) * (v + epsilon)
    # v + epsilon
    rhs = tvm.compute(v.shape, lambda *i: v(*i) + epsilon)
    # 1 - beta1_power
    lhs = tvm.compute(beta1_power.shape,
                      lambda *i: beta1_power(*i) * neg_one_const("float32"))
    lhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) + one_const("float32"))
    # (1 - beta1_power) * (v + epsilon)
    rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) * lhs[0])
    # lr * m
    lhs = tvm.compute(m.shape, lambda *i: m(*i) * lr[0])
    # lr * m / (1 - beta1_power) * (v + epsilon)
    rhs = reciprocal(rhs)
    rhs = topi.multiply(lhs, rhs)
    var = topi.subtract(var, rhs)

    if inp_dtype == 'float16':
        var = topi.cast(var, inp_dtype)
        m = topi.cast(m, inp_dtype)
        v = topi.cast(v, inp_dtype)

    return var, m, v
Ejemplo n.º 17
0
def _apply_gradient_descent_compute(var, alpha, delta):
    """Compute gradient_descent"""
    # step 1: calculate delta * alpha
    var_change = tvm.compute(delta.shape,
                             lambda *indices: delta(*indices) * alpha[0])
    # step 2: calculate var - delta * alpha
    reuse_var = topi.subtract(var, var_change)
    return reuse_var
Ejemplo n.º 18
0
def xlogy_grad_compute(placeholders, shape_max, dtype, rx, ry):
    """
    do element-wise xlogy_grad compute

    Args:
        placeholders (Union[list, typle]): the placeholder of data input
        shape_max (Union[list, typle]): the shape of broadcast
        dtype (string): the type of data input
        rx (list): the reduction indices of data input with broadcast
        ry (list): the reduction indices for data input with broadcast

    Returns
        output_y1 (tvm.tensor.Tensor): result of xlogy_grad
        output_y2 (tvm.tensor.Tensor): result of xlogy_grad
    """
    x1_ori = placeholders[0]
    x2_ori = placeholders[1]
    grad_ori = placeholders[2]

    if dtype == "float16":
        x1 = akg.lang.cce.cast_to(x1_ori, "float32")
        x2 = akg.lang.cce.cast_to(x2_ori, "float32")
        grad = akg.lang.cce.cast_to(grad_ori, "float32")
        x1 = akg.lang.cce.broadcast(x1, shape_max)
        x2 = akg.lang.cce.broadcast(x2, shape_max)
        grad = akg.lang.cce.broadcast(grad, shape_max)
    else:
        x1 = akg.lang.cce.broadcast(x1_ori, shape_max)
        x2 = akg.lang.cce.broadcast(x2_ori, shape_max)
        grad = akg.lang.cce.broadcast(grad_ori, shape_max)

    esp_min = tvm.const(1.18e-38, dtype="float32")
    x1_addespmin = akg.lang.cce.vadds(x1, esp_min)

    if utils.product_is_mini():
        not_zero_x1 = akg.lang.cce.vmul(x1, reciprocal(x1_addespmin))
        log_x2 = tvm.compute(
            x2.shape,
            lambda *i: (tvm.log(x2(*i).astype("float16"))).astype("float32"),
            name="log_x2")
    else:
        not_zero_x1 = div(x1, x1_addespmin)
        log_x2 = akg.lang.cce.vlog(x2)

    partial_x1 = akg.lang.cce.vmul(not_zero_x1, log_x2)
    partial_x1g = akg.lang.cce.vmul(partial_x1, grad)

    partial_x2 = div(x1, x2) if not utils.product_is_mini() else \
        akg.lang.cce.vmul(x1, reciprocal(x2))
    partial_x2g = akg.lang.cce.vmul(partial_x2, grad)

    output_y1 = akg.lang.cce.sum(partial_x1g, rx, keepdims=True)
    output_y2 = akg.lang.cce.sum(partial_x2g, ry, keepdims=True)

    if dtype == "float16":
        output_y1 = akg.lang.cce.cast_to(output_y1, "float16")
        output_y2 = akg.lang.cce.cast_to(output_y2, "float16")
    return output_y1, output_y2
Ejemplo n.º 19
0
def fake_quant_with_min_max_args(input_data,
                                 min_=-6,
                                 max_=6,
                                 num_bits=8,
                                 narrow_range=False):
    """
    Computes Fake-quantize the 'input_data' tensor,
    type float32 to 'output_data' tensor of same type

    output_data = (floor(clamped_shifted * inv_nudged_scale + 0.5f))) * scale
                  + nudged_min
    scale = (max-min) / (quant_max-quant_min)

    Args:
        data_x1 (tvm.tensor.Tensor): Tensor of dtype "float32"
        min ([float, int]): scalar, defaults to -6
        max ([float, int]): scalar, defaults to 6. [min; max] define the
                            clamping range for the input_data data
        num_bits ([float, int]): Defaults to 8. num_bits is the bitwidth
                                 of the quantization,between 2 and 16
        narrow_range ([bool]):
            True, quantized into the quantization range [1; 2^num_bits - 1]
            False,quantized into the quantization range [0; 2^num_bits - 1]

    Returns:
        tvm.tensor.Tensor
    """
    shape = get_shape(input_data)
    utils.check_shape(shape)

    dtype = input_data.dtype
    utils.ops_dtype_check(dtype, utils.DtypeForDavinci.FLOAT32)

    nudged_min, nudged_max, scale = nudge_min_max(min_, max_, num_bits,
                                                  narrow_range)

    zero_tensor = tvm.compute(input_data.shape,
                              lambda *i: tvm.const(0, dtype="float32"),
                              name="zero_tensor")
    nudged_max_tensor = topi.add(zero_tensor, nudged_max)
    nudged_min_tensor = topi.add(zero_tensor, nudged_min)
    inv_nudged_scale = 1.00 / scale

    # Transform the input between nudged_max and nudged_min
    clamped_vmin = topi.minimum(input_data, nudged_max_tensor)
    clamped = topi.maximum(clamped_vmin, nudged_min_tensor)

    # Calculate the quantized and dequantized results
    clamped_shifted = topi.subtract(clamped, nudged_min_tensor)
    vmul_shifted = topi.multiply(clamped_shifted, inv_nudged_scale)
    vadds_shifted = topi.add(vmul_shifted, 0.5)
    floor_vadds_shifted = floor(vadds_shifted)
    floor_cast = akg.lang.ascend.cast_to(floor_vadds_shifted, dtype)
    res_scale = topi.multiply(floor_cast, scale)
    res = topi.add(res_scale, nudged_min_tensor)

    return res
Ejemplo n.º 20
0
def _init_atan2_mask(data_y_, data_x_):
    """
    Compute mask for atan2.

    Args:
        data_y (tvm.tensor.Tensor): The y of atan2(y, x).
        data_x (tvm.tensor.Tensor): The x of atan2(y, x).

    Returns:
        mask (tvm.tensor.Tensor): The mask of x's and y's value.
    """
    is_cast_for_mini = utils.product_is_mini() and data_y_.dtype == "float32"

    # in mini, select only support float16
    if is_cast_for_mini:
        data_x = topi.cast(data_x_, "float16")
        data_y = topi.cast(data_y_, "float16")
    else:
        data_x = data_x_
        data_y = data_y_

    dtype_input = data_y.dtype

    tensor_one = dc.one_const(dtype_input)
    tensor_zero = dc.zero_const(dtype_input)
    tensor_neg_one = dc.neg_one_const(dtype_input)

    y_ge_zero = tvm.compute(
        data_y.shape,
        lambda *i: tvm.expr.Select(
            data_y(*i) >= tensor_zero, tensor_one, tensor_neg_one),
        name="y_ge_zero")

    x_lt_zero_y_mask = tvm.compute(
        data_y.shape,
        lambda *i: tvm.expr.Select(
            data_x(*i) < tensor_zero, y_ge_zero(*i), tensor_zero),
        name="xlt0_y_mask")

    if is_cast_for_mini:
        x_lt_zero_y_mask = topi.cast(x_lt_zero_y_mask, "float32")
        y_ge_zero = topi.cast(y_ge_zero, "float32")

    return (x_lt_zero_y_mask, y_ge_zero)
Ejemplo n.º 21
0
def HSwishGrad(y_grad, x):
    """
    HSwishGrad
    Args:
        y_grad:
        x:

    Returns:

    """
    shape = x.shape

    res0 = tvm.compute(
        shape, lambda *i: tvm.if_then_else(
            x(*i) <= -3, 0,
            y_grad(*i) * (2 * x(*i) + 3) / 6))
    res6 = tvm.compute(
        shape, lambda *i: tvm.if_then_else(x(*i) >= 3, y_grad(*i), res0(*i)))
    return res6
Ejemplo n.º 22
0
def relu_grad(head, in_data):
    shape = head.shape
    dtype = head.dtype

    zero = tvm.const(0, dtype)
    relugrad = tvm.compute(
        shape,
        lambda *i: tvm.if_then_else(in_data(*i) >= zero, head(*i), zero),
        tag=tag.INJECTIVE)
    return relugrad
Ejemplo n.º 23
0
def fused_relu_grad_bn_double_reduce_grad(data0, data1, data2, data3, data4, data5, data6, data7, data8,
                           data9, data10, data11, data12, data13, data14, data15, layout="NHWC",
                           out_dtype="float16", target=utils.CUDA):
    
    if layout == 'NCHW':
        data5 = topi.transpose(data5, (0, 2, 3, 1))
        data9 = topi.transpose(data9, (0, 2, 3, 1))
        data13 = topi.transpose(data13, (0, 2, 3, 1))
        data14 = topi.transpose(data14, (0, 2, 3, 1))
        data15 = topi.transpose(data15, (0, 2, 3, 1))
    elif layout != 'NHWC':
        raise NotImplementedError(
            'Layout not supported {} '.format(layout))
    
    inter_dtype = "float32"
    n, h, w, c = data5.shape
    scale = n * h * w

    mul = topi.multiply(data2, data3)
    mul1221 = topi.divide(mul, scale)

    # ReluGrad
    zero = tvm.const(0, data15.dtype)
    add = topi.add(data13, data14)
    addgrad = tvm.compute(add.shape, lambda *i: tvm.if_then_else(data15(*i) >= zero, add(*i), zero), tag=tag.INJECTIVE)
    addgrad = topi.cast(addgrad, inter_dtype)
    mul3283 = topi.multiply(scale, addgrad)
    sub1159 = topi.subtract(mul3283, data6)

    data5_cast = topi.cast(data5, inter_dtype)
    mul2372 = topi.divide(data4, scale)
    sub631 = topi.subtract(data5_cast, mul2372)
    mul1220 = topi.multiply(sub631, data1)
    div = topi.divide(mul1220, data0)
    sub271 = topi.subtract(sub1159, div)
    mul1218 = topi.multiply(mul1221, sub271)
    mul1218_cast = topi.cast(mul1218, out_dtype)

    mul1231 = topi.multiply(data11, data12)
    mul1230 = topi.divide(mul1231, scale)
    data9_cast = topi.cast(data9, inter_dtype)
    mul2364 = topi.divide(data8, scale)
    sub625 = topi.subtract(data9_cast, mul2364)
    mul1229 = topi.multiply(data10, sub625)

    div272 = topi.divide(mul1229, data7)
    sub272 = topi.subtract(sub1159, div272)
    mul1228 = topi.multiply(mul1230, sub272)
    mul1228_cast = topi.cast(mul1228, out_dtype)

    if layout == "NCHW":
        mul1218_cast = topi.transpose(mul1218_cast, (0, 3, 1, 2))
        mul1228_cast = topi.transpose(mul1228_cast, (0, 3, 1, 2))
    
    return [mul1218_cast, mul1228_cast]
Ejemplo n.º 24
0
def batch_matmul_3d(data1, data2, attrs):
    """batch matmul for 3-D data"""
    bias, out_dtype, layout1, layout2, layout_out = attrs
    layout1_dict = {}
    layout2_dict = {}
    layout1 = layout1[1:]
    layout2 = layout2[1:]
    layout1_str = layout1.replace('N', 'b').replace(
        'H', 'b').replace('D', 'm').replace('T', 'k')
    layout2_str = layout2.replace('N', 'b').replace(
        'H', 'b').replace('D', 'n').replace('T', 'k')
    layout1_list = list(layout1_str)
    layout2_list = list(layout2_str)

    for i in range(len(layout1)):
        layout1_dict[layout1_list[i]] = data1.shape[i]
        layout2_dict[layout2_list[i]] = data2.shape[i]

    reduce_axis = tvm.reduce_axis(
        (0, layout1_dict.get('k')), name='reduce_axis')

    if out_dtype == "float32":
        res = tvm.compute(
            (layout1_dict.get('b'), layout1_dict.get('m'), layout2_dict.get('n')),
            lambda b, i, j: tvm.sum(
                data1[b, i if layout1_list[1] == 'm' else reduce_axis,
                      reduce_axis if layout1_list[2] == 'k' else i].astype("float") *
                data2[b, j if layout2_list[1] == 'n' else reduce_axis,
                      reduce_axis if layout2_list[2] == 'k' else j].astype("float"), axis=reduce_axis))
    else:
        res = tvm.compute(
            (layout1_dict.get('b'), layout1_dict.get('m'), layout2_dict.get('n')),
            lambda b, i, j: tvm.sum(
                data1[b, i if layout1_list[1] == 'm' else reduce_axis, reduce_axis if layout1_list[2] == 'k' else i] *
                data2[b, j if layout2_list[1] == 'n' else reduce_axis,
                      reduce_axis if layout2_list[2] == 'k' else j], axis=reduce_axis))
    if bias is not None:
        res = topi.add(res, bias)

    if layout_out != "NHDT":
        res = auto_out_transpose(res, layout_out)
    return res
Ejemplo n.º 25
0
def cimag(inputs, attrs):
    del attrs
    in_tensor = inputs[0]
    out_shape = in_tensor.shape[:-1]

    def fcompute(*index):
        out_index = [x for x in index]
        out_index.append(1)
        return in_tensor(*out_index)

    return tvm.compute(out_shape, fcompute, name="imag")
Ejemplo n.º 26
0
def topi_nn_hsigmoid(x):
    """
    topi hsigmoid
    Args:
        x:

    Returns:

    """
    return tvm.compute(x.shape, lambda *i: tvm.if_then_else(x(*i) <= -3, 0,
                                                            tvm.if_then_else(x(*i) >= 3, 1,
                                                                             (x(*i) + 3) / 6)))
Ejemplo n.º 27
0
def my_dsl(dtype, kernel_name, attrs):
    m = tvm.var("M")
    n = tvm.var("N")
    A = tvm.placeholder((m, ), name="A", dtype=dtype)
    B = tvm.placeholder((m, ), name="B", dtype=dtype)

    if insn == "add":
        C = topi.add(A, B)
    elif insn == "sub":
        C = topi.subtract(A, B)
    if insn == "mul":
        C = topi.multiply(A, B)
    elif insn == "div":
        C = topi.divide(A, B)
    elif insn == "max":
        C = topi.maximum(A, B)
    elif insn == "min":
        C = topi.minimum(A, B)

    elif insn == "abs":
        C = tvm.compute(A.shape, lambda *index: tvm.abs(A(*index)), name='C')
    elif insn == "exp":
        C = topi.exp(A)
    elif insn == "log":
        C = topi.log(A)
    elif insn == "sqrt":
        C = topi.sqrt(A)
        C = topi.log(A)
    elif insn == "sqrt":
        C = topi.sqrt(A)

    elif insn == "adds":
        C = A + tvm.const(2, dtype)
    elif insn == "muls":
        C = A * tvm.const(2, dtype)

    # C = tvm.compute((m, ), lambda i: A[i] + B[i], name="C")
    s = tvm.create_schedule([C.op])
    with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True):
        if insnType == "binary":
            mod = akg.build(s, [A, B, C],
                            "cce",
                            name=kernel_name,
                            attrs=attrs,
                            polyhedral=True)
        else:
            mod = akg.build(s, [A, C],
                            "cce",
                            name=kernel_name,
                            attrs=attrs,
                            polyhedral=True)
    return mod
Ejemplo n.º 28
0
def resize_nearest_neighbor_grad(grad, size, align_corners=True, out_dtype=None):
    """
    Perform resize_nearest_neighbor_grad.
    """

    in_n, in_c, in_h, in_w = grad.shape
    output_shape = [in_n, in_c, size[0], size[1]]

    if align_corners:
        y_ratio = (in_h - 1).astype('float') / (size[0] - 1)
        x_ratio = (in_w - 1).astype('float') / (size[1] - 1)
    else:
        y_ratio = (in_h).astype('float') / (size[0])
        x_ratio = (in_w).astype('float') / (size[1])

    def _get_pixel(n, c, y, x):
        y = tvm.max(tvm.min(y, in_h - 1), 0)
        x = tvm.max(tvm.min(x, in_w - 1), 0)
        return grad(n, c, y, x).astype('float')

    def _get_indices(*indices):
        n, c, y, x = indices    
        return n, c, y, x

    def _cast_output(value):
        if out_dtype:
            dtype = out_dtype
        else:
            dtype = grad.dtype
        return value.astype(dtype)

    # Nearest neighbor computation
    def _nearest_neighbor_grad(*indices):
        n, c, y, x = _get_indices(*indices)

        in_y = y_ratio * y
        in_x = x_ratio * x

        if align_corners:
            yint = tvm.round(in_y).astype('int32')
            xint = tvm.round(in_x).astype('int32')
        else:
            # Add epsilon to floor to prevent gpu rounding errors.
            epsilon = 1e-5
            yint = tvm.floor(in_y + epsilon).astype('int32')
            xint = tvm.floor(in_x + epsilon).astype('int32')
        return _cast_output(_get_pixel(n, c, yint, xint))
 
    compute_func = _nearest_neighbor_grad

    return tvm.compute(output_shape, compute_func, name='resize_nearest_neighbor_grad', tag=tag.INJECTIVE)
Ejemplo n.º 29
0
def HSigmoidGrad(y_grad, x):
    """
    HSigmoidGrad
    Args:
        y_grad:
        x:

    Returns:

    """
    return tvm.compute(
        x.shape, lambda *i: tvm.if_then_else(
            x(*i) <= -3, 0, tvm.if_then_else(x(*i) >= 3, 0,
                                             y_grad(*i) / 6)))
Ejemplo n.º 30
0
def topi_nn_HSwish(x):
    """
    topi HSwish
    Args:
        x:

    Returns:

    """
    return tvm.compute(
        x.shape, lambda *i: tvm.if_then_else(
            x(*i) <= -3, 0,
            tvm.if_then_else(x(*i) >= 3, x(*i),
                             x(*i) * (x(*i) + 3) / 6)))