Beispiel #1
0
def CaffePReLU(device="llvm",
               lib_path="./",
               ndim=None,
               dtype=None,
               channel_shared=None):
    '''
    caffe prelu
    Args:
        device:
        lib_path:
        ndim:
        dtype:
        channel_shared:

    Returns:
    '''
    shape = [tvm.var("n" + str(i)) for i in range(ndim)]
    channel = 1 if channel_shared else shape[1]
    opname = "CaffePReLU_ndim%d_%s_%s" % (
        ndim, dtype, "channelShared" if channel_shared else "channelNotShared")
    print(opname)

    in_tensor = tvm.placeholder(shape, dtype=dtype, name='in_tensor')
    slope = tvm.placeholder((channel, ), dtype=dtype, name='slope')
    if channel_shared:
        out_tensor = tvm.compute(shape, lambda *idx: tvm.if_then_else(in_tensor[idx] >= 0, in_tensor[idx],\
                                                                      in_tensor[idx] * slope[0]))
    else:
        out_tensor = tvm.compute(shape, lambda *idx: tvm.if_then_else(in_tensor[idx] >= 0, in_tensor[idx],\
                                                                      in_tensor[idx] * slope[idx[1]]))

    tensor_list = [in_tensor, slope, out_tensor]
    s = tvm.create_schedule(out_tensor.op)
    Genlib(s, tensor_list, device, opname, lib_path)
Beispiel #2
0
def test_if_then_else():
    cases = [[(tvm.var('cond', dtype='bool'), 'bool', 'int32'), 'int32'],
             [(True, 'int32', 'float32'), 'float32'],
             [(False, 'int32', 'int64'), 'int64'],
             [(tvm.var('cond', dtype='bool'), 'uint32', 'int32'), 'int32'],
             [(tvm.var('cond', dtype='int32'), 'uint32', 'int32'), 'int32']]
    for (cond, lhs_dtype, rhs_dtype), out_dtype in cases:
        lhs = tvm.var('lhs', dtype=lhs_dtype)
        rhs = tvm.var('rhs', dtype=rhs_dtype)
        if cond is True or cond is False:
            out = tvm.if_then_else(cond, lhs, rhs)
            out2 = tvm.if_then_else(not cond, rhs, lhs)
            out3 = tvm.if_then_else(not cond, lhs, rhs)
            assert tvm.ir_pass.Equal(out, out2) == 1
            if cond:
                assert tvm.ir_pass.Equal(out, lhs.astype(out_dtype)) == 1
                assert tvm.ir_pass.Equal(out3, rhs.astype(out_dtype)) == 1
            else:
                assert tvm.ir_pass.Equal(out, rhs.astype(out_dtype)) == 1
                assert tvm.ir_pass.Equal(out3, lhs.astype(out_dtype)) == 1
        elif cond.dtype == 'bool':
            out = tvm.if_then_else(cond, lhs, rhs)
            assert out.dtype == out_dtype
            assert out.args[1].dtype == out_dtype
            assert out.args[2].dtype == out_dtype
        elif cond.dtype != 'bool':
            check_throws(lambda: tvm.if_then_else(cond, lhs, rhs))
        else:
            raise ValueError('Unknown combinations')
Beispiel #3
0
def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
    """Low level IR routing for multibox_prior operator.

    Parameters
    ----------
    data : Buffer
        Input data buffer.

    out : Buffer
        Output buffer.

    sizes : tuple of float
        Tuple of sizes for anchor boxes.

    ratios : tuple of float
        Tuple of ratios for anchor boxes.

    steps : Tuple of float
        Priorbox step across y and x, -1 for auto calculation.

    offsets : tuple of int
        Priorbox center offsets, y and x respectively.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    ib = tvm.ir_builder.create()
    p_out = ib.buffer_ptr(out)
    in_height = data.shape[2]
    in_width = data.shape[3]
    num_sizes = len(sizes)
    num_ratios = len(ratios)
    size_ratio_concat = sizes + ratios
    steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
    steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
    offset_h = offsets[0]
    offset_w = offsets[1]

    with ib.for_range(0, in_height, for_type="parallel", name="i") as i:
        center_h = (i + offset_h) * steps_h
        with ib.for_range(0, in_width, name="j") as j:
            center_w = (j + offset_w) * steps_w
            for k in range(num_sizes + num_ratios - 1):
                w = tvm.if_then_else(k < num_sizes,
                                     size_ratio_concat[k] * in_height / in_width / 2.0,
                                     size_ratio_concat[0] * in_height / in_width *
                                     math.sqrt(size_ratio_concat[k + 1]) / 2.0)
                h = tvm.if_then_else(
                    k < num_sizes, size_ratio_concat[k] / 2.0,
                    size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0)
                count = (i * in_width * (num_sizes + num_ratios - 1) +
                         j * (num_sizes + num_ratios - 1) + k) * 4
                p_out[count] = center_w - w
                p_out[count + 1] = center_h - h
                p_out[count + 2] = center_w + w
                p_out[count + 3] = center_h + h

    return ib.get()
def test_simplify_if_then_else():
    ck = CanonicalChecker()
    x = tvm.var("x")
    y = tvm.var("y")
    # simplification that takes condition into account.
    res = tvm.if_then_else(
        (x * 4 + y) >= 466036,
        tvm.if_then_else(24512 <= ((((x * 4) + y) - 466036) % 24528),
                         (((((x * 4) + y) - 466036) % 24528) - 24512) % 16, x),
        y)

    res2 = tvm.if_then_else(
        (x * 4) >= 466036 - y,
        tvm.if_then_else(24512 <= ((((x * 4) + y) - 466036) % 24528),
                         (((((x * 4) + y) - 466036) % 24528) - 24512) % 16, x),
        y)
    expected = tvm.if_then_else(
        tvm.expr.LE(466036, (x * 4 + y)),
        tvm.if_then_else(tvm.expr.LE(24512, ((((x * 4) + y) - 4) % 24528)),
                         (((x * 4) + y) - 4) % 16, x), y)
    ck.verify(res, expected)
    ck.verify(res2, expected)
    # can only simplify if condition
    res = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 100) % 3,
                          (x + 100) % 3)
    expected = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 1) % 3,
                               (x + 100) % 3)
    ck.verify(res, ck.analyzer.canonical_simplify(expected))

    res = tvm.expr.Select(x >= 10, tvm.if_then_else(x / 3 > 2, x, 0), 0)
    expected = tvm.expr.Select(x >= 10, x, 0)
    ck.verify(res, ck.analyzer.canonical_simplify(expected))

    res = tvm.expr.Select(x >= 10, tvm.if_then_else(x / 3 < 2, x, 0), 0)
    ck.verify(res, 0)
Beispiel #5
0
def within_index(b, e, s, i):
    """Return a boolean value that indicates if i is within the given index.

    Parameter
    ---------
    b : Expr
      beginning of the index

    e : Expr
      end of the index

    s : Expr
      strides of index

    i : Expr
      array position

    Returns
    -------
    selected: Expr
        bool expression that is True is the array position would be selected
        by the index and False otherwise
    """
    bc = tvm.expr.Select(s < 0, i <= e, i < b)
    ec = tvm.expr.Select(s < 0, i > b, i >= e)
    ss = tvm.if_then_else(s < 0, ((i - e) + (e % tvm.abs(s)) + 1) % tvm.abs(s),
                          (i - b) % s)
    return tvm.expr.Select(tvm.expr.Or(bc, ec), tvm.const(False), ss.equal(0))
Beispiel #6
0
def conv3d_channel_batch(B,
                         N,
                         M,
                         P,
                         C,
                         K,
                         L,
                         Q,
                         O,
                         stride=1,
                         padding=0,
                         dtype="float32"):
    A = tvm.placeholder((B, N, M, P, C), dtype=dtype, name="A")
    W = tvm.placeholder((K, L, Q, C, O), dtype=dtype, name="W")
    N_out = max(0, (N + padding * 2 - K) // stride) + 1
    M_out = max(0, (M + padding * 2 - L) // stride) + 1
    P_out = max(0, (P + padding * 2 - Q) // stride) + 1
    Apad = tvm.compute(
        (B, N + 2 * padding, M + 2 * padding, P + 2 * padding, C),
        lambda b, i, j, k, c: tvm.if_then_else(
            tvm.all(i >= padding, j >= padding, k >= padding, i < N + padding,
                    j < M + padding, k < P + padding), A[
                        b, i - padding, j - padding, k - padding, c], 0.0),
        name="Apad")
    rx, ry, rz, rc = tvm.reduce_axis((0, K), name="rx"), tvm.reduce_axis((0, L), name="ry"), \
                     tvm.reduce_axis((0, Q), name="rz"), tvm.reduce_axis((0, C), name="rc")
    Output = tvm.compute((B, N_out, M_out, P_out, O),
                         lambda b, i, j, k, o: tvm.sum(
                             Apad[b, i * stride + rx, j * stride + ry, k *
                                  stride + rz, rc] * W[rx, ry, rz, rc, o],
                             axis=[rx, ry, rz, rc]),
                         name="Output")
    return Output.op, [A, W, Output]
def zero_pad2d(inputs, padding=0):
    """Zero padding for 2d tensor

    Args:
    -----------------------------
    inputs : tvm.tensor.Tensor
        shape [batch, channel, height, width]
    padding: (optional:0) int or tuple
        expected: (h_pad_up, h_pad_down, w_pad_up, w_pad_down)
    -----------------------------

    Returns:
    -----------------------------
    tvm.tensor.Tensor
        shape [batch, channel, padded_height, padded_width]
    -----------------------------
    """
    padding = (padding, padding, padding,
               padding) if isinstance(padding,
                                      (int, tvm.expr.IntImm)) else padding
    assert_print(isinstance(padding, tuple),
                 "type(padding)={}".format(type(padding)))
    if len(padding) == 2:
        padding = (padding[0], padding[0], padding[1], padding[1])
    assert_print(len(padding) == 4)

    padding_zero = 0.0 if "float" in inputs.dtype else 0

    batch_size, in_channel, height, width = inputs.shape
    return tvm.compute(
        (batch_size, in_channel, height + padding[0] + padding[1],
         width + padding[2] + padding[3]), lambda b, c, h, w: tvm.if_then_else(
             tvm.all(h >= padding[0], h < height + padding[0], w >= padding[2],
                     w < width + padding[2]), inputs[b, c, h - padding[
                         0], w - padding[2]], padding_zero))
def test_copy_pad_split():
    m = 4 * 3
    A = tvm.placeholder((m, ), name="A")
    Apad = tvm.compute(
        (m + 2, ),
        lambda i: tvm.if_then_else(tvm.all(i >= 1, i <= m), A[i - 1], 0.0),
        "Apad")
    B = tvm.compute((m, ), lambda i: Apad[i] + Apad[i + 1] + Apad[i + 2])
    s = tvm.create_schedule(B.op)
    xo, xi = s[B].split(B.op.axis[0], factor=4)
    s[Apad].compute_at(s[B], xo)
    s[Apad].pragma(s[Apad].op.axis[0], "memcpy")
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
    stmt = tvm.ir_pass.Simplify(stmt)
    stmt = tvm.ir_pass.CanonicalSimplify(stmt)

    def cb(src, dst, pad_before, pad_after, pad_value):
        assert (dst.elem_offset.value == 0)
        assert_expr_equal(src.elem_offset, tvm.max(xo * 4, 1) - 1)

        rpad_before = tvm.max(1 - xo * 4, 0)
        rpad_after = tvm.max(xo * 4 - 7, 0)
        assert_expr_equal(pad_before[0], rpad_before)
        assert_expr_equal(pad_after[0], rpad_after)
        assert_expr_equal(src.shape[0], 6 - rpad_before - rpad_after)
        return tvm.make.Evaluate(0)

    stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
def test_copy_pad():
    m = tvm.var('m')
    l = tvm.var('l')
    A = tvm.placeholder((m, l), name='A')
    B = tvm.compute((m + 2, l),
                    lambda i, j: tvm.if_then_else(tvm.all(i >= 1, i < m + 1),
                                                  A[i - 1, j], 1.0),
                    name='B')
    s = tvm.create_schedule(B.op)
    s[B].pragma(B.op.axis[0], "memcpy")
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)

    def cb(src, dst, pad_before, pad_after, pad_value):
        assert tvm.ir_pass.Simplify(src.elem_offset).value == 0
        assert pad_before[0].value == 1
        assert pad_before[1].value == 0
        assert pad_after[0].value == 1
        assert pad_after[1].value == 0
        assert pad_value.value == 1.0
        return tvm.make.Evaluate(0)

    stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
def test_copy_pad_split():
    m = 4 * 3
    A = tvm.placeholder((m, ), name="A")
    Apad = tvm.compute((m + 2,), lambda i:
                       tvm.if_then_else(tvm.all(i >= 1, i <= m),
                                        A[i - 1], 0.0), "Apad")
    B = tvm.compute((m,), lambda i: Apad[i] + Apad[i + 1] + Apad[i + 2])
    s = tvm.create_schedule(B.op)
    xo, xi = s[B].split(B.op.axis[0], factor=4)
    s[Apad].compute_at(s[B], xo)
    s[Apad].pragma(s[Apad].op.axis[0], "memcpy")
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
    stmt = tvm.ir_pass.Simplify(stmt)
    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
    def cb(src, dst, pad_before, pad_after, pad_value):
        assert(dst.elem_offset.value == 0)
        assert_expr_equal(src.elem_offset, tvm.max(xo * 4, 1) - 1)

        rpad_before = tvm.max(1 - xo * 4, 0)
        rpad_after = tvm.max(xo * 4 - 7, 0)
        assert_expr_equal(pad_before[0], rpad_before)
        assert_expr_equal(pad_after[0], rpad_after)
        assert_expr_equal(src.shape[0], 6 - rpad_before - rpad_after)
        return tvm.make.Evaluate(0)
    stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
def compute_sinc(dtype, ndim):
    A = tvm.placeholder([tvm.var() for _ in range(ndim)], name='input', dtype=dtype)
    if dtype in ['float16', 'float32', 'float64']:
        var = tvm.const(np.pi, dtype)
        B = tvm.compute([tvm.var() for _ in range(ndim)],
                        lambda *index: tvm.if_then_else(A[index] == 0, tvm.const(1, dtype),
                                                        tvm.sin(var * A[index]) / (A[index] * var)),
                                                        name='output')
    else:
        var = tvm.const(np.pi, "float64")
        B = tvm.compute([tvm.var() for _ in range(ndim)],
                        lambda *index: tvm.if_then_else(A[index] == 0, tvm.const(1, 'float64'),
                                                        tvm.sin(var * A[index].astype('float64')) /
                                                        (A[index].astype("float64") * var)),
                                                        name='output')

    s = tvm.create_schedule(B.op)
    return s, A, B
Beispiel #12
0
def padding(X, ph, pw):
    assert len(X.shape) >= 2
    nh, nw = X.shape[-2:]
    return tvm.compute(
        (*X.shape[:-2], nh + ph * 2, nw + pw * 2),
        lambda *i: tvm.if_then_else(
            tvm.any(i[-2] < ph, i[-2] >= nh + ph, i[-1] < pw, i[-1] >= nw + pw
                    ), 0, X[i[:-2] + (i[-2] - ph, i[-1] - pw)]),
        name='PaddedX')
Beispiel #13
0
 def _gaussian_map_sum(i, j):
     # i is row, j is col
     x, y = data[ni, 0], data[ni, 1]
     sigma = data[ni, 2]
     sigma2 = sigma * sigma
     v = tvm.if_then_else(
         tvm.all(x >= 0, x < cols, y >= 0, y < rows),
         tvm.exp(-(topi.power((x - j), 2) + topi.power(
             (y - i), 2)) / (2 * sigma2)) / (2 * pi * sigma2), 0)
     return tvm.sum(v, axis=ni)
Beispiel #14
0
 def _select(*indices):
     from_val = []
     index_tuple = []
     for i in range(n):
         from_val.append(
             within_index(begin[i], end[i], strides[i], indices[i]))
         index_tuple.append(
             make_idx(begin[i], end[i], strides[i], a.shape[i], indices[i]))
     return tvm.if_then_else(tvm.all(*from_val), v(*index_tuple),
                             a(*indices))
Beispiel #15
0
def conv2d_channel_batch(B,
                         N,
                         M,
                         C,
                         K,
                         L,
                         O,
                         stride=1,
                         padding=0,
                         dtype="float32"):
    A = tvm.placeholder((B, N, M, C), dtype=dtype, name="A")
    W = tvm.placeholder((K, L, C, O), dtype=dtype, name="W")
    N_out = max(0, (N + padding * 2 - K) // stride) + 1
    M_out = max(0, (M + padding * 2 - L) // stride) + 1
    Apad = tvm.compute(
        (B, N + 2 * padding, M + 2 * padding, C),
        lambda b, i, j, k: tvm.if_then_else(
            tvm.all(i >= padding, j >= padding, i < N + padding, j < M +
                    padding), A[b, i - padding, j - padding, k], 0.0),
        name="Apad")
    rx, ry = tvm.reduce_axis((0, K), name="rx"), tvm.reduce_axis((0, L),
                                                                 name="ry")
    rc = tvm.reduce_axis((0, C), name="rc")
    Output = tvm.compute(
        (B, N_out, M_out, O),
        lambda b, i, j, k: tvm.sum(Apad[b, i * stride + rx, j * stride + ry, rc
                                        ] * W[rx, ry, rc, k],
                                   axis=[rx, ry, rc]),
        name="Output")

    s = tvm.create_schedule(Output.op)
    s[Apad].compute_inline()
    CL = s.cache_write(Output, "local")

    n, h, w, c = s[Output].op.axis
    out = s[Output].fuse(h, w)
    cfg = autotvm.get_config()
    cfg.define_split("split_n", n, num_outputs=2)
    cfg.define_split("split_c", c, num_outputs=2)
    no, ni = cfg["split_n"].apply(s, Output, n)
    co, ci = cfg["split_c"].apply(s, Output, c)
    s[Output].reorder(no, out, co, ni, ci)
    s[Output].parallel(out)

    # schedule CL
    s[CL].compute_at(s[Output], co)
    ni, hi, wi, ci = s[CL].op.axis
    xi, yi, ki = s[CL].op.reduce_axis
    cfg.define_split("split_k", ki, num_outputs=2)
    ko, ki = cfg["split_k"].apply(s, CL, ki)
    s[CL].reorder(ko, xi, yi, ni, ki, ci)
    s[CL].unroll(ki)
    s[CL].vectorize(ci)

    return s, [A, W, Output]
Beispiel #16
0
def conv1d_transpose_ncw(cfg, data, kernel, stride, padding, out_dtype):
    """Transposed 1D convolution ncw forward operator.

    Parameters
    ----------
    cfg: ConfigEntity
        The config for this template
    Input : tvm.Tensor
        3-D with shape [batch, in_channel, inp_width]
    Filter : tvm.Tensor
        3-D with shape [in_channel, num_filter, kernel_size]
    stride : tuple of one int
        The spatial stride along width
    padding : int, tuple, or string
        int: padding size
        tuple of 2 ints: (pad_left, pad_right) for left and right padding
        string: ['VALID', 'SAME']
    out_dtype: str
        The output type. This is used in mixed precision

    Returns
    -------
    Output : tvm.Tensor
    u    3-D with shape [batch, out_channel, out_width]
    """
    if isinstance(stride, (tuple, list)):
        stride = stride[0]
    cfg.stride = stride
    batch, inp_channels, inp_width = get_const_tuple(data.shape)
    _, out_channels, kernel_size = get_const_tuple(kernel.shape)
    pad_left, pad_right = nn.get_pad_tuple1d(padding, kernel_size)
    out_width = (inp_width - 1) * stride + kernel_size - pad_left - pad_right
    pad_left = kernel_size - 1 - pad_left
    pad_right = kernel_size - 1 - pad_right
    dilated_width = stride * (inp_width - 1) + 1
    data = tvm.compute(
        (batch, inp_channels, pad_left + dilated_width + pad_right),
        lambda n, c, x: tvm.if_then_else(
            tvm.all(x >= pad_left, x < pad_left + dilated_width,
                    tvm.indexmod(x - pad_left, stride).equal(0)), data[
                        n, c, tvm.indexdiv(x - pad_left, stride)],
            tvm.const(0., "float32")),
        name='data_pad')

    dc = tvm.reduce_axis((0, inp_channels), name='dc')
    dw = tvm.reduce_axis((0, kernel_size), name='dw')
    data_out = tvm.compute(
        (batch, out_channels, out_width),
        lambda b, c, w: tvm.sum(data[b, dc, w + dw].astype(out_dtype) * kernel[
            dc, c, kernel_size - 1 - dw].astype(out_dtype),
                                axis=[dc, dw]),
        tag="conv1d_transpose_ncw")

    return data_out
def make_relu_gradient(shape, tgt, tgt_host, func_name, dtype="float32"):
    """Hint: use tvm.select"""
    A = tvm.placeholder(shape, dtype=dtype, name="A")
    B = tvm.placeholder(shape, dtype=dtype, name="B")
    C = tvm.compute(
        A.shape, lambda *i: tvm.if_then_else(
            A(*i) > tvm.const(0, A.dtype), B(*i), tvm.const(0, A.dtype)))

    s = tvm.create_schedule(C.op)
    f = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name=func_name)
    return f
def test_schedule_bound_condition():
   A = tvm.placeholder((64,), name='A', dtype="float32")
   Apad = tvm.compute((66,), lambda i: tvm.if_then_else(
       tvm.all(i>0, i < 65), A[i-1], tvm.const(0., "float32")), name='Apad')
   Apad2 = tvm.compute((66,), lambda i: Apad[i]*2, name='Apad2')
   s = tvm.create_schedule(Apad2.op)
   AL1 = s.cache_read(A,"local",[Apad])
   s = s.normalize()
   bounds = tvm.schedule.InferBound(s)
   stmt = tvm.schedule.ScheduleOps(s, bounds)
   stmt = tvm.ir_pass.Simplify(stmt)
   assert (isinstance(stmt.body.body.first.body.body.then_case, tvm.stmt.IfThenElse))
Beispiel #19
0
 def _pad(*indices):
     index_tuple = []
     above = []
     below = []
     for i in range(n):
         if equal_const_int(pad_before[i], 0) and equal_const_int(
                 pad_after[i], 0):
             index_tuple.append(indices[i])
             above.append(False)
             below.append(False)
         else:
             index_tuple.append(indices[i] - pad_before[i])
             above.append(indices[i] >= data.shape[i] + pad_before[i])
             below.append(indices[i] < pad_before[i])
     mapped_tuple = []
     for i, axis in enumerate(index_tuple):
         mapped_axis = tvm.if_then_else(below[i], -axis - mode, axis)
         mapped_axis = tvm.if_then_else(
             above[i], (2 * (data.shape[i] - 1)) - axis + mode, mapped_axis)
         mapped_tuple.append(mapped_axis)
     return data(*mapped_tuple)
Beispiel #20
0
def _decl_winograd_kernel_transform(kernel, tile_size, G):
    """Declare a Winograd kernel transform
    This exists separately to allow for precomputation
    The precomputation will most often happen on CPU

    Parameters
    ----------
    kernel : tvm.Tensor
        The kernel to transform

    tile_size : int
        The size of the tile to use for the Winograd filter

    Returns
    -------
    U : tvm.Tensor
        Transformed kernel

    """
    CO, CI, KH, KW = [get_const_int(x) for x in kernel.shape]
    # Only support 32 bit floats
    out_dtype = 'float32'

    alpha = G.shape[0]
    K = CO
    C = CI

    def upround(x, align):
        return (x + align - 1) // align * align

    ALIGN = 16
    K_round = upround(K, ALIGN)

    # Padded Kernel [K_round, C, KH, KW]
    # Pad the number of kernels to multiple of ALIGN
    padded_kernel = tvm.compute((K_round, C, KH, KW),
                                lambda k, c, h, w:
                                tvm.if_then_else(k < K,
                                                 kernel[k][c][h][w],
                                                 tvm.const(0, out_dtype)),
                                name='padded_kernel')

    # U [alpha, alpha, K_round, C]
    # Perform the kernel transform
    r_kh = tvm.reduce_axis((0, KH), 'r_kh')
    r_kw = tvm.reduce_axis((0, KW), 'r_kw')
    U = tvm.compute((alpha, alpha, K_round, C),
                    lambda eps, nu, k, c:
                    tvm.sum(padded_kernel[k][c][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
                            axis=[r_kh, r_kw]),
                    name='U')

    return U
def test_simplify_if_then_else():
    ck = CanonicalChecker()
    x = tvm.var("x")
    y = tvm.var("y")
    # simplification that takes condition into account.
    res = tvm.if_then_else((x * 4 + y) >= 466036,
                           tvm.if_then_else(24512 <= ((((x*4) + y) - 466036) % 24528),
                                            (((((x*4) + y)  - 466036) % 24528) -24512) % 16,
                                            x), y)
    expected = tvm.if_then_else(
        tvm.expr.LE(466036, (x * 4 + y)),
        tvm.if_then_else(tvm.expr.LE(24512, ((((x*4) + y) - 4) % 24528)),
                         (((x*4) + y)  - 4) % 16,
                         x), y)
    ck.verify(res, expected)
    # can only simplify if condition
    res = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 100) % 3, (x + 100) % 3)
    expected = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 1) % 3, (x + 100) % 3)
    ck.verify(res, ck.analyzer.canonical_simplify(expected))

    res = tvm.expr.Select(x >= 10,
                          tvm.if_then_else(x / 3 > 2, x, 0), 0)
    expected = tvm.expr.Select(x >= 10, x, 0)
    ck.verify(res, ck.analyzer.canonical_simplify(expected))

    res = tvm.expr.Select(x >= 10,
                          tvm.if_then_else(x / 3 < 2, x, 0), 0)
    ck.verify(res, 0)
Beispiel #22
0
def make_idx(b, e, s, z, i):
    """Return the array position in the selection that corresponds to an
    array position in the full array.

    The returned value is only meaningful if within_index() returns True
    for the same set of parameters.

    Parameter
    ---------
    b : Expr
      beginning of the index

    e : Expr
      end of the index

    s : Expr
      strides of index

    z : Expr
      size of the indexed dimension

    i : Expr
      array position

    Returns
    -------
    postion: Expr
        int expression that corresponds to an array position in the selection.
    """
    bc = tvm.expr.Select(s < 0, i <= e, i < b)
    ec = tvm.expr.Select(s < 0, i > b, i >= e)

    # Clamp to array size
    b = tvm.expr.Select(z < b, z - 1, b)

    ss = tvm.if_then_else(s < 0,
                          (b - i) // tvm.abs(s),
                          (i - b) // s)
    return tvm.if_then_else(tvm.expr.Or(bc, ec), 88, ss)
def compute_backward_sinc(dtype, ndim, req):
    A = tvm.placeholder([tvm.var() for _ in range(ndim)], name='A', dtype=dtype)
    B = tvm.placeholder([tvm.var() for _ in range(ndim)], name='B', dtype=dtype)
    C = tvm.placeholder([tvm.var() for _ in range(ndim)], name='C', dtype=dtype)
    var = tvm.const(np.pi, dtype)
    D = tvm.compute([tvm.var() for _ in range(ndim)],
                    lambda *index: tvm.if_then_else(B[index] == 0, tvm.const(0, dtype),
                                                    (tvm.cos(var * B[index]) / B[index] -
                                                    C[index] / B[index]) * A[index]), name='in_grad')
    in_grad_a, in_grad = assign_by_req(D, req)
    s = tvm.create_schedule(in_grad.op)
    s[D].compute_inline()
    return s, A, B, C, in_grad_a, in_grad
Beispiel #24
0
def padding(X, ph, pw):
    """Pad X with 0s in 2-D

    ph, pw : height and width padding
    """
    assert len(X.shape) >= 2
    nh, nw = X.shape[-2], X.shape[-1]
    return tvm.compute(
            (*X.shape[0:-2], nh+ph*2, nw+pw*2),
            lambda *i: tvm.if_then_else(
                tvm.any(i[-2]<ph, i[-2]>=nh+ph, i[-1]<pw, i[-1]>=nw+pw),
                0, X[i[:-2]+(i[-2]-ph, i[-1]-pw)]),
            name='PaddedX')
Beispiel #25
0
 def _dilate(*indices):
     not_zero = []
     index_tuple = []
     for i in range(n):
         if not equal_const_int(strides[i], 1):
             index_tuple.append(indices[i] // strides[i])
             not_zero.append((indices[i] % strides[i]).equal(0))
         else:
             index_tuple.append(indices[i])
     if not_zero:
         not_zero = tvm.all(*not_zero)
         return tvm.if_then_else(not_zero, data(*index_tuple), tvm.const(0.0, data.dtype))
     return data(*index_tuple)
 def _dilate(*indices):
     not_zero = []
     index_tuple = []
     for i in range(n):
         if not equal_const_int(strides[i], 1):
             index_tuple.append(idxdiv(indices[i], strides[i]))
             not_zero.append(idxmod(indices[i], strides[i]).equal(0))
         else:
             index_tuple.append(indices[i])
     if not_zero:
         not_zero = tvm.all(*not_zero)
         return tvm.if_then_else(not_zero, data(*index_tuple), tvm.const(0.0, data.dtype))
     return data(*index_tuple)
Beispiel #27
0
 def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, vh):
     """Transform prior anchor box to output box through location predictions.
     """
     al = anchor[anchor_base_idx]
     at = anchor[anchor_base_idx + 1]
     ar = anchor[anchor_base_idx + 2]
     ab = anchor[anchor_base_idx + 3]
     aw = ar - al
     ah = ab - at
     ax = (al + ar) / 2.0
     ay = (at + ab) / 2.0
     px = loc[loc_base_idx]
     py = loc[loc_base_idx + 1]
     pw = loc[loc_base_idx + 2]
     ph = loc[loc_base_idx + 3]
     ox = px * vx * aw + ax
     oy = py * vy * ah + ay
     ow = tvm.exp(pw * vw) * aw / 2.0
     oh = tvm.exp(ph * vh) * ah / 2.0
     return tvm.if_then_else(clip, tvm.max(0, tvm.min(1, ox - ow)), ox - ow), \
            tvm.if_then_else(clip, tvm.max(0, tvm.min(1, oy - oh)), oy - oh), \
            tvm.if_then_else(clip, tvm.max(0, tvm.min(1, ox + ow)), ox + ow), \
            tvm.if_then_else(clip, tvm.max(0, tvm.min(1, oy + oh)), oy + oh)
Beispiel #28
0
 def _pad(*indices):
     not_zero = []
     index_tuple = []
     for i in range(n):
         if equal_const_int(pad_before[i], 0) and equal_const_int(pad_after[i], 0):
             index_tuple.append(indices[i])
         else:
             index_tuple.append(indices[i] - pad_before[i])
             not_zero.append(indices[i] >= pad_before[i])
             not_zero.append(indices[i] < data.shape[i] + pad_before[i])
     if not_zero:
         not_zero = tvm.all(*not_zero)
         return tvm.if_then_else(not_zero, data(*index_tuple), pad_value)
     return data(*index_tuple)
Beispiel #29
0
Datei: pad.py Projekt: bddppq/tvm
 def _pad(*indices):
     not_zero = []
     index_tuple = []
     for i in range(n):
         if equal_const_int(pad_before[i], 0) and equal_const_int(pad_after[i], 0):
             index_tuple.append(indices[i])
         else:
             index_tuple.append(indices[i] - pad_before[i])
             not_zero.append(indices[i] >= pad_before[i])
             not_zero.append(indices[i] < data.shape[i] + pad_before[i])
     if not_zero:
         not_zero = tvm.all(*not_zero)
         return tvm.if_then_else(not_zero, data(*index_tuple), pad_value)
     return data(*index_tuple)
Beispiel #30
0
 def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, vh):
     """Transform prior anchor box to output box through location predictions.
     """
     al = anchor[anchor_base_idx]
     at = anchor[anchor_base_idx + 1]
     ar = anchor[anchor_base_idx + 2]
     ab = anchor[anchor_base_idx + 3]
     aw = ar - al
     ah = ab - at
     ax = (al + ar) / 2.0
     ay = (at + ab) / 2.0
     px = loc[loc_base_idx]
     py = loc[loc_base_idx + 1]
     pw = loc[loc_base_idx + 2]
     ph = loc[loc_base_idx + 3]
     ox = px * vx * aw + ax
     oy = py * vy * ah + ay
     ow = exp(pw * vw) * aw / 2.0
     oh = exp(ph * vh) * ah / 2.0
     return tvm.if_then_else(clip, tvm.max(0.0, tvm.min(1.0, ox - ow)), ox - ow), \
         tvm.if_then_else(clip, tvm.max(0.0, tvm.min(1.0, oy - oh)), oy - oh), \
         tvm.if_then_else(clip, tvm.max(0.0, tvm.min(1.0, ox + ow)), ox + ow), \
         tvm.if_then_else(clip, tvm.max(0.0, tvm.min(1.0, oy + oh)), oy + oh)
Beispiel #31
0
    def check_if_then_else(ctx, n, dtype):
        A = tvm.placeholder((n,), name='A', dtype=dtype)
        true_value = tvm.const(1, dtype=dtype)
        false_value = tvm.const(3, dtype=dtype)
        max_lhs = tvm.const(2, dtype=dtype)
        max_rhs = tvm.if_then_else(A[0] > 0, true_value, false_value)
        C = tvm.compute((n,), lambda i: tvm.max(max_lhs, max_rhs), name='C')
        s = tvm.create_schedule(C.op)
        s[C].bind(s[C].op.axis[0], tvm.thread_axis("threadIdx.x"))
        fun = tvm.build(s, [A, C], target)

        a = tvm.nd.empty((n,), A.dtype, ctx)
        c = tvm.nd.empty((n,), A.dtype, ctx)
        # Only need to test compiling here
        fun(a, c)
Beispiel #32
0
    def check_if_then_else(ctx, n, dtype):
        A = tvm.placeholder((n,), name='A', dtype=dtype)
        true_value = tvm.const(1, dtype=dtype)
        false_value = tvm.const(3, dtype=dtype)
        max_lhs = tvm.const(2, dtype=dtype)
        max_rhs = tvm.if_then_else(A[0] > 0, true_value, false_value)
        C = tvm.compute((n,), lambda i: tvm.max(max_lhs, max_rhs), name='C')
        s = tvm.create_schedule(C.op)
        s[C].bind(s[C].op.axis[0], tvm.thread_axis("threadIdx.x"))
        fun = tvm.build(s, [A, C], target)

        a = tvm.nd.empty((n,), A.dtype, ctx)
        c = tvm.nd.empty((n,), A.dtype, ctx)
        # Only need to test compiling here
        fun(a, c)
Beispiel #33
0
 def check_llvm(n, offset):
     if not tvm.runtime.enabled("llvm"):
         return
     A = tvm.placeholder((n, ), name='A')
     C = tvm.compute((n,), lambda i: tvm.if_then_else(i >= offset, A[i], 0.0), name='C')
     s = tvm.create_schedule(C.op)
     # build and invoke the kernel.
     f = tvm.build(s, [A, C], "llvm")
     ctx = tvm.cpu(0)
     # launch the kernel.
     a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
     c = tvm.nd.empty((n,), A.dtype, ctx)
     f(a, c)
     c_np = a.asnumpy()
     c_np[:offset] = 0
     tvm.testing.assert_allclose(c.asnumpy(), c_np)
Beispiel #34
0
def gaussian_blur2d(M, N, k, dtype="float32"):
    A = tvm.placeholder((M, N), dtype=dtype, name="A")
    pad = k // 2
    number = k * k
    Apad = tvm.compute((M + 2 * pad, N + 2 * pad),
                       lambda i, j: tvm.if_then_else(
                           tvm.all(i >= pad, i < M + pad, j >= pad, j < N + pad
                                   ), A[i - pad, j - pad], 0.0),
                       name="Apad")
    rx = tvm.reduce_axis((0, k), name="rx")
    ry = tvm.reduce_axis((0, k), name="ry")
    B = tvm.compute(
        (M, N),
        lambda i, j: tvm.sum(Apad[i + rx, j + ry] / number, axis=[rx, ry]),
        name="B")
    return B.op, [A, B]
Beispiel #35
0
 def check_llvm(n, offset):
     if not tvm.module.enabled("llvm"):
         return
     A = tvm.placeholder((n, ), name='A')
     C = tvm.compute((n,), lambda i: tvm.if_then_else(i >= offset, A[i], 0.0), name='C')
     s = tvm.create_schedule(C.op)
     # build and invoke the kernel.
     f = tvm.build(s, [A, C], "llvm")
     ctx = tvm.cpu(0)
     # launch the kernel.
     a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
     c = tvm.nd.empty((n,), A.dtype, ctx)
     f(a, c)
     c_np = a.asnumpy()
     c_np[:offset] = 0
     tvm.testing.assert_allclose(c.asnumpy(), c_np)
Beispiel #36
0
def poolingb(Image, Index, POutput):
    """
    reverse 2*2 max pooling revised

    Parameters
    ----------
    Image : tvm.tensor.Tensor
        4-D with shape [batch_size, image_height, image_width, in_channels]
    Index : tvm.tensor.Tensor, specify where Output[i,j,k,l] is from, this follows the convention of 
        Numpy and PyTorch. You will need this tensor to compute the gradient.
        ------------------------------------------
        For example, if Image is of shape [1, 4, 4, 1] (batch 1 and channel 1), then the slice
        Image[0, :, :, 0] is

        [[0.7243, 0.3236, 0.0124, 0.4314],
        [0.4104, 0.3997, 0.4534, 0.1791],
        [0.0973, 0.2673, 0.6907, 0.9207],
        [0.9268, 0.6590, 0.0312, 0.2364]]

        and Index is of shape [1, 2, 2, 1] and the slice Index[0, :, :, 0] is 

        [[ 0,  6],
        [12, 11]]

        because 0 = 0 * 4 + 0 (0, 0)
                6 = 1 * 4 + 2 (1, 2)
                12= 3 * 4 + 0 (3, 0)
                11= 2 * 4 + 3 (2, 3)
        --------------------------------------------
        4-D with shape [batch_size, out_height, out_width, in_channels]
    POutput:tvm.tensor.Tensor, gradient of Output
        4-D with shape [batch_size, out_height, out_width, in_channels]

    Returns
    -------
    PImage: tvm.tensor.Tensor, gradient of Image
        4-D with shape (Image.shape)
    """
    _, _, W, _ = Image.shape

    PImage = tvm.compute(
        Image.shape, lambda n, i, j, c: tvm.if_then_else(
            tvm.all(i == Index[n, i // 2, j // 2, c] // W, j == Index[
                n, i // 2, j // 2, c] % W), POutput[n, i // 2, j // 2, c], 0.0)
    )

    return PImage
Beispiel #37
0
def conv2db(Image, Filter, POutput):
    """
    convolution with NHWC layout backward

    Parameters
    ----------
    Image : tvm.tensor.Tensor
        4-D with shape [batch_size, image_height, image_width, in_channels]
    Filter: tvm.tensor.Tensor
        4-D with shape [out_channels, in_channels, kernel_height, kernel_width]
    POutput:tvm.tensor.Tensor, gradient of Output
        4-D with shape [batch_size, out_height, out_width, out_channels]

    Returns
    -------
    PImage :tvm.tensor.Tensor, gradient of Image
        4-D with shape (Image.shape)
    PFilter:tvm.tensor.Tensor, gradient of Filter
        4-D with shape (Filter.shape)
    """
    N, H, W, C = Image.shape
    K, _, Hk, Wk = Filter.shape

    rx = tvm.reduce_axis((0, H - (Hk - 1)), name='rx')
    ry = tvm.reduce_axis((0, W - (Wk - 1)), name='ry')
    rn = tvm.reduce_axis((0, N), name='rn')

    PFilter = tvm.compute(
        Filter.shape, lambda o, c, h, w: tvm.sum(Image[rn, h + rx, w + ry, c] *
                                                 POutput[rn, rx, ry, o],
                                                 axis=[rn, rx, ry]))

    rx_k = tvm.reduce_axis((0, Hk), name='rx_k')
    ry_k = tvm.reduce_axis((0, Wk), name='ry_k')
    ro = tvm.reduce_axis((0, K), name='ro')

    PImage = tvm.compute(
        Image.shape, lambda n, h, w, c: tvm.
        sum(Filter[ro, c, Hk - rx_k - 1, Wk - ry_k - 1] * tvm.if_then_else(
            tvm.all(h + rx_k >= Hk - 1, h + rx_k < H, w + ry_k >= Wk - 1, w +
                    ry_k < W), POutput[n, h + rx_k - (Hk - 1), w + ry_k -
                                       (Wk - 1), ro], 0.0),
            axis=[rx_k, ry_k, ro]))

    return (PImage, PFilter)
def test_copy_pad():
    m = tvm.var('m')
    l = tvm.var('l')
    A = tvm.placeholder((m, l), name='A')
    B = tvm.compute((m + 2, l), lambda i, j:
                    tvm.if_then_else(tvm.all(i >= 1, i < m + 1),
                                     A[i - 1, j], 1.0), name='B')
    s = tvm.create_schedule(B.op)
    s[B].pragma(B.op.axis[0], "memcpy")
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
    def cb(src, dst, pad_before, pad_after, pad_value):
        assert tvm.ir_pass.Simplify(src.elem_offset).value == 0
        assert pad_before[0].value == 1
        assert pad_before[1].value == 0
        assert pad_after[0].value == 1
        assert pad_after[1].value == 0
        assert pad_value.value == 1.0
        return tvm.make.Evaluate(0)
    stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
Beispiel #39
0
 def _bilinear(n, c, h, w):
     outside = tvm.any(h < 0, w < 0, h >= in_height, w >= in_width)
     val = bilinear_sample_nchw(data, (n, c, h, w), in_height - 1, in_width - 1)
     return tvm.if_then_else(outside, zero, val)
Beispiel #40
0
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
    N, CI, IH, IW = get_const_tuple(data.shape)
    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    if len(kernel.shape) == 4:

        if dilation_h != 1 or dilation_w != 1:
            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
        pre_computed = False
        CO, _, KH, KW = get_const_tuple(kernel.shape)
    else:
        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
        pre_computed = True
        H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
        CO *= VC
        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)

    assert layout == 'NCHW'
    assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1
    data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")

    if tile_size == 4:
        G_data = np.array([
            [1 / 4.0, 0, 0],
            [-1 / 6.0, -1 / 6.0, -1 / 6.0],
            [-1 / 6.0, 1 / 6.0, -1 / 6.0],
            [1 / 24.0, 1 / 12.0, 1 / 6.0],
            [1 / 24.0, -1 / 12.0, 1 / 6.0],
            [0, 0, 1]], out_dtype)

        B_data = np.array([
            [4, 0, 0, 0, 0, 0],
            [0, -4, 4, -2, 2, 4],
            [-5, -4, -4, -1, -1, 0],
            [0, 1, -1, 2, -2, -5],
            [1, 1, 1, 1, 1, 0],
            [0, 0, 0, 0, 0, 1]], out_dtype)

        A_data = np.array([
            [1, 0, 0, 0],
            [1, 1, 1, 1],
            [1, -1, 1, -1],
            [1, 2, 4, 8],
            [1, -2, 4, -8],
            [0, 0, 0, 1]], out_dtype)
    elif tile_size == 2:
        G_data = np.array([
            [1, 0, 0],
            [1.0/2, 1.0/2, 1.0/2],
            [1.0/2, -1.0/2, 1.0/2],
            [0, 0, 1]], out_dtype)

        B_data = np.array([
            [1, 0, 0, 0],
            [0, 1, -1, 1],
            [-1, 1, 1, 0],
            [0, 0, 0, -1]], out_dtype)

        A_data = np.array([
            [1, 0],
            [1, 1],
            [1, -1],
            [0, -1]], out_dtype)
    else:
        raise ValueError("Unsupported tile size for winograd: " + str(tile_size))

    m = A_data.shape[1]
    r = 3
    alpha = m + r - 1

    H = (IH + 2 * HPAD - 3) // HSTR + 1
    W = (IW + 2 * WPAD - 3) // WSTR + 1
    nH, nW = (H + m-1) // m, (W + m-1) // m
    P = N * nH * nW

    ##### space definition begin #####
    tile_bna_candidates = [1, 2, 4, 8, 16]
    factors = get_factors(CO)
    cfg.define_knob('tile_bna', [x for x in tile_bna_candidates if x in factors])
    cfg.define_knob('tile_bnb', [1, 2, 4, 8, 16])
    cfg.define_split('tile_t1', CI, num_outputs=2, max_factor=128)
    cfg.define_split('tile_t2', CO, num_outputs=2, max_factor=128)
    cfg.define_split('c_unroll', CI, num_outputs=2, max_factor=8)
    cfg.define_knob('yt', [1, 2, 4, 8, 16, 32])
    ##### space definition end #####

    if cfg.is_fallback:
        cfg['tile_bnb'].val = 4
        cfg['tile_bna'].val = 4
        while CO % cfg['tile_bna'].val != 0:
            cfg['tile_bna'].val //= 2
        cfg['yt'].val = 8
        cfg.fallback_split('tile_t1', [-1, 128])
        cfg.fallback_split('tile_t2', [-1, 128])
        cfg.fallback_split('c_unroll', [-1, 8])

    bna = cfg['tile_bna'].val
    bnb = cfg['tile_bnb'].val

    P_round = (P + bnb - 1) // bnb * bnb
    assert CO % bna == 0 and P_round % bnb == 0

    # pack input tile
    input_tile = tvm.compute((CI, P_round // bnb, alpha, alpha, bnb), lambda ci, b, eps, nu, bb: \
         tvm.if_then_else(
             b * bnb + bb < P,
             data_pad[(b*bnb+bb) // (nH*nW)][ci][(b*bnb+bb) // nW % nH * m + eps]
             [(b*bnb+bb) % nW * m + nu], tvm.const(0, data_pad.dtype)), name='d')

    # transform kernel
    if pre_computed:
        U = kernel
    else:
        G = const_matrix(G_data, 'G')
        r_kh = tvm.reduce_axis((0, KH), 'r_kh')
        r_kw = tvm.reduce_axis((0, KW), 'r_kw')
        U = tvm.compute((alpha, alpha, CO // bna, CI, bna), lambda eps, nu, co, ci, vco:
                        tvm.sum(kernel[co * bna + vco][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
                                axis=[r_kh, r_kw]), name='U')

    # transform image
    B = const_matrix(B_data, 'B')
    r_a = tvm.reduce_axis((0, alpha), 'r_a')
    r_b = tvm.reduce_axis((0, alpha), 'r_b')
    V = tvm.compute((alpha, alpha, P_round // bnb, CI, bnb), lambda eps, nu, p, ci, vp:
                    tvm.sum(input_tile[ci][p][r_a][r_b][vp] * B[r_a][eps] * B[r_b][nu],
                            axis=[r_a, r_b]), name='V')

    # batch gemm
    ci = tvm.reduce_axis((0, CI), name='c')
    M = tvm.compute((alpha, alpha, CO, P_round), lambda eps, nu, co, p:
                    tvm.sum(U[eps][nu][co // bna][ci][co % bna] *
                            V[eps][nu][p // bnb][ci][p % bnb], axis=ci), name='M')

    A = const_matrix(A_data, 'A')
    r_a = tvm.reduce_axis((0, alpha), 'r_a')
    r_b = tvm.reduce_axis((0, alpha), 'r_b')
    Y = tvm.compute((CO, P, m, m), lambda co, p, vh, vw:
                    tvm.sum(M[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw],
                            axis=[r_a, r_b]), name='Y')

    # unpack output
    output = tvm.compute((N, CO, H, W), lambda n, co, h, w:
                         Y[co][n * nH * nW + (h//m) * nW + w//m][h % m][w % m]
                         # The following hack term is used to make the padding in batch gemm ("M")
                         # effective, otherwise the padding will be eliminated by bound inference.
                         # Use `tvm.expr.Mul` instead of `*` to avoid issues in const folding.
                         + tvm.expr.Mul(tvm.const(0, out_dtype),
                                        M[alpha-1][alpha-1][CO-1][P_round-1]),
                         name='output', tag='winograd_conv2d_output')

    # we have to manually assign effective GFLOP for winograd
    cfg.add_flop(2 * N * CO * H * W * KH * KW * CI)
    return output
Beispiel #41
0
 def _bilinear(i, c, y, x):
     outside = tvm.any(y < -1.0, x < -1.0, y > height, x > width)
     y = tvm.max(y, 0.0)
     x = tvm.max(x, 0.0)
     val = bilinear_sample_nchw(data, (i, c, y, x), height - 1, width - 1)
     return tvm.if_then_else(outside, 0.0, val)