def verify_transpose(in_shape, axes): A = tvm.placeholder(shape=in_shape, name="A") B = topi.transpose(A, axes) def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_injective(B) foo = tvm.build(s, [A, B], device, name="transpose") data_npy = np.arange(np.prod(in_shape)).reshape(in_shape).astype(A.dtype) out_npy = data_npy.transpose(axes) data_nd = tvm.nd.array(data_npy, ctx) out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=B.dtype) foo(data_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) for device in get_all_backend(): check_device(device)
def verify_transpose(in_shape, axes): A = te.placeholder(shape=in_shape, name="A") B = topi.transpose(A, axes) def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.testing.get_injective_schedule(device)(B) foo = tvm.build(s, [A, B], device, name="transpose") data_npy = np.arange(np.prod(in_shape)).reshape(in_shape).astype( A.dtype) out_npy = data_npy.transpose(axes) data_nd = tvm.nd.array(data_npy, ctx) out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=B.dtype) foo(data_nd, out_nd) tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy) for device in get_all_backend(): check_device(device)
def OptimalOut(input_tensor, temp_tensor, in_channel): ''' deconv compute Args: input_tensor: temp_tensor: in_channel: Returns: ''' temp_tensor = topi.transpose(temp_tensor, axes=(1, 0, 2, 3)) out_shape = [] for i in range(len(input_tensor.shape)): if i == 0: out_shape.append(input_tensor.shape[i]) continue if i == 1: out_shape.append(temp_tensor.shape[0]) continue out_shape.append(2 * input_tensor.shape[i]) rc = tvm.reduce_axis((0, in_channel), name='rc') return tvm.compute(out_shape, lambda i, j, k, l:\ tvm.sum(input_tensor[i, rc, k // 2, l // 2].astype(input_tensor.dtype) *\ temp_tensor[j, rc, k % 2, l % 2].astype(input_tensor.dtype), axis=[rc]))
def compute_transpose(attrs, inputs, out_info): """Compute definition of transpose""" axes = attrs.get_int_tuple("axes") axes = tuple(axes) if axes else None return topi.transpose(inputs[0], axes)
def transpose(tensor, axes=None, sph=None, dst_scope='buffer0'): res = topi.transpose(tensor, axes) MarkScope(res, dst_scope) PragmaCopy(res) return res
def make_matrix_mul(shapeA, transposeA, shapeB, transposeB, tgt, tgt_host, func_name, dtype="float32"): """Hint: treat 4 cases of transposeA, transposeB separately""" """Hint: for tvm schedule, use split, reorder, vectorize, parallel""" """Hint: debug tvm schedule using tvm.lower""" assert len(shapeA) == 2 and len(shapeB) == 2 A = tvm.placeholder(shapeA, dtype=dtype, name="A") B = tvm.placeholder(shapeB, dtype=dtype, name="B") if not transposeA and not transposeB: in_a, out_a = shapeA in_b, out_b = shapeB k = tvm.reduce_axis((0, out_a), name='k') trans_b = topi.transpose(B) matmul = tvm.compute( (in_a, out_b), lambda i, j: tvm.sum(A[i, k] * trans_b[j, k], axis=k)) elif transposeA and not transposeB: out_a, in_a = shapeA in_b, out_b = shapeB k = tvm.reduce_axis((0, out_a), name='k') trans_a = topi.transpose(A) trans_b = topi.transpose(B) matmul = tvm.compute( (in_a, out_b), lambda i, j: tvm.sum(trans_a[i, k] * trans_b[j, k], axis=k)) elif not transposeA and transposeB: in_a, out_a = shapeA out_b, in_b = shapeB k = tvm.reduce_axis((0, out_a), name='k') matmul = tvm.compute((in_a, out_b), lambda i, j: tvm.sum(A[i, k] * B[j, k], axis=k)) elif transposeA and transposeB: out_a, in_a = shapeA out_b, in_b = shapeB k = tvm.reduce_axis((0, out_a), name='k') trans_a = topi.transpose(A) matmul = tvm.compute( (in_a, out_b), lambda i, j: tvm.sum(trans_a[i, k] * B[j, k], axis=k)) s = tvm.create_schedule(matmul.op) # Blocking by loop tiling bn = 32 xo, yo, xi, yi = s[matmul].tile(matmul.op.axis[0], matmul.op.axis[1], bn, bn) k, = s[matmul].op.reduce_axis ko, ki = s[matmul].split(k, factor=4) # Hoist reduction domain outside the blocking loop s[matmul].reorder(xo, yo, ko, ki, xi, yi) # Vectorization s[matmul].vectorize(yi) s[matmul].parallel(xo) # logging.info(tvm.lower(s, [A, B, matmul], simple_mode=True)) f = tvm.build(s, [A, B, matmul], tgt, target_host=tgt_host, name=func_name) return f