def l2norm_instance(data, eps, axis=None): """Perform L2norm on the input data For axis=None, y(i, j) = x(i, j) / sqrt(max(sum(x^2), eps)) Parameters ---------- data : tvm.Tensor 4-D with NCHW or NHWC layout eps : float epsilon value axis : list of int axis over the normalization applied Returns ------- output : tvm.Tensor 4-D output with same shape """ assert len(data.shape) == 4, "only support 4-dim lrn" dot_value = topi.cpp.pow(data, 2.0) sum_value = topi.sum(dot_value, axis=axis, keepdims=True) expand_sum = topi.broadcast_to(sum_value, data.shape) return topi.broadcast_div(data, topi.sqrt(\ tvm.compute(expand_sum.shape, lambda i, j, k, l:\ tvm.max(expand_sum[i, j, k, l], eps), tag='l2norm')))
def verify_broadcast_to_ele(in_shape, out_shape): # Build the logic and compile the function A = tvm.placeholder(shape=in_shape, name="A") B = topi.broadcast_to(A, out_shape) def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_broadcast(B) ctx = tvm.context(device, 0) foo = tvm.build(s, [A, B], device, name="broadcast_to") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = np.broadcast_to(data_npy, out_shape) data_nd = tvm.nd.array(data_npy, ctx) out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx) for _ in range(1): foo(data_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy) check_device("opencl") check_device("cuda") check_device("metal") check_device("rocm")
def verify_broadcast_to_ele(in_shape, out_shape): # Build the logic and compile the function A = tvm.placeholder(shape=in_shape, name="A") B = topi.broadcast_to(A, out_shape) def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_broadcast(B) foo = tvm.build(s, [A, B], device, name="broadcast_to") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = np.broadcast_to(data_npy, out_shape) data_nd = tvm.nd.array(data_npy, ctx) out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx) for _ in range(1): foo(data_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy) check_device("vulkan") check_device("opencl") check_device("cuda") check_device("metal") check_device("rocm")
def make_broadcast_to(shape, to_shape, tgt, tgt_host, func_name, dtype="float32"): A = tvm.placeholder(shape, dtype=dtype, name="A") C = topi.broadcast_to(A, to_shape) s = tvm.create_schedule(C.op) f = tvm.build(s, [A, C], tgt, target_host=tgt_host, name=func_name) return f
def make_broadcast_to(shape, to_shape, tgt, tgt_host, func_name, dtype="float32"): A = te.placeholder(shape, dtype=dtype, name="A") C = topi.broadcast_to(A, to_shape) s = te.create_schedule(C.op) if tgt=="cuda": bx,tx=s[C].split(C.op.axis[1],factor=32) s[C].bind(bx,te.thread_axis("blockIdx.x")) s[C].bind(tx,te.thread_axis("threadIdx.x")) # print(tvm.lower(s, [A, C], simple_mode=True)) f = tvm.build(s, [A, C], tgt, target_host=tgt_host, name=func_name) return f
def compute_expand_like(attrs, inputs, _): """Compute definition of expand_like""" if len(inputs[0].shape) == len(inputs[1].shape): # If the number of dimensions is not changed then it is just a broadcasting return topi.broadcast_to(inputs[0], inputs[1].shape) exclude = attrs.get_bool("exclude") axis = attrs.get_int_tuple("axis") if exclude: exclude_axis = (axis, ) if isinstance(axis, int) else axis axis = [] for item in range(len(inputs[1].shape)): if item not in exclude_axis: axis.append(item) axis = tuple(axis) return topi.transform.expand_like(inputs[0], inputs[1], axis)
def compute_expand_like(attrs, inputs, _): """Compute definition of expand_like""" if len(inputs[0].shape) == len(inputs[1].shape): # If the number of dimensions is not changed then it is just a broadcasting return topi.broadcast_to(inputs[0], inputs[1].shape) exclude = attrs.get_bool("exclude") axis = attrs.get_int_tuple("axis") if exclude: exclude_axis = (axis,) if isinstance(axis, int) else axis axis = [] for item in range(len(inputs[1].shape)): if item not in exclude_axis: axis.append(item) axis = tuple(axis) return topi.transform.expand_like(inputs[0], inputs[1], axis)
def test_broadcast_to(in_shape, out_shape): global TASK TASK = "bcast_to_i" + "_".join([str(ele) for ele in in_shape])\ + "o" + "_".join([str(ele) for ele in out_shape]) # Build the logic and compile the function A = tvm.placeholder(shape=in_shape, name="A") B = topi.broadcast_to(A, out_shape) s = topi.cuda.schedule_broadcast(B) fcuda = tvm.build(s, [A, B], "cuda", name="broadcast_to") data_npy = np.random.uniform(size=in_shape).astype(A.dtype) out_npy = np.broadcast_to(data_npy, out_shape) data_nd = tvm.nd.array(data_npy, tvm.gpu()) out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), tvm.gpu()) for _ in range(2): fcuda(data_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
def make_broadcast_to(shape, to_shape, tgt, tgt_host, func_name, dtype="float32"): A = tvm.placeholder(shape, dtype=dtype, name="A") C = topi.broadcast_to(A, to_shape) s = tvm.create_schedule(C.op) block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis("threadIdx.x") s[C].bind(C.op.axis[0], block_x) if len(to_shape) > 1: s[C].bind(C.op.axis[1], thread_x) f = tvm.build(s, [A, C], tgt, target_host=tgt_host, name=func_name) return _export_module(f, func_name, remote)
def demo_broadcast(): """ Check that broad works as expected """ num_classes = 10 batch_size = 1 img_h = 28 img_w = 28 img_c = 1 f1_c = 1 x = tvm.placeholder((batch_size, img_h, img_w, img_c), name='x') b = tvm.placeholder((img_c, ), name='b') # Plus here will perform auto-broadcast y = x + topi.broadcast_to(b, (batch_size, 1, 1, img_c)) npy = run_tvm( 0, 1, { x: np.ones(get_shape(x)).astype(np.float32), b: np.ones(get_shape(b)).astype(np.float32) }, y) print(npy.last_data[0, :, :, 0])
def compute_softmax(attrs, inputs, out_info): """Compute definition of softmax""" return topi.broadcast_to(inputs[0], shape=out_info[0].shape)
def demo_conv2d(): lrate = 0.1 nbatches = 100 # batches to train num_classes = 10 batch_size = 10 img_h = 28 img_w = 28 img_c = 1 f1_c = 4 f2_c = 5 f3_units = 16 x = tvm.placeholder((batch_size, img_h, img_w, img_c), name='x') y = tvm.placeholder((batch_size, num_classes), name='y') print('Block1') w1 = tvm.placeholder((3, 3, img_c, f1_c), name='w1') b1 = tvm.placeholder((f1_c, ), name='b1') t = topi.nn.conv2d(x, w1, 1, 0, layout='NHWC', out_dtype=tvm.float32) t = t + topi.broadcast_to(b1, (batch_size, 1, 1, f1_c)) print('Block1: after-biasing shape is', get_shape(t)) t = topi.nn.pool(t, [2, 2], [2, 2], [0, 0, 0, 0], 'max', layout='NHWC') print('Block1: after-pooling shape is', get_shape(t)) t = topi.nn.relu(t) print('Block1: after-relu shape is', get_shape(t)) print('Block2') w2 = tvm.placeholder((3, 3, f1_c, f2_c), name='w2') b2 = tvm.placeholder((f2_c, ), name='b2') t = topi.nn.conv2d(t, w2, 1, 0, layout='NHWC', out_dtype=tvm.float32) t = t + topi.broadcast_to(b2, (batch_size, 1, 1, f2_c)) print('Block2: after-biasing shape is', get_shape(t)) t = topi.nn.pool(t, [2, 2], [2, 2], [0, 0, 0, 0], 'max', layout='NHWC') print('Block2: after-pooling shape is', get_shape(t)) t = topi.nn.relu(t) print('Block2: after-relu shape is', get_shape(t)) t = topi.nn.flatten(t) print('Block2: after-flattern shape is', get_shape(t)) print('Block3') w3 = tvm.placeholder((f3_units, get_shape(t)[1])) b3 = tvm.placeholder((f3_units, )) t = topi.nn.dense(t, w3, b3) print('Block3: after-dense shape is', get_shape(t)) print('Block4') w4 = tvm.placeholder((num_classes, get_shape(t)[1])) b4 = tvm.placeholder((num_classes, )) t = topi.nn.dense(t, w4, b4) print('Block4: after-dense shape is', get_shape(t)) t = topi.nn.relu(t) p = topi.argmax(t, axis=1) # TODO: check the correctnesss of the log_softmax expression # TODO: figure out the difference between it and standard cross-entropy loss l = -topi.sum(y * topi.nn.log_softmax(t)) / batch_size print('Block4: loss shape is', get_shape(l)) ones = topi.full_like(l, 1.0) #[dl_dw1,dl_db1,dl_dw2,dl_db2,dl_dw3,dl_db3,dl_dw4,dl_db4] params = [w1, b1, w2, b2, w3, b3, w4, b4] dl = list(tvm.ir_pass.JacobianRecursive(l, params, ones)) assert len(params) == len(dl) print('dl_dw1 weight is', get_shape(params[0])) sdl = tvm.create_schedule([p.op for p in [x, y, l] + params + dl]) mdl = tvm.build(sdl, [x, y, l] + params + dl) print('Train+Inference module', mdl) # sl = tvm.create_schedule([l.op]) # ml = tvm.build(sdl, [x,y] + params + [l]) # print('Inference module',ml) state = {} for p in params: state.update({ p: tvm.nd.array( np.random.uniform(-1.0, 1.0, size=get_shape(p)).astype(np.float32)) }) grads = {} for p, g in zip(params, dl): grads.update({p: tvm.nd.empty(get_shape(g))}) for ib in range(nbatches): b = range(ib * batch_size, (ib + 1) * batch_size) tx = tvm.nd.array(mnist_img(b)) ty = tvm.nd.array(mnist_cls_oh(b)) tl = tvm.nd.empty(shape=(), dtype=tvm.float32) print('Entering') mdl(*([tx, ty, tl] + list(state.values()) + list(grads.values()))) print('Done', 'loss', tl.asnumpy()) state2 = {} for p in params: state2.update({ p: tvm.nd.array(state[p].asnumpy() - lrate * grads[p].asnumpy()) }) state = state2