def forward_(x_op, name): # this is 2d only specialized implementation of topi.nn.softmax if x_op.ndim == 1: x = topi.reshape(x_op.tvm_tensor, (1, x_op.size)) m = 1 n = x_op.shape[0] elif x_op.ndim == 2: x = x_op.tvm_tensor m, n = x_op.shape else: raise ValueError(f'Given ndim {x_op.ndim} is not supported') k = tvm.reduce_axis((0, n), name='k') max_elem = tvm.compute((m, ), lambda i: tvm.max(x[i, k], axis=k), name=f'{name}:max_elem') k = tvm.reduce_axis((0, n), name='k') expsum = tvm.compute( (m, ), lambda i: tvm.sum(tvm.exp(x[i, k] - max_elem[i]), axis=k), name=f'{name}:expsum') softmax = tvm.compute( x.shape, lambda i, j: tvm.exp(x[i, j] - max_elem[i]) / expsum[i], name=f'{name}:softmax') if x_op.ndim == 1: softmax = topi.reshape(softmax, x_op.shape) return softmax, max_elem, expsum
def softmax(x, axis): assert len(x.shape) == 2 m, n = x.shape ok = tvm.reduce_axis((0, n), name='ok') maxelem = tvm.compute((m, ), lambda on: tvm.max(x[on, ok], axis=ok)) ok = tvm.reduce_axis((0, n), name='ok') expsum = tvm.compute( (m, ), lambda on: tvm.sum(tvm.exp(x[on, ok] - maxelem[on]), axis=ok)) divelem = tvm.compute( x.shape, lambda on, os: (tvm.exp(x[on,os] - maxelem[on]) / expsum[on])) return divelem
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): """TODO: Your code here""" """Hint: output shape should be (1,)""" assert len(shape) == 2 x = tvm.placeholder(shape, dtype=dtype, name='x') y = tvm.placeholder(shape, dtype=dtype, name='y') n, cls_num = shape max_iter = tvm.reduce_axis((0, cls_num), name='max_iter') x_max = tvm.compute((n,), lambda i : tvm.max(x[i, max_iter], axis=max_iter)) e_x = tvm.compute(shape, lambda i,j : tvm.exp(x[i,j] - x_max[i])) sum_ex_iter = tvm.reduce_axis((0, cls_num), name='sum_ex_iter') e_x_sum = tvm.compute((n,), lambda i : tvm.sum(e_x[i, sum_ex_iter], axis=sum_ex_iter)) log_softmax = tvm.compute(shape, lambda i,j : tvm.log(e_x[i,j] /e_x_sum[i])) y_mul_log_softmax = tvm.compute(shape, lambda i,j: log_softmax[i,j] * y[i,j]) sum_entropy_iter = tvm.reduce_axis((0, cls_num), name='sum_entropy_iter') mean_iter = tvm.reduce_axis((0, n), name='mean_iter') sum_entropy = tvm.compute((1,), lambda i:tvm.sum(y_mul_log_softmax[mean_iter, sum_entropy_iter], axis=[mean_iter, sum_entropy_iter])) scale = tvm.const(-n, dtype) entropy = tvm.compute((1,), lambda i: sum_entropy[i]/scale) s = tvm.create_schedule(entropy.op) f = tvm.build(s, [x, y, entropy], tgt, target_host=tgt_host, name=func_name) return f
def test_exp(): # graph n = tvm.convert(1024) A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: tvm.exp(A(*i)), name='B') s = tvm.create_schedule(B.op) # create iter var and assign them tags. num_thread = 8 bx, tx = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(bx, tvm.thread_axis("blockIdx.x")) s[B].bind(tx, tvm.thread_axis("threadIdx.x")) # one line to build the function. def check_device(device, host="stackvm"): if not tvm.module.enabled(host): return ctx = tvm.context(device, 0) if not ctx.exist: return fexp = tvm.build(s, [A, B], device, host, name="myexp") ctx = tvm.context(device, 0) # launch the kernel. n = 1024 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) fexp(a, b) np.testing.assert_allclose( b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5) check_device("cuda", "llvm") check_device("vulkan") check_device("opencl")
def test_exp(): # graph n = tvm.convert(1024) A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: tvm.exp(A(*i)), name='B') s = tvm.create_schedule(B.op) # create iter var and assign them tags. px, x = s[B].split(B.op.axis[0], nparts=1) s[B].bind(px, tvm.thread_axis("pipeline")) # one line to build the function. def check_device(device, host="llvm"): if not tvm.module.enabled(host): return ctx = tvm.context(device, 0) if not ctx.exist: return fexp = tvm.build(s, [A, B], device, host, name="myexp") ctx = tvm.context(device, 0) # launch the kernel. n = 1024 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) fexp(a, b) tvm.testing.assert_allclose( b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5) check_device("sdaccel") if "AWS_PLATFORM" in os.environ: check_device("sdaccel -device=" + os.environ.get("AWS_PLATFORM")) check_device("aocl_sw_emu")
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): """Hint: output shape should be (1,)""" h, w = shape x = tvm.placeholder(shape, dtype, name="x") k1 = tvm.reduce_axis((0, w), name="k1") x2 = tvm.compute((h, ), lambda i: tvm.max(x[i, k1], axis=k1)) x3 = tvm.compute(shape, lambda i, j: x[i, j] - x2[i]) x4 = tvm.compute(shape, lambda i, j: tvm.exp(x3[i, j])) k2 = tvm.reduce_axis((0, w), name="k2") x5 = tvm.compute((h, ), lambda i: tvm.sum(x4[i, k2], axis=k2)) log_p = tvm.compute(shape, lambda i, j: tvm.log(x4[i, j] / x5[i])) q = tvm.placeholder(shape, dtype, name="q") k3 = tvm.reduce_axis((0, w), name="k3") y1 = tvm.compute((h, ), lambda i: tvm.sum(q[i, k3] * log_p[i, k3], axis=k3)) k4 = tvm.reduce_axis((0, h), name="k4") y2 = tvm.compute((1, ), lambda i: tvm.sum(-y1[k4], axis=k4)) y3 = tvm.compute((1, ), lambda i: y2[i] / h) s = tvm.create_schedule(y3.op) f = tvm.build(s, [x, q, y3], tgt, target_host=tgt_host, name=func_name) return f
def make_matrix_softmax(shape, tgt, tgt_host, func_name, dtype="float32"): """TODO: Your code here""" """Hint: use tvm.reduce_axis, tvm.sum, tvm.max, tvm.exp""" """Hint: do not reuse the same reduction axis j.""" """Hint: implement the following version for better stability e_x = np.exp(x - np.max(x)) softmax(x)= e_x / e_x.sum() """ A = tvm.placeholder(shape, dtype=dtype, name="A") k = tvm.reduce_axis((0, shape[1]), name="k") max_A = tvm.compute((shape[0], ), lambda i: tvm.max(A[i, k], axis=k), name="max_A") exp = tvm.compute(shape, lambda i, j: tvm.exp(A[i, j] - max_A[i]), name="exp") k1 = tvm.reduce_axis((0, shape[1]), name="k1") sum_exp = tvm.compute((shape[0], ), lambda i: tvm.sum(exp[i, k1], axis=k1), name="sum_exp") B = tvm.compute(shape, lambda i, j: exp[i, j] / sum_exp[i], name="B") s = tvm.create_schedule(B.op) f = tvm.build(s, [A, B], tgt, target_host=tgt_host, name=func_name) return f
def test_exp(): # graph n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda *i: tvm.exp(A(*i)), name='B') s = tvm.create_schedule(B.op) # create iter var and assign them tags. num_thread = 8 bx, tx = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(bx, tvm.thread_axis("blockIdx.x")) s[B].bind(tx, tvm.thread_axis("threadIdx.x")) # one line to build the function. def check_device(device, host="stackvm"): if not tvm.module.enabled(host): return ctx = tvm.context(device, 0) if not ctx.exist: return fexp = tvm.build(s, [A, B], device, host, name="myexp") ctx = tvm.context(device, 0) # launch the kernel. n = 1024 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) fexp(a, b) np.testing.assert_allclose(b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5) check_device("cuda", "llvm") check_device("vulkan") check_device("opencl")
def test_exp(): # graph n = tvm.convert(1024) A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda *i: tvm.exp(A(*i)), name='B') s = tvm.create_schedule(B.op) # create iter var and assign them tags. px, x = s[B].split(B.op.axis[0], nparts=1) s[B].bind(px, tvm.thread_axis("pipeline")) # one line to build the function. def check_device(device, host="llvm"): if not tvm.module.enabled(host): return ctx = tvm.context(device, 0) if not ctx.exist: return fexp = tvm.build(s, [A, B], device, host, name="myexp") ctx = tvm.context(device, 0) # launch the kernel. n = 1024 a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) fexp(a, b) tvm.testing.assert_allclose(b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5) check_device("sdaccel") if "AWS_PLATFORM" in os.environ: check_device("sdaccel -device=" + os.environ.get("AWS_PLATFORM")) check_device("aocl_sw_emu")
def reg_bbox(x1, y1, x2, y2, dx, dy, dw, dh): """Bounding box regression function""" bbox_w = x2 - x1 + 1.0 bbox_h = y2 - y1 + 1.0 ctr_x = x1 + 0.5 * (bbox_w - 1.0) ctr_y = y1 + 0.5 * (bbox_h - 1.0) pred_ctr_x = dx * bbox_w + ctr_x pred_ctr_y = dy * bbox_h + ctr_y pred_w = tvm.exp(dw) * bbox_w pred_h = tvm.exp(dh) * bbox_h pred_x1 = pred_ctr_x - 0.5 * (pred_w - 1.0) pred_y1 = pred_ctr_y - 0.5 * (pred_h - 1.0) pred_x2 = pred_ctr_x + 0.5 * (pred_w - 1.0) pred_y2 = pred_ctr_y + 0.5 * (pred_h - 1.0) return pred_x1, pred_y1, pred_x2, pred_y2
def _declaration_softmaxDiv(cfg, sum_data, max_data, in_data, axis): assert len(in_data.shape) == 2 m, n = in_data.shape divelem = tvm.compute( in_data.shape, lambda on, ok: ((tvm.exp(in_data[on,ok] - max_data[on,0])) / sum_data[on,0]), tag="softmaxDiv") return divelem
def _gaussian_map_sum(i, j): # i is row, j is col x, y = data[ni, 0], data[ni, 1] sigma = data[ni, 2] sigma2 = sigma * sigma v = tvm.if_then_else( tvm.all(x >= 0, x < cols, y >= 0, y < rows), tvm.exp(-(topi.power((x - j), 2) + topi.power( (y - i), 2)) / (2 * sigma2)) / (2 * pi * sigma2), 0) return tvm.sum(v, axis=ni)
def _declaration_softmaxSum(cfg, max_data, in_data, axis): assert len(in_data.shape) == 2 m, n = in_data.shape ok = tvm.reduce_axis((0, n), name='ok') expsum = tvm.compute( (m, ), lambda on: tvm.sum(tvm.exp(in_data[on,ok] - max_data[on,0]), axis=ok), tag="softmaxSum") return expsum
def test_inline2(): m = tvm.size_var('m') A = tvm.placeholder((m,), name='A') T = tvm.compute((m,), lambda i,: A[i] + 10, name='T') stmt = tvm.make.Evaluate(tvm.exp(T[10]) + 11 * T[100]) stmt = tvm.ir_pass.Inline( stmt, T.op, [x.var for x in T.op.axis], T.op.body[0]) def check(op): if isinstance(op, tvm.expr.Call): assert op.func != T.op tvm.ir_pass.PostOrderVisit(stmt, check)
def test_inline2(): m = tvm.var('m') A = tvm.placeholder((m,), name='A') T = tvm.compute((m,), lambda i,: A[i] + 10, name='T') stmt = tvm.make.Evaluate(tvm.exp(T[10]) + 11 * T[100]) stmt = tvm.ir_pass.Inline( stmt, T.op, [x.var for x in T.op.axis], T.op.body[0]) def check(op): if isinstance(op, tvm.expr.Call): assert op.func != T.op tvm.ir_pass.PostOrderVisit(stmt, check)
def softmax_computation(X, shape, dtype="float32"): km = tvm.reduce_axis((0, shape[1]), name="km") MAX = tvm.compute((shape[0],), lambda i: tvm.max(X(i, km), axis=km), name="MAX") E_X = tvm.compute(shape, lambda i, j: tvm.exp(X(i, j) - MAX(i)), name="E_X") ks = tvm.reduce_axis((0, shape[1]), name="ks") SUM = tvm.compute((shape[0],), lambda i: tvm.sum(E_X(i, ks), axis=ks), name="SUM") SOFTMAX = tvm.compute(shape, lambda i, j: E_X(i, j) / SUM(i), name="SOFTMAX") return MAX, E_X, SUM, SOFTMAX
def softmax(x): """Perform softmax activation on the data Parameters ---------- data : tvm.Tensor 2-D input data Returns ------- output : tvm.Tensor 2-D output with same shape """ assert len(x.shape) == 2, "only support 2-dim softmax" m, n = x.shape k = tvm.reduce_axis((0, n), name='k') max_elem = tvm.compute((m, ), lambda i: tvm.max(x[i, k], axis=k)) expsum = tvm.compute( (m, ), lambda i: tvm.sum(tvm.exp(x[i, k] - max_elem[i]), axis=k)) return tvm.compute(x.shape, lambda i, j: tvm.exp(x[i, j] - max_elem[i]) / expsum[i])
def exp(x): """Take exponential of input x. Parameters ---------- x : tvm.Tensor Input argument. Returns ------- y : tvm.Tensor The result. """ return tvm.compute(x.shape, lambda *i: tvm.exp(x(*i)))
def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, vh): """Transform prior anchor box to output box through location predictions. """ al = anchor[anchor_base_idx] at = anchor[anchor_base_idx + 1] ar = anchor[anchor_base_idx + 2] ab = anchor[anchor_base_idx + 3] aw = ar - al ah = ab - at ax = (al + ar) / 2.0 ay = (at + ab) / 2.0 px = loc[loc_base_idx] py = loc[loc_base_idx + 1] pw = loc[loc_base_idx + 2] ph = loc[loc_base_idx + 3] ox = px * vx * aw + ax oy = py * vy * ah + ay ow = tvm.exp(pw * vw) * aw / 2.0 oh = tvm.exp(ph * vh) * ah / 2.0 return tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox - ow)), ox - ow), \ tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \ tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \ tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh)
def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, vh): """Transform prior anchor box to output box through location predictions. """ al = anchor[anchor_base_idx] at = anchor[anchor_base_idx + 1] ar = anchor[anchor_base_idx + 2] ab = anchor[anchor_base_idx + 3] aw = ar - al ah = ab - at ax = (al + ar) / 2.0 ay = (at + ab) / 2.0 px = loc[loc_base_idx] py = loc[loc_base_idx + 1] pw = loc[loc_base_idx + 2] ph = loc[loc_base_idx + 3] ox = px * vx * aw + ax oy = py * vy * ah + ay ow = tvm.exp(pw * vw) * aw / 2.0 oh = tvm.exp(ph * vh) * ah / 2.0 return tvm.if_then_else(clip, tvm.max(0, tvm.min(1, ox - ow)), ox - ow), \ tvm.if_then_else(clip, tvm.max(0, tvm.min(1, oy - oh)), oy - oh), \ tvm.if_then_else(clip, tvm.max(0, tvm.min(1, ox + ow)), ox + ow), \ tvm.if_then_else(clip, tvm.max(0, tvm.min(1, oy + oh)), oy + oh)
def make_matrix_softmax(shape, tgt, tgt_host, func_name, dtype="float32"): assert len(shape) == 2, "only a 2d tensor is accepted for batched softmax" num_batch, num_class = shape logits = tvm.placeholder(shape, dtype, "logits") k = tvm.reduce_axis((0, num_class), name="k") max_logits = tvm.compute((num_batch,), lambda i: tvm.max(logits[i, k], axis=k)) logits_shifted = tvm.compute(shape, lambda i, j: logits[i, j] - max_logits[i]) exps = tvm.compute(shape, lambda *i: tvm.exp(logits_shifted(*i))) k = tvm.reduce_axis((0, num_class), name="k") exps_sum = tvm.compute((num_batch,), lambda i: tvm.sum(exps[i, k], axis=k)) softmaxes = tvm.compute(shape, lambda i, j: exps[i, j] / exps_sum[i]) s = tvm.create_schedule(softmaxes.op) f = tvm.build(s, [logits, softmaxes], tgt, tgt_host, name=func_name) return f
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): X = tvm.placeholder(shape, dtype=dtype, name="X") Y_orig = tvm.placeholder(shape, dtype=dtype, name="Y_orig") j1 = tvm.reduce_axis((0, shape[1]), "j1") j2 = tvm.reduce_axis((0, shape[1]), "j2") maxX = tvm.compute((shape[0], ), lambda i: tvm.max(X[i, j1], axis=j1), name="maxX") numerator = tvm.compute(shape, lambda i, j: tvm.exp(X[i, j] - maxX[i]), name="numerator") denominator = tvm.compute((shape[0], ), lambda i: tvm.sum(numerator[i, j2], axis=j2), name="denominator") m1 = tvm.reduce_axis((0, shape[0]), "m1") m2 = tvm.reduce_axis((0, shape[1]), "m2") cross_entropy_sum = tvm.compute((1, ), lambda i: tvm.sum(Y_orig[m1, m2] * tvm.log( numerator[m1, m2] / denominator[m1]), axis=[m1, m2]), name="cross_entropy_sum") negated = tvm.compute((1, ), lambda i: -cross_entropy_sum[i] / shape[0], name="negated") s = tvm.create_schedule(negated.op) # print(tvm.lower(s, [X, Y_orig, negated], simple_mode=True)) return tvm.build(s, [X, Y_orig, negated], tgt, target_host=tgt_host, name=func_name)
def make_matrix_softmax(shape, tgt, tgt_host, func_name, dtype="float32"): """TODO: Your code here""" """Hint: use tvm.reduce_axis, tvm.sum, tvm.max, tvm.exp""" """Hint: do not reuse the same reduction axis j.""" """Hint: implement the following version for better stability e_x = np.exp(x - np.max(x)) softmax(x)= e_x / e_x.sum() """ X = tvm.placeholder(shape, dtype=dtype, name='X') ky = tvm.reduce_axis((0, shape[1]), name='ky') MAX_X = tvm.compute((shape[0], ), lambda i: tvm.max(X[i, ky], axis=[ky])) E_X = tvm.compute(shape, lambda i, j: tvm.exp(X[i, j] - MAX_X(i))) ky_n = tvm.reduce_axis((0, shape[1]), name='ky_n') E_X_SUM = tvm.compute((shape[0], ), lambda i: tvm.sum(E_X[i, ky_n], axis=[ky_n])) Y = tvm.compute(shape, lambda i, j: E_X[i, j] / E_X_SUM(i)) s = tvm.create_schedule(Y.op) block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis("threadIdx.x") # MAX_X s[MAX_X].bind(MAX_X.op.axis[0], tvm.thread_axis("blockIdx.x")) s[MAX_X].bind(ky, tvm.thread_axis("threadIdx.x")) # E_X s[E_X].bind(E_X.op.axis[0], tvm.thread_axis("blockIdx.x")) s[E_X].bind(E_X.op.axis[1], tvm.thread_axis("threadIdx.x")) # E_X_SUM s[E_X_SUM].bind(E_X_SUM.op.axis[0], tvm.thread_axis("blockIdx.x")) s[E_X_SUM].bind(ky_n, tvm.thread_axis("threadIdx.x")) # SOFTMAX_X s[Y].bind(Y.op.axis[0], tvm.thread_axis("blockIdx.x")) s[Y].bind(Y.op.axis[1], tvm.thread_axis("threadIdx.x")) # print(tvm.lower(s, [X, Y], simple_mode=True)) f = tvm.build(s, [X, Y], tgt, target_host=tgt_host, name=func_name) return _export_module(f, func_name, remote)
def make_matrix_softmax(shape, tgt, tgt_host, func_name, dtype="float32"): """Hint: use tvm.reduce_axis, tvm.sum, tvm.max, tvm.exp""" """Hint: do not reuse the same reduction axis j.""" """Hint: implement the following version for better stability e_x = np.exp(x - np.max(x)) softmax(x)= e_x / e_x.sum() """ h, w = shape x1 = tvm.placeholder(shape, dtype, name="x1") k1 = tvm.reduce_axis((0, w), name="k1") x2 = tvm.compute((h, ), lambda i: tvm.max(x1[i, k1], axis=k1)) x3 = tvm.compute(shape, lambda i, j: x1[i, j] - x2[i]) x4 = tvm.compute(shape, lambda i, j: tvm.exp(x3[i, j])) k2 = tvm.reduce_axis((0, w), name="k2") x5 = tvm.compute((h, ), lambda i: tvm.sum(x4[i, k2], axis=k2)) y = tvm.compute(shape, lambda i, j: x4[i, j] / x5[i]) s = tvm.create_schedule(y.op) f = tvm.build(s, [x1, y], tgt, target_host=tgt_host, name=func_name) return f
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): """TODO: Your code here""" """Hint: output shape should be (1,)""" A = tvm.placeholder(shape, dtype=dtype, name="A") B = tvm.placeholder(shape, dtype=dtype, name="B") k = tvm.reduce_axis((0, shape[1]), name="k") max_A = tvm.compute((shape[0], ), lambda i: tvm.max(A[i, k], axis=k), name="max_A") exp = tvm.compute(shape, lambda i, j: tvm.exp(A[i, j] - max_A[i]), name="exp") k1 = tvm.reduce_axis((0, shape[1]), name="k1") sum_exp = tvm.compute((shape[0], ), lambda i: tvm.sum(exp[i, k1], axis=k1), name="sum_exp") softmax = tvm.compute(shape, lambda i, j: exp[i, j] / sum_exp[i], name="softmax") log = tvm.compute(shape, lambda i, j: tvm.log(softmax[i, j]), name="log") k2 = tvm.reduce_axis((0, shape[1]), name="k2") sum_softmax = tvm.compute( (shape[0], ), lambda i: tvm.sum(B[i, k2] * log[i, k2], axis=k2), name="sum_softmax") k3 = tvm.reduce_axis((0, shape[0]), name="k3") softmax_cross_entropy = tvm.compute( (1, ), lambda i: tvm.sum(-1 * sum_softmax[k3] / shape[0], axis=k3)) s = tvm.create_schedule(softmax_cross_entropy.op) f = tvm.build(s, [A, B, softmax_cross_entropy], tgt, target_host=tgt_host, name=func_name) return f
def test_inline_multi_reduce(): def argmax_comp(x, y): idx = tvm.select((x[1] >= y[1]), x[0], y[0]) val = tvm.select((x[1] >= y[1]), x[1], y[1]) return idx, val def argmax_init(idx_typ, val_typ): return tvm.const(-1, idx_typ), tvm.min_value(val_typ) argmax = tvm.comm_reducer(argmax_comp, argmax_init, name='argmax') m = tvm.var('m') n = tvm.var('n') val = tvm.placeholder((m, n), name='val', dtype='float32') val1 = tvm.compute((m, n), lambda i, j: val[i, j]+1, name='val1') val2 = tvm.compute((m, n), lambda i, j: tvm.exp(val1[i, j]), name='val2') k = tvm.reduce_axis((0, n), 'k') T_idx, T_val = tvm.compute((m, ), lambda i: argmax((k.var, val2[i, k]), axis=k), name='T') s = tvm.create_schedule(T_idx.op) s[val1].compute_inline() s = s.normalize() bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds)
def test_inline_multi_reduce(): def argmax_comp(x, y): idx = tvm.expr.Select((x[1] >= y[1]), x[0], y[0]) val = tvm.expr.Select((x[1] >= y[1]), x[1], y[1]) return idx, val def argmax_init(idx_typ, val_typ): return tvm.const(-1, idx_typ), tvm.min_value(val_typ) argmax = tvm.comm_reducer(argmax_comp, argmax_init, name='argmax') m = tvm.var('m') n = tvm.var('n') val = tvm.placeholder((m, n), name='val', dtype='float32') val1 = tvm.compute((m, n), lambda i, j: val[i, j]+1, name='val1') val2 = tvm.compute((m, n), lambda i, j: tvm.exp(val1[i, j]), name='val2') k = tvm.reduce_axis((0, n), 'k') T_idx, T_val = tvm.compute((m, ), lambda i: argmax((k.var, val2[i, k]), axis=k), name='T') s = tvm.create_schedule(T_idx.op) s[val1].compute_inline() s = s.normalize() bounds = tvm.schedule.InferBound(s) stmt = tvm.schedule.ScheduleOps(s, bounds)
def make_matrix_softmax(shape, tgt, tgt_host, func_name, dtype="float32"): X = tvm.placeholder(shape, dtype=dtype, name="X") j1 = tvm.reduce_axis((0, shape[1]), "j1") maxX = tvm.compute((shape[0], ), lambda i: tvm.max(X[i, j1], axis=j1), name="maxX") numerator = tvm.compute(shape, lambda i, j: tvm.exp(X[i, j] - maxX[i]), name="numerator") j2 = tvm.reduce_axis((0, shape[1]), "j2") denominator = tvm.compute((shape[0], ), lambda i: tvm.sum(numerator[i, j2], axis=j2), name="denominator") Y = tvm.compute(shape, lambda i, j: numerator[i, j] / denominator[i], name="Y") s = tvm.create_schedule(Y.op) return tvm.build(s, [X, Y], tgt, target_host=tgt_host, name=func_name)
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): assert len(shape) == 2, "only 2d tensor is accepted for batched softmax xent loss" num_batch, num_class = shape logits = tvm.placeholder(shape, dtype, "logits") truth = tvm.placeholder(shape, dtype, "truth") k = tvm.reduce_axis((0, num_class), name="k") max_logits = tvm.compute((num_batch,), lambda i: tvm.max(logits[i, k], axis=k)) logits_shifted = tvm.compute(shape, lambda i, j: logits[i, j] - max_logits[i]) exps = tvm.compute(shape, lambda *i: tvm.exp(logits_shifted(*i))) k = tvm.reduce_axis((0, num_class), name="k") exps_sum = tvm.compute((num_batch,), lambda i: tvm.sum(exps[i, k], axis=k)) neg_pred_log = tvm.compute(shape, lambda i,j: tvm.log(exps_sum[i]) - logits_shifted[i, j]) ewise_prod = tvm.compute(shape, lambda *i: truth(*i) * neg_pred_log(*i)) i = tvm.reduce_axis((0, num_batch), name="i") j = tvm.reduce_axis((0, num_class), name="j") ce_sum = tvm.compute((1,), lambda _: tvm.sum(ewise_prod[i, j], axis=[i, j])) ce_mean = tvm.compute((1,), lambda _: ce_sum[0] / tvm.const(num_batch, dtype)) s = tvm.create_schedule(ce_mean.op) f = tvm.build(s, [logits, truth, ce_mean], tgt, tgt_host, func_name) return f
def log_softmax(x): """Perform log softmax activation on the data Parameters ---------- data : tvm.Tensor 2-D input data Returns ------- output : tvm.Tensor 2-D output with same shape """ assert len(x.shape) == 2, "only support 2-dim log softmax" m, n = x.shape k = tvm.reduce_axis((0, n), name='k') max_elem = tvm.compute((m, ), lambda i: tvm.max(x[i, k], axis=k)) k = tvm.reduce_axis((0, n), name='k') expsum = tvm.compute( (m, ), lambda i: tvm.sum(tvm.exp(x[i, k] - max_elem[i]), axis=k)) return tvm.compute( x.shape, lambda i, j: x[i, j] - max_elem[i] - tvm.log(expsum[i]))
def make_matrix_softmax(shape, tgt, tgt_host, func_name, dtype="float32"): """TODO: Your code here""" """Hint: use tvm.reduce_axis, tvm.sum, tvm.max, tvm.exp""" """Hint: do not reuse the same reduction axis j.""" """Hint: implement the following version for better stability e_x = np.exp(x - np.max(x)) softmax(x)= e_x / e_x.sum() """ assert len(shape) == 2 n, cls_dim = shape x = tvm.placeholder(shape, dtype=dtype, name='x') max_iter = tvm.reduce_axis((0, cls_dim), name='max_iter') t_max = tvm.compute((n, ), lambda i: tvm.max(x[i, max_iter], axis=max_iter), name='t_max') e_x = tvm.compute(shape, lambda i,j: tvm.exp(x[i,j]-t_max[i]), name='e_x') sum_iter = tvm.reduce_axis((0, cls_dim), name='sum_iter') e_x_sum = tvm.compute((n,) , lambda i:tvm.sum(e_x[i, sum_iter], axis=sum_iter), name='e_x_sum') softmax = tvm.compute(shape, lambda i, j: e_x[i, j] / e_x_sum[i], name='softmax') s = tvm.create_schedule(softmax.op) f = tvm.build(s, [x, softmax], tgt, target_host=tgt_host, name=func_name) return f
parser.add_argument('--sim', type=str, help='the simulator to use', default='S0', choices=['S0', 'S1', 'SC']) args = parser.parse_args() with ScheduleProcHelper(), nnpu.Environment('./nnpu_config_fp32.yaml'): env = nnpu.get_env() nnpu.set_device(env, type=args.sim) dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w'] assert dtype_w in ['float32', 'float16'], 'when testing activation function, float dtype is needed' shape = (64, ) a = tvm.placeholder(shape, dtype_w, 'a') a_buf = tvm.compute(shape, lambda *i: a(*i), 'a_buf') exp = tvm.compute(shape, lambda i: tvm.exp(a_buf[i]), 'exp') log = tvm.compute(shape, lambda i: tvm.log(a_buf[i]), 'exp') tanh = tvm.compute(shape, lambda i: tvm.tanh(a_buf[i]), 'exp') sigmoid = tvm.compute(shape, lambda i: tvm.sigmoid(a_buf[i]), 'exp') # k = tvm.reduce_axis((0, 16), 'k0') # sum = tvm.compute((1, ), lambda i: tvm.sum(sigmoid[k], axis=k), 'sum') # nnpu.utils.MarkScope(sum) # softmax = tvm.compute(shape, lambda i: sigmoid[i] / sum[0], 'softmax') # nnpu.utils.MarkScope(softmax) # softmax_host, _ = nnpu.utils.CopyBufToH(softmax, 'softmax') s = nnpu.create_schedule([exp.op, log.op, tanh.op, sigmoid.op]) # cache write exp_buf = s.cache_write(exp, env.get_scope('buffer0'))
def compute_exp(a): return tvm.compute(a.shape, lambda *i: tvm.exp(a(*i)))
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): """TODO: Your code here""" """Hint: output shape should be (1,)""" X = tvm.placeholder(shape, dtype=dtype, name='X') a1 = tvm.reduce_axis((0, shape[1]), name='a1') MAX_X = tvm.compute((shape[0], ), lambda i: tvm.max(X[i, a1], axis=[a1])) E_X = tvm.compute(shape, lambda i, j: tvm.exp(X[i, j] - MAX_X(i))) a2 = tvm.reduce_axis((0, shape[1]), name='a2') E_X_SUM = tvm.compute((shape[0], ), lambda i: tvm.sum(E_X[i, a2], axis=[a2])) SOFTMAX_X = tvm.compute(shape, lambda i, j: E_X[i, j] / E_X_SUM(i)) LOG_SOFTMAX_X = tvm.compute(shape, lambda i, j: tvm.log(SOFTMAX_X[i, j])) X_P = tvm.placeholder(shape, dtype=dtype, name='X_P') MUL = tvm.compute(shape, lambda i, j: X_P[i, j] * LOG_SOFTMAX_X[i, j]) a3 = tvm.reduce_axis((0, shape[1]), name='a3') SUM = tvm.compute((shape[0], ), lambda i: tvm.sum(-MUL[i, a3], axis=[a3])) a4 = tvm.reduce_axis((0, shape[0]), name='a4') MEAN = tvm.compute((1, ), lambda i: tvm.sum(SUM[a4] / shape[0], axis=[a4])) # s = tvm.create_schedule([MAX_X.op, E_X.op, E_X_SUM.op, SOFTMAX_X.op, LOG_SOFTMAX_X.op, MUL.op, SUM.op, MEAN.op]) s = tvm.create_schedule(MEAN.op) # print(tvm.lower(s, [X, X_P, MEAN], simple_mode=True)) # MAX_X s[MAX_X].bind(MAX_X.op.axis[0], tvm.thread_axis("blockIdx.x")) s[MAX_X].bind(a1, tvm.thread_axis("threadIdx.x")) # E_X s[E_X].bind(E_X.op.axis[0], tvm.thread_axis("blockIdx.x")) s[E_X].bind(E_X.op.axis[1], tvm.thread_axis("threadIdx.x")) # E_X_SUM s[E_X_SUM].bind(E_X_SUM.op.axis[0], tvm.thread_axis("blockIdx.x")) s[E_X_SUM].bind(a2, tvm.thread_axis("threadIdx.x")) # SOFTMAX_X s[SOFTMAX_X].bind(SOFTMAX_X.op.axis[0], tvm.thread_axis("blockIdx.x")) s[SOFTMAX_X].bind(SOFTMAX_X.op.axis[1], tvm.thread_axis("threadIdx.x")) # LOG_SOFT_MAX s[LOG_SOFTMAX_X].bind(LOG_SOFTMAX_X.op.axis[0], tvm.thread_axis("blockIdx.x")) s[LOG_SOFTMAX_X].bind(LOG_SOFTMAX_X.op.axis[1], tvm.thread_axis("threadIdx.x")) # MUL s[MUL].bind(MUL.op.axis[0], tvm.thread_axis("blockIdx.x")) s[MUL].bind(MUL.op.axis[1], tvm.thread_axis("threadIdx.x")) # SUM s[SUM].bind(SUM.op.axis[0], tvm.thread_axis("blockIdx.x")) s[SUM].bind(a3, tvm.thread_axis("threadIdx.x")) # MEAN # s[MEAN].bind(a4, tvm.thread_axis("blockIdx.x")) s[MEAN].bind(a4, tvm.thread_axis("threadIdx.x")) # print(tvm.lower(s, [X, X_P, MEAN], simple_mode=True)) # block_x = tvm.thread_axis("blockIdx.x") # thread_x = tvm.thread_axis("threadIdx.x") # zo, zi = s[SUM].split(SUM.op.axis[0], 3) # print(tvm.lower(s, [X, X_P, MEAN], simple_mode=True)) # s[SUM].bind(zo, block_x) # s[SUM].bind(zi, thread_x) f = tvm.build(s, [X, X_P, MEAN], tgt, target_host=tgt_host, name=func_name) return _export_module(f, func_name, remote)
###################################################################### # Unified Intrinsic Call # ---------------------- # The above code verifies that direct external call can be used to # call into device specific functions. # However, the above way only works for CUDA target with float type. # Ideally, we want to write same code for any device and any data type. # # TVM intrinsic provides the user a mechanism to achieve this, and this # is the recommended way to solve the problem. # The following code use tvm.exp instead, which create an intrinsic call # :any:`tvm.exp` to do the exponential. # n = tvm.var("n") A = tvm.placeholder((n, ), name='A') B = tvm.compute(A.shape, lambda i: tvm.exp(A[i]), name="B") s = tvm.create_schedule(B.op) num_thread = 64 bx, tx = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(bx, tvm.thread_axis("blockIdx.x")) s[B].bind(tx, tvm.thread_axis("threadIdx.x")) fcuda = tvm.build(s, [A, B], "cuda", name="myexp") print(fcuda.imported_modules[0].get_source()) ###################################################################### # We can find that the code works for both CUDA and opencl. # The same tvm.exp can also be used for float64 data types. # fopencl = tvm.build(s, [A, B], "opencl", name="myexp") print(fopencl.imported_modules[0].get_source()) ######################################################################
def _compute_expsum(max_elem, *indices): eval_range = insert_reduce_index(indices, k2) return tvm.sum(tvm.exp(x[eval_range] - max_elem[indices]), axis=k2)
label = tvm.placeholder((N, L), name='label') data = tvm.placeholder((N, D), name='data') weight = tvm.placeholder((L, D + 1), name='weight') data_expand = tvm.compute((N, D + 1), lambda n, d: tvm.select((d < D), data[n, d], tvm.const(1, dtype=data.dtype)), name='data_expand') rd = tvm.reduce_axis((0, D + 1), name='rd') dot = tvm.compute((N, L), lambda n, l: tvm.sum(weight[l, rd] * data_expand[n, rd], axis=rd), name='dot') scale = tvm.compute((N, L), lambda n, l: (1 / (1 + tvm.exp(-label[n, l] * dot[n, l])) - 1) * label[n, l], name='scale') rn = tvm.reduce_axis((0, N), name='rn') gradient = tvm.compute((L, D + 1), lambda l, d: tvm.sum(scale[rn, l] * data_expand[rn, d], axis=rn), name='gradient') new_weight = tvm.compute((L, D + 1), lambda l, d: weight[l, d] - learning_rate * gradient[l, d], name='new_weight') # === End computation # Scheduling s = tvm.create_schedule(new_weight.op)
###################################################################### # Unified Intrinsic Call # ---------------------- # The above code verifies that direct external call can be used to # call into device specific functions. # However, the above way only works for CUDA target with float type. # Ideally, we want to write same code for any device and any data type. # # TVM intrinsic provides the user a mechanism to achieve this, and this # is the recommended way to solve the problem. # The following code use tvm.exp instead, which create an intrinsic call # :any:`tvm.exp` to do the exponential. # n = tvm.var("n") A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda i: tvm.exp(A[i]), name="B") s = tvm.create_schedule(B.op) num_thread = 64 bx, tx = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(bx, tvm.thread_axis("blockIdx.x")) s[B].bind(tx, tvm.thread_axis("threadIdx.x")) fcuda = tvm.build(s, [A, B], "cuda", name="myexp") print(fcuda.imported_modules[0].get_source()) ###################################################################### # We can find that the code works for both CUDA and opencl. # The same tvm.exp can also be used for float64 data types. # fopencl = tvm.build(s, [A, B], "opencl", name="myexp") print(fopencl.imported_modules[0].get_source()) ######################################################################
def func2(): return tvm.exp((x + y + 1) * y / 4)
def _normalize(max_elem, expsum, *indices): non_reduce_indices = tuple([var for (i, var) in enumerate(indices) if i != axis]) return tvm.exp(x[indices] - max_elem[non_reduce_indices]) / expsum[non_reduce_indices]