def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): """Hint: output shape should be (1,)""" y = tvm.placeholder(shape, dtype=dtype, name="y") y_ = tvm.placeholder(shape, dtype=dtype, name="y_") t = -topi.sum(y_ * topi.log(topi.nn.softmax(y)), axis=1) c = topi.sum(t, keepdims=True) / shape[0] s = tvm.create_schedule(c.op) f = tvm.build(s, [y, y_, c], tgt, target_host=tgt_host, name=func_name) return f
def make_reduce_sum_axis_zero(shape, tgt, tgt_host, func_name, dtype="float32"): A = tvm.placeholder(shape, dtype=dtype, name="A") C = topi.sum(A, axis=0, keepdims=False) s = tvm.create_schedule(C.op) f = tvm.build(s, [A, C], tgt, target_host=tgt_host, name=func_name) return f
def l2norm_instance(data, eps, axis=None): """Perform L2norm on the input data For axis=None, y(i, j) = x(i, j) / sqrt(max(sum(x^2), eps)) Parameters ---------- data : tvm.Tensor 4-D with NCHW or NHWC layout eps : float epsilon value axis : list of int axis over the normalization applied Returns ------- output : tvm.Tensor 4-D output with same shape """ assert len(data.shape) == 4, "only support 4-dim lrn" dot_value = topi.cpp.pow(data, 2.0) sum_value = topi.sum(dot_value, axis=axis, keepdims=True) expand_sum = topi.broadcast_to(sum_value, data.shape) return topi.broadcast_div(data, topi.sqrt(\ tvm.compute(expand_sum.shape, lambda i, j, k, l:\ tvm.max(expand_sum[i, j, k, l], eps), tag='l2norm')))
def check_cuda(dtype, m=32, n=32): if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): print("skip because cuda is not enabled..") return if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version): print("Skip because gpu does not have fp16 support") return a = te.placeholder((m, n), name="a", dtype=dtype) b = te.placeholder((m, n), name="b", dtype=dtype) c = a + b d = a * b e = topi.elemwise_sum([c, d]) g = topi.sum(e) with tvm.target.cuda(): sg = topi.cuda.schedule_reduce(g) ctx = tvm.gpu(0) func = tvm.build(sg, [a, b, g], 'cuda') a_np = np.random.uniform(size=(m, n)).astype(a.dtype) b_np = np.random.uniform(size=(m, n)).astype(b.dtype) g_np = np.sum(np.add(a_np * b_np, a_np + b_np)) a_nd = tvm.nd.array(a_np, ctx) b_nd = tvm.nd.array(b_np, ctx) g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx) func(a_nd, b_nd, g_nd) tvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-3)
def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"): # Build the logic and compile the function dat_dtype = "float32" A = tvm.placeholder(shape=in_shape, name="A", dtype=dat_dtype) A1 = topi.sqrt(topi.exp(A)) out_dtype = "float32" if type == "sum": B = topi.sum(A1, axis=axis, keepdims=keepdims) elif type == "max": B = topi.max(A1, axis=axis, keepdims=keepdims) elif type == "min": B = topi.min(A1, axis=axis, keepdims=keepdims) elif type == "argmax": B = topi.argmax(A1, axis=axis, keepdims=keepdims) out_dtype = "int32" elif type == "argmin": B = topi.argmin(A1, axis=axis, keepdims=keepdims) out_dtype = "int32" else: raise NotImplementedError def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return with tvm.target.create(device): s = topi.generic.schedule_reduce(B) ctx = tvm.context(device, 0) foo = tvm.build(s, [A, B], device, name="sum") # Test in_npy = np.random.uniform(size=in_shape).astype(np.float32) in_npy_map = np.sqrt(np.exp(in_npy)).astype(np.float32) if type == "sum": out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims) elif type == "max": out_npy = in_npy_map.max(axis=axis, keepdims=keepdims) elif type == "min": out_npy = in_npy_map.min(axis=axis, keepdims=keepdims) elif type == "argmax": out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims) elif type == "argmin": out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims) else: raise NotImplementedError data_tvm = tvm.nd.array(in_npy, ctx=ctx) out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype) for _ in range(1): foo(data_tvm, out_tvm) np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3) check_device("opencl") check_device("cuda") check_device("metal") check_device("rocm")
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name, dtype="float32"): """TODO: Your code here""" """Hint: output shape should be (1,)""" # softmax y = tvm.te.placeholder(shape, dtype=dtype, name="y") # input y maxtrix_row_max = topi.max(y, axis=1, keepdims=False) Ex = tvm.te.compute(shape, lambda i, j: tvm.te.exp(y[i][j] - maxtrix_row_max[i]), name="exp_element") Ex_sum = topi.sum(Ex, axis=1, keepdims=False) soft_max = tvm.te.compute(shape, lambda i, j: Ex[i][j] / Ex_sum[i], name="soft_max") # cross_entropy y_real = tvm.te.placeholder(shape, dtype=dtype, name="y_real") j = tvm.te.reduce_axis((0, shape[1]), name="j") loss = tvm.te.compute( (shape[0], ), lambda i: tvm.te.sum(y_real[i][j] * tvm.te.log(soft_max[i][j]), j), name="loss") sum_loss = topi.sum(loss, axis=0, keepdims=True) mean_loss = tvm.te.compute((1, ), lambda *i: -1 * sum_loss(*i) / shape[0], "mean_loss") s = tvm.te.create_schedule(mean_loss.op) f = tvm.build(s, [y, y_real, mean_loss], tgt, target_host=tgt_host, name=func_name) # print(tvm.lower(s, [y, y_real, mean_loss],name=func_name, simple_mode=True)) return f
def make_reduce_sum_axis_zero(shape, tgt, tgt_host, func_name, dtype="float32"): A = te.placeholder(shape, dtype=dtype, name="A") C = topi.sum(A, axis=0, keepdims=False) s = te.create_schedule(C.op) if tgt=="cuda": bx,tx=s[C].split(C.op.axis[1],factor=32) s[C].bind(bx,te.thread_axis("blockIdx.x")) s[C].bind(tx,te.thread_axis("threadIdx.x")) # print(tvm.lower(s, [A, C], simple_mode=True)) f = tvm.build(s, [A, C], tgt, target_host=tgt_host, name=func_name) return f
def make_reduce_sum_axis_zero(shape, tgt, tgt_host, func_name, dtype="float32"): A = tvm.placeholder(shape, dtype=dtype, name="A") C = topi.sum(A, axis=0, keepdims=False) s = tvm.create_schedule(C.op) block_x = tvm.thread_axis("blockIdx.x") thread_x = tvm.thread_axis("threadIdx.x") print(C.op.axis, C) # s[C].bind(C.op.axis[0], block_x) s[C].bind(C.op.axis[0], thread_x) f = tvm.build(s, [A, C], tgt, target_host=tgt_host, name=func_name) return _export_module(f, func_name, remote)
def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"): # Build the logic and compile the function A = tvm.placeholder(shape=in_shape, name="A") if type == "sum": B = topi.sum(A, axis=axis, keepdims=keepdims) elif type == "max": B = topi.max(A, axis=axis, keepdims=keepdims) elif type == "min": B = topi.min(A, axis=axis, keepdims=keepdims) else: raise NotImplementedError s = topi.cuda.schedule_reduce(B) def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0) foo = tvm.build(s, [A, B], device, name="sum") # Test in_npy = np.random.normal(size=in_shape).astype(np.float32) if type == "sum": out_npy = in_npy.sum(axis=axis, keepdims=keepdims) elif type == "max": out_npy = in_npy.max(axis=axis, keepdims=keepdims) elif type == "min": out_npy = in_npy.min(axis=axis, keepdims=keepdims) else: raise NotImplementedError data_tvm = tvm.nd.array(in_npy, ctx=ctx) out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx) for _ in range(1): foo(data_tvm, out_tvm) np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3) check_device("opencl") check_device("cuda") check_device("metal")
def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0): global TASK # Build the logic and compile the function A = te.placeholder(shape=in_shape, name="A") if type == "sum": TASK = "sum_map_id%d" % test_id B = topi.sum(A, axis=axis, keepdims=keepdims) elif type == "max": TASK = "max_map_id%d" % test_id B = topi.max(A, axis=axis, keepdims=keepdims) elif type == "min": TASK = "min_map_id%d" % test_id B = topi.min(A, axis=axis, keepdims=keepdims) else: raise NotImplementedError s = topi.cuda.schedule_reduce(B) with tvm.target.build_config(auto_unroll_max_step=16, auto_unroll_min_depth=0): fcuda = tvm.build(s, [A, B], "cuda", name="sum") # Test in_npy = np.random.normal(size=in_shape).astype(np.float32) if type == "sum": out_npy = in_npy.sum(axis=axis, keepdims=keepdims) elif type == "max": out_npy = in_npy.max(axis=axis, keepdims=keepdims) elif type == "min": out_npy = in_npy.min(axis=axis, keepdims=keepdims) else: raise NotImplementedError data_tvm = tvm.nd.array(in_npy, ctx=tvm.gpu()) out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=tvm.gpu()) for _ in range(2): fcuda(data_tvm, out_tvm) tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, rtol=4e-4, atol=4e-4)
def check(device, dtype, m=32, n=32): ctx = tvm.context(device, 0) if not ctx.exist or not tvm.runtime.enabled(device): print("skip because", device, "is not enabled..") return if dtype == "float16" and not have_fp16(ctx.compute_version): print("Skip because gpu does not have fp16 support") return a = tvm.te.placeholder((m, n), name="a", dtype=dtype) b = topi.sum(a) with tvm.target.create(device): sb = tvm.te.create_schedule(b.op) i, _ = b.op.reduce_axis sb[b].bind(i, tvm.te.thread_axis("threadIdx.x")) func = tvm.build(sb, [a, b], device) a_np = np.random.uniform(size=(m, n)).astype(a.dtype) b_np = np.sum(a_np) a_nd = tvm.nd.array(a_np, ctx) b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), ctx) func(a_nd, b_nd) tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3)
def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0): global TASK # Build the logic and compile the function A = tvm.placeholder(shape=in_shape, name="A") if type == "sum": TASK = "sum_map_id%d" %test_id B = topi.sum(A, axis=axis, keepdims=keepdims) elif type == "max": TASK = "max_map_id%d" %test_id B = topi.max(A, axis=axis, keepdims=keepdims) elif type == "min": TASK = "min_map_id%d" %test_id B = topi.min(A, axis=axis, keepdims=keepdims) else: raise NotImplementedError s = topi.cuda.schedule_reduce(B) with tvm.build_config(auto_unroll_max_step=16, auto_unroll_min_depth=0): fcuda = tvm.build(s, [A, B], "cuda", name="sum") # Test in_npy = np.random.normal(size=in_shape).astype(np.float32) if type == "sum": out_npy = in_npy.sum(axis=axis, keepdims=keepdims) elif type == "max": out_npy = in_npy.max(axis=axis, keepdims=keepdims) elif type == "min": out_npy = in_npy.min(axis=axis, keepdims=keepdims) else: raise NotImplementedError data_tvm = tvm.nd.array(in_npy, ctx=tvm.gpu()) out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=tvm.gpu()) for _ in range(2): fcuda(data_tvm, out_tvm) tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, rtol=4e-4, atol=4e-4)
A = tvm.placeholder((n, m), name='A') k = tvm.reduce_axis((0, m), "k") B = tvm.compute((n,), lambda i: tvm.sum(A[i, k], axis=k), name="B") s = tvm.create_schedule(B.op) ###################################################################### # and to examine the IR code in human readable format, we can do # print(tvm.lower(s, [A], simple_mode=True)) ###################################################################### # However, for such a common operation we had to define the reduce axis ourselves as well as explicit computation with # :code: `tvm.compute`. Imagine for more complicated operations how much details we need to provide. # Fortunately, we can replace those two lines with simple :code:`topi.sum` much like :code`numpy.sum` # C = topi.sum(A, axis=1) ts = tvm.create_schedule(C.op) print(tvm.lower(ts, [A], simple_mode=True)) ###################################################################### # Numpy-style operator overloading # -------------------------------- # We can add two tensors using :code:`topi.broadcast_add` that have correct (broadcastable with specific) shapes. # Even shorter, TOPI provides operator overloading for such common operations. For example, # x, y = 100, 10 a = tvm.placeholder((x, y, y), name="a") b = tvm.placeholder((y, y), name="b") c = a + b # same as topi.broadcast_add d = a * b # same as topi.broadcast_mul
A = tvm.placeholder((n, m), name='A') k = tvm.reduce_axis((0, m), "k") B = tvm.compute((n,), lambda i: tvm.sum(A[i, k], axis=k), name="B") s = tvm.create_schedule(B.op) ###################################################################### # and to examine the IR code in human readable format, we can do # print(tvm.lower(s, [A], simple_mode=True)) ###################################################################### # However, for such a common operation we had to define the reduce axis ourselves as well as explicit computation with # :code:`tvm.compute`. Imagine for more complicated operations how much details we need to provide. # Fortunately, we can replace those two lines with simple :code:`topi.sum` much like :code:`numpy.sum` # C = topi.sum(A, axis=1) ts = tvm.create_schedule(C.op) print(tvm.lower(ts, [A], simple_mode=True)) ###################################################################### # Numpy-style operator overloading # -------------------------------- # We can add two tensors using :code:`topi.broadcast_add` that have correct (broadcastable with specific) shapes. # Even shorter, TOPI provides operator overloading for such common operations. For example, # x, y = 100, 10 a = tvm.placeholder((x, y, y), name="a") b = tvm.placeholder((y, y), name="b") c = a + b # same as topi.broadcast_add d = a * b # same as topi.broadcast_mul
def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum", dtype="float32"): # Build the logic and compile the function A = tvm.placeholder(shape=in_shape, name="A", dtype=dtype) A1 = topi.sqrt(topi.exp(A)) out_dtype = dtype if type == "sum": B = topi.sum(A1, axis=axis, keepdims=keepdims) elif type == "all": B = topi.all(A, axis=axis, keepdims=keepdims) elif type == "any": B = topi.any(A, axis=axis, keepdims=keepdims) elif type == "max": B = topi.max(A1, axis=axis, keepdims=keepdims) elif type == "min": B = topi.min(A1, axis=axis, keepdims=keepdims) elif type == "argmax": B = topi.argmax(A1, axis=axis, keepdims=keepdims) out_dtype = "int32" elif type == "argmin": B = topi.argmin(A1, axis=axis, keepdims=keepdims) out_dtype = "int32" else: raise NotImplementedError def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_reduce(B) foo = tvm.build(s, [A, B], device, name=type) # Test if dtype == 'bool': in_npy_map = in_npy = np.random.choice([True, False], size=in_shape) else: in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype) in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype) if type == "sum": out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims) elif type == "all" and dtype == 'bool': out_npy = in_npy_map.all(axis=axis, keepdims=keepdims) elif type == "any" and dtype == "bool": out_npy = in_npy_map.any(axis=axis, keepdims=keepdims) elif type == "max": out_npy = in_npy_map.max(axis=axis, keepdims=keepdims) elif type == "min": out_npy = in_npy_map.min(axis=axis, keepdims=keepdims) elif type == "argmax": out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims) elif type == "argmin": out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims) else: raise NotImplementedError data_tvm = tvm.nd.array(in_npy, ctx=ctx) out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype) for _ in range(1): foo(data_tvm, out_tvm) if type == "argmax" or type == "argmin": out_tvm_indices = out_tvm.asnumpy() if keepdims: out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis) if axis is None: out_tvm_val = in_npy_map.ravel()[out_tvm_indices] else: other_indices = tuple( np.indices(in_shape[0:axis] + in_shape[(axis + 1):])) sel_indices = other_indices[0:axis] + ( out_tvm_indices, ) + other_indices[axis:] out_tvm_val = in_npy_map[sel_indices] if type == "argmax": tvm.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3) elif type == "argmin": tvm.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3) else: tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3) for device in get_all_backend(): check_device(device)
def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"): # Build the logic and compile the function dat_dtype = "float32" A = tvm.placeholder(shape=in_shape, name="A", dtype=dat_dtype) A1 = topi.sqrt(topi.exp(A)) out_dtype = "float32" if type == "sum": B = topi.sum(A1, axis=axis, keepdims=keepdims) elif type == "max": B = topi.max(A1, axis=axis, keepdims=keepdims) elif type == "min": B = topi.min(A1, axis=axis, keepdims=keepdims) elif type == "argmax": B = topi.argmax(A1, axis=axis, keepdims=keepdims) out_dtype = "int32" elif type == "argmin": B = topi.argmin(A1, axis=axis, keepdims=keepdims) out_dtype = "int32" else: raise NotImplementedError def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.create(device): s = topi.generic.schedule_reduce(B) foo = tvm.build(s, [A, B], device, name=type) # Test in_npy = np.random.uniform(size=in_shape).astype(np.float32) in_npy_map = np.sqrt(np.exp(in_npy)).astype(np.float32) if type == "sum": out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims) elif type == "max": out_npy = in_npy_map.max(axis=axis, keepdims=keepdims) elif type == "min": out_npy = in_npy_map.min(axis=axis, keepdims=keepdims) elif type == "argmax": out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims) elif type == "argmin": out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims) else: raise NotImplementedError data_tvm = tvm.nd.array(in_npy, ctx=ctx) out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype) for _ in range(1): foo(data_tvm, out_tvm) if type == "argmax" or type == "argmin": out_tvm_indices = out_tvm.asnumpy() if keepdims: out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis) if axis is None: out_tvm_val = in_npy_map.ravel()[out_tvm_indices] else: other_indices = tuple(np.indices(in_shape[0:axis] + in_shape[(axis+1):])) sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:] out_tvm_val = in_npy_map[sel_indices] if type == "argmax": np.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3) elif type == "argmin": np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3) else: np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3) for device in ["cuda", "opencl", "metal", "llvm", "rocm", "vulkan"]: check_device(device)
for i in range(num_timesteps): inp = topi.concatenate([xs[i], new_h], 1) g = topi.tanh(topi.matmul(inp, weights[0]) + weights[1]) j = topi.sigmoid(topi.matmul(inp, weights[2]) + weights[3]) f = topi.sigmoid(topi.matmul(inp, weights[4]) + weights[5]) o = topi.sigmoid(topi.matmul(inp, weights[6]) + weights[7]) new_s = new_s * f + g * j new_h = topi.tanh(new_s) * o logits = topi.matmul(new_h, weights[8]) + weights[9] # compute accuracy pred = topi.nn.softmax(logits) correct_pred = topi.equal(topi.argmax(y, 1), topi.argmax(pred, 1)) accuracy = topi.sum(correct_pred.astype('float32')) / batch_size # Define loss and optimizer loss = topi.sum(-topi.sum(y * topi.nn.log_softmax(logits), axis=1)) / batch_size head = topi.full((1, ), 'float32', 1.0) gradients = list(tvm.differentiate(topi.reshape(loss, (1, )), weights, head)) new_weights = [w - lr * g for (w, g) in zip(weights, gradients)] # Define model sched = tvm.create_schedule([loss.op, accuracy.op] + [x.op for x in new_weights]) parallel_schedule(sched) train_model = tvm.build(sched, [x, y, s, h, loss, accuracy, *weights, *new_weights])
def demo_conv2d(): lrate = 0.1 nbatches = 100 # batches to train num_classes = 10 batch_size = 10 img_h = 28 img_w = 28 img_c = 1 f1_c = 4 f2_c = 5 f3_units = 16 x = tvm.placeholder((batch_size, img_h, img_w, img_c), name='x') y = tvm.placeholder((batch_size, num_classes), name='y') print('Block1') w1 = tvm.placeholder((3, 3, img_c, f1_c), name='w1') b1 = tvm.placeholder((f1_c, ), name='b1') t = topi.nn.conv2d(x, w1, 1, 0, layout='NHWC', out_dtype=tvm.float32) t = t + topi.broadcast_to(b1, (batch_size, 1, 1, f1_c)) print('Block1: after-biasing shape is', get_shape(t)) t = topi.nn.pool(t, [2, 2], [2, 2], [0, 0, 0, 0], 'max', layout='NHWC') print('Block1: after-pooling shape is', get_shape(t)) t = topi.nn.relu(t) print('Block1: after-relu shape is', get_shape(t)) print('Block2') w2 = tvm.placeholder((3, 3, f1_c, f2_c), name='w2') b2 = tvm.placeholder((f2_c, ), name='b2') t = topi.nn.conv2d(t, w2, 1, 0, layout='NHWC', out_dtype=tvm.float32) t = t + topi.broadcast_to(b2, (batch_size, 1, 1, f2_c)) print('Block2: after-biasing shape is', get_shape(t)) t = topi.nn.pool(t, [2, 2], [2, 2], [0, 0, 0, 0], 'max', layout='NHWC') print('Block2: after-pooling shape is', get_shape(t)) t = topi.nn.relu(t) print('Block2: after-relu shape is', get_shape(t)) t = topi.nn.flatten(t) print('Block2: after-flattern shape is', get_shape(t)) print('Block3') w3 = tvm.placeholder((f3_units, get_shape(t)[1])) b3 = tvm.placeholder((f3_units, )) t = topi.nn.dense(t, w3, b3) print('Block3: after-dense shape is', get_shape(t)) print('Block4') w4 = tvm.placeholder((num_classes, get_shape(t)[1])) b4 = tvm.placeholder((num_classes, )) t = topi.nn.dense(t, w4, b4) print('Block4: after-dense shape is', get_shape(t)) t = topi.nn.relu(t) p = topi.argmax(t, axis=1) # TODO: check the correctnesss of the log_softmax expression # TODO: figure out the difference between it and standard cross-entropy loss l = -topi.sum(y * topi.nn.log_softmax(t)) / batch_size print('Block4: loss shape is', get_shape(l)) ones = topi.full_like(l, 1.0) #[dl_dw1,dl_db1,dl_dw2,dl_db2,dl_dw3,dl_db3,dl_dw4,dl_db4] params = [w1, b1, w2, b2, w3, b3, w4, b4] dl = list(tvm.ir_pass.JacobianRecursive(l, params, ones)) assert len(params) == len(dl) print('dl_dw1 weight is', get_shape(params[0])) sdl = tvm.create_schedule([p.op for p in [x, y, l] + params + dl]) mdl = tvm.build(sdl, [x, y, l] + params + dl) print('Train+Inference module', mdl) # sl = tvm.create_schedule([l.op]) # ml = tvm.build(sdl, [x,y] + params + [l]) # print('Inference module',ml) state = {} for p in params: state.update({ p: tvm.nd.array( np.random.uniform(-1.0, 1.0, size=get_shape(p)).astype(np.float32)) }) grads = {} for p, g in zip(params, dl): grads.update({p: tvm.nd.empty(get_shape(g))}) for ib in range(nbatches): b = range(ib * batch_size, (ib + 1) * batch_size) tx = tvm.nd.array(mnist_img(b)) ty = tvm.nd.array(mnist_cls_oh(b)) tl = tvm.nd.empty(shape=(), dtype=tvm.float32) print('Entering') mdl(*([tx, ty, tl] + list(state.values()) + list(grads.values()))) print('Done', 'loss', tl.asnumpy()) state2 = {} for p in params: state2.update({ p: tvm.nd.array(state[p].asnumpy() - lrate * grads[p].asnumpy()) }) state = state2
def compute_cross_entropy(attrs, inputs, out_dtype): x, y = inputs return [-topi.sum(topi.log(x) * y) / x.shape[0]]
def compute_cross_entropy_with_logits(attrs, inputs, out_dtype): x, y = inputs return [-topi.sum(x * y) / x.shape[0]]
import tvm import topi x = tvm.te.placeholder((32, 3, 28, 28), name='x') w1 = tvm.te.placeholder((10, 3, 3, 3), name='w1') w2 = tvm.te.placeholder((10, 10, 3, 3), name='w2') z1 = topi.nn.conv2d(x, w1, 1, 1, 1) z2 = topi.nn.conv2d(z1, w2, 1, 1, 1) y = topi.sum(z2) # produce gradients [dw1] = tvm.te.gradient(y, [w1]) print(type(dw1)) # produce Jacobians [jw1, jw2] = tvm.te.gradient(z2, [w1, w2]) # produce gradients, the head adjoint for z2 is provided manually [dw1, dw2] = tvm.te.gradient(z2, [w1, w2], topi.full_like(z2, 1.0))
from __future__ import absolute_import, print_function import tvm import topi import numpy as np if __name__ == '__main__': x, y = 100, 10 a = tvm.placeholder((x, y, y), name='a') b = tvm.placeholder((y, y), name='b') c = a + b d = a * b e = topi.elemwise_sum([c, d]) f = e / 2.0 g = topi.sum(f) with tvm.target.cuda(): sg = topi.generic.schedule_reduce(g) print(tvm.lower(sg, [a, b], simple_mode=True))
# https://rufflewind.com/2016-12-30/reverse-mode-automatic-differentiation import tvm import topi import numpy x = tvm.te.placeholder((3, ), name='x') w = tvm.te.placeholder((3, ), name='w') z1 = topi.multiply(x, w) z2 = topi.sum(z1) z3 = topi.multiply(z2, -1) z4 = topi.exp(z3) z5 = topi.add(z4, 1) z6 = topi.divide(1, z5) [dw] = tvm.te.gradient(z6, w) s = tvm.te.create_schedule(dw.op) g = tvm.build(s, [x, w, dw]) # The default tensor type in tvm dtype = "float32" target = 'llvm' ctx = tvm.context(target, 0) # # Random generated tensor for testing x1 = tvm.nd.array(numpy.array([1, 3, 2]).astype(dtype), ctx) w1 = tvm.nd.array(numpy.array([2, 1, -2]).astype(dtype), ctx) dw1 = tvm.nd.empty(shape=(3, ), dtype='float32', ctx=ctx) g(x1, w1, dw1) print("ret=", dw1)