def test_module_quantize_args(): hcl.init() def algorithm(A, B): @hcl.def_([A.shape, B.shape, ()]) def add(A, B, x): hcl.return_(A[x] + B[x]) return hcl.compute(A.shape, lambda x: add(A, B, x), "C") A = hcl.placeholder((10, ), dtype=hcl.UInt(2)) B = hcl.placeholder((10, )) s = hcl.create_scheme([A, B], algorithm) s.downsize([algorithm.add.A], hcl.UInt(2)) s = hcl.create_schedule_from_scheme(s) f = hcl.build(s) a = np.random.randint(100, size=(10, )) b = np.random.randint(100, size=(10, )) c = np.zeros(10) _A = hcl.asarray(a, hcl.UInt(2)) _B = hcl.asarray(b) _C = hcl.asarray(c) f(_A, _B, _C) _A = _A.asnumpy() _B = _B.asnumpy() _C = _C.asnumpy() for i in range(0, 10): assert (_C[i] == a[i] % 4 + b[i])
def test_uint_imm_ops(): A = hcl.placeholder((10, 10), "A", dtype=hcl.UInt(1)) def kernel(A): return hcl.compute((8, 8), lambda y, x: hcl.select(x < 4, A[y][x], 0), "B") s = hcl.create_scheme(A, kernel) s = hcl.create_schedule_from_scheme(s) code = hcl.build(s, target="vhls") assert "(unsigned int)0U)" in code
def test_binary_ops(): A = hcl.placeholder((8, 8), "A", dtype=hcl.Int(20)) B = hcl.placeholder((8, 8), "B", dtype=hcl.Fixed(16,12)) def kernel(A, B): return hcl.compute((8, 8), lambda y, x: hcl.select(x < 4, A[y][x], B[y][x]), "C", dtype=hcl.Int(8)) s = hcl.create_scheme([A, B], kernel) s = hcl.create_schedule_from_scheme(s) code = hcl.build(s, target="vhls") assert "(ap_fixed<32, 20>)B" in code
def test_imm_ops(): A = hcl.placeholder((10, 10), "A") def kernel(A): return hcl.compute((8, 8), lambda y, x: hcl.select(x < 4, A[y][x] + A[y+2][x+2], 0), "B") s = hcl.create_scheme(A, kernel) s = hcl.create_schedule_from_scheme(s) code = hcl.build(s, target="vhls") assert "((ap_int<33>)0)" in code assert "((ap_int<33>)(((ap_int<33>)A" in code
def test_uint_int(): A = hcl.placeholder((8, 8), "A", dtype=hcl.Fixed(20,12)) B = hcl.placeholder((8, 8), "B", dtype=hcl.UFixed(16,12)) def kernel(A, B): return hcl.compute((8, 8), lambda y, x: hcl.select(x < 4, A[y][x], B[y][x]), "C", dtype=hcl.Int(8)) s = hcl.create_scheme([A, B], kernel) s = hcl.create_schedule_from_scheme(s) code = hcl.build(s, target="vhls") assert "ap_ufixed<20, 8>)A" in code
def build_bnn_inf_opt(batch_size=batch_size, target=target): hcl_ph = [] input_image = hcl.placeholder((batch_size, 1, 16, 16), "input_image", qtype_bit) for name in params: dtype = qtype_bit if ("conv" in name or "w_" in name) else qtype_float hcl_ph.append(hcl.placeholder(params[name].shape, name, dtype=dtype)) # build the network scheme = hcl.create_scheme([input_image] + hcl_ph, build_bnn) s = hcl.create_schedule_from_scheme(scheme) def plot_dataflow_graph(): import matplotlib.pyplot as plt import networkx as nx graph, op = s.dataflow_graph(plot=True) nx.draw(graph, with_labels=True) plt.savefig("bnn.png") # compute optimization layer_names = build_bnn.__dict__.keys() for layer in layer_names: s_layer = getattr(build_bnn, layer) if "bn" in layer: # fuse conv s_conv = getattr(build_bnn, "conv" + layer[-1]) s[s_conv].compute_at(s[s_layer], s_layer.axis[3]) if layer == "bn1": s[s_layer].pipeline(s_layer.axis[3]) # will be refreshed else: s[s_conv].pipeline(s_conv.axis[4]) elif "pool" in layer: s[s_layer].pipeline(s_layer.axis[2]) elif "fc" in layer: s[s_layer].pipeline(s_layer.axis[1]) elif "flatten" in layer: s[s_layer].pipeline(s_layer.axis[1]) elif "dense_relu" in layer: s_fc = getattr(build_bnn, "fc1") s[s_fc].compute_at(s[s_layer], s_layer.axis[1]) s[s_fc].pipeline(s_fc.axis[2]) if isinstance(target, hcl.platform): s.to([input_image] + hcl_ph, target.xcel) s.to(build_bnn.fc2, target.host) target.config(compile="vivado_hls", mode="csyn") # memory optimization s.partition(input_image, hcl.Partition.Block, dim=1, factor=8) for ph in reversed(hcl_ph): if ph.name in ["b_fc2", "fc2"]: s.partition(ph, hcl.Partition.Complete, dim=1) else: s.partition(ph, hcl.Partition.Block, dim=1, factor=8) return hcl.build(s, target=target)
def test(): hcl.init() A = hcl.placeholder((8, 8), "A") def kernel(A): return hcl.compute((8, 8), lambda y, x: foo(A[y, x] + A[y, x]), "C") s = hcl.create_scheme([A], kernel) s = hcl.create_schedule_from_scheme(s) f = hcl.build(s, "vhls") print(f)
def test3(): A = hcl.placeholder((8, 8), "A", dtype=hcl.UInt(2)) B = hcl.placeholder((8, 8), "B", dtype=hcl.UInt(2)) def kernel(A, B): return hcl.compute((8, 8), lambda y, x: hcl.select(x < 4, A[y, x][0], 0), "C") s = hcl.create_scheme([A, B], kernel) s = hcl.create_schedule_from_scheme(s) f = hcl.build(s, "vhls") print(f)
def test1(): A = hcl.placeholder((8, 8), "A") B = hcl.placeholder((8, 8), "B", dtype=hcl.Fixed(16, 12)) def kernel(A, B): return hcl.compute( (8, 8), lambda y, x: hcl.select(x < 4, A[y][x], B[y][x]), "C") s = hcl.create_scheme([A, B], kernel) s = hcl.create_schedule_from_scheme(s) f = hcl.build(s, target="vhls") print(f)
def test2(): A = hcl.placeholder((8, 8), "A", dtype=hcl.UInt(1)) def kernel(A): return hcl.compute((8, 8), lambda y, x: hcl.select(x < 4, A[y][x], 0), "B") s = hcl.create_scheme([A], kernel) s = hcl.create_schedule_from_scheme(s) f = hcl.build(s, target="vhls") with open("select_test.cpp", "w") as outfile: outfile.write(f)
def build_lenet_inf(batch_size=batch_size, target=None): # set up input/output placeholders input_image = hcl.placeholder((batch_size, 1, 28, 28), "input_image") weight_conv1 = hcl.placeholder((20, 1, 5, 5), "weight_conv1", qtype1) weight_conv2 = hcl.placeholder((50, 20, 5, 5), "weight_conv2", qtype1) weight_fc1 = hcl.placeholder((500, 800), "weight_fc1", qtype1) weight_fc2 = hcl.placeholder((10, 500), "weight_fc2", qtype1) lenet = hcl.placeholder((batch_size, 10), "lenet") # create a quantization scheme scheme = hcl.create_scheme([ input_image, weight_conv1, weight_conv2, weight_fc1, weight_fc2, lenet ], build_lenet) # quantize the three activation layers scheme.quantize([build_lenet.tanh1, build_lenet.tanh2, build_lenet.tanh3], qtype2) s = hcl.create_schedule_from_scheme(scheme) return hcl.build(s, target=target)
def build_bnn_inf(batch_size=batch_size, target=target): hcl_ph = [] input_image = hcl.placeholder((batch_size, 1, 16, 16), "input_image", qtype_bit) for name in params: dtype = qtype_bit if ("conv" in name or "w_" in name) else qtype_float hcl_ph.append(hcl.placeholder(params[name].shape, name, dtype=dtype)) # build the network scheme = hcl.create_scheme([input_image] + hcl_ph, build_bnn) s = hcl.create_schedule_from_scheme(scheme) # if isinstance(target,hcl.platform): # s.to([input_image] + hcl_ph, target.xcel) # s.to(build_bnn.fc2, target.host) # target.config(compile="vivado_hls", mode="csyn") return hcl.build(s, target=target)
def test_resize(): def algorithm(A): return hcl.compute(A.shape, lambda x: A[x] + 1, "B") A = hcl.placeholder((10, ), dtype=hcl.UInt(32)) scheme = hcl.create_scheme([A], algorithm) scheme.downsize(algorithm.B, hcl.UInt(2)) s = hcl.create_schedule_from_scheme(scheme) f = hcl.build(s) a = np.random.randint(100, size=(10, )) _A = hcl.asarray(a, dtype=hcl.UInt(32)) _B = hcl.asarray(np.zeros(10), dtype=hcl.UInt(2)) f(_A, _B) _A = _A.asnumpy() _B = _B.asnumpy() for i in range(10): assert (_B[i] == (a[i] + 1) % 4)
def build_ultranet_hls(batch_size=batch_size, target=None): # set up input/output placeholders input_image = hcl.placeholder((batch_size, 3, 160, 320), dtype=input_dtype, name="input_image") weight_conv1 = hcl.placeholder((16, 3, 3, 3), dtype=weight_dtype, name="weight_conv1") # 3 in, 16 out a_batchnorm1 = hcl.placeholder((16, ), dtype=bn_a_dtype, name="a_batchnorm1") b_batchnorm1 = hcl.placeholder((16, ), dtype=bn_b_dtype, name="b_batchnorm1") weight_conv2 = hcl.placeholder((32, 16, 3, 3), dtype=weight_dtype, name="weight_conv2") # 16 in, 32 out a_batchnorm2 = hcl.placeholder((32, ), dtype=bn_a_dtype, name="a_batchnorm2") b_batchnorm2 = hcl.placeholder((32, ), dtype=bn_b_dtype, name="b_batchnorm2") weight_conv3 = hcl.placeholder((64, 32, 3, 3), dtype=weight_dtype, name="weight_conv3") # 32 in, 64 out a_batchnorm3 = hcl.placeholder((64, ), dtype=bn_a_dtype, name="a_batchnorm3") b_batchnorm3 = hcl.placeholder((64, ), dtype=bn_b_dtype, name="b_batchnorm3") weight_conv4 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv4") # 64 in, 64 out a_batchnorm4 = hcl.placeholder((64, ), dtype=bn_a_dtype, name="a_batchnorm4") b_batchnorm4 = hcl.placeholder((64, ), dtype=bn_b_dtype, name="b_batchnorm4") weight_conv5 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv5") # 64 in, 64 out a_batchnorm5 = hcl.placeholder((64, ), dtype=bn_a_dtype, name="a_batchnorm5") b_batchnorm5 = hcl.placeholder((64, ), dtype=bn_b_dtype, name="b_batchnorm5") weight_conv6 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv6") # 64 in, 64 out a_batchnorm6 = hcl.placeholder((64, ), dtype=bn_a_dtype, name="a_batchnorm6") b_batchnorm6 = hcl.placeholder((64, ), dtype=bn_b_dtype, name="b_batchnorm6") weight_conv7 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv7") # 64 in, 64 out a_batchnorm7 = hcl.placeholder((64, ), dtype=bn_a_dtype, name="a_batchnorm7") b_batchnorm7 = hcl.placeholder((64, ), dtype=bn_b_dtype, name="b_batchnorm7") weight_conv8 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv8") # 64 in, 64 out a_batchnorm8 = hcl.placeholder((64, ), dtype=bn_a_dtype, name="a_batchnorm8") b_batchnorm8 = hcl.placeholder((64, ), dtype=bn_b_dtype, name="b_batchnorm8") sm = hcl.create_scheme([ input_image, weight_conv1, a_batchnorm1, b_batchnorm1, weight_conv2, a_batchnorm2, b_batchnorm2, weight_conv3, a_batchnorm3, b_batchnorm3, weight_conv4, a_batchnorm4, b_batchnorm4, weight_conv5, a_batchnorm5, b_batchnorm5, weight_conv6, a_batchnorm6, b_batchnorm6, weight_conv7, a_batchnorm7, b_batchnorm7, weight_conv8, a_batchnorm8, b_batchnorm8 ], ultranet) # quantize activations sm.quantize(ultranet.conv1, conv_dtype) sm.quantize(ultranet.relu1, act_dtype) sm.quantize(ultranet.conv2, conv_dtype) sm.quantize(ultranet.relu2, act_dtype) sm.quantize(ultranet.conv3, conv_dtype) sm.quantize(ultranet.relu3, act_dtype) sm.quantize(ultranet.conv4, conv_dtype) sm.quantize(ultranet.relu4, act_dtype) sm.quantize(ultranet.conv5, conv_dtype) sm.quantize(ultranet.relu5, act_dtype) sm.quantize(ultranet.conv6, conv_dtype) sm.quantize(ultranet.relu6, act_dtype) sm.quantize(ultranet.conv7, conv_dtype) sm.quantize(ultranet.relu7, act_dtype) sm.quantize(ultranet.conv8, conv_dtype) sm.quantize(ultranet.relu8, act_dtype) s = hcl.create_schedule_from_scheme(sm, "main") # create line-buffer and window-buffer for conv layers for i in range(1, 1 + 8): conv_pad = getattr(ultranet, 'conv' + str(i) + '_pad') conv = getattr(ultranet, 'conv' + str(i)) LB = s.reuse_at(conv_pad._op, s[conv], conv.axis[2], f"conv{i}_line_buffer") WB = s.reuse_at(LB, s[conv], conv.axis[3], f"conv{i}_window_buffer") # conv3 = ultranet.conv3 # xo, yo, xi, yi = s[conv3].tile(conv3.axis[2], conv3.axis[3], 4, 4) # s[conv3].reorder(yo, xo, yi, xi) # print(hcl.lower(s)) if opt: # merge conv + bn + relu operators for i in range(1, 1 + 8): pad = getattr(ultranet, 'conv' + str(i) + '_pad') conv = getattr(ultranet, 'conv' + str(i)) bn = getattr(ultranet, 'batch_norm' + str(i)) relu = getattr(ultranet, 'relu' + str(i)) # Can't merge pad with conv, a limitation of HCL. # s[pad].compute_at(s[conv], conv.axis[3]) s[bn].compute_at(s[relu], relu.axis[3]) res = ultranet.result relu8 = ultranet.relu8 s[relu8].compute_at(s[res], res.axis[3]) # pipeline all layers for i in range(1, 1 + 8): pad = getattr(ultranet, 'conv' + str(i) + '_pad') conv = getattr(ultranet, 'conv' + str(i)) bn_relu = getattr(ultranet, 'relu' + str(i)) s[pad].pipeline(pad.axis[3]) # s[conv].pipeline(conv.axis[4]) s[conv].pipeline(conv.axis[3]) s[bn_relu].pipeline(bn_relu.axis[3]) if i <= 4: pool_pad = getattr(ultranet, 'pool' + str(i) + '_pad') pool = getattr(ultranet, 'pool' + str(i)) s[pool_pad].pipeline(pool_pad.axis[3]) s[pool].pipeline(pool.axis[3]) s[ultranet.result].pipeline(ultranet.result.axis[3]) # partition weight buffers if partition: # weights need to be partitioned in dim 2, 3, 4 # for now HeteroCL doesn't support multi-dimensional partition s.partition(weight_conv1, dim=2) s.partition(weight_conv2, dim=2) s.partition(weight_conv3, dim=2) s.partition(weight_conv4, dim=2) s.partition(weight_conv5, dim=2) s.partition(weight_conv6, dim=2) s.partition(weight_conv7, dim=2) s.partition(weight_conv8, dim=2) # fifo across layers if stream: ''' Note: Padding layer's pipelining has to precede other layers' because of a bug in HeteroCL: when there's an ifThenElse statement in which both branch reads/writes the same buffer, HeteroCL thinks it's accessing the buffer twice, thus preventing pipelining. For now, ?: works, but if..else.. doesn't, because the latter has two load/store nodes. ''' s.to(ultranet.conv1_pad, s[ultranet.conv1], fifo_depth=128) s.to(ultranet.conv2_pad, s[ultranet.conv2], fifo_depth=128) s.to(ultranet.conv3_pad, s[ultranet.conv3], fifo_depth=128) s.to(ultranet.conv4_pad, s[ultranet.conv4], fifo_depth=128) s.to(ultranet.conv5_pad, s[ultranet.conv5], fifo_depth=128) s.to(ultranet.conv6_pad, s[ultranet.conv6], fifo_depth=128) s.to(ultranet.conv7_pad, s[ultranet.conv7], fifo_depth=128) s.to(ultranet.conv8_pad, s[ultranet.conv8], fifo_depth=128) s.to(ultranet.conv1, s[ultranet.relu1], fifo_depth=128) s.to(ultranet.relu1, s[ultranet.pool1_pad], fifo_depth=128) s.to(ultranet.pool1_pad, s[ultranet.pool1], fifo_depth=128) s.to(ultranet.pool1, s[ultranet.conv2_pad], fifo_depth=128) s.to(ultranet.conv2, s[ultranet.relu2], fifo_depth=128) s.to(ultranet.relu2, s[ultranet.pool2_pad], fifo_depth=128) s.to(ultranet.pool2_pad, s[ultranet.pool2], fifo_depth=128) s.to(ultranet.pool2, s[ultranet.conv3_pad], fifo_depth=128) s.to(ultranet.conv3, s[ultranet.relu3], fifo_depth=128) s.to(ultranet.relu3, s[ultranet.pool3_pad], fifo_depth=128) s.to(ultranet.pool3_pad, s[ultranet.pool3], fifo_depth=128) s.to(ultranet.pool3, s[ultranet.conv4_pad], fifo_depth=128) s.to(ultranet.conv4, s[ultranet.relu4], fifo_depth=128) s.to(ultranet.relu4, s[ultranet.pool4_pad], fifo_depth=128) s.to(ultranet.pool4_pad, s[ultranet.pool4], fifo_depth=128) s.to(ultranet.pool4, s[ultranet.conv5_pad], fifo_depth=128) s.to(ultranet.conv5, s[ultranet.relu5], fifo_depth=128) s.to(ultranet.relu5, s[ultranet.conv6_pad], fifo_depth=128) s.to(ultranet.conv6, s[ultranet.relu6], fifo_depth=128) s.to(ultranet.relu6, s[ultranet.conv7_pad], fifo_depth=128) s.to(ultranet.conv7, s[ultranet.relu7], fifo_depth=128) s.to(ultranet.relu7, s[ultranet.conv8_pad], fifo_depth=128) s.to(ultranet.conv8, s[ultranet.result], fifo_depth=128) return hcl.build(s, name="main", target=target)
def top(target=None): # Algorithm definition (§1) def knn(test_image, train_images): # Imperative programming and bit operations (§2) def popcount(num): out = hcl.scalar(0, "out") with hcl.for_(0, train_images.type.bits) as i: # Bit selection operation out.v += num[i] return out.v # This function update the candidates, i.e., `knn_mat`. Here we mutate # through the shape of tensor `dist`. For each `dist` value, if it is # smaller than the maximum candidate, we replace it. def update_knn(dist, knn_mat, i, j): max_id = hcl.scalar(0, "max_id") with hcl.for_(0, 3) as k: with hcl.if_(knn_mat[i][k] > knn_mat[i][max_id.v]): max_id.v = k with hcl.if_(dist[i][j] < knn_mat[i][max_id.v]): knn_mat[i][max_id.v] = dist[i][j] # Main algorithm (§3) # Fist step: XOR (§3.1) diff = hcl.compute(train_images.shape, lambda x, y: train_images[x][y] ^ test_image, "diff") # Second step: popcount (§3.2) dist = hcl.compute(diff.shape, lambda x, y: popcount(diff[x][y]), "dist") # Third step: initialize the candidates (§3.3) knn_mat_buf = hcl.compute((10, 4), lambda x, y: 50, "knn_mat_buf") # Fourth step: update the candidates (§3.4) hcl.mutate(dist.shape, lambda x, y: update_knn(dist, knn_mat_buf, x, y), "knn_update") knn_mat = hcl.compute((10, 3), lambda x, y: knn_mat_buf[x][y], "knn_mat") # Final step: return the candidates (§3.5) return knn_mat # Inputs/Outputs definition (§4) # Scalars (§4.1) test_image = hcl.placeholder((), "test_image") # Tensors (§4.2) train_images = hcl.placeholder(data_size, "train_images") # Data type customization (§5.1) scheme = hcl.create_scheme([test_image, train_images], knn) scheme.downsize([knn.dist, knn.dist.out, knn.knn_mat_buf, knn.knn_mat], dtype_knnmat) # Compute customization (§5.2) s = hcl.create_schedule_from_scheme(scheme) diff = knn.diff dist = knn.dist knn_mat_buf = knn.knn_mat_buf knn_update = knn.knn_update # Merge loop nests s[diff].compute_at(s[dist], dist.axis[1]) s[dist].compute_at(s[knn_update], knn_update.axis[1]) # Reorder loop to expose more parallelism s[knn_update].reorder(knn_update.axis[1], knn_update.axis[0]) # Parallel initialization of knn mat s[knn_mat_buf].parallel(knn_mat_buf.axis[0]) # Parallel outer loop and pipeline inner loop s[knn_update].parallel(knn_update.axis[0]) s[knn_update].pipeline(knn_update.axis[1]) # Parallel the innermost loop of 49 pixels s[dist].parallel(dist.axis[2]) # At the end, we build the whole offloaded function. return hcl.build(s, target=target)
def top(target=None): def smith_waterman(seqA, seqB, consA, consB): def similarity_score(a, b): return hcl.select(a == b, 1, penalty) def find_max(A, len_): max_ = hcl.local(A[0], "max") act_ = hcl.local(0, "act") with hcl.for_(0, len_) as i: with hcl.if_(A[i] > max_[0]): max_[0] = A[i] act_[0] = i return max_[0], act_[0] matrix_max = hcl.local(0, "maxtrix_max") i_max = hcl.local(0, "i_max") j_max = hcl.local(0, "j_max") matrix = hcl.compute((lenA + 1, lenB + 1), lambda x, y: 0, "matrix") action = hcl.compute(matrix.shape, lambda x, y: 3, "action") def populate_matrix(i, j): trace_back = hcl.compute((4, ), lambda x: 0, "trace_back") with hcl.if_(hcl.and_(i != 0, j != 0)): trace_back[0] = matrix[i-1, j-1] + \ similarity_score(seqA[i-1], seqB[j-1]) trace_back[1] = matrix[i - 1, j] + penalty trace_back[2] = matrix[i, j - 1] + penalty trace_back[3] = 0 matrix[i, j], action[i, j] = find_max(trace_back, 4) with hcl.if_(matrix[i, j] > matrix_max[0]): matrix_max[0] = matrix[i, j] i_max[0] = i j_max[0] = j P = hcl.mutate((lenA + 1, lenB + 1), lambda i, j: populate_matrix(i, j)) def align(curr_i, curr_j, next_i, next_j): outA = hcl.local(0, "a") outB = hcl.local(0, "b") with hcl.if_(next_i[0] == curr_i[0]): outA[0] = 0 with hcl.else_(): outA[0] = seqA[curr_i[0] - 1] with hcl.if_(next_j[0] == curr_j[0]): outB[0] = 0 with hcl.else_(): outB[0] = seqB[curr_j[0] - 1] return outA[0], outB[0] def get_next(action, i, j): act_ = hcl.local(action[i][j], "act") next_i = hcl.local(0, "next_i") next_j = hcl.local(0, "next_j") with hcl.if_(act_[0] == 0): next_i[0] = i - 1 next_j[0] = j - 1 with hcl.elif_(act_[0] == 1): next_i[0] = i - 1 next_j[0] = j with hcl.elif_(act_[0] == 2): next_i[0] = i next_j[0] = j - 1 with hcl.else_(): next_i[0] = i next_j[0] = j return next_i[0], next_j[0] with hcl.Stage("T"): curr_i = hcl.local(i_max[0], "curr_i") curr_j = hcl.local(j_max[0], "curr_j") next_i = hcl.local(0, "next_i") next_j = hcl.local(0, "next_j") next_i[0], next_j[0] = get_next(action, curr_i[0], curr_j[0]) tick = hcl.local(0, "tick") with hcl.while_( hcl.or_(curr_i[0] != next_i[0], curr_j[0] != next_j[0])): consA[tick[0]], consB[tick[0]] = \ align(curr_i, curr_j, next_i, next_j) curr_i[0], curr_j[0] = next_i[0], next_j[0] next_i[0], next_j[0] = get_next(action, curr_i[0], curr_j[0]) tick[0] += 1 def batch_sw(seqAs, seqBs, outAs, outBs): hcl.mutate( (num, ), lambda t: smith_waterman(seqAs[t], seqBs[t], outAs[t], outBs[t]), "B") seqAs = hcl.placeholder((num, lenA), "seqAs", dtype) seqBs = hcl.placeholder(( num, lenB, ), "seqBs", dtype) outAs = hcl.placeholder((num, lenA + lenB), "outAs", dtype) outBs = hcl.placeholder((num, lenA + lenB), "outBs", dtype) # seqAs = hcl.placeholder((num, lenA), "seqAs") # seqBs = hcl.placeholder((num, lenB,), "seqBs") # outAs = hcl.placeholder((num, lenA+lenB), "outAs") # outBs = hcl.placeholder((num, lenA+lenB), "outBs") scheme = hcl.create_scheme([seqAs, seqBs, outAs, outBs], batch_sw) scheme.downsize([batch_sw.B.matrix, batch_sw.B.action], mtype) s = hcl.create_schedule_from_scheme(scheme) o, p = s[batch_sw.B].split(batch_sw.B.axis[0], factor=32) s[batch_sw.B].pipeline(o) # s[batch_sw.B].parallel(p) s[batch_sw.B].unroll(p) return hcl.build(s, target=target)
# We can also apply data type customization to our defined modules. There are # two ways to do that. First, you can specify the data types directly in the # module decorator. Second, you can use the ``quantize`` and ``downsize`` APIs. # Let's show how we can downsize the first example. A = hcl.placeholder((10,), dtype=hcl.UInt(4)) B = hcl.placeholder((10,), dtype=hcl.UInt(4)) C = hcl.placeholder((10,), dtype=hcl.UInt(4)) D = hcl.placeholder((10,), dtype=hcl.UInt(4)) s = hcl.create_scheme([A, B, C, D], maximum) # Downsize the input arguments and also the return value s.downsize([maximum.find_max.A, maximum.find_max.B, maximum.find_max], hcl.UInt(4)) # We also need to downsize the intermediate results s.downsize([maximum.max_1, maximum.max_2], hcl.UInt(4)) s = hcl.create_schedule_from_scheme(s) f = hcl.build(s) ############################################################################## # Let's run it. hcl_A = hcl.asarray(a, hcl.UInt(4)) hcl_B = hcl.asarray(b, hcl.UInt(4)) hcl_C = hcl.asarray(c, hcl.UInt(4)) hcl_D = hcl.asarray(d, hcl.UInt(4)) hcl_O = hcl.asarray(o) f(hcl_A, hcl_B, hcl_C, hcl_D, hcl_O) print("Downsized output tensor:") print(hcl_O)
# ----------------------------------- # create fixed point scheme # ----------------------------------- from uptune import autotune, feedback take_log, type_log = list(), list() for index in range(len(name_pool)): primitive = eval('build_resnet.' + name_pool[index]) taken = autotune(1, (0, 1)) fraction = autotune(18, (0, 16)) bitwidth = 32 datatype = hcl.Fixed(bitwidth, fraction) take_log.append(taken) type_log.append(datatype) if taken: scheme.quantize(primitive, datatype) s = hcl.create_schedule_from_scheme(scheme) f = hcl.build(s, target="llvm") # --------------------------------- # evaluation of run through synthesis # --------------------------------- # load validation data from imagenet # jitter_param = 0.4 # lighting_param = 0.1 # num_gpur = 1 # mean_rgb = [123.68, 116.779, 103.939] # std_rgb = [58.393, 57.12, 57.375] # ctx = [mx.cpu(0)] # # val_data = mx.io.ImageRecordIter( # path_imgrec = '/work/zhang-x2/common/datasets/imagenet-mxnet/val.rec',
def build_ultranet_inf(batch_size=batch_size, target=None): # set up input/output placeholders input_image = hcl.placeholder((batch_size, 3, 160, 320), dtype=input_dtype, name="input_image") weight_conv1 = hcl.placeholder((16, 3, 3, 3), dtype=weight_dtype, name="weight_conv1") # 3 in, 16 out a_batchnorm1 = hcl.placeholder((16,), dtype=bn_a_dtype, name="a_batchnorm1") b_batchnorm1 = hcl.placeholder((16,), dtype=bn_b_dtype, name="b_batchnorm1") weight_conv2 = hcl.placeholder((32, 16, 3, 3), dtype=weight_dtype, name="weight_conv2") # 16 in, 32 out a_batchnorm2 = hcl.placeholder((32,), dtype=bn_a_dtype, name="a_batchnorm2") b_batchnorm2 = hcl.placeholder((32,), dtype=bn_b_dtype, name="b_batchnorm2") weight_conv3 = hcl.placeholder((64, 32, 3, 3), dtype=weight_dtype, name="weight_conv3") # 32 in, 64 out a_batchnorm3 = hcl.placeholder((64,), dtype=bn_a_dtype, name="a_batchnorm3") b_batchnorm3 = hcl.placeholder((64,), dtype=bn_b_dtype, name="b_batchnorm3") weight_conv4 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv4") # 64 in, 64 out a_batchnorm4 = hcl.placeholder((64,), dtype=bn_a_dtype, name="a_batchnorm4") b_batchnorm4 = hcl.placeholder((64,), dtype=bn_b_dtype, name="b_batchnorm4") weight_conv5 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv5") # 64 in, 64 out a_batchnorm5 = hcl.placeholder((64,), dtype=bn_a_dtype, name="a_batchnorm5") b_batchnorm5 = hcl.placeholder((64,), dtype=bn_b_dtype, name="b_batchnorm5") weight_conv6 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv6") # 64 in, 64 out a_batchnorm6 = hcl.placeholder((64,), dtype=bn_a_dtype, name="a_batchnorm6") b_batchnorm6 = hcl.placeholder((64,), dtype=bn_b_dtype, name="b_batchnorm6") weight_conv7 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv7") # 64 in, 64 out a_batchnorm7 = hcl.placeholder((64,), dtype=bn_a_dtype, name="a_batchnorm7") b_batchnorm7 = hcl.placeholder((64,), dtype=bn_b_dtype, name="b_batchnorm7") weight_conv8 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv8") # 64 in, 64 out a_batchnorm8 = hcl.placeholder((64,), dtype=bn_a_dtype, name="a_batchnorm8") b_batchnorm8 = hcl.placeholder((64,), dtype=bn_b_dtype, name="b_batchnorm8") sm = hcl.create_scheme( [input_image, weight_conv1, a_batchnorm1, b_batchnorm1, weight_conv2, a_batchnorm2, b_batchnorm2, weight_conv3, a_batchnorm3, b_batchnorm3, weight_conv4, a_batchnorm4, b_batchnorm4, weight_conv5, a_batchnorm5, b_batchnorm5, weight_conv6, a_batchnorm6, b_batchnorm6, weight_conv7, a_batchnorm7, b_batchnorm7, weight_conv8, a_batchnorm8, b_batchnorm8], ultranet ) # quantize activations sm.quantize(ultranet.conv1, conv_dtype) sm.quantize(ultranet.relu1, act_dtype) sm.quantize(ultranet.conv2, conv_dtype) sm.quantize(ultranet.relu2, act_dtype) sm.quantize(ultranet.conv3, conv_dtype) sm.quantize(ultranet.relu3, act_dtype) sm.quantize(ultranet.conv4, conv_dtype) sm.quantize(ultranet.relu4, act_dtype) sm.quantize(ultranet.conv5, conv_dtype) sm.quantize(ultranet.relu5, act_dtype) sm.quantize(ultranet.conv6, conv_dtype) sm.quantize(ultranet.relu6, act_dtype) sm.quantize(ultranet.conv7, conv_dtype) sm.quantize(ultranet.relu7, act_dtype) sm.quantize(ultranet.conv8, conv_dtype) sm.quantize(ultranet.relu8, act_dtype) s = hcl.create_schedule_from_scheme(sm, "main") return hcl.build(s, target=target)