def test_module_quantize_args(): hcl.init() def algorithm(A, B): @hcl.def_([A.shape, B.shape, ()]) def add(A, B, x): hcl.return_(A[x] + B[x]) return hcl.compute(A.shape, lambda x: add(A, B, x), "C") A = hcl.placeholder((10, ), dtype=hcl.UInt(2)) B = hcl.placeholder((10, )) s = hcl.create_scheme([A, B], algorithm) s.downsize([algorithm.add.A], hcl.UInt(2)) s = hcl.create_schedule_from_scheme(s) f = hcl.build(s) a = np.random.randint(100, size=(10, )) b = np.random.randint(100, size=(10, )) c = np.zeros(10) _A = hcl.asarray(a, hcl.UInt(2)) _B = hcl.asarray(b) _C = hcl.asarray(c) f(_A, _B, _C) _A = _A.asnumpy() _B = _B.asnumpy() _C = _C.asnumpy() for i in range(0, 10): assert (_C[i] == a[i] % 4 + b[i])
def test_uint_imm_ops(): A = hcl.placeholder((10, 10), "A", dtype=hcl.UInt(1)) def kernel(A): return hcl.compute((8, 8), lambda y, x: hcl.select(x < 4, A[y][x], 0), "B") s = hcl.create_scheme(A, kernel) s = hcl.create_schedule_from_scheme(s) code = hcl.build(s, target="vhls") assert "(unsigned int)0U)" in code
def test_binary_ops(): A = hcl.placeholder((8, 8), "A", dtype=hcl.Int(20)) B = hcl.placeholder((8, 8), "B", dtype=hcl.Fixed(16,12)) def kernel(A, B): return hcl.compute((8, 8), lambda y, x: hcl.select(x < 4, A[y][x], B[y][x]), "C", dtype=hcl.Int(8)) s = hcl.create_scheme([A, B], kernel) s = hcl.create_schedule_from_scheme(s) code = hcl.build(s, target="vhls") assert "(ap_fixed<32, 20>)B" in code
def test_imm_ops(): A = hcl.placeholder((10, 10), "A") def kernel(A): return hcl.compute((8, 8), lambda y, x: hcl.select(x < 4, A[y][x] + A[y+2][x+2], 0), "B") s = hcl.create_scheme(A, kernel) s = hcl.create_schedule_from_scheme(s) code = hcl.build(s, target="vhls") assert "((ap_int<33>)0)" in code assert "((ap_int<33>)(((ap_int<33>)A" in code
def test_uint_int(): A = hcl.placeholder((8, 8), "A", dtype=hcl.Fixed(20,12)) B = hcl.placeholder((8, 8), "B", dtype=hcl.UFixed(16,12)) def kernel(A, B): return hcl.compute((8, 8), lambda y, x: hcl.select(x < 4, A[y][x], B[y][x]), "C", dtype=hcl.Int(8)) s = hcl.create_scheme([A, B], kernel) s = hcl.create_schedule_from_scheme(s) code = hcl.build(s, target="vhls") assert "ap_ufixed<20, 8>)A" in code
def build_bnn_inf_opt(batch_size=batch_size, target=target): hcl_ph = [] input_image = hcl.placeholder((batch_size, 1, 16, 16), "input_image", qtype_bit) for name in params: dtype = qtype_bit if ("conv" in name or "w_" in name) else qtype_float hcl_ph.append(hcl.placeholder(params[name].shape, name, dtype=dtype)) # build the network scheme = hcl.create_scheme([input_image] + hcl_ph, build_bnn) s = hcl.create_schedule_from_scheme(scheme) def plot_dataflow_graph(): import matplotlib.pyplot as plt import networkx as nx graph, op = s.dataflow_graph(plot=True) nx.draw(graph, with_labels=True) plt.savefig("bnn.png") # compute optimization layer_names = build_bnn.__dict__.keys() for layer in layer_names: s_layer = getattr(build_bnn, layer) if "bn" in layer: # fuse conv s_conv = getattr(build_bnn, "conv" + layer[-1]) s[s_conv].compute_at(s[s_layer], s_layer.axis[3]) if layer == "bn1": s[s_layer].pipeline(s_layer.axis[3]) # will be refreshed else: s[s_conv].pipeline(s_conv.axis[4]) elif "pool" in layer: s[s_layer].pipeline(s_layer.axis[2]) elif "fc" in layer: s[s_layer].pipeline(s_layer.axis[1]) elif "flatten" in layer: s[s_layer].pipeline(s_layer.axis[1]) elif "dense_relu" in layer: s_fc = getattr(build_bnn, "fc1") s[s_fc].compute_at(s[s_layer], s_layer.axis[1]) s[s_fc].pipeline(s_fc.axis[2]) if isinstance(target, hcl.platform): s.to([input_image] + hcl_ph, target.xcel) s.to(build_bnn.fc2, target.host) target.config(compile="vivado_hls", mode="csyn") # memory optimization s.partition(input_image, hcl.Partition.Block, dim=1, factor=8) for ph in reversed(hcl_ph): if ph.name in ["b_fc2", "fc2"]: s.partition(ph, hcl.Partition.Complete, dim=1) else: s.partition(ph, hcl.Partition.Block, dim=1, factor=8) return hcl.build(s, target=target)
def test(): hcl.init() A = hcl.placeholder((8, 8), "A") def kernel(A): return hcl.compute((8, 8), lambda y, x: foo(A[y, x] + A[y, x]), "C") s = hcl.create_scheme([A], kernel) s = hcl.create_schedule_from_scheme(s) f = hcl.build(s, "vhls") print(f)
def test1(): A = hcl.placeholder((8, 8), "A") B = hcl.placeholder((8, 8), "B", dtype=hcl.Fixed(16, 12)) def kernel(A, B): return hcl.compute( (8, 8), lambda y, x: hcl.select(x < 4, A[y][x], B[y][x]), "C") s = hcl.create_scheme([A, B], kernel) s = hcl.create_schedule_from_scheme(s) f = hcl.build(s, target="vhls") print(f)
def test2(): A = hcl.placeholder((8, 8), "A", dtype=hcl.UInt(1)) def kernel(A): return hcl.compute((8, 8), lambda y, x: hcl.select(x < 4, A[y][x], 0), "B") s = hcl.create_scheme([A], kernel) s = hcl.create_schedule_from_scheme(s) f = hcl.build(s, target="vhls") with open("select_test.cpp", "w") as outfile: outfile.write(f)
def test3(): A = hcl.placeholder((8, 8), "A", dtype=hcl.UInt(2)) B = hcl.placeholder((8, 8), "B", dtype=hcl.UInt(2)) def kernel(A, B): return hcl.compute((8, 8), lambda y, x: hcl.select(x < 4, A[y, x][0], 0), "C") s = hcl.create_scheme([A, B], kernel) s = hcl.create_schedule_from_scheme(s) f = hcl.build(s, "vhls") print(f)
def build_lenet_inf(batch_size=batch_size, target=None): # set up input/output placeholders input_image = hcl.placeholder((batch_size, 1, 28, 28), "input_image") weight_conv1 = hcl.placeholder((20, 1, 5, 5), "weight_conv1", qtype1) weight_conv2 = hcl.placeholder((50, 20, 5, 5), "weight_conv2", qtype1) weight_fc1 = hcl.placeholder((500, 800), "weight_fc1", qtype1) weight_fc2 = hcl.placeholder((10, 500), "weight_fc2", qtype1) lenet = hcl.placeholder((batch_size, 10), "lenet") # create a quantization scheme scheme = hcl.create_scheme([ input_image, weight_conv1, weight_conv2, weight_fc1, weight_fc2, lenet ], build_lenet) # quantize the three activation layers scheme.quantize([build_lenet.tanh1, build_lenet.tanh2, build_lenet.tanh3], qtype2) s = hcl.create_schedule_from_scheme(scheme) return hcl.build(s, target=target)
def build_bnn_inf(batch_size=batch_size, target=target): hcl_ph = [] input_image = hcl.placeholder((batch_size, 1, 16, 16), "input_image", qtype_bit) for name in params: dtype = qtype_bit if ("conv" in name or "w_" in name) else qtype_float hcl_ph.append(hcl.placeholder(params[name].shape, name, dtype=dtype)) # build the network scheme = hcl.create_scheme([input_image] + hcl_ph, build_bnn) s = hcl.create_schedule_from_scheme(scheme) # if isinstance(target,hcl.platform): # s.to([input_image] + hcl_ph, target.xcel) # s.to(build_bnn.fc2, target.host) # target.config(compile="vivado_hls", mode="csyn") return hcl.build(s, target=target)
def test_resize(): def algorithm(A): return hcl.compute(A.shape, lambda x: A[x] + 1, "B") A = hcl.placeholder((10, ), dtype=hcl.UInt(32)) scheme = hcl.create_scheme([A], algorithm) scheme.downsize(algorithm.B, hcl.UInt(2)) s = hcl.create_schedule_from_scheme(scheme) f = hcl.build(s) a = np.random.randint(100, size=(10, )) _A = hcl.asarray(a, dtype=hcl.UInt(32)) _B = hcl.asarray(np.zeros(10), dtype=hcl.UInt(2)) f(_A, _B) _A = _A.asnumpy() _B = _B.asnumpy() for i in range(10): assert (_B[i] == (a[i] + 1) % 4)
def build_ultranet_hls(batch_size=batch_size, target=None): # set up input/output placeholders input_image = hcl.placeholder((batch_size, 3, 160, 320), dtype=input_dtype, name="input_image") weight_conv1 = hcl.placeholder((16, 3, 3, 3), dtype=weight_dtype, name="weight_conv1") # 3 in, 16 out a_batchnorm1 = hcl.placeholder((16, ), dtype=bn_a_dtype, name="a_batchnorm1") b_batchnorm1 = hcl.placeholder((16, ), dtype=bn_b_dtype, name="b_batchnorm1") weight_conv2 = hcl.placeholder((32, 16, 3, 3), dtype=weight_dtype, name="weight_conv2") # 16 in, 32 out a_batchnorm2 = hcl.placeholder((32, ), dtype=bn_a_dtype, name="a_batchnorm2") b_batchnorm2 = hcl.placeholder((32, ), dtype=bn_b_dtype, name="b_batchnorm2") weight_conv3 = hcl.placeholder((64, 32, 3, 3), dtype=weight_dtype, name="weight_conv3") # 32 in, 64 out a_batchnorm3 = hcl.placeholder((64, ), dtype=bn_a_dtype, name="a_batchnorm3") b_batchnorm3 = hcl.placeholder((64, ), dtype=bn_b_dtype, name="b_batchnorm3") weight_conv4 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv4") # 64 in, 64 out a_batchnorm4 = hcl.placeholder((64, ), dtype=bn_a_dtype, name="a_batchnorm4") b_batchnorm4 = hcl.placeholder((64, ), dtype=bn_b_dtype, name="b_batchnorm4") weight_conv5 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv5") # 64 in, 64 out a_batchnorm5 = hcl.placeholder((64, ), dtype=bn_a_dtype, name="a_batchnorm5") b_batchnorm5 = hcl.placeholder((64, ), dtype=bn_b_dtype, name="b_batchnorm5") weight_conv6 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv6") # 64 in, 64 out a_batchnorm6 = hcl.placeholder((64, ), dtype=bn_a_dtype, name="a_batchnorm6") b_batchnorm6 = hcl.placeholder((64, ), dtype=bn_b_dtype, name="b_batchnorm6") weight_conv7 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv7") # 64 in, 64 out a_batchnorm7 = hcl.placeholder((64, ), dtype=bn_a_dtype, name="a_batchnorm7") b_batchnorm7 = hcl.placeholder((64, ), dtype=bn_b_dtype, name="b_batchnorm7") weight_conv8 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv8") # 64 in, 64 out a_batchnorm8 = hcl.placeholder((64, ), dtype=bn_a_dtype, name="a_batchnorm8") b_batchnorm8 = hcl.placeholder((64, ), dtype=bn_b_dtype, name="b_batchnorm8") sm = hcl.create_scheme([ input_image, weight_conv1, a_batchnorm1, b_batchnorm1, weight_conv2, a_batchnorm2, b_batchnorm2, weight_conv3, a_batchnorm3, b_batchnorm3, weight_conv4, a_batchnorm4, b_batchnorm4, weight_conv5, a_batchnorm5, b_batchnorm5, weight_conv6, a_batchnorm6, b_batchnorm6, weight_conv7, a_batchnorm7, b_batchnorm7, weight_conv8, a_batchnorm8, b_batchnorm8 ], ultranet) # quantize activations sm.quantize(ultranet.conv1, conv_dtype) sm.quantize(ultranet.relu1, act_dtype) sm.quantize(ultranet.conv2, conv_dtype) sm.quantize(ultranet.relu2, act_dtype) sm.quantize(ultranet.conv3, conv_dtype) sm.quantize(ultranet.relu3, act_dtype) sm.quantize(ultranet.conv4, conv_dtype) sm.quantize(ultranet.relu4, act_dtype) sm.quantize(ultranet.conv5, conv_dtype) sm.quantize(ultranet.relu5, act_dtype) sm.quantize(ultranet.conv6, conv_dtype) sm.quantize(ultranet.relu6, act_dtype) sm.quantize(ultranet.conv7, conv_dtype) sm.quantize(ultranet.relu7, act_dtype) sm.quantize(ultranet.conv8, conv_dtype) sm.quantize(ultranet.relu8, act_dtype) s = hcl.create_schedule_from_scheme(sm, "main") # create line-buffer and window-buffer for conv layers for i in range(1, 1 + 8): conv_pad = getattr(ultranet, 'conv' + str(i) + '_pad') conv = getattr(ultranet, 'conv' + str(i)) LB = s.reuse_at(conv_pad._op, s[conv], conv.axis[2], f"conv{i}_line_buffer") WB = s.reuse_at(LB, s[conv], conv.axis[3], f"conv{i}_window_buffer") # conv3 = ultranet.conv3 # xo, yo, xi, yi = s[conv3].tile(conv3.axis[2], conv3.axis[3], 4, 4) # s[conv3].reorder(yo, xo, yi, xi) # print(hcl.lower(s)) if opt: # merge conv + bn + relu operators for i in range(1, 1 + 8): pad = getattr(ultranet, 'conv' + str(i) + '_pad') conv = getattr(ultranet, 'conv' + str(i)) bn = getattr(ultranet, 'batch_norm' + str(i)) relu = getattr(ultranet, 'relu' + str(i)) # Can't merge pad with conv, a limitation of HCL. # s[pad].compute_at(s[conv], conv.axis[3]) s[bn].compute_at(s[relu], relu.axis[3]) res = ultranet.result relu8 = ultranet.relu8 s[relu8].compute_at(s[res], res.axis[3]) # pipeline all layers for i in range(1, 1 + 8): pad = getattr(ultranet, 'conv' + str(i) + '_pad') conv = getattr(ultranet, 'conv' + str(i)) bn_relu = getattr(ultranet, 'relu' + str(i)) s[pad].pipeline(pad.axis[3]) # s[conv].pipeline(conv.axis[4]) s[conv].pipeline(conv.axis[3]) s[bn_relu].pipeline(bn_relu.axis[3]) if i <= 4: pool_pad = getattr(ultranet, 'pool' + str(i) + '_pad') pool = getattr(ultranet, 'pool' + str(i)) s[pool_pad].pipeline(pool_pad.axis[3]) s[pool].pipeline(pool.axis[3]) s[ultranet.result].pipeline(ultranet.result.axis[3]) # partition weight buffers if partition: # weights need to be partitioned in dim 2, 3, 4 # for now HeteroCL doesn't support multi-dimensional partition s.partition(weight_conv1, dim=2) s.partition(weight_conv2, dim=2) s.partition(weight_conv3, dim=2) s.partition(weight_conv4, dim=2) s.partition(weight_conv5, dim=2) s.partition(weight_conv6, dim=2) s.partition(weight_conv7, dim=2) s.partition(weight_conv8, dim=2) # fifo across layers if stream: ''' Note: Padding layer's pipelining has to precede other layers' because of a bug in HeteroCL: when there's an ifThenElse statement in which both branch reads/writes the same buffer, HeteroCL thinks it's accessing the buffer twice, thus preventing pipelining. For now, ?: works, but if..else.. doesn't, because the latter has two load/store nodes. ''' s.to(ultranet.conv1_pad, s[ultranet.conv1], fifo_depth=128) s.to(ultranet.conv2_pad, s[ultranet.conv2], fifo_depth=128) s.to(ultranet.conv3_pad, s[ultranet.conv3], fifo_depth=128) s.to(ultranet.conv4_pad, s[ultranet.conv4], fifo_depth=128) s.to(ultranet.conv5_pad, s[ultranet.conv5], fifo_depth=128) s.to(ultranet.conv6_pad, s[ultranet.conv6], fifo_depth=128) s.to(ultranet.conv7_pad, s[ultranet.conv7], fifo_depth=128) s.to(ultranet.conv8_pad, s[ultranet.conv8], fifo_depth=128) s.to(ultranet.conv1, s[ultranet.relu1], fifo_depth=128) s.to(ultranet.relu1, s[ultranet.pool1_pad], fifo_depth=128) s.to(ultranet.pool1_pad, s[ultranet.pool1], fifo_depth=128) s.to(ultranet.pool1, s[ultranet.conv2_pad], fifo_depth=128) s.to(ultranet.conv2, s[ultranet.relu2], fifo_depth=128) s.to(ultranet.relu2, s[ultranet.pool2_pad], fifo_depth=128) s.to(ultranet.pool2_pad, s[ultranet.pool2], fifo_depth=128) s.to(ultranet.pool2, s[ultranet.conv3_pad], fifo_depth=128) s.to(ultranet.conv3, s[ultranet.relu3], fifo_depth=128) s.to(ultranet.relu3, s[ultranet.pool3_pad], fifo_depth=128) s.to(ultranet.pool3_pad, s[ultranet.pool3], fifo_depth=128) s.to(ultranet.pool3, s[ultranet.conv4_pad], fifo_depth=128) s.to(ultranet.conv4, s[ultranet.relu4], fifo_depth=128) s.to(ultranet.relu4, s[ultranet.pool4_pad], fifo_depth=128) s.to(ultranet.pool4_pad, s[ultranet.pool4], fifo_depth=128) s.to(ultranet.pool4, s[ultranet.conv5_pad], fifo_depth=128) s.to(ultranet.conv5, s[ultranet.relu5], fifo_depth=128) s.to(ultranet.relu5, s[ultranet.conv6_pad], fifo_depth=128) s.to(ultranet.conv6, s[ultranet.relu6], fifo_depth=128) s.to(ultranet.relu6, s[ultranet.conv7_pad], fifo_depth=128) s.to(ultranet.conv7, s[ultranet.relu7], fifo_depth=128) s.to(ultranet.relu7, s[ultranet.conv8_pad], fifo_depth=128) s.to(ultranet.conv8, s[ultranet.result], fifo_depth=128) return hcl.build(s, name="main", target=target)
def top(target=None): # Algorithm definition (§1) def knn(test_image, train_images): # Imperative programming and bit operations (§2) def popcount(num): out = hcl.scalar(0, "out") with hcl.for_(0, train_images.type.bits) as i: # Bit selection operation out.v += num[i] return out.v # This function update the candidates, i.e., `knn_mat`. Here we mutate # through the shape of tensor `dist`. For each `dist` value, if it is # smaller than the maximum candidate, we replace it. def update_knn(dist, knn_mat, i, j): max_id = hcl.scalar(0, "max_id") with hcl.for_(0, 3) as k: with hcl.if_(knn_mat[i][k] > knn_mat[i][max_id.v]): max_id.v = k with hcl.if_(dist[i][j] < knn_mat[i][max_id.v]): knn_mat[i][max_id.v] = dist[i][j] # Main algorithm (§3) # Fist step: XOR (§3.1) diff = hcl.compute(train_images.shape, lambda x, y: train_images[x][y] ^ test_image, "diff") # Second step: popcount (§3.2) dist = hcl.compute(diff.shape, lambda x, y: popcount(diff[x][y]), "dist") # Third step: initialize the candidates (§3.3) knn_mat_buf = hcl.compute((10, 4), lambda x, y: 50, "knn_mat_buf") # Fourth step: update the candidates (§3.4) hcl.mutate(dist.shape, lambda x, y: update_knn(dist, knn_mat_buf, x, y), "knn_update") knn_mat = hcl.compute((10, 3), lambda x, y: knn_mat_buf[x][y], "knn_mat") # Final step: return the candidates (§3.5) return knn_mat # Inputs/Outputs definition (§4) # Scalars (§4.1) test_image = hcl.placeholder((), "test_image") # Tensors (§4.2) train_images = hcl.placeholder(data_size, "train_images") # Data type customization (§5.1) scheme = hcl.create_scheme([test_image, train_images], knn) scheme.downsize([knn.dist, knn.dist.out, knn.knn_mat_buf, knn.knn_mat], dtype_knnmat) # Compute customization (§5.2) s = hcl.create_schedule_from_scheme(scheme) diff = knn.diff dist = knn.dist knn_mat_buf = knn.knn_mat_buf knn_update = knn.knn_update # Merge loop nests s[diff].compute_at(s[dist], dist.axis[1]) s[dist].compute_at(s[knn_update], knn_update.axis[1]) # Reorder loop to expose more parallelism s[knn_update].reorder(knn_update.axis[1], knn_update.axis[0]) # Parallel initialization of knn mat s[knn_mat_buf].parallel(knn_mat_buf.axis[0]) # Parallel outer loop and pipeline inner loop s[knn_update].parallel(knn_update.axis[0]) s[knn_update].pipeline(knn_update.axis[1]) # Parallel the innermost loop of 49 pixels s[dist].parallel(dist.axis[2]) # At the end, we build the whole offloaded function. return hcl.build(s, target=target)
def top(target=None): def smith_waterman(seqA, seqB, consA, consB): def similarity_score(a, b): return hcl.select(a == b, 1, penalty) def find_max(A, len_): max_ = hcl.local(A[0], "max") act_ = hcl.local(0, "act") with hcl.for_(0, len_) as i: with hcl.if_(A[i] > max_[0]): max_[0] = A[i] act_[0] = i return max_[0], act_[0] matrix_max = hcl.local(0, "maxtrix_max") i_max = hcl.local(0, "i_max") j_max = hcl.local(0, "j_max") matrix = hcl.compute((lenA + 1, lenB + 1), lambda x, y: 0, "matrix") action = hcl.compute(matrix.shape, lambda x, y: 3, "action") def populate_matrix(i, j): trace_back = hcl.compute((4, ), lambda x: 0, "trace_back") with hcl.if_(hcl.and_(i != 0, j != 0)): trace_back[0] = matrix[i-1, j-1] + \ similarity_score(seqA[i-1], seqB[j-1]) trace_back[1] = matrix[i - 1, j] + penalty trace_back[2] = matrix[i, j - 1] + penalty trace_back[3] = 0 matrix[i, j], action[i, j] = find_max(trace_back, 4) with hcl.if_(matrix[i, j] > matrix_max[0]): matrix_max[0] = matrix[i, j] i_max[0] = i j_max[0] = j P = hcl.mutate((lenA + 1, lenB + 1), lambda i, j: populate_matrix(i, j)) def align(curr_i, curr_j, next_i, next_j): outA = hcl.local(0, "a") outB = hcl.local(0, "b") with hcl.if_(next_i[0] == curr_i[0]): outA[0] = 0 with hcl.else_(): outA[0] = seqA[curr_i[0] - 1] with hcl.if_(next_j[0] == curr_j[0]): outB[0] = 0 with hcl.else_(): outB[0] = seqB[curr_j[0] - 1] return outA[0], outB[0] def get_next(action, i, j): act_ = hcl.local(action[i][j], "act") next_i = hcl.local(0, "next_i") next_j = hcl.local(0, "next_j") with hcl.if_(act_[0] == 0): next_i[0] = i - 1 next_j[0] = j - 1 with hcl.elif_(act_[0] == 1): next_i[0] = i - 1 next_j[0] = j with hcl.elif_(act_[0] == 2): next_i[0] = i next_j[0] = j - 1 with hcl.else_(): next_i[0] = i next_j[0] = j return next_i[0], next_j[0] with hcl.Stage("T"): curr_i = hcl.local(i_max[0], "curr_i") curr_j = hcl.local(j_max[0], "curr_j") next_i = hcl.local(0, "next_i") next_j = hcl.local(0, "next_j") next_i[0], next_j[0] = get_next(action, curr_i[0], curr_j[0]) tick = hcl.local(0, "tick") with hcl.while_( hcl.or_(curr_i[0] != next_i[0], curr_j[0] != next_j[0])): consA[tick[0]], consB[tick[0]] = \ align(curr_i, curr_j, next_i, next_j) curr_i[0], curr_j[0] = next_i[0], next_j[0] next_i[0], next_j[0] = get_next(action, curr_i[0], curr_j[0]) tick[0] += 1 def batch_sw(seqAs, seqBs, outAs, outBs): hcl.mutate( (num, ), lambda t: smith_waterman(seqAs[t], seqBs[t], outAs[t], outBs[t]), "B") seqAs = hcl.placeholder((num, lenA), "seqAs", dtype) seqBs = hcl.placeholder(( num, lenB, ), "seqBs", dtype) outAs = hcl.placeholder((num, lenA + lenB), "outAs", dtype) outBs = hcl.placeholder((num, lenA + lenB), "outBs", dtype) # seqAs = hcl.placeholder((num, lenA), "seqAs") # seqBs = hcl.placeholder((num, lenB,), "seqBs") # outAs = hcl.placeholder((num, lenA+lenB), "outAs") # outBs = hcl.placeholder((num, lenA+lenB), "outBs") scheme = hcl.create_scheme([seqAs, seqBs, outAs, outBs], batch_sw) scheme.downsize([batch_sw.B.matrix, batch_sw.B.action], mtype) s = hcl.create_schedule_from_scheme(scheme) o, p = s[batch_sw.B].split(batch_sw.B.axis[0], factor=32) s[batch_sw.B].pipeline(o) # s[batch_sw.B].parallel(p) s[batch_sw.B].unroll(p) return hcl.build(s, target=target)
assert np.array_equal(hcl_D.asnumpy(), m) ############################################################################## # Data Type Customization for Modules # ----------------------------------- # We can also apply data type customization to our defined modules. There are # two ways to do that. First, you can specify the data types directly in the # module decorator. Second, you can use the ``quantize`` and ``downsize`` APIs. # Let's show how we can downsize the first example. A = hcl.placeholder((10,), dtype=hcl.UInt(4)) B = hcl.placeholder((10,), dtype=hcl.UInt(4)) C = hcl.placeholder((10,), dtype=hcl.UInt(4)) D = hcl.placeholder((10,), dtype=hcl.UInt(4)) s = hcl.create_scheme([A, B, C, D], maximum) # Downsize the input arguments and also the return value s.downsize([maximum.find_max.A, maximum.find_max.B, maximum.find_max], hcl.UInt(4)) # We also need to downsize the intermediate results s.downsize([maximum.max_1, maximum.max_2], hcl.UInt(4)) s = hcl.create_schedule_from_scheme(s) f = hcl.build(s) ############################################################################## # Let's run it. hcl_A = hcl.asarray(a, hcl.UInt(4)) hcl_B = hcl.asarray(b, hcl.UInt(4)) hcl_C = hcl.asarray(c, hcl.UInt(4)) hcl_D = hcl.asarray(d, hcl.UInt(4)) hcl_O = hcl.asarray(o)
**Author**: Yi-Hsiang Lai (seanlatias@github) HeteroCL provides multiple back-end supports. Currently, we support both CPU and FPGA flows. We will be extending to other back ends including ASICs and PIMs (processing in memory). To set to different back ends, simply set the ``target`` of ``hcl.build`` API. In this tutorial, we will demonstrate how to target different back ends in HeteroCL. The same program and schedule will be used throughout the entire tutorial. """ import heterocl as hcl import numpy as np A = hcl.placeholder((10, 10), "A") def kernel(A): return hcl.compute((8, 8), lambda y, x: A[y][x] + A[y+2][x+2], "B") s = hcl.create_scheme(A, kernel) s.downsize(kernel.B, hcl.UInt(4)) s = hcl.create_schedule_from_scheme(s) s.partition(A) s[kernel.B].pipeline(kernel.B.axis[1]) ############################################################################## # CPU # --- # CPU is the default back end of a HeteroCL program. If you want to be more # specific, set the ``target`` to be ``llvm``. Note the some customization # primitives are ignored by the CPU back end. For instance, ``partition`` and # ``pipeline`` have no effect. Instead, we can use ``parallel``. f = hcl.build(s) # equivalent to hcl.build(s, target="llvm") ############################################################################## # We can execute the returned function as we demonstrated in other tutorials.
params = arg_params.copy() params.update(aux_params) for name in names: val = params[name].asnumpy() ph = hcl.placeholder(val.shape, name) holders.append(ph) values.append(hcl.asarray(val, dtype=hcl.Float())) # build the function input_image = hcl.placeholder((batch_size, 3, 224, 224), "input_image") resnet = hcl.placeholder((batch_size, 1000), "resnet") # create scheme and build arg_list = [input_image, resnet] + holders scheme = hcl.create_scheme(arg_list, build_resnet) # ----------------------------------- # create fixed point scheme # ----------------------------------- from uptune import autotune, feedback take_log, type_log = list(), list() for index in range(len(name_pool)): primitive = eval('build_resnet.' + name_pool[index]) taken = autotune(1, (0, 1)) fraction = autotune(18, (0, 16)) bitwidth = 32 datatype = hcl.Fixed(bitwidth, fraction) take_log.append(taken) type_log.append(datatype) if taken:
"dist") knn_mat = hcl.compute((10, 3), lambda x, y: 50, "knn_mat") hcl.mutate(dist.shape, lambda x, y: update_knn(dist, knn_mat, x, y), "knn_update") hcl.mutate((10, 3), lambda x, y: sort_knn(knn_mat, x, y), "sort") knn_new = hcl.compute(knn_mat.shape, lambda x, y: knn_mat[x][y], "copy") knn_pred = hcl.compute((10,), lambda x: knn_vote(knn_mat, x), "vote") return knn_pred test_image = hcl.placeholder(test_size, "test_image", dtype_image) train_images = hcl.placeholder(data_size, "train_images", dtype_image) scheme = hcl.create_scheme([test_image, train_images], knn) scheme.downsize([knn.dist, knn.dist.out, knn.knn_mat], dtype_knnmat) s = hcl.create_schedule_from_scheme(scheme) diff = knn.diff dist = knn.dist vote = knn.copy knn_update = knn.knn_update s.to([test_images, train_images], target.xcel) s.to(vote, target.host) # merge loop nests s[diff].compute_at(s[dist], dist.axis[1]) s[dist].compute_at(s[knn_update], knn_update.axis[1])
def build_ultranet_inf(batch_size=batch_size, target=None): # set up input/output placeholders input_image = hcl.placeholder((batch_size, 3, 160, 320), dtype=input_dtype, name="input_image") weight_conv1 = hcl.placeholder((16, 3, 3, 3), dtype=weight_dtype, name="weight_conv1") # 3 in, 16 out a_batchnorm1 = hcl.placeholder((16,), dtype=bn_a_dtype, name="a_batchnorm1") b_batchnorm1 = hcl.placeholder((16,), dtype=bn_b_dtype, name="b_batchnorm1") weight_conv2 = hcl.placeholder((32, 16, 3, 3), dtype=weight_dtype, name="weight_conv2") # 16 in, 32 out a_batchnorm2 = hcl.placeholder((32,), dtype=bn_a_dtype, name="a_batchnorm2") b_batchnorm2 = hcl.placeholder((32,), dtype=bn_b_dtype, name="b_batchnorm2") weight_conv3 = hcl.placeholder((64, 32, 3, 3), dtype=weight_dtype, name="weight_conv3") # 32 in, 64 out a_batchnorm3 = hcl.placeholder((64,), dtype=bn_a_dtype, name="a_batchnorm3") b_batchnorm3 = hcl.placeholder((64,), dtype=bn_b_dtype, name="b_batchnorm3") weight_conv4 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv4") # 64 in, 64 out a_batchnorm4 = hcl.placeholder((64,), dtype=bn_a_dtype, name="a_batchnorm4") b_batchnorm4 = hcl.placeholder((64,), dtype=bn_b_dtype, name="b_batchnorm4") weight_conv5 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv5") # 64 in, 64 out a_batchnorm5 = hcl.placeholder((64,), dtype=bn_a_dtype, name="a_batchnorm5") b_batchnorm5 = hcl.placeholder((64,), dtype=bn_b_dtype, name="b_batchnorm5") weight_conv6 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv6") # 64 in, 64 out a_batchnorm6 = hcl.placeholder((64,), dtype=bn_a_dtype, name="a_batchnorm6") b_batchnorm6 = hcl.placeholder((64,), dtype=bn_b_dtype, name="b_batchnorm6") weight_conv7 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv7") # 64 in, 64 out a_batchnorm7 = hcl.placeholder((64,), dtype=bn_a_dtype, name="a_batchnorm7") b_batchnorm7 = hcl.placeholder((64,), dtype=bn_b_dtype, name="b_batchnorm7") weight_conv8 = hcl.placeholder((64, 64, 3, 3), dtype=weight_dtype, name="weight_conv8") # 64 in, 64 out a_batchnorm8 = hcl.placeholder((64,), dtype=bn_a_dtype, name="a_batchnorm8") b_batchnorm8 = hcl.placeholder((64,), dtype=bn_b_dtype, name="b_batchnorm8") sm = hcl.create_scheme( [input_image, weight_conv1, a_batchnorm1, b_batchnorm1, weight_conv2, a_batchnorm2, b_batchnorm2, weight_conv3, a_batchnorm3, b_batchnorm3, weight_conv4, a_batchnorm4, b_batchnorm4, weight_conv5, a_batchnorm5, b_batchnorm5, weight_conv6, a_batchnorm6, b_batchnorm6, weight_conv7, a_batchnorm7, b_batchnorm7, weight_conv8, a_batchnorm8, b_batchnorm8], ultranet ) # quantize activations sm.quantize(ultranet.conv1, conv_dtype) sm.quantize(ultranet.relu1, act_dtype) sm.quantize(ultranet.conv2, conv_dtype) sm.quantize(ultranet.relu2, act_dtype) sm.quantize(ultranet.conv3, conv_dtype) sm.quantize(ultranet.relu3, act_dtype) sm.quantize(ultranet.conv4, conv_dtype) sm.quantize(ultranet.relu4, act_dtype) sm.quantize(ultranet.conv5, conv_dtype) sm.quantize(ultranet.relu5, act_dtype) sm.quantize(ultranet.conv6, conv_dtype) sm.quantize(ultranet.relu6, act_dtype) sm.quantize(ultranet.conv7, conv_dtype) sm.quantize(ultranet.relu7, act_dtype) sm.quantize(ultranet.conv8, conv_dtype) sm.quantize(ultranet.relu8, act_dtype) s = hcl.create_schedule_from_scheme(sm, "main") return hcl.build(s, target=target)
print("Loaded {} images".format(num_images)) params = np.load("data/bnn-sdsoc.params.npz") # declare hcl placeholders hcl_array = [] hcl_ph = [] input_image = hcl.placeholder((batch_size, 1, 16, 16), "input_image", qtype_int) for name in params: dtype = qtype_bit if ("conv" in name or "w_" in name) else qtype_float hcl_array.append(hcl.asarray(params[name], dtype=dtype)) hcl_ph.append(hcl.placeholder(params[name].shape, name, dtype=dtype)) hcl_out = hcl.asarray(np.zeros((batch_size, 10)).astype(np.float), dtype=qtype_float) # build the network scheme = hcl.create_scheme([input_image] + hcl_ph, build_bnn) s = hcl.create_schedule_from_scheme(scheme) f = hcl.build(s, target=target) correct_sum = 0 for i in range(num_images // batch_size): np_image = images[i * batch_size:(i + 1) * batch_size] hcl_image = hcl.asarray(np_image, dtype=qtype_int) f(hcl_image, *hcl_array, hcl_out) prediction = np.argmax(hcl_out.asnumpy(), axis=1) correct_sum += np.sum( np.equal(prediction, labels[i * batch_size:(i + 1) * batch_size])) if (i + 1) % 10 == 0: print("Done {} batches.".format(i + 1)) print("Testing accuracy: {}".format(correct_sum / float(num_images)))
import os, sys tempdir = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, tempdir + "/heterocl") import heterocl as hcl hcl.init() A = hcl.placeholder((10, )) B = hcl.placeholder((10, )) def quantization(A): return hcl.compute(A.shape, lambda x: hcl.tanh(A[x]), "B") sm = hcl.create_scheme([A], quantization) sm_B = quantization.B