def pipeline_resource01(xs, ys): with rule(scheduling='pipeline'): for i in range(4): a = xs[i] b = xs[i + 1] ys[i] = (a + b) >> 1 return
def pipeline_resource02(xs, ys): with rule(scheduling='pipeline'): for i in range(4): a = xs[i] ys[i] = a ys[i + 1] = a << 1 return
def _sha256(msg, _h, w): with rule(unroll='full'): for i in range(16): w[i] = msg[i] for i in range(16, 64): wi_15 = w[i - 15] s0 = rotr(wi_15, 7) ^ rotr(wi_15, 18) ^ (wi_15 >> 3) wi_2 = w[i - 2] s1 = rotr(wi_2, 17) ^ rotr(wi_2, 19) ^ (wi_2 >> 10) wi_16 = w[i - 16] wi_7 = w[i - 7] w[i] = (wi_16 + s0 + wi_7 + s1) & 0xFFFFFFFF a = _h[0] b = _h[1] c = _h[2] d = _h[3] e = _h[4] f = _h[5] g = _h[6] h = _h[7] for i in range(64): s0 = rotr(a, 2) ^ rotr(a, 13) ^ rotr(a, 22) maj = (a & b) ^ (a & c) ^ (b & c) t2 = s0 + maj s1 = rotr(e, 6) ^ rotr(e, 11) ^ rotr(e, 25) ch = (e & f) ^ ((~e) & g) t1 = h + s1 + ch + _k[i] + w[i] h = g g = f f = e e = (d + t1) & 0xFFFFFFFF d = c c = b b = a a = (t1 + t2) & 0xFFFFFFFF _lst = [a, b, c, d, e, f, g, h] with rule(unroll='full'): for i in range(8): _h[i] = (_h[i] + _lst[i]) & 0xFFFFFFFF
def test(p04): with rule(scheduling='parallel'): p04.i1.wr(2) p04.i2.wr(3) p04.i3.wr(4) clkfence() assert p04.o1() == 4 assert p04.o2() == 9 assert p04.o3() == 16
def main(self): while is_worker_running(): with rule(scheduling='parallel'): t1 = self.i1.rd() t2 = self.i2.rd() t3 = self.i3.rd() self.o1.wr(t1 * t1) self.o2.wr(t2 * t2) self.o3.wr(t3 * t3)
def sha256(msg, h): with rule(unroll='full'): for i in range(len(_h)): h[i] = _h[i] work = [None] * 64 _sha256(msg, h, work) tail_blk = [0] * 16 tail_blk[0] = 0x80000000 tail_blk[15] = 0x00000200 _sha256(tail_blk, h, work)
def test(m): msg = [0x61616161] * 16 # type: List[bit32] lst = [0] * 16 # type: List[bit32] blen = len(msg) blocks = ((blen * 4 + 5) + 63) // 64 print("blocks", blocks) start_i = 0 m.data_in.wr(blocks) for i in range(blocks - 1): print('index:', i) #print('start_i', start_i) with rule(unroll='full'): for j in range(16): lst[j] = msg[start_i] start_i += 1 rv512: bit512 = bit32x16_bit512(lst) print('rv512', rv512) m.data_in.wr(rv512) print("$time") with rule(unroll='full'): for i in range(16): lst[i] = 0 for i in range(blen - start_i): lst[i] = lst[start_i] start_i += 1 lst[blen - start_i] = 0x80000000 lst[15] = (blocks << 8) v512_last: bit512 = bit32x16_bit512(lst) print('lastblock', v512_last) m.data_in.wr(v512_last) v256: bit256 = m.data_out.rd() print('sha256', v256)
def w(self, i_q, o_q): with rule(scheduling='pipeline'): while is_worker_running(): v = i_q.rd() o_q.wr(v) o_q.wr(v)
def process_sha256(self): work = [0] * 64 # type: List[bit32] _h = [0] * 8 # type: List[bit32] __h = [0] * 8 # type: List[bit32] while is_worker_running(): update = True for i in range(8): _h[i] = h[i] block_len512: bit512 = self.data_in.rd() block_len32 = block_len512 count = 0 #print(block_len512) #print(block_len32) while count < block_len32: #print(count, block_len32) count += 1 #print("--=========") d512 = self.data_in.rd() #print("start d512 %5t", d512, "$time") shift_n = 480 for i in unroll(range(16)): work[i] = (d512 >> shift_n) & 0xFFFFFFFF shift_n -= 32 for i in range(16, 64): wi_15 = work[i - 15] s0 = rotr(wi_15, 7) ^ rotr(wi_15, 18) ^ (wi_15 >> 3) wi_2 = work[i - 2] s1 = rotr(wi_2, 17) ^ rotr(wi_2, 19) ^ (wi_2 >> 10) wi_16 = work[i - 16] wi_7 = work[i - 7] work[i] = (wi_16 + s0 + wi_7 + s1) & 0xFFFFFFFF with rule(unroll='full'): for i in range(8): __h[i] = _h[i] for i in range(64): s0 = rotr(__h[0], 2) ^ rotr(__h[0], 13) ^ rotr(__h[0], 22) maj = (__h[0] & __h[1]) ^ (__h[0] & __h[2]) ^ (__h[1] & __h[2]) t2 = s0 + maj s1 = rotr(__h[4], 6) ^ rotr(__h[4], 11) ^ rotr(__h[4], 25) ch = (__h[4] & __h[5]) ^ ((~__h[4]) & __h[6]) t1 = __h[7] + s1 + ch + k[i] + work[i] __h[7] = __h[6] __h[6] = __h[5] __h[5] = __h[4] __h[4] = (__h[3] + t1) & 0xFFFFFFFF __h[3] = __h[2] __h[2] = __h[1] __h[1] = __h[0] __h[0] = (t1 + t2) & 0xFFFFFFFF with rule(unroll='full'): for i in range(8): _h[i] = (_h[i] + __h[i]) & 0xFFFFFFFF # print("turn %5t", count, "$time") rv256: bit256 = 0 with rule(unroll='full'): for i in range(8): rv256 <<= 32 rv256 |= _h[i] #print("rv256 %5t", rv256, "$time") self.data_out.wr(rv256)
def mips_main(self): inputs = [0] * 8 for i in range(len(inputs)): inputs[i] = self.din() dmem = [0] * 64 for i in range(8): dmem[i] = inputs[i] hilo: int64 = 0 Hi = 0 Lo = 0 n_inst = 0 reg = [0] * 32 reg[29] = 0x7fffeffc pc = 0x00400000 self.run() while is_worker_running(): with rule(scheduling='pipeline'): while pc != 0: iaddr = self.IADDR(pc) ins = imem[iaddr] pc = pc + 4 op = (ins >> 26) & 0x3f #print(op) if op == R: funct = ins & 0x3f shamt = (ins >> 6) & 0x1f rd = (ins >> 11) & 0x1f rt = (ins >> 16) & 0x1f rs = (ins >> 21) & 0x1f if funct == ADDU: reg[rd] = reg[rs] + reg[rt] elif funct == SUBU: reg[rd] = reg[rs] - reg[rt] elif funct == MULT: hilo = reg[rs] * reg[rt] Lo = hilo & 0x00000000ffffffff Hi = (hilo >> 32) & 0xffffffff elif funct == MULTU: hilo = reg[rs] * reg[rt] Lo = hilo & 0x00000000ffffffff Hi = (hilo >> 32) & 0xffffffff elif funct == MFHI: reg[rd] = Hi elif funct == MFLO: reg[rd] = Lo elif funct == AND: reg[rd] = reg[rs] & reg[rt] elif funct == OR: reg[rd] = reg[rs] | reg[rt] elif funct == XOR: reg[rd] = reg[rs] ^ reg[rt] elif funct == SLL: reg[rd] = reg[rt] << shamt elif funct == SRL: reg[rd] = reg[rt] >> shamt elif funct == SLLV: reg[rd] = reg[rt] << reg[rs] elif funct == SRLV: reg[rd] = reg[rt] >> reg[rs] elif funct == SLT: reg[rd] = reg[rs] < reg[rt] elif funct == SLTU: reg[rd] = reg[rs] < reg[rt] elif funct == JR: pc = reg[rs] else: pc = 0 # error elif op == J: tgtadr = ins & 0x3ffffff pc = tgtadr << 2 elif op == JAL: tgtadr = ins & 0x3ffffff reg[31] = pc pc = tgtadr << 2 else: # if op == ... address = ins & 0xffff rt = (ins >> 16) & 0x1f rs = (ins >> 21) & 0x1f if op == ADDIU: reg[rt] = reg[rs] + address elif op == ANDI: reg[rt] = reg[rs] & address elif op == ORI: reg[rt] = reg[rs] | address elif op == XORI: reg[rt] = reg[rs] ^ address elif op == LW: reg[rt] = dmem[self.DADDR(reg[rs] + address)] elif op == SW: dmem[self.DADDR(reg[rs] + address)] = reg[rt] elif op == LUI: reg[rt] = address << 16 elif op == BEQ: if reg[rs] == reg[rt]: pc = pc - 4 + (address << 2) elif op == BNE: if reg[rs] != reg[rt]: pc = pc - 4 + (address << 2) elif op == BGEZ: if reg[rs] >= 0: pc = pc - 4 + (address << 2) elif op == SLTI: reg[rt] = reg[rs] < address elif op == SLTIU: reg[rt] = reg[rs] < address else: pc = 0 # error reg[0] = 0 n_inst = n_inst + 1 #if pc == 0: self.result(n_inst) for i in range(len(dmem)): self.dout(dmem[i]) self.run()
def ChenIDct(x: list, y: list): ''' ChenIDCT() implements the Chen inverse dct. Note that there are two input vectors that represent x=input, and y=output, and must be defined (and storage allocated) before this routine is called. ''' def LS(r, s): return r << s def RS(r, s): return r >> s # Caution with rounding... def MSCALE(expr): return RS(expr, 9) tmp = [None] * 64 # Loop over columns with rule(scheduling='pipeline'): for i in range(8): b0 = LS(x[i + 0], 2) a0 = LS(x[i + 8], 2) b2 = LS(x[i + 16], 2) a1 = LS(x[i + 24], 2) b1 = LS(x[i + 32], 2) a2 = LS(x[i + 40], 2) b3 = LS(x[i + 48], 2) a3 = LS(x[i + 56], 2) # Split into even mode b0 = x0 b1 = x4 b2 = x2 b3 = x6. # And the odd terms a0 = x1 a1 = x3 a2 = x5 a3 = x7. c0 = MSCALE((c7d16 * a0) - (c1d16 * a3)) c1 = MSCALE((c3d16 * a2) - (c5d16 * a1)) c2 = MSCALE((c3d16 * a1) + (c5d16 * a2)) c3 = MSCALE((c1d16 * a0) + (c7d16 * a3)) # First Butterfly on even terms. a0 = MSCALE(c1d4 * (b0 + b1)) a1 = MSCALE(c1d4 * (b0 - b1)) a2 = MSCALE((c3d8 * b2) - (c1d8 * b3)) a3 = MSCALE((c1d8 * b2) + (c3d8 * b3)) b0 = a0 + a3 b1 = a1 + a2 b2 = a1 - a2 b3 = a0 - a3 # Second Butterfly a0 = c0 + c1 a1 = c0 - c1 a2 = c3 - c2 a3 = c3 + c2 c0 = a0 c1 = MSCALE(c1d4 * (a2 - a1)) c2 = MSCALE(c1d4 * (a2 + a1)) c3 = a3 tmp[i + 0] = b0 + c3 tmp[i + 8] = b1 + c2 tmp[i + 16] = b2 + c1 tmp[i + 24] = b3 + c0 tmp[i + 32] = b3 - c0 tmp[i + 40] = b2 - c1 tmp[i + 48] = b1 - c2 tmp[i + 56] = b0 - c3 # Loop over rows for i in range(8): idx = LS(i, 3) b0 = tmp[idx + 0] a0 = tmp[idx + 1] b2 = tmp[idx + 2] a1 = tmp[idx + 3] b1 = tmp[idx + 4] a2 = tmp[idx + 5] b3 = tmp[idx + 6] a3 = tmp[idx + 7] # Split into even mode b0 = x0 b1 = x4 b2 = x2 b3 = x6. # And the odd terms a0 = x1 a1 = x3 a2 = x5 a3 = x7. c0 = MSCALE((c7d16 * a0) - (c1d16 * a3)) c1 = MSCALE((c3d16 * a2) - (c5d16 * a1)) c2 = MSCALE((c3d16 * a1) + (c5d16 * a2)) c3 = MSCALE((c1d16 * a0) + (c7d16 * a3)) # First Butterfly on even terms. a0 = MSCALE(c1d4 * (b0 + b1)) a1 = MSCALE(c1d4 * (b0 - b1)) a2 = MSCALE((c3d8 * b2) - (c1d8 * b3)) a3 = MSCALE((c1d8 * b2) + (c3d8 * b3)) # Calculate last set of b's b0 = a0 + a3 b1 = a1 + a2 b2 = a1 - a2 b3 = a0 - a3 # Second Butterfly a0 = c0 + c1 a1 = c0 - c1 a2 = c3 - c2 a3 = c3 + c2 c0 = a0 c1 = MSCALE(c1d4 * (a2 - a1)) c2 = MSCALE(c1d4 * (a2 + a1)) c3 = a3 idx = LS(i, 3) tmp[idx + 0] = b0 + c3 tmp[idx + 1] = b1 + c2 tmp[idx + 2] = b2 + c1 tmp[idx + 3] = b3 + c0 tmp[idx + 4] = b3 - c0 tmp[idx + 5] = b2 - c1 tmp[idx + 6] = b1 - c2 tmp[idx + 7] = b0 - c3 # Retrieve correct accuracy. We have additional factor # of 16 that must be removed. for i in range(64): v = tmp[i] if v < 0: z = (v - 8) >> 4 else: z = (v + 8) >> 4 y[i] = z return 0