def SCH_polymulNxN_mod3(N,C1,C2,rf,rg,rh) : global V, NV, r_f, r_g, r_h, r_N r_f = rf; r_g = rg; r_h = rh globals()["C1"]=C1 globals()["C2"]=C2 globals()["N"]=N #assert (N>16) alloc_save_no("N",str(N)) alloc_save_no("C1",str(C1)) alloc_save_no("C2",str(C2)) # print_str(rh,"h","save h") # print_str(rh,"hh","save hh") # print_str(rf,"f","save f") # print_str(rg,"g","save g") print "sch3_0: // increasing thread length" print " mov %s, #0" % (ac(0,0)) print " mov r12, %s" % rf print " mov r14, %s" % rg print " ldr r11, =0x03030303" print "#ifndef __thumb2__" print_str("r11", "3", "save #0x03030303") print " ldr r11, =0x0f0f0f0f" print_str("r11", "F", "save #0x0F0F0F0F") print_ldr("r11", "3", "reload #0x03030303") print "#endif" print "sch3_1: // later blocks" for i in range(0,N/16) : # i is thread count start_strip_top (i) for j in range(1, 4*i+1) : continue_strip_top (i,j) end_strip_top (i) print "sch3_10: // decreasing thread length" for i in range(N/16, N/8-1) : start_strip_bot (i) for j in range(N/4-2, 4*i-N/4+3, -1) : continue_strip_bot (i,j) end_strip_bot (i) print "sch3_20: // mv hh back to h" i = N/8 - 1 print " mov %s, #0" % (ac(i,1)) print " mov %s, #0" % (ac(i,2)) print " mov %s, #0" % (ac(i,3)) print " mov %s, #0" % (ac(i,4)) j = N / 4 print " ldr %s, [r14, #%d]" % (ar(i,j,1), N-12) print " ldr %s, [r14, #%d]" % (ar(i,j,2), N-8) print " ldr %s, [r14, #%d]" % (ar(i,j,3), N-4) end_strip_bot(i)
def conv_3x64_7x32_store_end (i) : # store 4 accumulators at end of thread global V print " // store r6-r8 to", [read_V(acc_r(i,j)) for j in range(4,7)] for j in range(4,7) : print_str("r"+str(j+2), acc_r(i,j), "limb %d" % (j)) print " // compress and store r2-r5" print_ldr("r6","hh","reload cursor") print_ldr("r7","q","load q") print_ldr("r8","q32","load round(-2^32/q)") print " br_32x2 r2, r3, r7, r8, r9" print " br_32x2 r4, r5, r7, r8, r9" print " str r2, [r6], #4" print " str r4, [r6], #4" print_str("r6","hh","store cursor")
def j2ds_prologue(): global V, NV print " .p2align 2,,3 " print " .syntax unified" print " .text" print "// void jump2divsteps (int minusdelta, int *M, int f, int g);" print " .global jump2divsteps" print " .type jump2divsteps, %function" print "jump2divsteps:" print " push {r1,r4-r11,lr}" print_ldr(rq, "q", "load q") print " movw r1, #%d" % (q32inv % 65536) print " movt r1, #%d" % (65536 + q32inv // 65536) print_str(qi, "q32", "save q32inv")
def SCH_prologue_sh(N, rf, rg, rh): global V, NV print '#include "red-asm.h"' SCH_polymulNxNsh_defs() print print "// void gf_polymul_%dx%dsh (int *h, int *f, int *g);" % (N, N) print " .p2align 2,,3 " print " .syntax unified" print " .text" print " .global gf_polymul_%dx%dsh" % (N, N) print " .type gf_polymul_%dx%dsh, %%function" % (N, N) print "gf_polymul_%dx%dsh:" % (N, N) print " push {r4-r11,lr}" print " movw r14, #%d" % ((-q32inv) % 65536) print " movt r14, #%d" % (65536 - (q32inv // 65536)) print_str("r14", "q32", "save q32inv")
def conv_3x64_7x32_store_end_i (i) : # store all 7 accumulators global V print_ldr("r9","q","load q") print_ldr("r10","q32","load round(-2^32/q)") print " // compress and store r2-r8" print " br_32x2 r2, r3, r9, r10, r11" print " br_32x2 r4, r5, r9, r10, r11" print " br_32x2 r6, r7, r9, r10, r11" print " br_32 r8, r9, r10, r11" print " uxth r8, r8" print_ldr("r11","hh","reload cursor") print " str r2, [r11], #4" print " str r4, [r11], #4" print " str r6, [r11], #4" print " str r8, [r11], #4" print_str("r11","hh","store cursor")
def j4ds_2(NAME1, NAME2): global V, NV print " .p2align 2,,3 " print " .syntax unified" print " .text" print "// void %s (int minusdelta, int *M, int *f, int *g);" % (NAME1) print " .global %s" % (NAME1) print " .type %s, %%function" % (NAME1) print "%s:" % (NAME1) print " push {r1-r11,lr}" print_ldr("r12", "q", "load q") print " movw r1, #%d" % (q32inv % 65536) print " movt r1, #%d" % (65536 + q32inv // 65536) print_str("r1", "q32", "save q32inv") print " ldr r2, [r2] // f0" print " ldr r3, [r3] // g0" print " bl %s" % (NAME2) print_str("r0", "d", "store minusdelta") print " ldr r8, [sp, #4]" print " ldr r9, [sp, #8]" print " ldr r8, [r8, #4] // f1" print " ldr r9, [r9, #4] // g1" print " ldr r0, [sp]" print " // compute [[x r4, x r5], [r6, r7]] * [r8, r9] + [r2, r3]" print " bl M2x2x2_2" print " str r4, [r0, #20]" print " str r5, [r0, #28]" print " str r6, [r0, #36]" print " str r7, [r0, #44]" print " mov r1, #0" print " str r1, [r0, #16]" print " str r1, [r0, #24]" print " str r1, [r0, #32]" print " str r1, [r0, #40]" print_ldr("r0", "d", "reload minusdelta for return value") print " pop {r1-r11,pc}"
def j4ds(NAME1, NAME2): global V, NV alloc_save_no("M1", "sp+0") alloc_save_no("M2", "sp+24") alloc_save_no("fg", "sp+48") alloc_save_no("M", "sp+64") alloc_save_no("pf", "sp+68") alloc_save_no("pg", "sp+72") print " .p2align 2,,3 " print " .syntax unified" print " .text" print "// void %s (int minusdelta, int *M, int *f, int *g);" % (NAME1) print " .global %s" % (NAME1) print " .type %s, %%function" % (NAME1) print "%s:" % (NAME1) print " push {r1-r11,lr}" print_ldr("r12", "q", "load q") print " movw r1, #%d" % (q32inv % 65536) print " movt r1, #%d" % (65536 + q32inv // 65536) print_str("r1", "q32", "save q32inv") print " sub sp, sp, #64 // allocate 2x6+2x2 ints" print " ldr r2, [r2] // f0" print " ldr r3, [r3] // g0" print " bl jump2divsteps_sub" print_str("r0", "d", "store minusdelta") print " stm sp, {r2-r7} // matrix 1 is at sp" print_ldr("r8", "pf[0]", "load f pointer") print_ldr("r9", "pg[0]", "load g pointer") print " ldr r8, [r8, #4] // f1" print " ldr r9, [r9, #4] // g1" print_ldr("r0", "fg", "load intermediate f,g pointer") print " // compute [[x r4, x r5], [r6, r7]] * [r8, r9] + [r2, r3]" print " bl M2x2x2_2" print_ldr("r3", "fg[8]", "reload lower half of new g") print_ldr("r0", "d", "reload minusdelta") print " bl %s" % (NAME2) print_str("r0", "d", "store minusdelta") print_ldr("r14", "M2", "load matrix 2") print " stm r14, {r2-r7}" print_ldr("r8", "fg[4]", "reload top half of new f") print_ldr("r9", "fg[12]", "reload top half of new g") print_ldr("r0", "M[0]", "reload matrix pointer") print " bl M2x2x2_2" print " // remains to multiply M1 by M2" print_ldr("r8", "M1[8]", "reload u1") print_ldr("r9", "M1[16]", "reload r1") print " add r0, r0, #16" print " bl M2x2x2s" print_ldr("r8", "M1[12]", "reload v1") print_ldr("r9", "M1[20]", "reload s1") print " add r0, r0, #8" print " bl M2x2x2s" print " add sp, sp, #76 // deallocate temp storage + pop r1-3" print_ldr("r0", "d", "reload minusdelta for return value") print " pop {r4-r11,pc}"
def SCH_polymulNxN_mod3(N,C1,C2,rf,rg,rh) : global V, NV, r_f, r_g, r_h, r_N r_f = rf; r_g = rg; r_h = rh assert (N>8) alloc_save_no("N",str(N)) alloc_save_no("C1",str(C1)) alloc_save_no("C2",str(C2)) print_str(rh,"h","save h") print_str(rh,"hh","save hh") print_str(rf,"f","save f") print_str(rg,"g","save g") print "sch3_0: // increasing thread length" print " // block (0,0)" print " ldr r5, [r1, #4]" print " ldr r4, [r1]" print " ldr r7, [r2, #4]" print " ldr r6, [r2]" print " mov r3, 0x03030303" print " umlal %s, %s, r4, r6" % (acc_r(0,0),acc_r(0,1)) print " umull %s, %s, r5, r7" % (acc_r(0,2),acc_r(0,3)) print " umlal %s, %s, r4, r7" % (acc_r(0,1),acc_r(0,2)) print " umlal %s, %s, r5, r6" % (acc_r(0,1),acc_r(0,2)) reduce_mod3_32(acc_r(0,0),"r12","r3") reduce_mod3_32(acc_r(0,1),"r12","r3") reduce_mod3_lazy(acc_r(0,2),"r12") reduce_mod3_lazy(acc_r(0,3),"r12") print " strd %s, %s, [r0], 8" % (acc_r(0,0),acc_r(0,1)) print "sch3_1: // later blocks" C = min(C1,C2) for i in range(1,N/8) : # i is thread count add_block_initial (0,i) for j in range(1,i+1) : add_block (j,i-j) if (j % C == C-1) : reduce_mod3_lazy(acc_r(i,0),"r12") reduce_mod3_lazy(acc_r(i,1),"r12") reduce_mod3_lazy(acc_r(i,2),"r12") reduce_mod3_lazy(acc_r(i,3),"r12") if ((i+1) % C == 0) : reduce_mod3_32(acc_r(i,0),"r12","r3") reduce_mod3_32(acc_r(i,1),"r12","r3") else : reduce_mod3_full(acc_r(i,0),"r12","r3") reduce_mod3_full(acc_r(i,1),"r12","r3") reduce_mod3_lazy(acc_r(i,2),"r12") reduce_mod3_lazy(acc_r(i,3),"r12") print " strd %s, %s, [r0], #8" % (acc_r(i,0),acc_r(i,1)) print "sch3_10: // decreasing thread length" for i in range(N/8, N/4-1) : add_block_initial (i-N/8+1, N/8-1) for j in range(i-N/8+2,N/8) : add_block (j,i-j) if ((j-(i-N/8+1)) % C == C-1) : reduce_mod3_lazy(acc_r(i,0),"r12") reduce_mod3_lazy(acc_r(i,1),"r12") reduce_mod3_lazy(acc_r(i,2),"r12") reduce_mod3_lazy(acc_r(i,3),"r12") if ((N/4-i-1) % C == 0) : reduce_mod3_32(acc_r(i,0),"r12","r3") reduce_mod3_32(acc_r(i,1),"r12","r3") else : reduce_mod3_full(acc_r(i,0),"r12","r3") reduce_mod3_full(acc_r(i,1),"r12","r3") reduce_mod3_lazy(acc_r(i,2),"r12") reduce_mod3_lazy(acc_r(i,3),"r12") print " strd %s, %s, [r0], #8" % (acc_r(i,0),acc_r(i,1)) print "sch3_20: // mv hh back to h" reduce_mod3_32(acc_r(i,2),"r12","r3") reduce_mod3_32(acc_r(i,3),"r12","r3") print " strd %s, %s, [r0], #8" % (acc_r(i,2),acc_r(i,3))
# g = r2, r3 r = r8, r9 s = r10, r11 # scratch registers = r12, r14 print ''' // bitslice functions .p2align 2,,3 .syntax unified .text .global bs3_jump32divsteps .type bs3_jump32divsteps, %function //normal usage is 'vmov.f32 s0, #31.0' before calling //int bs3_jump32divsteps(int delta, int *f, int *g, int *M, float rep+1); bs3_jump32divsteps:''' #print " ldr r12, [sp]" print " push {r4-r11,lr}" #print_str("r12","Temp","save temporary data pointer") print_str("r3","M","save result matrix ptr") print_str("r0","D","save delta") print " ldr %s, [r1]" % f0 print " ldr %s, [r1, #4]" % f1 print " ldr %s, [r2, #4]" % g1 print " ldr %s, [r2]" % g0 for i in [f0,f1,g0,g1] : print " rbit %s, %s" % (i,i) print " mov %s, #(1<<31)" % u0 print " mov %s, #(1<<31)" % s0 for i in [u1,v0,v1,r0,r1,s1] : print " mov %s, #0" % (i) print " vmov.f32 %s, #1.0 // float 1.0 #112" % (v("1")) #print " vmov.f32 %s, #31.0 // float 31.0 #63" % (v("ct")) print " vcvt.f32.s32 %s, %s // convert to float" %(v("D"),v("D")) print "bs3_jump32divsteps_0: // first half"
def SCH_polymulNxNsh(N,C,rf,rg,rh) : global V, NV, r_f, r_g, r_h, r_N r_f = rf; r_g = rg; r_h = rh alloc_save_no("N",str(N)) alloc_save_no("C",str(C)) alloc_save_no("ff","sp+0") alloc_save_no("gg","sp+%d" % (2*N)) alloc_save_no("q",str(q)) print_str(rh,"h","save h") print_str(rh,"hh","save hh") print_str(rf,"f","save f") print_str(rg,"g","save g") print " sub sp, sp, #%d // space for ff,gg" % (4*N) print_ldr("r0","ff","load ff") print_ldr("r12","N","load N") print " bl convert_2p19" print_ldr("r0","gg","load gg") print_ldr("r1","g","load g") #print " mov r0, r1" print_ldr("r12","N","load N") print " bl convert_2p19" print "sch2p19_0: // increasing thread length" for i in range(0,N/4) : # i is thread count print " add r1, sp, #%d // load gg+2*%d" % (2*N+8*i,i) print_ldr("r0","ff","load ff") for j in range(0,i-C+1,C) : add_block_initial (j,i-j) for k in range(1,C) : add_block (j+k,i-j-k) conv_3x64_7x32_body(i) if (j == 0) : conv_3x64_7x32_acc_i(i) else : conv_3x64_7x32_acc(i) conv_3x64_7x32_store(i) j = (i // C) * C add_block_initial (j,i-j) for k in range(j+1,i+1) : add_block (k, i-k) conv_3x64_7x32_body(i) if (j == 0) : if (i == 0) : print " asr r8, r11, #6" else : conv_3x64_7x32_acc_i(i) else : conv_3x64_7x32_acc(i) conv_3x64_7x32_store_end(i) print "sch2p19_10: // decreasing thread length" for i in range(N/4, N/2-1) : print " add r1, sp, #%d // load gg+2*%d" % (4*N-8,N/4-1) print " add r0, sp, #%d // load ff+2*%d" % (8*i-N*2+8,i-N/4+1) for j in range(i-N/4+1,N/4-C,C) : add_block_initial (j,i-j) for k in range(1,C) : add_block (j+k,i-j-k) conv_3x64_7x32_body(i) if (j == i-N/4+1) : conv_3x64_7x32_acc_i(i) else : conv_3x64_7x32_acc(i) conv_3x64_7x32_store(i) j = ((N/2 - i - 2) // C) * C + i-N/4+1 add_block_initial (j,i-j) for k in range(j+1,N/4) : add_block (k, i-k) conv_3x64_7x32_body(i) if (j == i-N/4+1) : conv_3x64_7x32_acc_i(i) else : conv_3x64_7x32_acc(i) if (i == N/2 -2) : conv_3x64_7x32_store_end_i(i) else : conv_3x64_7x32_store_end(i) print "sch2p19_20: // mv hh back to h" print_ldr("r0","h","reload h") print_ldr("r1","f","reload f") print_ldr("r2","g","reload g") print
def conv_3x64_7x32_store (i) : global V print " // store r2-r8 to", [read_V(acc_r(i,j)) for j in range(7)] for j in range(7) : print_str("r"+str(j+2), acc_r(i,j), "limb %d" % (j))
def KA_polymulNxN(N): # KA_head print("// N=%d requires %d storage" % (N, KA_terms(N, B))) HN = "bitslice3_mul%d" % (N) RN = "bs3_mul%d" % (N) aux = open(HN + ".h", "w+") aux.write("extern void " + RN + " (int32_t *h, int32_t *f, int32_t *g);\n") aux.close() print " .p2align 2,,3" print " .syntax unified" print " .text" N0 = N print "KA_%d:" % N while (N0 >= B): print " .word %d" % KA_terms(N, N0) N0 /= 2 print " .p2align 2,,3 " print " .syntax unified" print " .text" # print "// void %s (int32_t *h, int32_t *f, int32_t *g);" % (RN) print " .global %s" % (RN) print " .type %s, %%function" % (RN) print "%s:" % (RN) # M = (KA_terms(N, B)) alloc_save_no("ff", "sp+0") alloc_save_no("gg", "sp+%d" % (M // 4)) alloc_save_no("hh", "sp+%d" % (M // 2)) alloc_save_no("M4", str(M // 4)) # print " push {r4-r11,lr}" if (read_NV() > 16): print " vpush {s16-s31}" print " sub sp, sp, #%d // subtract M" % (M) print " // ff=[sp], gg=[sp,#%d], hh=[sp,#%d]" % (M // 4, M // 2) print_str("r0", "h", "save h") print_ldr("r3", "ff", "load ff pointer") print_ldr("r0", "gg", "load gg pointer") # if (N > 128): print " mov r14, #%d" % (N) print "KA%d_mv_loop: // r0 = gg, r1 = f, r2 = g, r3 = ff" % N if (N == 64): print " ldm r1!, {r4-r7}" print " ldm r2!, {r8-r11}" print " stm r3!, {r4-r7}" print " stm r0!, {r8-r11}" else: # N >= 128 print " ldm r1!, {r4-r11}" print " stm r3!, {r4-r11}" print " ldm r2!, {r4-r11}" print " stm r0!, {r4-r11}" if (N > 128): print " subs r14, #128" print " bne KA%d_mv_loop" % N # print "KA%d_exp: // ff @ sp, gg @ sp + M/4, M/4 @ r12" % N print_ldr("r0", "ff", "load ff") print_ldr("r1", "gg", "load gg") print " ldr r3, =KA_%d" % N print_str("r3", "ov", "save list of multiplication sizes pointer") print " mov r2, #%d // N0/8 = r2 = N/8" % (N // 8) print "KA%d_exp_loop1: // loop on N0(/8)" % N print " cmp r2, #%d // while (N0>B)" % (B // 8) print " beq KA%d_exp_end1" % N # main assembly routine for KA_exp print "KA%d_exp_adds:" % N print "/*" print " for (j=0; j<N1; j+=N0) {" print " for (k=0; k<N0/2; k+=32) {" print " add3(ff+(j+k+N1)/4,ff+(2*j+k)/4,ff+(2*j+k+N0/2)/4);" print " add3(gg+(j+k+N1)/4,gg+(2*j+k)/4,gg+(2*j+k+N0/2)/4);" print " }" print "*/" print " ldr r4, [r3], #4 // load N1=KA_terms(N,N0)" print " add r5, r0, r4, LSR #2 // r5 = ff + N1/4" print " add r6, r1, r4, LSR #2 // r6 = gg + N1/4" print " add r0, r0, r2 // r0 = ff + N0/8" print " add r1, r1, r2 // r1 = gg + N0/8" print " rsb r2, r2, #0 // r2 = -N0/8" print " mov r12, r2" print "KA%d_exp_adds1:" % N print " ldr r8, [r0, r2]" print " ldr r10, [r0], #4" print " ldr r9, [r0, r2]" print " ldr r11, [r0], #4" add_to_mod3_d("r8", "r9", "r10", "r11") print " strd r8, r9, [r5], #8" print " ldr r8, [r1, r2]" print " ldr r10, [r1], #4" print " ldr r9, [r1, r2]" print " ldr r11, [r1], #4" add_to_mod3_d("r8", "r9", "r10", "r11") print " strd r8, r9, [r6], #8" print " subs r4, r4, #64 // total of N1/64 pairs" print " beq KA%d_exp_end" % N print " adds r12, r12, #8 // from N0/8 each time 8" print " ittt eq // divisible by N0/2?" print " subeq r0, r0, r2 // then add N0/8!" print " subeq r1, r1, r2 // then add N0/8!" print " moveq r12, r2 // reload with N0/8" print " b KA%d_exp_adds1" % N print "KA%d_exp_end:" % N print " rsb r2, r2, #0 // back to + N0/8" print_ldr("r0", "ff", "reload ff") print_ldr("r1", "gg", "reload gg") print print " lsr r2, #1 // N0 /= 2" print " b KA%d_exp_loop1 // loop" % (N) print "KA%d_exp_end1:" % N print print "KA%d_mul:" % N N1 = KA_terms(N, B) # print_str("r3", "ov", "save N1 list pointer") print " ldr r3, [r3] // r3 = N1" print_ldr("r2", "hh", "load r2 = hh") print "KA%d_muls1:" % (N) if (B == 32): print " ldr r4, [r0], #4" print " ldr r5, [r0], #4" print " ldr r6, [r1], #4" print " ldr r7, [r1], #4" mul32_mod3("r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", 0, "") print " stm r2!, {r8-r11}" print " subs r3, #32" print " bne KA%d_muls1" % (N) # # now hh = r2, everything else is disposable print "KA%d_collect:" % (N) print_ldr("r2", "hh", "reload hh") N0 = B while (N0 < N): N1 = KA_terms(N, 2 * N0) # hh has not left r2, ov has not left r3 print "KA%d_col_%d_add: // KA collection" % (N, N0) print_ldr("r3", "ov", "reload N1 list") # probably in each loop? print " ldr r14, [r3, #-4]! // N1" print_str("r3", "ov", "save N1 list") print " add r12, r2, r14, LSR #1 // points into array" print " mov r1, r2 // copy of hh" print " mov r11, #%d // N0" % (N0) print "KA%d_col_%d_add1: // beginning of KA collect" % (N, N0) print " ldrd r4, r5, [r1, #%d]" % (N0 // 4) print " ldrd r6, r7, [r1, #%d]" % (N0 // 2) sub_from_mod3_d("r4", "r5", "r6", "r7") print " ldrd r6, r7, [r1, #%d]" % (3 * N0 // 4) #print " mov r8, r4" #print " mov r9, r5" #add_to_mod3_d("r8","r9","r6","r7") add_to_mod3_dx("r8", "r9", "r4", "r5", "r6", "r7") print " ldrd r6, r7, [r1]" sub_from_mod3_d("r4", "r5", "r6", "r7") print " ldrd r6, r7, [r12, #%d]" % (N0 // 4) sub_from_mod3_d("r6", "r7", "r8", "r9") print " ldrd r8, r9, [r12], #8 // shift r12" print " strd r6, r7, [r1, #%d]" % (N0 // 2) add_to_mod3_d("r4", "r5", "r8", "r9") print " strd r4, r5, [r1, #%d]" % (N0 // 4) print " add r1, r1, #8" print " subs r14, r14, #64" print " beq KA%d_col_%d_end" % (N, N0) print " subs r11, r11, #32" print " ittt eq // no, then next set" print " addeq r1, r1, #%d" % (3 * N0 // 4) print " addeq r12, r12, #%d" % (N0 // 4) print " moveq r11, #%d // N0" % (N0) print " b KA%d_col_%d_add1" % (N, N0) print "KA%d_col_%d_end:" % (N, N0) # N0 *= 2 # hh should still be at #2 print "KA%d_mv_back: // hh still =r2" % N print_ldr("r0", "h", "reload h") if (N > 64): print " mov r14, #%d" % (N) print "KA%d_mv_back_loop:" % N print " ldm r2!, {r4-r11} // 4 pairs = 128 trits" print " stm r0!, {r4-r11}" if (N > 64): print " subs r14, #64" print " bne KA%d_mv_back_loop" % N # print "KA%d_end:" % N print " add sp, sp, #%d" % (M) if (read_NV() > 16): print " vpop {s16-s31}" print " pop {r4-r11,lr}" print " bx lr" print ""
#endif''' bs3_header1("bs3_mul64_negc") print " ldr r4, [r1]" print " ldr r5, [r1, #4]" print " ldr r6, [r2]" print " ldr r7, [r2, #4]" print " ldr r8, [r1, #8]" print " ldr r9, [r1, #12]" print " ldr r10, [r2, #8]" print " ldr r11, [r2, #12]" add_to_mod3_d("r4", "r5", "r8", "r9") add_to_mod3_d("r6", "r7", "r10", "r11") mul32_mod3("r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", 0, "") print_str("r8", "0", "save c01l0") print_str("r9", "1", "save c01l1") print_str("r10", "2", "save c01h0") print_str("r11", "3", "save c01h1") print " // c01 = (a0+a1)(b0+b1) in scratch 0-3" print " ldr r4, [r1, #8]" print " ldr r5, [r1, #12]" print " ldr r6, [r2, #8]" print " ldr r7, [r2, #12]" mul32_mod3("r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", 0, "") print_str("r8", "4", "save c1l0") print_str("r9", "5", "save c1l1") print_str("r10", "6", "save c1h0") print_str("r11", "7", "save c1h1") print " // c1 = a1 b1 in scratch 4-7"