コード例 #1
0
def SCH_polymulNxN_mod3(N,C1,C2,rf,rg,rh) :
    global V, NV, r_f, r_g, r_h, r_N
    r_f = rf; r_g = rg; r_h = rh
    globals()["C1"]=C1
    globals()["C2"]=C2
    globals()["N"]=N

    #assert (N>16)
    alloc_save_no("N",str(N))
    alloc_save_no("C1",str(C1))
    alloc_save_no("C2",str(C2))

    # print_str(rh,"h","save h")
    # print_str(rh,"hh","save hh")
    # print_str(rf,"f","save f")
    # print_str(rg,"g","save g")
    
    print "sch3_0:			// increasing thread length"
    print "	mov	%s, #0" % (ac(0,0))
    print "	mov	r12, %s" % rf
    print "	mov	r14, %s" % rg
    print "	ldr	r11, =0x03030303"

    print "#ifndef __thumb2__"
    print_str("r11", "3", "save #0x03030303")
    print "	ldr	r11, =0x0f0f0f0f"
    print_str("r11", "F", "save #0x0F0F0F0F")
    print_ldr("r11", "3", "reload #0x03030303")
    print "#endif"
    
    print "sch3_1:			// later blocks"
    for i in range(0,N/16) : # i is thread count
        start_strip_top (i)
        for j in range(1, 4*i+1) :
            continue_strip_top (i,j)
        end_strip_top (i) 
            
    print "sch3_10:			// decreasing thread length"
    for i in range(N/16, N/8-1) :
        start_strip_bot (i)
        for j in range(N/4-2, 4*i-N/4+3, -1) :
            continue_strip_bot (i,j)
        end_strip_bot (i) 
        
        
    print "sch3_20:			// mv hh back to h"
    i = N/8 - 1
    print "	mov	%s, #0" % (ac(i,1))
    print "	mov	%s, #0" % (ac(i,2))
    print "	mov	%s, #0" % (ac(i,3))
    print "	mov	%s, #0" % (ac(i,4))
    j = N / 4
    print "	ldr	%s, [r14, #%d]" % (ar(i,j,1), N-12)
    print "	ldr	%s, [r14, #%d]" % (ar(i,j,2), N-8)
    print "	ldr	%s, [r14, #%d]" % (ar(i,j,3), N-4)
    end_strip_bot(i)
コード例 #2
0
def reduce_mod3_lazy (X, scr, r03) :
    print "#ifdef __thumb2__"	
    print "	and	%s, %s, #0xF0F0F0F0	// top 4b < 16" % (scr, X)
    print "	and	%s, %s, #0x0F0F0F0F	// bot 4b < 16" % (X, X)
    print "	add	%s, %s, %s, LSR #4	// range < 31" % (X, X, scr)
    print "#else"
    print_ldr(r03, "F", "reload #0x0F0F0F0F")
    print "	bic	%s, %s, %s	// top 4b < 16" % (scr, X, r03)
    print "	and	%s, %s, %s	// bot 4b < 16" % (X, X, r03)
    print "	add	%s, %s, %s, LSR #4	// range < 31" % (X, X, scr)
    print_ldr(r03, "3", "reload #0x03030303")
    print "#endif"
コード例 #3
0
def j2ds_prologue():
    global V, NV
    print "	.p2align	2,,3	"
    print "	.syntax		unified"
    print "	.text"
    print "// void jump2divsteps (int minusdelta, int *M, int f, int g);"
    print "	.global jump2divsteps"
    print "	.type	jump2divsteps, %function"
    print "jump2divsteps:"
    print "	push	{r1,r4-r11,lr}"
    print_ldr(rq, "q", "load q")
    print "	movw	r1, #%d" % (q32inv % 65536)
    print "	movt	r1, #%d" % (65536 + q32inv // 65536)
    print_str(qi, "q32", "save q32inv")
コード例 #4
0
def j4ds_2(NAME1, NAME2):
    global V, NV
    print "	.p2align	2,,3	"
    print "	.syntax		unified"
    print "	.text"
    print "// void %s (int minusdelta, int *M, int *f, int *g);" % (NAME1)
    print "	.global %s" % (NAME1)
    print "	.type	%s, %%function" % (NAME1)
    print "%s:" % (NAME1)
    print "	push	{r1-r11,lr}"
    print_ldr("r12", "q", "load q")
    print "	movw	r1, #%d" % (q32inv % 65536)
    print "	movt	r1, #%d" % (65536 + q32inv // 65536)
    print_str("r1", "q32", "save q32inv")
    print "	ldr	r2, [r2]	// f0"
    print "	ldr	r3, [r3]	// g0"
    print "	bl	%s" % (NAME2)
    print_str("r0", "d", "store minusdelta")
    print "	ldr	r8, [sp, #4]"
    print "	ldr	r9, [sp, #8]"
    print "	ldr	r8, [r8, #4]	// f1"
    print "	ldr	r9, [r9, #4]	// g1"
    print "	ldr	r0, [sp]"
    print "	// compute [[x r4, x r5], [r6, r7]] * [r8, r9] + [r2, r3]"
    print "	bl	M2x2x2_2"
    print "	str	r4, [r0, #20]"
    print "	str	r5, [r0, #28]"
    print "	str	r6, [r0, #36]"
    print "	str	r7, [r0, #44]"
    print "	mov	r1, #0"
    print "	str	r1, [r0, #16]"
    print "	str	r1, [r0, #24]"
    print "	str	r1, [r0, #32]"
    print "	str	r1, [r0, #40]"
    print_ldr("r0", "d", "reload minusdelta for return value")
    print "	pop	{r1-r11,pc}"
コード例 #5
0
def conv_3x64_7x32_acc_i (i) :
    global V
    print "	// accumulate r2-r4 to", [read_V(acc_r(i,j)) for j in range(3)]
    print_ldr("r10", acc_r(i,0), "limb 0")
    print_ldr("r12", acc_r(i,1), "limb 1")
    print_ldr("r14", acc_r(i,2), "limb 2")
    print "	add	r2, r2, r10"
    print "	add	r3, r3, r12"
    print "	add	r4, r4, r14"
    print "	asr	r8, r11, #6"
コード例 #6
0
def conv_3x64_7x32_store_end (i) : # store 4 accumulators at end of thread 
    global V
    print "	// store r6-r8 to", [read_V(acc_r(i,j)) for j in range(4,7)]
    for j in range(4,7) :
        print_str("r"+str(j+2), acc_r(i,j), "limb %d" % (j))
    print "	// compress and store r2-r5"
    print_ldr("r6","hh","reload cursor")
    print_ldr("r7","q","load q")
    print_ldr("r8","q32","load round(-2^32/q)")
    print "	br_32x2	r2, r3, r7, r8, r9"
    print "	br_32x2	r4, r5, r7, r8, r9"
    print "	str	r2, [r6], #4"
    print "	str	r4, [r6], #4"
    print_str("r6","hh","store cursor")
コード例 #7
0
def conv_3x64_7x32_store_end_i (i) : # store all 7 accumulators     
    global V
    print_ldr("r9","q","load q")
    print_ldr("r10","q32","load round(-2^32/q)")
    print "	// compress and store r2-r8"
    print "	br_32x2	r2, r3, r9, r10, r11"
    print "	br_32x2	r4, r5, r9, r10, r11"
    print "	br_32x2	r6, r7, r9, r10, r11"
    print "	br_32	r8, r9, r10, r11"
    print "	uxth	r8, r8"
    print_ldr("r11","hh","reload cursor")
    print "	str	r2, [r11], #4"
    print "	str	r4, [r11], #4"
    print "	str	r6, [r11], #4"
    print "	str	r8, [r11], #4"
    print_str("r11","hh","store cursor")
コード例 #8
0
ファイル: bitslice3_j32d.py プロジェクト: boyin/poly-m4
print "	vsub.f32	%s, %s, %s" % (v("ct"),v("ct"),v("1"))
print "	vcmp.f32	%s, #0.0" % (v("ct"))
print "	vmrs	APSR_nzcv, FPSCR	// move c flag"
print "	itttt	cs	// u = xu, v = xv if ct >= 0"

for i in [u0,u1,v0,v1] :
    print "	lsrcs	%s, %s, #1" % (i,i)

#print_ldr("r12","Temp","reload output matrix ptr")
#print "	stm	r12!, {r0-r11}	// store results"
#print_str("r12","Temp","save output matrix ptr")

print "	bcs	bs3_jump32divsteps_0"
print "bs3_jump32divsteps_2:	// clean up"
for i in [f0,f1,g0,g1,u0,u1,v0,v1,r0,r1,s0,s1] :
    print "	rbit	%s, %s" % (i,i)
print_ldr(X0,"M","reload output ptr for results")
print("	stm	%s,{"+"%s,"*11+"%s}") % (X0,f0,f1,g0,g1,u0,u1,v0,v1,r0,r1,s0,s1)
print "	vcvt.s32.f32	%s, %s	// back to int" % (v("D"),v("D"))
print_ldr("r0","D","restore delta")
print '''
#ifndef __thumb__
	pop	{r4-r11,lr}
	bx	lr
#else
	pop	{r4-r11,pc}
#endif'''



コード例 #9
0
def SCH_polymulNxNsh(N,C,rf,rg,rh) :
    global V, NV, r_f, r_g, r_h, r_N
    r_f = rf; r_g = rg; r_h = rh

    alloc_save_no("N",str(N))
    alloc_save_no("C",str(C))
    alloc_save_no("ff","sp+0")
    alloc_save_no("gg","sp+%d" % (2*N))
    alloc_save_no("q",str(q))

    print_str(rh,"h","save h")
    print_str(rh,"hh","save hh")
    print_str(rf,"f","save f")
    print_str(rg,"g","save g")
    print "	sub	sp, sp, #%d	// space for ff,gg" % (4*N)
    
    print_ldr("r0","ff","load ff")
    print_ldr("r12","N","load N")
    print "	bl	convert_2p19"
    print_ldr("r0","gg","load gg")
    print_ldr("r1","g","load g")
    #print "	mov	r0, r1"
    print_ldr("r12","N","load N")
    print "	bl	convert_2p19"
    
    print "sch2p19_0:			// increasing thread length"
    for i in range(0,N/4) : # i is thread count
        print "	add	r1, sp, #%d	// load gg+2*%d" % (2*N+8*i,i)
        print_ldr("r0","ff","load ff")
        for j in range(0,i-C+1,C) :
            add_block_initial (j,i-j)
            for k in range(1,C) :
            	add_block (j+k,i-j-k)
            conv_3x64_7x32_body(i)
            if (j == 0) :
                conv_3x64_7x32_acc_i(i)
            else :
                conv_3x64_7x32_acc(i)
            conv_3x64_7x32_store(i)
        j = (i // C) * C
        add_block_initial (j,i-j)
        for k in range(j+1,i+1) :
            add_block (k, i-k)
        conv_3x64_7x32_body(i)
        if (j == 0) :
            if (i == 0) :
                print "	asr	r8, r11, #6"
            else :
                conv_3x64_7x32_acc_i(i)
        else :
            conv_3x64_7x32_acc(i)
        conv_3x64_7x32_store_end(i)
    print "sch2p19_10:			// decreasing thread length"
    for i in range(N/4, N/2-1) :
        print "	add	r1, sp, #%d	// load gg+2*%d" % (4*N-8,N/4-1)
        print "	add	r0, sp, #%d	// load ff+2*%d" % (8*i-N*2+8,i-N/4+1)
        for j in range(i-N/4+1,N/4-C,C) :
            add_block_initial (j,i-j)
            for k in range(1,C) :
            	add_block (j+k,i-j-k)
            conv_3x64_7x32_body(i)
            if (j == i-N/4+1) :
                conv_3x64_7x32_acc_i(i)
            else :
                conv_3x64_7x32_acc(i)
            conv_3x64_7x32_store(i)
        j = ((N/2 - i - 2) // C) * C + i-N/4+1
        add_block_initial (j,i-j)
        for k in range(j+1,N/4) :
            add_block (k, i-k)
        conv_3x64_7x32_body(i)
        if (j == i-N/4+1) :
            conv_3x64_7x32_acc_i(i)
        else :
            conv_3x64_7x32_acc(i)
        if (i == N/2 -2) :
            conv_3x64_7x32_store_end_i(i)
        else :
            conv_3x64_7x32_store_end(i)
    print "sch2p19_20:			// mv hh back to h"
    print_ldr("r0","h","reload h")
    print_ldr("r1","f","reload f")
    print_ldr("r2","g","reload g")
    print
コード例 #10
0
def conv_3x64_7x32_acc (i) :
    global V
    print "	// accumulate to", [read_V(acc_r(i,j)) for j in range(7)]
    print_ldr("r10", acc_r(i,0), "limb 0")
    print_ldr("r12", acc_r(i,1), "limb 1")
    print_ldr("r14", acc_r(i,2), "limb 2")
    print_ldr("r9", acc_r(i,3), "limb 3")
    print "	add	r2, r2, r10"
    print "	add	r3, r3, r12"
    print "	add	r4, r4, r14"
    print "	add	r5, r5, r9"
    print_ldr("r10", acc_r(i,4), "limb 4")
    print_ldr("r12", acc_r(i,5), "limb 5")
    print_ldr("r8", acc_r(i,6), "limb 6")
    print "	add	r6, r6, r10"
    print "	add	r7, r7, r12"
    print "	add	r8, r8, r11, asr #6"
コード例 #11
0
ファイル: polymul_NxN-bs3.py プロジェクト: boyin/poly-m4
def KA_polymulNxN(N):
    # KA_head
    print("// N=%d requires %d storage" % (N, KA_terms(N, B)))
    HN = "bitslice3_mul%d" % (N)
    RN = "bs3_mul%d" % (N)
    aux = open(HN + ".h", "w+")
    aux.write("extern void " + RN + " (int32_t *h, int32_t *f, int32_t *g);\n")
    aux.close()

    print "	.p2align	2,,3"
    print "	.syntax		unified"
    print "	.text"
    N0 = N
    print "KA_%d:" % N
    while (N0 >= B):
        print "	.word	%d" % KA_terms(N, N0)
        N0 /= 2
    print "	.p2align	2,,3	"
    print "	.syntax		unified"
    print "	.text"
    #
    print "// void %s (int32_t *h, int32_t *f, int32_t *g);" % (RN)
    print "	.global %s" % (RN)
    print "	.type	%s, %%function" % (RN)
    print "%s:" % (RN)
    #
    M = (KA_terms(N, B))
    alloc_save_no("ff", "sp+0")
    alloc_save_no("gg", "sp+%d" % (M // 4))
    alloc_save_no("hh", "sp+%d" % (M // 2))
    alloc_save_no("M4", str(M // 4))
    #
    print "	push	{r4-r11,lr}"
    if (read_NV() > 16):
        print "	vpush	{s16-s31}"
    print "	sub	sp, sp, #%d	// subtract M" % (M)
    print "		// ff=[sp], gg=[sp,#%d], hh=[sp,#%d]" % (M // 4, M // 2)
    print_str("r0", "h", "save h")
    print_ldr("r3", "ff", "load ff pointer")
    print_ldr("r0", "gg", "load gg pointer")
    #
    if (N > 128):
        print "	mov	r14, #%d" % (N)
    print "KA%d_mv_loop:	// r0 = gg, r1 = f, r2 = g, r3 = ff" % N
    if (N == 64):
        print "	ldm	r1!, {r4-r7}"
        print "	ldm	r2!, {r8-r11}"
        print "	stm	r3!, {r4-r7}"
        print "	stm	r0!, {r8-r11}"
    else:  # N >= 128
        print "	ldm	r1!, {r4-r11}"
        print "	stm	r3!, {r4-r11}"
        print "	ldm	r2!, {r4-r11}"
        print "	stm	r0!, {r4-r11}"
        if (N > 128):
            print "	subs	r14, #128"
            print "	bne	KA%d_mv_loop" % N
    #
    print "KA%d_exp:	// ff @ sp, gg @ sp + M/4, M/4 @ r12" % N
    print_ldr("r0", "ff", "load ff")
    print_ldr("r1", "gg", "load gg")
    print "	ldr	r3, =KA_%d" % N
    print_str("r3", "ov", "save list of multiplication sizes pointer")
    print "	mov	r2, #%d		// N0/8 = r2 = N/8" % (N // 8)
    print "KA%d_exp_loop1:		// loop on N0(/8)" % N
    print "	cmp	r2, #%d		// while (N0>B)" % (B // 8)
    print "	beq	KA%d_exp_end1" % N
    # main assembly routine for KA_exp
    print "KA%d_exp_adds:" % N
    print "/*"
    print "  for (j=0; j<N1; j+=N0) {"
    print "    for (k=0; k<N0/2; k+=32) {"
    print "     add3(ff+(j+k+N1)/4,ff+(2*j+k)/4,ff+(2*j+k+N0/2)/4);"
    print "     add3(gg+(j+k+N1)/4,gg+(2*j+k)/4,gg+(2*j+k+N0/2)/4);"
    print "    }"
    print "*/"
    print "	ldr	r4, [r3], #4		// load N1=KA_terms(N,N0)"
    print "	add	r5, r0, r4, LSR #2	// r5 = ff + N1/4"
    print "	add	r6, r1, r4, LSR #2	// r6 = gg + N1/4"
    print "	add	r0, r0, r2		// r0 = ff + N0/8"
    print "	add	r1, r1, r2		// r1 = gg + N0/8"
    print "	rsb	r2, r2, #0		// r2 = -N0/8"
    print "	mov	r12, r2"
    print "KA%d_exp_adds1:" % N
    print "	ldr	r8, [r0, r2]"
    print "	ldr	r10, [r0], #4"
    print "	ldr	r9, [r0, r2]"
    print "	ldr	r11, [r0], #4"
    add_to_mod3_d("r8", "r9", "r10", "r11")
    print "	strd	r8, r9, [r5], #8"
    print "	ldr	r8, [r1, r2]"
    print "	ldr	r10, [r1], #4"
    print "	ldr	r9, [r1, r2]"
    print "	ldr	r11, [r1], #4"
    add_to_mod3_d("r8", "r9", "r10", "r11")
    print "	strd	r8, r9, [r6], #8"
    print "	subs	r4, r4, #64	// total of N1/64 pairs"
    print "	beq	KA%d_exp_end" % N
    print "	adds	r12, r12, #8	// from N0/8 each time 8"
    print "	ittt	eq		// divisible by N0/2?"
    print "	subeq	r0, r0, r2	// then add N0/8!"
    print "	subeq	r1, r1, r2	// then add N0/8!"
    print "	moveq	r12, r2		// reload with N0/8"
    print "	b	KA%d_exp_adds1" % N
    print "KA%d_exp_end:" % N
    print "	rsb	r2, r2, #0	// back to + N0/8"
    print_ldr("r0", "ff", "reload ff")
    print_ldr("r1", "gg", "reload gg")
    print
    print "	lsr	r2, #1 		// N0 /= 2"
    print "	b	KA%d_exp_loop1	// loop" % (N)
    print "KA%d_exp_end1:" % N
    print
    print "KA%d_mul:" % N
    N1 = KA_terms(N, B)
    #
    print_str("r3", "ov", "save N1 list pointer")
    print "	ldr	r3, [r3]	// r3 = N1"
    print_ldr("r2", "hh", "load r2 = hh")
    print "KA%d_muls1:" % (N)
    if (B == 32):
        print "	ldr	r4, [r0], #4"
        print "	ldr	r5, [r0], #4"
        print "	ldr	r6, [r1], #4"
        print "	ldr	r7, [r1], #4"
        mul32_mod3("r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12",
                   "r14", 0, "")
        print "	stm	r2!, {r8-r11}"
    print "	subs	r3, #32"
    print "	bne	KA%d_muls1" % (N)
    #
    # now hh = r2, everything else is disposable
    print "KA%d_collect:" % (N)
    print_ldr("r2", "hh", "reload hh")
    N0 = B
    while (N0 < N):
        N1 = KA_terms(N, 2 * N0)
        # hh has not left r2, ov has not left r3
        print "KA%d_col_%d_add:			// KA collection" % (N, N0)
        print_ldr("r3", "ov", "reload N1 list")  # probably in each loop?
        print "	ldr	r14, [r3, #-4]!	// N1"
        print_str("r3", "ov", "save N1 list")
        print "	add	r12, r2, r14, LSR #1	// points into array"
        print "	mov	r1, r2		// copy of hh"
        print "	mov	r11, #%d	// N0" % (N0)
        print "KA%d_col_%d_add1:	// beginning of KA collect" % (N, N0)
        print "	ldrd	r4, r5, [r1, #%d]" % (N0 // 4)
        print "	ldrd	r6, r7, [r1, #%d]" % (N0 // 2)
        sub_from_mod3_d("r4", "r5", "r6", "r7")
        print "	ldrd	r6, r7, [r1, #%d]" % (3 * N0 // 4)
        #print "	mov	r8, r4"
        #print "	mov	r9, r5"
        #add_to_mod3_d("r8","r9","r6","r7")
        add_to_mod3_dx("r8", "r9", "r4", "r5", "r6", "r7")
        print "	ldrd	r6, r7, [r1]"
        sub_from_mod3_d("r4", "r5", "r6", "r7")
        print "	ldrd	r6, r7, [r12, #%d]" % (N0 // 4)
        sub_from_mod3_d("r6", "r7", "r8", "r9")
        print "	ldrd	r8, r9, [r12], #8	// shift r12"
        print "	strd	r6, r7, [r1, #%d]" % (N0 // 2)
        add_to_mod3_d("r4", "r5", "r8", "r9")
        print "	strd	r4, r5, [r1, #%d]" % (N0 // 4)
        print "	add	r1, r1, #8"
        print "	subs	r14, r14, #64"
        print "	beq	KA%d_col_%d_end" % (N, N0)
        print "	subs	r11, r11, #32"
        print "	ittt	eq	// no, then next set"
        print "	addeq	r1, r1, #%d" % (3 * N0 // 4)
        print "	addeq	r12, r12, #%d" % (N0 // 4)
        print "	moveq	r11, #%d	// N0" % (N0)
        print "	b	KA%d_col_%d_add1" % (N, N0)
        print "KA%d_col_%d_end:" % (N, N0)
        #
        N0 *= 2
    # hh should still be at #2
    print "KA%d_mv_back:			// hh still =r2" % N
    print_ldr("r0", "h", "reload h")
    if (N > 64):
        print "	mov	r14, #%d" % (N)
    print "KA%d_mv_back_loop:" % N
    print "	ldm	r2!, {r4-r11}	// 4 pairs = 128 trits"
    print "	stm	r0!, {r4-r11}"
    if (N > 64):
        print "	subs	r14, #64"
        print "	bne	KA%d_mv_back_loop" % N
    #
    print "KA%d_end:" % N
    print "	add	sp, sp, #%d" % (M)
    if (read_NV() > 16):
        print "	vpop	{s16-s31}"
    print "	pop	{r4-r11,lr}"
    print "	bx	lr"
    print ""
コード例 #12
0
print "	ldr	r7, [r2, #12]"
mul32_mod3("r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", 0,
           "")
print_str("r8", "4", "save c1l0")
print_str("r9", "5", "save c1l1")
print_str("r10", "6", "save c1h0")
print_str("r11", "7", "save c1h1")
print "	// c1 = a1 b1 in scratch 4-7"
print "	ldr	r5, [r1, #4]"
print "	ldr	r4, [r1], #16"
print "	ldr	r7, [r2, #4]"
print "	ldr	r6, [r2], #16"
mul32_mod3("r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r14", 0,
           "bs3_mul64_negc0")
print "	// c0 = a0 b0 in r8-11, now compute (c0-c1 R)(1-R) + c01 R"
print_ldr("r4", "4", "load c1l0")
print_ldr("r5", "5", "load c1l1")
print_ldr("r6", "6", "load c1h0")
print_ldr("r7", "7", "load c1h1")
add_to_mod3_d("r8", "r9", "r6", "r7")
sub_from_mod3_d("r10", "r11", "r4", "r5")
print "	// (c0-c1 R) in r8-11"
add_sub_mod3_d("r10", "r11", "r8", "r9", "r6", "r7")
print "	// (c0-c1 R)(1-R) in r10,r11,r8,r9"
print_ldr("r4", "0", "load c01l0")
print_ldr("r5", "1", "load c01l1")
print_ldr("r6", "2", "load c01h0")
print_ldr("r7", "3", "load c01h1")
add_to_mod3_d("r8", "r9", "r4", "r5")
sub_from_mod3_d("r10", "r11", "r6", "r7")
print "	str	r10, [r0], #4"
コード例 #13
0
def j4ds(NAME1, NAME2):
    global V, NV
    alloc_save_no("M1", "sp+0")
    alloc_save_no("M2", "sp+24")
    alloc_save_no("fg", "sp+48")
    alloc_save_no("M", "sp+64")
    alloc_save_no("pf", "sp+68")
    alloc_save_no("pg", "sp+72")
    print "	.p2align	2,,3	"
    print "	.syntax		unified"
    print "	.text"
    print "// void %s (int minusdelta, int *M, int *f, int *g);" % (NAME1)
    print "	.global %s" % (NAME1)
    print "	.type	%s, %%function" % (NAME1)
    print "%s:" % (NAME1)
    print "	push	{r1-r11,lr}"
    print_ldr("r12", "q", "load q")
    print "	movw	r1, #%d" % (q32inv % 65536)
    print "	movt	r1, #%d" % (65536 + q32inv // 65536)
    print_str("r1", "q32", "save q32inv")
    print "	sub	sp, sp, #64	// allocate 2x6+2x2 ints"
    print "	ldr	r2, [r2]	// f0"
    print "	ldr	r3, [r3]	// g0"
    print "	bl	jump2divsteps_sub"
    print_str("r0", "d", "store minusdelta")
    print "	stm	sp, {r2-r7}	// matrix 1 is at sp"
    print_ldr("r8", "pf[0]", "load f pointer")
    print_ldr("r9", "pg[0]", "load g pointer")
    print "	ldr	r8, [r8, #4]	// f1"
    print "	ldr	r9, [r9, #4]	// g1"
    print_ldr("r0", "fg", "load intermediate f,g pointer")
    print "	// compute [[x r4, x r5], [r6, r7]] * [r8, r9] + [r2, r3]"
    print "	bl	M2x2x2_2"
    print_ldr("r3", "fg[8]", "reload lower half of new g")
    print_ldr("r0", "d", "reload minusdelta")
    print "	bl	%s" % (NAME2)
    print_str("r0", "d", "store minusdelta")
    print_ldr("r14", "M2", "load matrix 2")
    print "	stm	r14, {r2-r7}"
    print_ldr("r8", "fg[4]", "reload top half of new f")
    print_ldr("r9", "fg[12]", "reload top half of new g")
    print_ldr("r0", "M[0]", "reload matrix pointer")
    print "	bl	M2x2x2_2"
    print "	// remains to multiply M1 by M2"
    print_ldr("r8", "M1[8]", "reload u1")
    print_ldr("r9", "M1[16]", "reload r1")
    print "	add	r0, r0, #16"
    print "	bl	M2x2x2s"
    print_ldr("r8", "M1[12]", "reload v1")
    print_ldr("r9", "M1[20]", "reload s1")
    print "	add	r0, r0, #8"
    print "	bl	M2x2x2s"
    print "	add	sp, sp, #76	// deallocate temp storage + pop r1-3"
    print_ldr("r0", "d", "reload minusdelta for return value")
    print "	pop	{r4-r11,pc}"