def testB(self): code = io.StringIO(r""" .fun main NORMAL [S32] = [S32 A64] .bbl %start poparg argc:S32 poparg argv:A64 mov b:S32 1 add a:S32 b 1 add x:S32 a 1 blt argc 2 if_1_true bra if_1_end .bbl if_1_true pusharg 1:S32 ret .bbl if_1_end pusharg 0:S32 ret """) unit = serialize.UnitParseFromAsm(code) fun = unit.fun_syms["main"] optimize.FunCfgInit(fun, unit) liveness.FunComputeLivenessInfo(fun) liveness.FunRemoveUselessInstructions(fun) # print ("@@@@\n", "\n".join(serialize.FunRenderToAsm(fun))) for bbl in fun.bbls: for ins in bbl.inss: self.assertTrue(ins.opcode in {o.POPARG, o.PUSHARG, o.RET, o.BLT}, f"bad ins {ins}")
def FunOptBasic(fun: ir.Fun, opt_stats: Dict[str, int], allow_conv_conversion: bool): opt_stats["canonicalized"] += canonicalize.FunCanonicalize(fun) opt_stats["strength_red"] += lowering.FunStrengthReduction(fun) reaching_defs.FunComputeReachingDefs(fun) reaching_defs.FunCheckReachingDefs(fun) opt_stats["reg_prop"] = reaching_defs.FunPropagateRegs(fun) opt_stats["const_prop"] += reaching_defs.FunPropagateConsts(fun) opt_stats["const_fold"] += reaching_defs.FunConstantFold( fun, allow_conv_conversion) opt_stats["canonicalized"] += canonicalize.FunCanonicalize(fun) opt_stats["strength_red"] += lowering.FunStrengthReduction(fun) opt_stats["ls_st_simplify"] += reaching_defs.FunLoadStoreSimplify(fun) opt_stats["move_elim"] += lowering.FunMoveElimination(fun) liveness.FunComputeLivenessInfo(fun) opt_stats["useless"] = liveness.FunRemoveUselessInstructions(fun) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunComputeRegStatsLAC(fun) opt_stats["dropped_regs"] += reg_stats.FunDropUnreferencedRegs(fun) opt_stats["separated_regs"] += reg_stats.FunSeparateLocalRegUsage(fun)
def testD(self): code = io.StringIO(r""" .fun arm_syscall_write SIGNATURE [S32] = [S32 A32 U32] .fun putchar NORMAL [] = [U32] .fun writeln NORMAL [] = [A32 U32] # live_out: ['r0', 'r1'] .reg S32 [$r0_S32 dummy] .reg U32 [$r0_U32 $r1_U32 $r2_U32 len] .reg A32 [$r0_A32 $r1_A32 buf] .bbl start mov buf $r0_A32@r0 # 0 mov len $r1_U32@r1 # 1 mov $r2_U32@r2 len # 2 mov $r1_A32@r1 buf # 3 mov $r0_S32@r0 1 # 4 syscall arm_syscall_write 4:U32 # 5 mov dummy $r0_S32@r0 # 6 mov $r0_U32@r0 10 # 7 bsr putchar # 8 ret # 9 """) cpu_regs = {"r0": ir.CpuReg("r0", 0), "r1": ir.CpuReg("r1", 1), "r2": ir.CpuReg("r2", 2)} unit = serialize.UnitParseFromAsm(code, cpu_regs=cpu_regs) fun = unit.fun_syms["arm_syscall_write"] fun.cpu_live_out = {cpu_regs["r0"]} fun.cpu_live_in = {cpu_regs["r0"], cpu_regs["r1"], cpu_regs["r2"]} fun = unit.fun_syms["putchar"] fun.cpu_live_in = {cpu_regs["r0"]} fun = unit.fun_syms["writeln"] cfg.FunSplitBbls(fun) cfg.FunInitCFG(fun) cfg.FunRemoveUnconditionalBranches(fun) cfg.FunRemoveEmptyBbls(fun) liveness.FunComputeLivenessInfo(fun) ranges = liveness.BblGetLiveRanges(fun.bbls[0], fun, fun.bbls[0].live_out, False) ranges.sort() print("TestD") for lr in ranges: print(lr) self.assertEqual(ranges, [ liveness.LiveRange(liveness.BEFORE_BBL, 0, fun.reg_syms["$r0_A32"], 1), liveness.LiveRange(liveness.BEFORE_BBL, 1, fun.reg_syms["$r1_U32"], 1), liveness.LiveRange(0, 3, fun.reg_syms["buf"], 1), liveness.LiveRange(1, 2, fun.reg_syms["len"], 1), liveness.LiveRange(2, 5, fun.reg_syms["$r2_U32"], 0), liveness.LiveRange(3, 5, fun.reg_syms["$r1_A32"], 0), liveness.LiveRange(4, 5, fun.reg_syms["$r0_S32"], 0), liveness.LiveRange(5, 6, fun.reg_syms["$r0_S32"], 1), liveness.LiveRange(6, liveness.NO_USE, fun.reg_syms["dummy"], 0), liveness.LiveRange(7, 8, fun.reg_syms["$r0_U32"], 0), ])
def testBaseRegPropagation2(self): code = io.StringIO(r""" .fun foo NORMAL [] = [] .reg S32 [x] .reg U32 [y] .reg A32 [a counter] .bbl start poparg counter poparg y lea a counter 666 ld x = a 0 mul x = x 777 st a 334 = x lea a counter y ld x = a 0 mul x = x 777 st a 0 = x lea a counter y ld x = a 0 mul x = x 777 st a 0 = x mov a counter ld x = a 0 mul x = x 777 st a 334 = x ret """) unit = serialize.UnitParseFromAsm(code, False) fun = unit.fun_syms["foo"] bbl = fun.bbls[0] cfg.FunInitCFG(fun) liveness.FunComputeLivenessInfo(fun) reaching_defs.FunComputeReachingDefs(fun) reaching_defs.FunPropagateConsts(fun) reaching_defs.FunLoadStoreSimplify(fun) liveness.FunRemoveUselessInstructions(fun) print("\n".join(serialize.FunRenderToAsm(fun))) # all ld/st were re-written for ins in bbl.inss: self.assertIn(ins.opcode.name, { "ret", "mul", "poparg", "ld", "ld", "st", "st", })
def UnitOptBasic(unit: ir.Unit, dump_reg_stats) -> Dict[str, int]: opt_stats: Dict[str, int] = collections.defaultdict(int) for fun in unit.funs: if fun.kind is not o.FUN_KIND.NORMAL: continue FunOptBasic(fun, opt_stats, allow_conv_conversion=True) if dump_reg_stats: reg_stats.FunComputeRegStatsExceptLAC(fun) liveness.FunComputeLivenessInfo(fun) reg_stats.FunComputeRegStatsLAC(fun) rs = reg_stats.FunCalculateRegStats(fun) print(f"# {fun.name:30} RegStats: {rs}") return opt_stats
def testC(self): code = io.StringIO(r""" .fun main NORMAL [S32] = [] .bbl %start mov %out:S32 3 bra next .bbl next pusharg %out ret """) unit = serialize.UnitParseFromAsm(code) fun = unit.fun_syms["main"] optimize.FunCfgInit(fun, unit) liveness.FunComputeLivenessInfo(fun) # print ("@@@@\n", "\n".join(serialize.FunRenderToAsm(fun))) liveness.FunRemoveUselessInstructions(fun) # print ("@@@@\n", "\n".join(serialize.FunRenderToAsm(fun))) self.assertEqual(1, len(fun.bbls[0].inss)) self.assertEqual(2, len(fun.bbls[1].inss))
def testE(self): code = io.StringIO(r""" .fun test NORMAL [F32 F32 F32 F32] = [F32 F32] .reg F32 [a b add sub mul div $s0_F32 $s1_F32 $s2_F32 $s3_F32] .bbl start mov a $s0_F32@s0 mov b $s1_F32@s1 add add a b sub sub a b mul mul a b div div a b mov $s3_F32@s3 div mov $s2_F32@s2 mul mov $s1_F32@s1 sub mov $s0_F32@s0 add ret """) cpu_regs = { "s0": ir.CpuReg("s0", 0), "s1": ir.CpuReg("s1", 1), "s2": ir.CpuReg("s2", 2), "s3": ir.CpuReg("s3", 2) } unit = serialize.UnitParseFromAsm(code, cpu_regs=cpu_regs) fun = unit.fun_syms["test"] fun.cpu_live_out = { cpu_regs["s0"], cpu_regs["s1"], cpu_regs["s2"], cpu_regs["s3"] } fun.cpu_live_in = {cpu_regs["s0"], cpu_regs["s1"]} cfg.FunSplitBblsAtTerminators(fun) cfg.FunInitCFG(fun) cfg.FunRemoveUnconditionalBranches(fun) cfg.FunRemoveEmptyBbls(fun) liveness.FunComputeLivenessInfo(fun) ranges = liveness.BblGetLiveRanges(fun.bbls[0], fun, fun.bbls[0].live_out) ranges.sort() print("TestE") for lr in ranges: print(lr)
def PhaseGlobalRegAlloc(fun: ir.Fun, _opt_stats: Dict[str, int], fout): """ These phase introduces CpuReg for globals and situations where we have no choice which register to use, e.g. function parameters and results ("pre-allocated" regs). After this function has been run all globals will have a valid cpu_reg and we have to be careful to not introduce new globals subsequently. If not enough cpu_regs are available for all globals, some of them will be spilled. We err on the site of spilling more, the biggest danger is to over-allocate and then lack registers for intra-bbl register allocation. The whole global allocator is terrible and so is the the decision which globals to spill is extremely simplistic at this time. We separate global from local register allocation so that we can use a straight forward linear scan allocator for the locals. This allocator assumes that each register is defined exactly once and hence does not work for globals. """ debug = None if fout: print("#" * 60, file=fout) print(f"# GlobalRegAlloc {fun.name}", file=fout) print("#" * 60, file=fout) # print ("@@@@@@\n", "\n".join(serialize.FunRenderToAsm(fun))) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunDropUnreferencedRegs(fun) liveness.FunComputeLivenessInfo(fun) reg_stats.FunComputeRegStatsLAC(fun) # Note: REG_KIND_MAP_ARM maps all non-float to registers to S64 local_reg_stats = reg_stats.FunComputeBblRegUsageStats(fun, regs.REG_KIND_TO_CPU_REG_FAMILY) # we have introduced some cpu regs in previous phases - do not treat them as globals global_reg_stats = _FunGlobalRegStats(fun, regs.REG_KIND_TO_CPU_REG_FAMILY) DumpRegStats(fun, local_reg_stats, fout) pre_allocated_mask_gpr = 0 for reg in fun.regs: if reg.HasCpuReg() and reg.cpu_reg.kind == regs.CpuRegKind.GPR: pre_allocated_mask_gpr |= 1 << reg.cpu_reg.no # Handle GPR regs needed_gpr = RegsNeeded(len(global_reg_stats[(regs.CpuRegKind.GPR, True)]), len(global_reg_stats[(regs.CpuRegKind.GPR, False)]), local_reg_stats.get((regs.CpuRegKind.GPR, True), 0), local_reg_stats.get((regs.CpuRegKind.GPR, False), 0)) if debug: print(f"@@ GPR NEEDED {needed_gpr.global_lac} {needed_gpr.global_not_lac} " f"{needed_gpr.local_lac} {needed_gpr.local_not_lac}", file=debug) gpr_global_lac, gpr_global_not_lac = _GetRegPoolsForGlobals( needed_gpr, regs.GPR_REGS_MASK & regs.GPR_LAC_REGS_MASK, regs.GPR_REGS_MASK & ~regs.GPR_LAC_REGS_MASK, pre_allocated_mask_gpr) if debug: print(f"@@ GPR POOL {gpr_global_lac:x} {gpr_global_not_lac:x}", file=debug) to_be_spilled: List[ir.Reg] = [] to_be_spilled += regs.AssignCpuRegOrMarkForSpilling( global_reg_stats[(regs.CpuRegKind.GPR, True)], gpr_global_lac, 0) to_be_spilled += regs.AssignCpuRegOrMarkForSpilling( global_reg_stats[(regs.CpuRegKind.GPR, False)], gpr_global_not_lac & ~regs.GPR_LAC_REGS_MASK, gpr_global_not_lac & regs.GPR_LAC_REGS_MASK) # Handle Float regs pre_allocated_mask_flt = 0 for reg in fun.regs: if reg.HasCpuReg() and reg.cpu_reg.kind == regs.CpuRegKind.FLT: pre_allocated_mask_flt |= 1 << reg.cpu_reg.no needed_flt = RegsNeeded(len(global_reg_stats[(regs.CpuRegKind.FLT, True)]), len(global_reg_stats[(regs.CpuRegKind.FLT, False)]), local_reg_stats.get((regs.CpuRegKind.FLT, True), 0), local_reg_stats.get((regs.CpuRegKind.FLT, False), 0)) if debug: print(f"@@ FLT NEEDED {needed_flt.global_lac} {needed_flt.global_not_lac} " f"{needed_flt.local_lac} {needed_flt.local_not_lac}", file=debug) flt_global_lac, flt_global_not_lac = _GetRegPoolsForGlobals( needed_flt, regs.FLT_REGS_MASK & regs.FLT_LAC_REGS_MASK, regs.FLT_REGS_MASK & ~regs.FLT_LAC_REGS_MASK, pre_allocated_mask_flt) if debug: print(f"@@ FLT POOL {flt_global_lac:x} {flt_global_not_lac:x}", file=debug) to_be_spilled += regs.AssignCpuRegOrMarkForSpilling( global_reg_stats[(regs.CpuRegKind.FLT, True)], flt_global_lac, 0) to_be_spilled += regs.AssignCpuRegOrMarkForSpilling( global_reg_stats[(regs.CpuRegKind.FLT, False)], flt_global_not_lac & ~regs.FLT_LAC_REGS_MASK, flt_global_not_lac & regs.FLT_LAC_REGS_MASK) reg_alloc.FunSpillRegs(fun, o.DK.U32, to_be_spilled, prefix="$gspill") # Recompute Everything (TODO: make this more selective to reduce work) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunDropUnreferencedRegs(fun) liveness.FunComputeLivenessInfo(fun) reg_stats.FunComputeRegStatsLAC(fun) reg_stats.FunSeparateLocalRegUsage(fun)
def testBaseRegPropagation1(self): code = io.StringIO(r""" .mem COUNTER 4 RW .data 4 [0] .fun foo NORMAL [] = [] .stk array 4 4000 .reg S32 [x] .reg U32 [y] .reg A32 [counter] .bbl start lea.mem counter = COUNTER 0 ld x = counter 0 add x = x 1 st counter 0 = x lea.mem counter = COUNTER 100 ld x = counter 100 add x = x 1 st counter 300 = x mov y 666 lea.mem counter = COUNTER 0 ld x = counter y add x = x 1 st counter y = x lea.stk counter = array 0 ld x = counter 0 add x = x 1 st counter 0 = x lea.stk counter = array 100 ld x = counter 100 add x = x 1 st counter 300 = x mov y 666 lea.stk counter = array 0 ld x = counter y add x = x 1 st counter y = x ret """) unit = serialize.UnitParseFromAsm(code, False) fun = unit.fun_syms["foo"] bbl = fun.bbls[0] cfg.FunInitCFG(fun) liveness.FunComputeLivenessInfo(fun) reaching_defs.FunComputeReachingDefs(fun) reaching_defs.FunPropagateConsts(fun) # reaching_defs.FunConstantFold(fun, True) reaching_defs.FunLoadStoreSimplify(fun) liveness.FunRemoveUselessInstructions(fun) print("\n".join(serialize.FunRenderToAsm(fun))) # all ld/st were re-written for ins in bbl.inss: self.assertIn( ins.opcode.name, {"ret", "add", "ld.mem", "st.mem", "ld.stk", "st.stk"})
def testD(self): code = io.StringIO(r""" .fun arm_syscall_write SIGNATURE [S32] = [S32 A32 U32] .fun putchar NORMAL [] = [U32] .fun writeln NORMAL [] = [A32 U32] # live_out: ['r0', 'r1'] .reg S32 [$r0_S32 dummy] .reg U32 [$r0_U32 $r1_U32 $r2_U32 len] .reg A32 [$r0_A32 $r1_A32 buf] .bbl start mov buf $r0_A32@r0 # 0 mov len $r1_U32@r1 # 1 mov $r2_U32@r2 len # 2 mov $r1_A32@r1 buf # 3 mov $r0_S32@r0 1 # 4 syscall arm_syscall_write 4:U32 # 5 mov dummy $r0_S32@r0 # 6 mov $r0_U32@r0 10 # 7 bsr putchar # 8 ret # 9 """) cpu_regs = { "r0": ir.CpuReg("r0", 0), "r1": ir.CpuReg("r1", 1), "r2": ir.CpuReg("r2", 2) } unit = serialize.UnitParseFromAsm(code, cpu_regs=cpu_regs) fun = unit.fun_syms["arm_syscall_write"] fun.cpu_live_out = {cpu_regs["r0"]} fun.cpu_live_in = {cpu_regs["r0"], cpu_regs["r1"], cpu_regs["r2"]} fun = unit.fun_syms["putchar"] fun.cpu_live_in = {cpu_regs["r0"]} fun = unit.fun_syms["writeln"] cfg.FunSplitBblsAtTerminators(fun) cfg.FunInitCFG(fun) cfg.FunRemoveUnconditionalBranches(fun) cfg.FunRemoveEmptyBbls(fun) liveness.FunComputeLivenessInfo(fun) ranges = liveness.BblGetLiveRanges(fun.bbls[0], fun, fun.bbls[0].live_out) ranges.sort() print("TestD") for lr in ranges: print(lr) lr_r0 = liveness.LiveRange(liveness.BEFORE_BBL, 0, fun.reg_syms["$r0_A32"], 1) lr_r1 = liveness.LiveRange(liveness.BEFORE_BBL, 1, fun.reg_syms["$r1_U32"], 1) lr_buf = liveness.LiveRange(0, 3, fun.reg_syms["buf"], 1) lr_len = liveness.LiveRange(1, 2, fun.reg_syms["len"], 1) lr_r0_2 = liveness.LiveRange(5, 6, fun.reg_syms["$r0_S32"], 1) expected = [ lr_r0, lr_r1, liveness.LiveRange(0, 0, reg=ir.REG_INVALID, num_uses=1, uses=[lr_r0]), lr_buf, liveness.LiveRange(1, 1, reg=ir.REG_INVALID, num_uses=1, uses=[lr_r1]), lr_len, liveness.LiveRange(2, 2, reg=ir.REG_INVALID, num_uses=1, uses=[lr_len]), liveness.LiveRange(2, 5, fun.reg_syms["$r2_U32"], 0), liveness.LiveRange(3, 3, reg=ir.REG_INVALID, num_uses=1, uses=[lr_buf]), liveness.LiveRange(3, 5, fun.reg_syms["$r1_A32"], 0), liveness.LiveRange(4, 5, fun.reg_syms["$r0_S32"], 0), lr_r0_2, liveness.LiveRange(6, 6, reg=ir.REG_INVALID, num_uses=1, uses=[lr_r0_2]), liveness.LiveRange(6, liveness.NO_USE, fun.reg_syms["dummy"], 0), liveness.LiveRange(7, 8, fun.reg_syms["$r0_U32"], 0), ] # self.assertSequenceEqual(ranges, expected) # this does not work because of the uses field self.assertEqual(len(ranges), len(expected)) for a, b in zip(): self.assertEqual(a, b)
def PhaseGlobalRegAlloc(fun: ir.Fun, _opt_stats: Dict[str, int], fout): """ These phase introduces CpuReg for globals and situations where we have no choice which register to use, e.g. function parameters and results ("pre-allocated" regs). After this function has been run all globals will have a valid cpu_reg and we have to be careful to not introduce new globals subsequently. IF not enough cpu_regs are available for all globals, some of them will be spilled. The whole global allocator is terrible and so is the the decision which globals to spill is extremely simplistic at this time. We sepatate global from local register allocation so that we can use a straight forward linear scan allocator for the locals. This allocator assumes that each register is defined exactly once and hence does not work for globals. """ if fout: print("#" * 60, file=fout) print(f"# GlobalRegAlloc {fun.name}", file=fout) print("#" * 60, file=fout) regs.FunPushargConversion(fun) regs.FunPopargConversion(fun) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunDropUnreferencedRegs(fun) liveness.FunComputeLivenessInfo(fun) reg_stats.FunComputeRegStatsLAC(fun) # Note: REG_KIND_MAP_ARM maps all non-float to registers to S32 local_reg_stats = reg_stats.FunComputeBblRegUsageStats( fun, REG_KIND_MAP_ARM) # we have introduced some cpu regs in previous phases - do not treat them as globals global_reg_stats = _FunGlobalRegStats(fun, REG_KIND_MAP_ARM) DumpRegStats(fun, local_reg_stats, fout) pre_allocated: Set[ir.CpuReg] = { reg.cpu_reg for reg in fun.regs if reg.HasCpuReg() } # Handle GPR regs needed_gpr = RegsNeeded( len(global_reg_stats[(o.DK.S32, True)]), len(global_reg_stats[(o.DK.S32, False)]), local_reg_stats.get((o.DK.S32, True), 0), # TODO: avoid fudge factor 1 + local_reg_stats.get((o.DK.S32, False), 0)) gpr_global_lac, gpr_global_not_lac = _GetRegPoolsForGlobals( needed_gpr, regs.GPR_CALLEE_SAVE_REGS.copy(), regs.GPR_NOT_LAC_REGS.copy(), pre_allocated) to_be_spilled: List[ir.Reg] = [] to_be_spilled += _AssignCpuRegOrMarkForSpilling( global_reg_stats[(o.DK.S32, True)], gpr_global_lac) to_be_spilled += _AssignCpuRegOrMarkForSpilling( global_reg_stats[(o.DK.S32, False)], gpr_global_not_lac) # Handle Float regs needed_flt = RegsNeeded( len(global_reg_stats[(o.DK.F32, True)]) + 2 * len(global_reg_stats[(o.DK.F64, True)]), len(global_reg_stats[(o.DK.F32, False)]) + 2 * len(global_reg_stats[(o.DK.F64, True)]), local_reg_stats.get((o.DK.F32, True), 0) + 2 * local_reg_stats.get( (o.DK.F64, True), 0), # TODO: avoid fudge factor 2 + local_reg_stats.get( (o.DK.F32, False), 0) + 2 * local_reg_stats.get( (o.DK.F64, False), 0)) flt_global_lac, flt_global_not_lac = _GetRegPoolsForGlobals( needed_flt, regs.FLT_CALLEE_SAVE_REGS.copy(), regs.FLT_PARAMETER_REGS.copy(), pre_allocated) to_be_spilled += _AssignCpuRegOrMarkForSpilling( global_reg_stats[(o.DK.F64, True)] + global_reg_stats[(o.DK.F32, True)], flt_global_lac) to_be_spilled += _AssignCpuRegOrMarkForSpilling( global_reg_stats[(o.DK.F64, False)] + global_reg_stats[(o.DK.F32, False)], flt_global_not_lac) reg_alloc.FunSpillRegs(fun, o.DK.U32, to_be_spilled) # Recompute Everything (TODO: make this more selective) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunDropUnreferencedRegs(fun) liveness.FunComputeLivenessInfo(fun) reg_stats.FunComputeRegStatsLAC(fun) reg_stats.FunSeparateLocalRegUsage(fun)
def PhaseLegalization(fun: ir.Fun, unit: ir.Unit, _opt_stats: Dict[str, int], fout): """ Does a lot of the heavily lifting so that the instruction selector can remain simple and table driven. * lift almost all regs to 32bit width * rewrite Ins that cannot be expanded * rewrite immediates that cannot be expanded except stack offsets which are dealt with in another pass TODO: missing is a function to change calling signature so that """ fun.cpu_live_in = regs.PushPopInterface.GetCpuRegsForInSignature( fun.input_types) fun.cpu_live_out = regs.PushPopInterface.GetCpuRegsForOutSignature( fun.output_types) if fun.kind is not o.FUN_KIND.NORMAL: return # Getting rid of the pusharg/poparg now relieves us form having to pay to attention to the # invariant that pushargs/popargs must be adjacent. lowering.FunPushargConversion(fun, regs.PushPopInterface) lowering.FunPopargConversion(fun, regs.PushPopInterface) # We did not bother with this addressing mode # TODO: we like can avoid this by adding more cases to isel_tab.py lowering.FunEliminateStkLoadStoreWithRegOffset(fun, base_kind=o.DK.A64, offset_kind=o.DK.S32) # TODO: switch this to a WithRegOffset flavor lowering.FunEliminateMemLoadStore(fun, base_kind=o.DK.A64, offset_kind=o.DK.S32) lowering.FunEliminateCopySign(fun) # TODO: support a few special cases in the isel, e.g. cmpXX a 0, 1, x, y lowering.FunEliminateCmp(fun) canonicalize.FunCanonicalize(fun) # TODO: add a cfg linearization pass to improve control flow optimize.FunCfgExit( fun, unit) # not this may affect immediates as it flips branches # Handle most overflowing immediates. # This excludes immediates related to stack offsets which have not been determined yet _FunRewriteOutOfBoundsImmediates(fun, unit) # mul/div/rem need special treatment _FunRewriteDivRem(fun) _FunRewriteIntoAABForm(fun, unit) # Recompute Everything (TODO: make this more selective to reduce work) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunDropUnreferencedRegs(fun) liveness.FunComputeLivenessInfo(fun) reg_stats.FunComputeRegStatsLAC(fun) reg_stats.FunSeparateLocalRegUsage( fun ) # this has special hacks to avoid undoing _FunRewriteIntoAABForm() # DumpRegStats(fun, local_reg_stats) # if fun.name == "fibonacci": DumpFun("end of legal", fun) # if fun.name == "write_s": exit(1) sanity.FunCheck(fun, None)
def PhaseGlobalRegAlloc(fun: ir.Fun, _opt_stats: Dict[str, int], fout): """ These phase introduces CpuReg for globals and situations where we have no choice which register to use, e.g. function parameters and results ("pre-allocated" regs). After this function has been run all globals will have a valid cpu_reg and we have to be careful to not introduce new globals subsequently. If not enough cpu_regs are available for all globals, some of them will be spilled. We err on the site of spilling more, the biggest danger is to over-allocate and then lack registers for intra-bbl register allocation. The whole global allocator is terrible and so is the the decision which globals to spill is extremely simplistic at this time. We separate global from local register allocation so that we can use a straight forward linear scan allocator for the locals. This allocator assumes that each register is defined exactly once and hence does not work for globals. """ if fout: print("#" * 60, file=fout) print(f"# GlobalRegAlloc {fun.name}", file=fout) print("#" * 60, file=fout) # replaces pusharg and poparg instructions and replace them with moves # The moves will use pre-allocated regs (the once use for argument/result paassing) # regs.FunPushargConversion(fun) # regs.FunPopargConversion(fun) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunDropUnreferencedRegs(fun) liveness.FunComputeLivenessInfo(fun) reg_stats.FunComputeRegStatsLAC(fun) # Note: REG_KIND_MAP_ARM maps all non-float to registers to S32 local_reg_stats = reg_stats.FunComputeBblRegUsageStats( fun, REG_KIND_TO_CPU_KIND) # we have introduced some cpu regs in previous phases - do not treat them as globals global_reg_stats = _FunGlobalRegStats(fun, REG_KIND_TO_CPU_KIND) DumpRegStats(fun, local_reg_stats, fout) # Handle GPR regs pre_allocated_mask_gpr = 0 for reg in fun.regs: if reg.HasCpuReg() and reg.cpu_reg.kind == regs.CpuRegKind.GPR: pre_allocated_mask_gpr |= regs.A32RegToAllocMask(reg.cpu_reg) # compute the number of regs needed if had indeed unlimited regs needed_gpr = RegsNeeded( len(global_reg_stats[(regs.CpuRegKind.GPR, True)]), len(global_reg_stats[(regs.CpuRegKind.GPR, False)]), local_reg_stats.get((regs.CpuRegKind.GPR, True), 0), local_reg_stats.get((regs.CpuRegKind.GPR, False), 0)) # earmark some regs for globals gpr_global_lac, gpr_global_not_lac = _GetRegPoolsForGlobals( needed_gpr, regs.GPR_REGS_MASK & regs.GPR_LAC_REGS_MASK, regs.GPR_REGS_MASK & ~regs.GPR_LAC_REGS_MASK, pre_allocated_mask_gpr) # assign the earmarked regs to some globals and spill the rest to_be_spilled: List[ir.Reg] = [] to_be_spilled += regs.AssignCpuRegOrMarkForSpilling( global_reg_stats[(regs.CpuRegKind.GPR, True)], gpr_global_lac, 0) to_be_spilled += regs.AssignCpuRegOrMarkForSpilling( global_reg_stats[(regs.CpuRegKind.GPR, False)], gpr_global_not_lac & ~regs.GPR_LAC_REGS_MASK, gpr_global_not_lac & regs.GPR_LAC_REGS_MASK) # Handle Float regs pre_allocated_mask_flt = 0 for reg in fun.regs: if reg.HasCpuReg() and reg.cpu_reg.kind != regs.CpuRegKind.GPR: pre_allocated_mask_flt |= regs.A32RegToAllocMask(reg.cpu_reg) # repeat the same process as we did for GPR regs needed_flt = RegsNeeded( len(global_reg_stats[(regs.CpuRegKind.FLT, True)]) + 2 * len(global_reg_stats[(regs.CpuRegKind.DBL, True)]), len(global_reg_stats[(regs.CpuRegKind.FLT, False)]) + 2 * len(global_reg_stats[(regs.CpuRegKind.DBL, False)]), local_reg_stats.get( (regs.CpuRegKind.FLT, True), 0) + 2 * local_reg_stats.get( (regs.CpuRegKind.DBL, True), 0), local_reg_stats.get( (regs.CpuRegKind.FLT, False), 0) + 2 * local_reg_stats.get( (regs.CpuRegKind.DBL, False), 0)) flt_global_lac, flt_global_not_lac = _GetRegPoolsForGlobals( needed_flt, regs.FLT_REGS_MASK & regs.FLT_LAC_REGS_MASK, regs.FLT_REGS_MASK & ~regs.FLT_LAC_REGS_MASK, pre_allocated_mask_flt) to_be_spilled += regs.AssignCpuRegOrMarkForSpilling( global_reg_stats[(regs.CpuRegKind.DBL, True)] + global_reg_stats[(regs.CpuRegKind.FLT, True)], flt_global_lac, 0) to_be_spilled += regs.AssignCpuRegOrMarkForSpilling( global_reg_stats[(regs.CpuRegKind.DBL, False)] + global_reg_stats[(regs.CpuRegKind.FLT, False)], flt_global_not_lac & ~regs.FLT_LAC_REGS_MASK, flt_global_not_lac & regs.FLT_LAC_REGS_MASK) reg_alloc.FunSpillRegs(fun, o.DK.U32, to_be_spilled, prefix="$gspill") # Recompute Everything (TODO: make this more selective) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunDropUnreferencedRegs(fun) liveness.FunComputeLivenessInfo(fun) reg_stats.FunComputeRegStatsLAC(fun) # establish per bbl SSA form by splitting liveranges reg_stats.FunSeparateLocalRegUsage(fun)