def FunOptBasic(fun: ir.Fun, opt_stats: Dict[str, int], allow_conv_conversion: bool): opt_stats["canonicalized"] += canonicalize.FunCanonicalize(fun) opt_stats["strength_red"] += lowering.FunStrengthReduction(fun) reaching_defs.FunComputeReachingDefs(fun) reaching_defs.FunCheckReachingDefs(fun) opt_stats["reg_prop"] = reaching_defs.FunPropagateRegs(fun) opt_stats["const_prop"] += reaching_defs.FunPropagateConsts(fun) opt_stats["const_fold"] += reaching_defs.FunConstantFold( fun, allow_conv_conversion) opt_stats["canonicalized"] += canonicalize.FunCanonicalize(fun) opt_stats["strength_red"] += lowering.FunStrengthReduction(fun) opt_stats["ls_st_simplify"] += reaching_defs.FunLoadStoreSimplify(fun) opt_stats["move_elim"] += lowering.FunMoveElimination(fun) liveness.FunComputeLivenessInfo(fun) opt_stats["useless"] = liveness.FunRemoveUselessInstructions(fun) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunComputeRegStatsLAC(fun) opt_stats["dropped_regs"] += reg_stats.FunDropUnreferencedRegs(fun) opt_stats["separated_regs"] += reg_stats.FunSeparateLocalRegUsage(fun)
def UnitOptBasic(unit: ir.Unit, dump_reg_stats) -> Dict[str, int]: opt_stats: Dict[str, int] = collections.defaultdict(int) for fun in unit.funs: if fun.kind is not o.FUN_KIND.NORMAL: continue FunOptBasic(fun, opt_stats, allow_conv_conversion=True) if dump_reg_stats: reg_stats.FunComputeRegStatsExceptLAC(fun) liveness.FunComputeLivenessInfo(fun) reg_stats.FunComputeRegStatsLAC(fun) rs = reg_stats.FunCalculateRegStats(fun) print(f"# {fun.name:30} RegStats: {rs}") return opt_stats
def UnitOpt(unit: ir.Unit, dump_reg_stats) -> Dict[str, int]: opt_stats: Dict[str, int] = collections.defaultdict(int) for fun in unit.funs: if fun.kind is not o.FUN_KIND.NORMAL: continue FunOpt(fun, opt_stats) if dump_reg_stats: local_stats = reg_stats.FunComputeBblRegUsageStats( fun, REG_KIND_MAP_TYPICAL) loc_lac = sum( count for (kind, lac), count in local_stats.items() if lac) loc_not_lac = sum( count for (kind, lac), count in local_stats.items() if not lac) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunComputeRegStatsLAC(fun) rs = reg_stats.FunCalculateRegStats(fun) print( f"# {fun.name:30} RegStats: {rs} {loc_lac:2}/{loc_not_lac:2}") return opt_stats
def PhaseGlobalRegAlloc(fun: ir.Fun, _opt_stats: Dict[str, int], fout): """ These phase introduces CpuReg for globals and situations where we have no choice which register to use, e.g. function parameters and results ("pre-allocated" regs). After this function has been run all globals will have a valid cpu_reg and we have to be careful to not introduce new globals subsequently. If not enough cpu_regs are available for all globals, some of them will be spilled. We err on the site of spilling more, the biggest danger is to over-allocate and then lack registers for intra-bbl register allocation. The whole global allocator is terrible and so is the the decision which globals to spill is extremely simplistic at this time. We separate global from local register allocation so that we can use a straight forward linear scan allocator for the locals. This allocator assumes that each register is defined exactly once and hence does not work for globals. """ debug = None if fout: print("#" * 60, file=fout) print(f"# GlobalRegAlloc {fun.name}", file=fout) print("#" * 60, file=fout) # print ("@@@@@@\n", "\n".join(serialize.FunRenderToAsm(fun))) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunDropUnreferencedRegs(fun) liveness.FunComputeLivenessInfo(fun) reg_stats.FunComputeRegStatsLAC(fun) # Note: REG_KIND_MAP_ARM maps all non-float to registers to S64 local_reg_stats = reg_stats.FunComputeBblRegUsageStats(fun, regs.REG_KIND_TO_CPU_REG_FAMILY) # we have introduced some cpu regs in previous phases - do not treat them as globals global_reg_stats = _FunGlobalRegStats(fun, regs.REG_KIND_TO_CPU_REG_FAMILY) DumpRegStats(fun, local_reg_stats, fout) pre_allocated_mask_gpr = 0 for reg in fun.regs: if reg.HasCpuReg() and reg.cpu_reg.kind == regs.CpuRegKind.GPR: pre_allocated_mask_gpr |= 1 << reg.cpu_reg.no # Handle GPR regs needed_gpr = RegsNeeded(len(global_reg_stats[(regs.CpuRegKind.GPR, True)]), len(global_reg_stats[(regs.CpuRegKind.GPR, False)]), local_reg_stats.get((regs.CpuRegKind.GPR, True), 0), local_reg_stats.get((regs.CpuRegKind.GPR, False), 0)) if debug: print(f"@@ GPR NEEDED {needed_gpr.global_lac} {needed_gpr.global_not_lac} " f"{needed_gpr.local_lac} {needed_gpr.local_not_lac}", file=debug) gpr_global_lac, gpr_global_not_lac = _GetRegPoolsForGlobals( needed_gpr, regs.GPR_REGS_MASK & regs.GPR_LAC_REGS_MASK, regs.GPR_REGS_MASK & ~regs.GPR_LAC_REGS_MASK, pre_allocated_mask_gpr) if debug: print(f"@@ GPR POOL {gpr_global_lac:x} {gpr_global_not_lac:x}", file=debug) to_be_spilled: List[ir.Reg] = [] to_be_spilled += regs.AssignCpuRegOrMarkForSpilling( global_reg_stats[(regs.CpuRegKind.GPR, True)], gpr_global_lac, 0) to_be_spilled += regs.AssignCpuRegOrMarkForSpilling( global_reg_stats[(regs.CpuRegKind.GPR, False)], gpr_global_not_lac & ~regs.GPR_LAC_REGS_MASK, gpr_global_not_lac & regs.GPR_LAC_REGS_MASK) # Handle Float regs pre_allocated_mask_flt = 0 for reg in fun.regs: if reg.HasCpuReg() and reg.cpu_reg.kind == regs.CpuRegKind.FLT: pre_allocated_mask_flt |= 1 << reg.cpu_reg.no needed_flt = RegsNeeded(len(global_reg_stats[(regs.CpuRegKind.FLT, True)]), len(global_reg_stats[(regs.CpuRegKind.FLT, False)]), local_reg_stats.get((regs.CpuRegKind.FLT, True), 0), local_reg_stats.get((regs.CpuRegKind.FLT, False), 0)) if debug: print(f"@@ FLT NEEDED {needed_flt.global_lac} {needed_flt.global_not_lac} " f"{needed_flt.local_lac} {needed_flt.local_not_lac}", file=debug) flt_global_lac, flt_global_not_lac = _GetRegPoolsForGlobals( needed_flt, regs.FLT_REGS_MASK & regs.FLT_LAC_REGS_MASK, regs.FLT_REGS_MASK & ~regs.FLT_LAC_REGS_MASK, pre_allocated_mask_flt) if debug: print(f"@@ FLT POOL {flt_global_lac:x} {flt_global_not_lac:x}", file=debug) to_be_spilled += regs.AssignCpuRegOrMarkForSpilling( global_reg_stats[(regs.CpuRegKind.FLT, True)], flt_global_lac, 0) to_be_spilled += regs.AssignCpuRegOrMarkForSpilling( global_reg_stats[(regs.CpuRegKind.FLT, False)], flt_global_not_lac & ~regs.FLT_LAC_REGS_MASK, flt_global_not_lac & regs.FLT_LAC_REGS_MASK) reg_alloc.FunSpillRegs(fun, o.DK.U32, to_be_spilled, prefix="$gspill") # Recompute Everything (TODO: make this more selective to reduce work) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunDropUnreferencedRegs(fun) liveness.FunComputeLivenessInfo(fun) reg_stats.FunComputeRegStatsLAC(fun) reg_stats.FunSeparateLocalRegUsage(fun)
def PhaseGlobalRegAlloc(fun: ir.Fun, _opt_stats: Dict[str, int], fout): """ These phase introduces CpuReg for globals and situations where we have no choice which register to use, e.g. function parameters and results ("pre-allocated" regs). After this function has been run all globals will have a valid cpu_reg and we have to be careful to not introduce new globals subsequently. IF not enough cpu_regs are available for all globals, some of them will be spilled. The whole global allocator is terrible and so is the the decision which globals to spill is extremely simplistic at this time. We sepatate global from local register allocation so that we can use a straight forward linear scan allocator for the locals. This allocator assumes that each register is defined exactly once and hence does not work for globals. """ if fout: print("#" * 60, file=fout) print(f"# GlobalRegAlloc {fun.name}", file=fout) print("#" * 60, file=fout) regs.FunPushargConversion(fun) regs.FunPopargConversion(fun) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunDropUnreferencedRegs(fun) liveness.FunComputeLivenessInfo(fun) reg_stats.FunComputeRegStatsLAC(fun) # Note: REG_KIND_MAP_ARM maps all non-float to registers to S32 local_reg_stats = reg_stats.FunComputeBblRegUsageStats( fun, REG_KIND_MAP_ARM) # we have introduced some cpu regs in previous phases - do not treat them as globals global_reg_stats = _FunGlobalRegStats(fun, REG_KIND_MAP_ARM) DumpRegStats(fun, local_reg_stats, fout) pre_allocated: Set[ir.CpuReg] = { reg.cpu_reg for reg in fun.regs if reg.HasCpuReg() } # Handle GPR regs needed_gpr = RegsNeeded( len(global_reg_stats[(o.DK.S32, True)]), len(global_reg_stats[(o.DK.S32, False)]), local_reg_stats.get((o.DK.S32, True), 0), # TODO: avoid fudge factor 1 + local_reg_stats.get((o.DK.S32, False), 0)) gpr_global_lac, gpr_global_not_lac = _GetRegPoolsForGlobals( needed_gpr, regs.GPR_CALLEE_SAVE_REGS.copy(), regs.GPR_NOT_LAC_REGS.copy(), pre_allocated) to_be_spilled: List[ir.Reg] = [] to_be_spilled += _AssignCpuRegOrMarkForSpilling( global_reg_stats[(o.DK.S32, True)], gpr_global_lac) to_be_spilled += _AssignCpuRegOrMarkForSpilling( global_reg_stats[(o.DK.S32, False)], gpr_global_not_lac) # Handle Float regs needed_flt = RegsNeeded( len(global_reg_stats[(o.DK.F32, True)]) + 2 * len(global_reg_stats[(o.DK.F64, True)]), len(global_reg_stats[(o.DK.F32, False)]) + 2 * len(global_reg_stats[(o.DK.F64, True)]), local_reg_stats.get((o.DK.F32, True), 0) + 2 * local_reg_stats.get( (o.DK.F64, True), 0), # TODO: avoid fudge factor 2 + local_reg_stats.get( (o.DK.F32, False), 0) + 2 * local_reg_stats.get( (o.DK.F64, False), 0)) flt_global_lac, flt_global_not_lac = _GetRegPoolsForGlobals( needed_flt, regs.FLT_CALLEE_SAVE_REGS.copy(), regs.FLT_PARAMETER_REGS.copy(), pre_allocated) to_be_spilled += _AssignCpuRegOrMarkForSpilling( global_reg_stats[(o.DK.F64, True)] + global_reg_stats[(o.DK.F32, True)], flt_global_lac) to_be_spilled += _AssignCpuRegOrMarkForSpilling( global_reg_stats[(o.DK.F64, False)] + global_reg_stats[(o.DK.F32, False)], flt_global_not_lac) reg_alloc.FunSpillRegs(fun, o.DK.U32, to_be_spilled) # Recompute Everything (TODO: make this more selective) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunDropUnreferencedRegs(fun) liveness.FunComputeLivenessInfo(fun) reg_stats.FunComputeRegStatsLAC(fun) reg_stats.FunSeparateLocalRegUsage(fun)
def PhaseLegalization(fun: ir.Fun, unit: ir.Unit, _opt_stats: Dict[str, int], fout): """ Does a lot of the heavily lifting so that the instruction selector can remain simple and table driven. * lift almost all regs to 32bit width * rewrite Ins that cannot be expanded * rewrite immediates that cannot be expanded except stack offsets which are dealt with in another pass TODO: missing is a function to change calling signature so that """ fun.cpu_live_in = regs.PushPopInterface.GetCpuRegsForInSignature( fun.input_types) fun.cpu_live_out = regs.PushPopInterface.GetCpuRegsForOutSignature( fun.output_types) if fun.kind is not o.FUN_KIND.NORMAL: return # Getting rid of the pusharg/poparg now relieves us form having to pay to attention to the # invariant that pushargs/popargs must be adjacent. lowering.FunPushargConversion(fun, regs.PushPopInterface) lowering.FunPopargConversion(fun, regs.PushPopInterface) # We did not bother with this addressing mode # TODO: we like can avoid this by adding more cases to isel_tab.py lowering.FunEliminateStkLoadStoreWithRegOffset(fun, base_kind=o.DK.A64, offset_kind=o.DK.S32) # TODO: switch this to a WithRegOffset flavor lowering.FunEliminateMemLoadStore(fun, base_kind=o.DK.A64, offset_kind=o.DK.S32) lowering.FunEliminateCopySign(fun) # TODO: support a few special cases in the isel, e.g. cmpXX a 0, 1, x, y lowering.FunEliminateCmp(fun) canonicalize.FunCanonicalize(fun) # TODO: add a cfg linearization pass to improve control flow optimize.FunCfgExit( fun, unit) # not this may affect immediates as it flips branches # Handle most overflowing immediates. # This excludes immediates related to stack offsets which have not been determined yet _FunRewriteOutOfBoundsImmediates(fun, unit) # mul/div/rem need special treatment _FunRewriteDivRem(fun) _FunRewriteIntoAABForm(fun, unit) # Recompute Everything (TODO: make this more selective to reduce work) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunDropUnreferencedRegs(fun) liveness.FunComputeLivenessInfo(fun) reg_stats.FunComputeRegStatsLAC(fun) reg_stats.FunSeparateLocalRegUsage( fun ) # this has special hacks to avoid undoing _FunRewriteIntoAABForm() # DumpRegStats(fun, local_reg_stats) # if fun.name == "fibonacci": DumpFun("end of legal", fun) # if fun.name == "write_s": exit(1) sanity.FunCheck(fun, None)
def PhaseGlobalRegAlloc(fun: ir.Fun, _opt_stats: Dict[str, int], fout): """ These phase introduces CpuReg for globals and situations where we have no choice which register to use, e.g. function parameters and results ("pre-allocated" regs). After this function has been run all globals will have a valid cpu_reg and we have to be careful to not introduce new globals subsequently. If not enough cpu_regs are available for all globals, some of them will be spilled. We err on the site of spilling more, the biggest danger is to over-allocate and then lack registers for intra-bbl register allocation. The whole global allocator is terrible and so is the the decision which globals to spill is extremely simplistic at this time. We separate global from local register allocation so that we can use a straight forward linear scan allocator for the locals. This allocator assumes that each register is defined exactly once and hence does not work for globals. """ if fout: print("#" * 60, file=fout) print(f"# GlobalRegAlloc {fun.name}", file=fout) print("#" * 60, file=fout) # replaces pusharg and poparg instructions and replace them with moves # The moves will use pre-allocated regs (the once use for argument/result paassing) # regs.FunPushargConversion(fun) # regs.FunPopargConversion(fun) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunDropUnreferencedRegs(fun) liveness.FunComputeLivenessInfo(fun) reg_stats.FunComputeRegStatsLAC(fun) # Note: REG_KIND_MAP_ARM maps all non-float to registers to S32 local_reg_stats = reg_stats.FunComputeBblRegUsageStats( fun, REG_KIND_TO_CPU_KIND) # we have introduced some cpu regs in previous phases - do not treat them as globals global_reg_stats = _FunGlobalRegStats(fun, REG_KIND_TO_CPU_KIND) DumpRegStats(fun, local_reg_stats, fout) # Handle GPR regs pre_allocated_mask_gpr = 0 for reg in fun.regs: if reg.HasCpuReg() and reg.cpu_reg.kind == regs.CpuRegKind.GPR: pre_allocated_mask_gpr |= regs.A32RegToAllocMask(reg.cpu_reg) # compute the number of regs needed if had indeed unlimited regs needed_gpr = RegsNeeded( len(global_reg_stats[(regs.CpuRegKind.GPR, True)]), len(global_reg_stats[(regs.CpuRegKind.GPR, False)]), local_reg_stats.get((regs.CpuRegKind.GPR, True), 0), local_reg_stats.get((regs.CpuRegKind.GPR, False), 0)) # earmark some regs for globals gpr_global_lac, gpr_global_not_lac = _GetRegPoolsForGlobals( needed_gpr, regs.GPR_REGS_MASK & regs.GPR_LAC_REGS_MASK, regs.GPR_REGS_MASK & ~regs.GPR_LAC_REGS_MASK, pre_allocated_mask_gpr) # assign the earmarked regs to some globals and spill the rest to_be_spilled: List[ir.Reg] = [] to_be_spilled += regs.AssignCpuRegOrMarkForSpilling( global_reg_stats[(regs.CpuRegKind.GPR, True)], gpr_global_lac, 0) to_be_spilled += regs.AssignCpuRegOrMarkForSpilling( global_reg_stats[(regs.CpuRegKind.GPR, False)], gpr_global_not_lac & ~regs.GPR_LAC_REGS_MASK, gpr_global_not_lac & regs.GPR_LAC_REGS_MASK) # Handle Float regs pre_allocated_mask_flt = 0 for reg in fun.regs: if reg.HasCpuReg() and reg.cpu_reg.kind != regs.CpuRegKind.GPR: pre_allocated_mask_flt |= regs.A32RegToAllocMask(reg.cpu_reg) # repeat the same process as we did for GPR regs needed_flt = RegsNeeded( len(global_reg_stats[(regs.CpuRegKind.FLT, True)]) + 2 * len(global_reg_stats[(regs.CpuRegKind.DBL, True)]), len(global_reg_stats[(regs.CpuRegKind.FLT, False)]) + 2 * len(global_reg_stats[(regs.CpuRegKind.DBL, False)]), local_reg_stats.get( (regs.CpuRegKind.FLT, True), 0) + 2 * local_reg_stats.get( (regs.CpuRegKind.DBL, True), 0), local_reg_stats.get( (regs.CpuRegKind.FLT, False), 0) + 2 * local_reg_stats.get( (regs.CpuRegKind.DBL, False), 0)) flt_global_lac, flt_global_not_lac = _GetRegPoolsForGlobals( needed_flt, regs.FLT_REGS_MASK & regs.FLT_LAC_REGS_MASK, regs.FLT_REGS_MASK & ~regs.FLT_LAC_REGS_MASK, pre_allocated_mask_flt) to_be_spilled += regs.AssignCpuRegOrMarkForSpilling( global_reg_stats[(regs.CpuRegKind.DBL, True)] + global_reg_stats[(regs.CpuRegKind.FLT, True)], flt_global_lac, 0) to_be_spilled += regs.AssignCpuRegOrMarkForSpilling( global_reg_stats[(regs.CpuRegKind.DBL, False)] + global_reg_stats[(regs.CpuRegKind.FLT, False)], flt_global_not_lac & ~regs.FLT_LAC_REGS_MASK, flt_global_not_lac & regs.FLT_LAC_REGS_MASK) reg_alloc.FunSpillRegs(fun, o.DK.U32, to_be_spilled, prefix="$gspill") # Recompute Everything (TODO: make this more selective) reg_stats.FunComputeRegStatsExceptLAC(fun) reg_stats.FunDropUnreferencedRegs(fun) liveness.FunComputeLivenessInfo(fun) reg_stats.FunComputeRegStatsLAC(fun) # establish per bbl SSA form by splitting liveranges reg_stats.FunSeparateLocalRegUsage(fun)