def _InsEliminateCopySign(ins: ir.Ins, fun: ir.Fun) -> Optional[List[ir.Ins]]: """Rewrites copysign instructions like so: z = copysign a b aa = int(a) & 0x7f...f bb = int(b) & 0x80...0 z = flt(aa | bb) """ if ins.opcode is not o.COPYSIGN: return None ops = ins.operands out = [] if ops[0].kind == o.DK.F32: kind = o.DK.U32 sign = 1 << 31 mask = sign - 1 else: kind = o.DK.U64 sign = 1 << 63 mask = sign - 1 tmp_src1 = fun.GetScratchReg(kind, "elim_copysign1", False) out.append(ir.Ins(o.BITCAST, [tmp_src1, ops[1]])) out.append(ir.Ins(o.AND, [tmp_src1, tmp_src1, ir.Const(kind, mask)])) # tmp_src2 = fun.GetScratchReg(kind, "elim_copysign2", False) out.append(ir.Ins(o.BITCAST, [tmp_src2, ops[2]])) out.append(ir.Ins(o.AND, [tmp_src2, tmp_src2, ir.Const(kind, sign)])) # out.append(ir.Ins(o.OR, [tmp_src1, tmp_src1, tmp_src2])) out.append(ir.Ins(o.BITCAST, [ops[0], tmp_src1])) return out
def _InsEliminateRem(ins: ir.Ins, fun: ir.Fun) -> Optional[List[ir.Ins]]: """Rewrites modulo instructions like so: z = a % b becomes z = a // b z = z * b z = a - z TODO: double check that this works out for corner-cases """ if ins.opcode is not o.REM: return None ops = ins.operands out = [] tmp_reg1 = fun.GetScratchReg(ops[0].kind, "elim_rem1", True) out.append(ir.Ins(o.DIV, [tmp_reg1, ops[1], ops[2]])) # NOTE: this implementation for floating mod may have precision issues. if ops[0].kind.flavor() is o.DK_FLAVOR_F: tmp_reg3 = fun.GetScratchReg(ops[0].kind, "elim_rem3", True) out.append(ir.Ins(o.TRUNC, [tmp_reg3, tmp_reg1])) tmp_reg1 = tmp_reg3 tmp_reg2 = fun.GetScratchReg(ops[0].kind, "elim_rem2", True) out.append(ir.Ins(o.MUL, [tmp_reg2, tmp_reg1, ops[2]])) out.append(ir.Ins(o.SUB, [ops[0], ops[1], tmp_reg2])) return out
def _InsEliminateMemLoadStore(ins: ir.Ins, fun: ir.Fun, base_kind: o.DK, offset_kind: o.DK) -> Optional[List[ir.Ins]]: """This rewrite is usually applied as prep step by some backends to get rid of Mem operands. It allows the register allocator to see the scratch register but it will obscure the fact that a ld/st is from a static location. Note: this function may add local registers which does not affect liveness or use-deg chains """ opc = ins.opcode ops = ins.operands if opc is o.ST_MEM: st_offset = ops[1] lea_offset = ir.Const(offset_kind, 0) if isinstance(st_offset, ir.Const): st_offset, lea_offset = lea_offset, st_offset scratch_reg = fun.GetScratchReg(base_kind, "base", False) lea = ir.Ins(o.LEA_MEM, [scratch_reg, ops[0], lea_offset]) ins.Init(o.ST, [scratch_reg, st_offset, ops[2]]) return [lea, ins] elif opc is o.LD_MEM: ld_offset = ops[2] lea_offset = ir.Const(offset_kind, 0) if isinstance(ld_offset, ir.Const): ld_offset, lea_offset = lea_offset, ld_offset scratch_reg = fun.GetScratchReg(base_kind, "base", False) # TODO: should the Zero Offset stay with the ld op? lea = ir.Ins(o.LEA_MEM, [scratch_reg, ops[1], lea_offset]) ins.Init(o.LD, [ops[0], scratch_reg, ld_offset]) return [lea, ins] else: return None
def PhaseLegalization(fun: ir.Fun, unit: ir.Unit, _opt_stats: Dict[str, int], fout): """ Does a lot of the heavily lifting so that the instruction selector can remain simple and table driven. * lift almost all regs to 32bit width * rewrite Ins that cannot be expanded * rewrite immediates that cannot be expanded except stack offsets which are dealt with in another pass TODO: missing is a function to change calling signature so that """ lowering.FunRegWidthWidening(fun, o.DK.U8, o.DK.U32) lowering.FunRegWidthWidening(fun, o.DK.S8, o.DK.S32) lowering.FunRegWidthWidening(fun, o.DK.S16, o.DK.S32) lowering.FunRegWidthWidening(fun, o.DK.U16, o.DK.U32) fun.cpu_live_in = regs.GetCpuRegsForSignature(fun.input_types) fun.cpu_live_out = regs.GetCpuRegsForSignature(fun.output_types) if fun.kind is not o.FUN_KIND.NORMAL: return # ARM has no mod instruction lowering.FunEliminateRem(fun) # ARM has not support for these addressing modes lowering.FunEliminateStkLoadStoreWithRegOffset(fun, base_kind=o.DK.A32, offset_kind=o.DK.S32) # No floating point immediates lowering.FunMoveImmediatesToMemory(fun, unit, o.DK.F32) lowering.FunMoveImmediatesToMemory(fun, unit, o.DK.F64) # also handles ld_mem from two transformations above lowering.FunEliminateMemLoadStore(fun, base_kind=o.DK.A32, offset_kind=o.DK.S32) canonicalize.FunCanonicalize(fun) # TODO: add a cfg linearization pass to improve control flow optimize.FunCfgExit( fun, unit) # not this may affect immediates as it flips branches # Handle most overflowing immediates. # This excludes immediates related to stack offsets which have not been determined yet lowering.FunEliminateImmediateStores(fun) # handles st_stk immediates _FunRewriteOutOfBoundsImmediates(fun) # hack: some of the code expansion templates need a scratch reg # we do not want to reserve registers for this globally, so instead # we inject some nop instructions that reserve a register that we # use as a scratch for the instruction immediately following the nop isel_tab.FunAddNop1ForCodeSel(fun) sanity.FunCheck(fun, None)
def _GetRegOrConstOperand(fun: ir.Fun, last_kind: o.DK, ok: o.OP_KIND, tc: o.TC, token: str, regs_cpu: Dict[str, ir.Reg]) -> Any: if ok == o.OP_KIND.REG_OR_CONST: ok = o.OP_KIND.CONST if parse.IsLikelyConst(token) else o.OP_KIND.REG if ok is o.OP_KIND.REG: cpu_reg = None pos = token.find("@") if pos > 0: cpu_reg_name = token[pos + 1:] token = token[:pos] if cpu_reg_name == "STK": cpu_reg = ir.StackSlot(0) else: cpu_reg = regs_cpu.get(cpu_reg_name) assert cpu_reg is not None, f"unknown cpu_reg {token[pos + 1:]} known regs {regs_cpu.keys()}" pos = token.find(":") if pos < 0: reg = fun.GetReg(token) else: kind = token[pos + 1:] reg_name = token[:pos] reg = ir.Reg(reg_name, o.SHORT_STR_TO_RK.get(kind)) fun.AddReg(reg) assert o.CheckTypeConstraint(last_kind, tc, reg.kind) if cpu_reg: if reg.cpu_reg: assert reg.cpu_reg == cpu_reg else: reg.cpu_reg = cpu_reg return reg else: pos = token.find(":") if pos >= 0: kind = token[pos + 1:] value_str = token[:pos] const = ir.ParseConst(value_str, o.SHORT_STR_TO_RK.get(kind)) return const elif tc == o.TC.SAME_AS_PREV: const = ir.ParseConst(token, last_kind) return const elif tc == o.TC.OFFSET: const = ir.ParseOffsetConst(token) return const elif tc == o.TC.UINT: assert token[0] != "-" const = ir.ParseOffsetConst(token) return const else: assert False, f"cannot deduce type for const {token} [{tc}]"
def PhaseLegalization(fun: ir.Fun, unit: ir.Unit, _opt_stats: Dict[str, int], fout): """ Does a lot of the heavily lifting so that the instruction selector can remain simple and table driven. * lift almost all regs to 32bit width * rewrite Ins that cannot be expanded * rewrite immediates that cannot be expanded except stack offsets which are dealt with in another pass TODO: missing is a function to change calling signature so that """ lowering.FunRegWidthWidening(fun, o.DK.U8, o.DK.U32) lowering.FunRegWidthWidening(fun, o.DK.S8, o.DK.S32) lowering.FunRegWidthWidening(fun, o.DK.S16, o.DK.S32) lowering.FunRegWidthWidening(fun, o.DK.U16, o.DK.U32) fun.cpu_live_in = regs.PushPopInterface.GetCpuRegsForInSignature(fun.input_types) fun.cpu_live_out = regs.PushPopInterface.GetCpuRegsForOutSignature(fun.output_types) if fun.kind is not o.FUN_KIND.NORMAL: return # Getting rid of the pusharg/poparg now relieves us form having to pay to attention to the # invariant that pushargs/popargs must be adjacent. lowering.FunPushargConversion(fun, regs.PushPopInterface) lowering.FunPopargConversion(fun, regs.PushPopInterface) # ARM has no mod instruction lowering.FunEliminateRem(fun) # A64 has not support for these addressing modes lowering.FunEliminateStkLoadStoreWithRegOffset(fun, base_kind=o.DK.A64, offset_kind=o.DK.S32) # we cannot load/store directly from mem so expand the instruction to simpler # sequences lowering.FunEliminateMemLoadStore(fun, base_kind=o.DK.A64, offset_kind=o.DK.S32) canonicalize.FunCanonicalize(fun) # TODO: add a cfg linearization pass to improve control flow optimize.FunCfgExit(fun, unit) # not this may affect immediates as it flips branches # Handle most overflowing immediates. # This excludes immediates related to stack offsets which have not been determined yet _FunRewriteOutOfBoundsImmediates(fun, unit) sanity.FunCheck(fun, None)
def _InsLimitShiftAmounts(ins: ir.Ins, fun: ir.Fun, width: int) -> Optional[List[ir.Ins]]: """This rewrite is usually applied as prep step by some backends to get rid of Stk operands. It allows the register allocator to see the scratch register but it will obscure the fact that a memory access is a stack access. Note, a stack address already implies a `sp+offset` addressing mode and risk ISAs do no usually support `sp+offset+reg` addressing mode. """ opc = ins.opcode ops = ins.operands if (opc is not o.SHL and opc is not o.SHR) or ops[0].kind.bitwidth() != width: return None amount = ops[2] if isinstance(amount, ir.Const): if 0 <= amount.value < width: return None else: ops[2] = ir.Const(amount.kind, amount.value % width) return ins else: tmp = fun.GetScratchReg(amount.kind, "shift", False) mask = ir.Ins(o.AND, [tmp, amount, ir.Const(amount.kind, width - 1)]) ins.Init(opc, [ops[0], ops[1], tmp]) return [mask, ins]
def FunRemoveEmptyBbls(fun: ir.Fun) -> int: keep = [] for bbl in fun.bbls: if bbl.inss: keep.append(bbl) continue succ = bbl.edge_out[0] if succ == bbl: # we have to keep infinite loop keep.append(bbl) continue # print ("BBL -DELETE", bbl.name) # print("IN", bbl.edge_in) # print ("OUT", bbl.edge_out) del fun.bbl_syms[bbl.name] # assert bbl != fun.bbls[0], f"attempt to delete first bbl in fun {fun.name}" assert len(bbl.edge_out) == 1, bbl succ = bbl.edge_out[0] bbl.DelEdgeOut(succ) # We need to clone the edge list since we have destructive updates # but while we are at it let's also process every predecessor only once unique_preds: Set[str] = set(pred.name for pred in bbl.edge_in) for pred_name in unique_preds: pred = fun.bbl_syms[pred_name] if pred.inss: InsMaybePatchNewSuccessor(pred.inss[-1], bbl, succ) # patch ins/jtb pred.ReplaceEdgeOut(bbl, succ) # patch edg discarded = len(fun.bbls) - len(keep) fun.bbls = keep return discarded
def InsEliminateCmp(ins: ir.Ins, bbl: ir.Bbl, fun: ir.Fun): """Rewrites cmpXX a, b, c, x, y instructions like so: canonicalization ensures that a != c mov z b bXX skip, x, y mov z c .bbl skip mov a z TODO: This is very coarse """ assert ins.opcode.kind is o.OPC_KIND.CMP bbl_skip = cfg.BblSplit(ins, bbl, fun, bbl.name + "_spilt") bbl_prev = cfg.BblSplit(ins, bbl_skip, fun, bbl.name + "_spilt") assert not bbl_skip.inss assert bbl_prev.inss[-1] is ins assert bbl_prev.edge_out == [bbl_skip] assert bbl_skip.edge_in == [bbl_prev] assert bbl_skip.edge_out == [bbl] assert bbl.edge_in == [bbl_skip] reg = fun.GetScratchReg(ins.operands[0].kind, "cmp", False) del bbl_prev.inss[-1] ops = ins.operands bbl_prev.inss.append(ir.Ins(o.MOV, [reg, ops[1]])) bbl_prev.inss.append( ir.Ins(o.BEQ if ins.opcode == o.CMPEQ else o.BLT, [ops[3], ops[4], bbl])) bbl_skip.inss.append(ir.Ins(o.MOV, [reg, ops[2]])) bbl.inss.insert(0, ir.Ins(o.MOV, [ops[0], reg])) bbl_prev.edge_out.append(bbl) bbl.edge_in.append(bbl_prev)
def FunSeparateLocalRegUsage(fun: ir.Fun) -> int: """ Split life ranges for (BBL) local regs This is works in coordination with the liverange computation AND the local register allocator which assigns one cpu register to each liverange. """ count = 0 for bbl in fun.bbls: for pos, ins in enumerate(bbl.inss): num_defs = ins.opcode.def_ops_count() for n, reg in enumerate(ins.operands[:num_defs]): assert isinstance(reg, ir.Reg) # do not separate if: # * this is the first definition of this reg # * the reg is global # * the reg is part of a two address "situation" (for x64) # * the reg is has been assigned a cpu_reg if (reg.def_ins is ins or ir.REG_FLAG.GLOBAL in reg.flags or (ir.REG_FLAG.TWO_ADDRESS in reg.flags and len(ins.operands) >= 2 and ins.operands[0] == ins.operands[1]) or reg.cpu_reg is not None): continue purpose = reg.name if purpose.startswith("$"): underscore_pos = purpose.find("_") purpose = purpose[underscore_pos + 1:] new_reg = fun.GetScratchReg(reg.kind, purpose, False) if ir.REG_FLAG.TWO_ADDRESS in reg.flags: new_reg.flags |= ir.REG_FLAG.TWO_ADDRESS ins.operands[n] = new_reg _BblRenameReg(bbl, pos + 1, reg, new_reg) count += 1 return count
def PhaseFinalizeStackAndLocalRegAlloc(fun: ir.Fun, _opt_stats: Dict[str, int], fout): """Finalizing the stack implies performing all transformations that could increase register usage. """ # print("@@@@@@\n", "\n".join(serialize.FunRenderToAsm(fun)), file=fout) # hack: some of the code expansion templates need a scratch reg # we do not want to reserve registers for this globally, so instead # we inject some nop instructions that reserve a register that we # use as a scratch for the instruction immediately following the nop # # This still has a potential bug: if the next instruction has one of its # inputs spilled, it will like use the scratch reg provided by the nop1 # which will cause incorrect code. # TODO: add a checker so we at least detect this # Alternatives: reserve reg (maybe only for functions that need it) # TODO: make sure that nop1 regs never get spilled isel_tab.FunAddNop1ForCodeSel(fun) regs.FunLocalRegAlloc(fun) fun.FinalizeStackSlots() # if fun.name == "fibonacci": DumpFun("after local alloc", fun) # DumpFun("after local alloc", fun) # cleanup _FunMoveEliminationCpu(fun)
def _GetOperand(unit: ir.Unit, fun: ir.Fun, ok: o.OP_KIND, v: Any) -> Any: if ok in o.OKS_LIST: assert isinstance(v, list) or v[0] == v[-1] == '"', f"operand {ok}: [{v}]" else: assert isinstance(v, str), f"bad operand {v} of type [{ok}]" if ok is o.OP_KIND.TYPE_LIST: out = [] for kind_name in v: kind = o.SHORT_STR_TO_RK.get(kind_name) assert kind is not None, f"bad kind name [{kind_name}]" out.append(kind) return out elif ok is o.OP_KIND.FUN: return unit.GetFunOrAddForwardDeclaration(v) elif ok is o.OP_KIND.BBL: return fun.GetBblOrAddForwardDeclaration(v) elif ok is o.OP_KIND.BBL_TAB: return ExtractBblTable(fun, v) elif ok is o.OP_KIND.MEM: return unit.GetMem(v) elif ok is o.OP_KIND.STK: return fun.GetStk(v) elif ok is o.OP_KIND.FUN_KIND: return o.SHORT_STR_TO_FK[v] elif ok is o.OP_KIND.DATA_KIND: rk = o.SHORT_STR_TO_RK.get(v) assert rk is not None, f"bad kind name [{v}]" return rk elif ok is o.OP_KIND.NAME: assert parse.RE_IDENTIFIER.match(v), f"bad identifier [{v}]" return v elif ok is o.OP_KIND.NAME_LIST: for x in v: assert parse.RE_IDENTIFIER.match(x), f"bad identifier [{x}]" return v elif ok is o.OP_KIND.MEM_KIND: return o.SHORT_STR_TO_MK[v] elif ok is o.OP_KIND.VALUE: return v elif ok is o.OP_KIND.BYTES: return ExtractBytes(v) elif ok is o.OP_KIND.JTB: return fun.GetJbl(v) else: raise ir.ParseError(f"cannot read op type: {ok}")
def ExtractBblTable(fun: ir.Fun, lst: List) -> Dict[int, ir.Bbl]: assert len(lst) % 2 == 0 it = iter(lst) out = {} for num_str in it: bbl_name = next(it) out[int(num_str)] = fun.GetBblOrAddForwardDeclaration(bbl_name) return out
def FunSpillRegs(fun: ir.Fun, offset_kind: o.DK, regs: List[ir.Reg]) -> int: reg_to_stk: Dict[ir.Reg, ir.Stk] = {} for reg in regs: size = ir.OffsetConst(reg.kind.bitwidth() // 8) stk = ir.Stk(f"$spill_{reg.name}", size, size) reg_to_stk[reg] = stk fun.AddStk(stk) return ir.FunGenericRewrite(fun, InsSpillRegs, zero_const=ir.Const(offset_kind, 0), reg_to_stk=reg_to_stk)
def PhaseFinalizeStackAndLocalRegAlloc(fun: ir.Fun, _opt_stats: Dict[str, int], fout): """Finalizing the stack implies performing all transformations that could increase register usage. """ regs.FunLocalRegAlloc(fun) fun.FinalizeStackSlots() # cleanup FunMoveEliminationCpu(fun)
def _InsAddNop1ForCodeSel(ins: ir.Ins, fun: ir.Fun) -> Optional[List[ir.Ins]]: opc = ins.opcode if opc is o.SWITCH: # needs scratch to compute the jmp address into scratch = fun.GetScratchReg(o.DK.C32, "switch", False) return [ir.Ins(o.NOP1, [scratch]), ins] elif (opc is o.CONV and o.RegIsInt(ins.operands[0].kind) and ins.operands[1].kind.flavor() == o.DK_FLAVOR_F): # need scratch for intermediate ftl result # we know the result cannot be wider than 32bit for this CPU scratch = fun.GetScratchReg(o.DK.F32, "ftoi", False) return [ir.Ins(o.NOP1, [scratch]), ins] elif (opc is o.CONV and o.RegIsInt(ins.operands[1].kind) and ins.operands[0].kind is o.DK.F64): # need scratch for intermediate ftl result # we know the result cannot be wider than 32bit for this CPU scratch = fun.GetScratchReg(o.DK.F32, "itof", False) return [ir.Ins(o.NOP1, [scratch]), ins] return [ins]
def FunRemoveUnreachableBbls(fun: ir.Fun) -> int: reachable = set() stack: List[ir.Bbl] = [fun.bbls[0]] while stack: curr = stack.pop(-1) if curr.name in reachable: continue reachable.add(curr.name) stack += curr.edge_out discarded = len(fun.bbls) - len(reachable) for bbl in fun.bbls: if bbl.name in reachable: continue for succ in bbl.edge_out: succ.edge_in.remove(bbl) fun.bbls = [bbl for bbl in fun.bbls if bbl.name in reachable] fun.bbl_syms = {bbl.name: bbl for bbl in fun.bbls} return discarded
def _InsRewriteFltImmediates(ins: ir.Ins, fun: ir.Fun, unit: ir.Unit) -> Optional[List[ir.Ins]]: inss = [] for n, op in enumerate(ins.operands): if isinstance(op, ir.Const) and op.kind.flavor() is o.DK_FLAVOR_F: mem = unit.FindOrAddConstMem(op) tmp = fun.GetScratchReg(op.kind, "flt_const", True) inss.append(ir.Ins(o.LD_MEM, [tmp, mem, _ZERO_OFFSET])) ins.operands[n] = tmp if inss: return inss + [ins] return None
def InsEliminateImmediateViaMem(ins: ir.Ins, pos: int, fun: ir.Fun, unit: ir.Unit, addr_kind: o.DK, offset_kind: o.DK) -> List[ir.Ins]: """Rewrite instruction with an immediate as load of the immediate This is useful if the target architecture does not support immediate for that instruction, or the immediate is too large. This optimization is run rather late and may already see machine registers. """ # support of PUSHARG would require additional work because they need to stay consecutive assert ins.opcode is not o.PUSHARG const = ins.operands[pos] mem = unit.FindOrAddConstMem(const) tmp_addr = fun.GetScratchReg(addr_kind, "mem_const_addr", True) lea_ins = ir.Ins(o.LEA_MEM, [tmp_addr, mem, ir.Const(offset_kind, 0)]) tmp = fun.GetScratchReg(const.kind, "mem_const", True) ld_ins = ir.Ins(o.LD, [tmp, tmp_addr, ir.Const(offset_kind, 0)]) ins.operands[pos] = tmp return [lea_ins, ld_ins]
def _InsRewriteDivRemShifts(ins: ir.Ins, fun: ir.Fun) -> Optional[List[ir.Ins]]: opc = ins.opcode ops = ins.operands if opc is o.DIV and ops[0].kind.flavor() != o.DK_FLAVOR_F: # note: we could leave it to the register allocator to pick a CpuReg for ops[2] # but then we would somehow have to ensure that the reg is NOT rdx. # By forcing rcx for ops[2] we sidestep the issue rax = fun.FindOrAddCpuReg(regs.CPU_REGS_MAP["rax"], ops[0].kind) rcx = fun.FindOrAddCpuReg(regs.CPU_REGS_MAP["rcx"], ops[0].kind) rdx = fun.FindOrAddCpuReg(regs.CPU_REGS_MAP["rdx"], ops[0].kind) return [ ir.Ins(o.MOV, [rax, ops[1]]), ir.Ins(o.MOV, [rcx, ops[2]]), ir.Ins(o.DIV, [rdx, rax, rcx ]), # note the notion of src/dst regs is murky here ir.Ins(o.MOV, [ops[0], rax]) ] elif opc is o.REM and ops[0].kind.flavor() != o.DK_FLAVOR_F: rax = fun.FindOrAddCpuReg(regs.CPU_REGS_MAP["rax"], ops[0].kind) rcx = fun.FindOrAddCpuReg(regs.CPU_REGS_MAP["rcx"], ops[0].kind) rdx = fun.FindOrAddCpuReg(regs.CPU_REGS_MAP["rdx"], ops[0].kind) return [ ir.Ins(o.MOV, [rax, ops[1]]), ir.Ins(o.MOV, [rcx, ops[2]]), ir.Ins(o.DIV, [rdx, rax, rcx ]), # note the notion of src/dst regs is murky here ir.Ins(o.MOV, [ops[0], rdx]) ] elif opc in {o.SHR, o.SHL} and isinstance(ops[2], ir.Reg): rcx = fun.FindOrAddCpuReg(regs.CPU_REGS_MAP["rcx"], ops[0].kind) mov = ir.Ins(o.MOV, [rcx, ops[2]]) ops[2] = rcx mask = _SHIFT_MASK.get(ops[0].kind) if mask: return [mov, ir.Ins(o.AND, [rcx, rcx, mask]), ins] else: return [mov, ins] else: return None
def BblSpillRegs(bbl: ir.Bbl, fun: ir.Fun, regs: List[ir.Reg], offset_kind: o.DK, prefix) -> int: reg_to_stk: Dict[ir.Reg, ir.Stk] = {} for reg in regs: size = reg.kind.bitwidth() // 8 stk = ir.Stk(f"{prefix}_{reg.name}", size, size) reg_to_stk[reg] = stk fun.AddStk(stk) ir.BblGenericRewrite(bbl, fun, InsSpillRegs, zero_const=ir.Const(offset_kind, 0), reg_to_stk=reg_to_stk)
def InsSpillRegs(ins: ir.Ins, fun: ir.Fun, zero_const, reg_to_stk) -> Optional[List[ir.Ins]]: before: List[ir.Ins] = [] after: List[ir.Ins] = [] num_defs = ins.opcode.def_ops_count() for n, reg in reversed(list(enumerate(ins.operands))): if not isinstance(reg, ir.Reg): continue stk = reg_to_stk.get(reg) if stk is None: continue if n < num_defs: scratch = fun.GetScratchReg(reg.kind, "stspill", False) ins.operands[n] = scratch after.append(ir.Ins(o.ST_STK, [stk, zero_const, scratch])) else: scratch = fun.GetScratchReg(reg.kind, "ldspill", False) ins.operands[n] = scratch before.append(ir.Ins(o.LD_STK, [scratch, stk, zero_const])) if before or after: return before + [ins] + after else: return None
def _InsMoveImmediatesToMemory(ins: ir.Ins, fun: ir.Fun, unit: ir.Unit, kind: o.DK) -> Optional[List[ir.Ins]]: inss = [] for n, op in enumerate(ins.operands): if isinstance(op, ir.Const) and op.kind is kind: mem = unit.FindOrAddConstMem(op) tmp = fun.GetScratchReg(kind, "mem_const", True) # TODO: pass the offset kind as a parameter inss.append(ir.Ins(o.LD_MEM, [tmp, mem, ir.Const(o.DK.U32, 0)])) ins.operands[n] = tmp if inss: return inss + [ins] return None
def _InsEliminateStkLoadStoreWithRegOffset( ins: ir.Ins, fun: ir.Fun, base_kind: o.DK, offset_kind: o.DK) -> Optional[List[ir.Ins]]: """This rewrite is usually applied as prep step by some backends to get rid of Stk operands. It allows the register allocator to see the scratch register but it will obscure the fact that a memory access is a stack access. Note, a stack address already implies a `sp+offset` addressing mode and risk ISAs do no usually support `sp+offset+reg` addressing mode. """ opc = ins.opcode ops = ins.operands if opc is o.ST_STK and isinstance(ops[1], ir.Reg): scratch_reg = fun.GetScratchReg(base_kind, "base", False) lea = ir.Ins(o.LEA_STK, [scratch_reg, ops[0], ir.Const(offset_kind, 0)]) ins.Init(o.ST, [scratch_reg, ops[1], ops[2]]) return [lea, ins] elif opc is o.LD_STK and isinstance(ops[2], ir.Reg): scratch_reg = fun.GetScratchReg(base_kind, "base", False) lea = ir.Ins(o.LEA_STK, [scratch_reg, ops[1], ir.Const(offset_kind, 0)]) ins.Init(o.LD, [ops[0], scratch_reg, ops[2]]) return [lea, ins] elif opc is o.LEA_STK and isinstance(ops[2], ir.Reg): scratch_reg = fun.GetScratchReg(base_kind, "base", False) # TODO: maybe reverse the order so that we can tell that ops[0] holds a stack # location lea = ir.Ins(o.LEA_STK, [scratch_reg, ops[1], ir.Const(offset_kind, 0)]) ins.Init(o.LEA, [ops[0], scratch_reg, ops[2]]) return [lea, ins] else: return None
def PhaseFinalizeStackAndLocalRegAlloc(fun: ir.Fun, _opt_stats: Dict[str, int], fout): """Finalizing the stack implies performing all transformations that could increase register usage. """ if False: to_be_spillled = [reg for reg in fun.regs if not reg.HasCpuReg()] to_be_spillled.sort() reg_alloc.FunSpillRegs(fun, o.DK.U32, to_be_spillled) fun.FinalizeStackSlots() # DumpFun("@@@ aaa", fun) # Special flavor out-of-bound immediate rewriter that is stack aware # In rare cases this could introduce the need for another gpr reg _FunRewriteOutOfBoundsOffsetsStk(fun) # DumpFun("@@@@ before reg-alloc", fun) # Assign regs to local var regs.FunLocalRegAlloc(fun) fun.flags &= ~ir.FUN_FLAG.STACK_FINALIZED fun.FinalizeStackSlots() # cleanup FunMoveEliminationCpu(fun)
def _InsEliminateImmediateStores(ins: ir.Ins, fun: ir.Fun) -> Optional[List[ir.Ins]]: """RISC architectures typically do not allow immediates to be stored directly TODO: maybe allow zero immediates """ opc = ins.opcode ops = ins.operands if opc in {o.ST_MEM, o.ST, o.ST_STK} and isinstance(ops[2], ir.Const): scratch_reg = fun.GetScratchReg(ops[2].kind, "st_imm", False) mov = ir.Ins(o.MOV, [scratch_reg, ops[2]]) ops[2] = scratch_reg return [mov, ins] else: return None
def FunSeparateLocalRegUsage(fun: ir.Fun) -> int: count = 0 for bbl in fun.bbls: for pos, ins in enumerate(bbl.inss): num_defs = ins.opcode.def_ops_count() for n, reg in enumerate(ins.operands[:num_defs]): assert isinstance(reg, ir.Reg) if reg.def_ins is ins or ir.REG_FLAG.GLOBAL in reg.flags or reg.cpu_reg is not None: continue purpose = reg.name if purpose.startswith("$"): underscore_pos = purpose.find("_") purpose = purpose[underscore_pos + 1:] new_reg = fun.GetScratchReg(reg.kind, purpose, False) ins.operands[n] = new_reg _BblRenameReg(bbl, pos + 1, reg, new_reg) count += 1 return count
def _InsRewriteIntoAABForm(ins: ir.Ins, fun: ir.Fun) -> Optional[List[ir.Ins]]: ops = ins.operands if not NeedsAABFromRewrite(ins): return None if ops[0] == ops[1]: ops[0].flags |= ir.REG_FLAG.TWO_ADDRESS return None if ops[0] == ops[2] and o.OA.COMMUTATIVE in ins.opcode.attributes: ir.InsSwapOps(ins, 1, 2) ops[0].flags |= ir.REG_FLAG.TWO_ADDRESS return [ins] else: reg = fun.GetScratchReg(ins.operands[0].kind, "aab", False) reg.flags |= ir.REG_FLAG.TWO_ADDRESS return [ ir.Ins(o.MOV, [reg, ops[1]]), ir.Ins(ins.opcode, [reg, reg, ops[2]]), ir.Ins(o.MOV, [ops[0], reg]) ]
def FunAddUnconditionalBranches(fun: ir.Fun): """Re-insert necessary unconditional branches sort of inverse of FunRemoveUnconditionalBranches """ bbls = [] for n, bbl in enumerate(fun.bbls): bbls.append(bbl) if bbl.inss and not bbl.inss[-1].opcode.has_fallthrough(): continue if len(bbl.edge_out) == 1: assert len(fun.bbls) > n succ = bbl.edge_out[0] if n + 1 == len(fun.bbls) or fun.bbls[n + 1] != succ: bbl.inss.append(ir.Ins(o.BRA, [succ])) continue assert len(bbl.edge_out) == 2 cond_bra = bbl.inss[-1] assert cond_bra.opcode.kind is o.OPC_KIND.COND_BRA, ( f"not a cond bra: {cond_bra} bbl: {bbl}") target = cond_bra.operands[2] other = bbl.edge_out[0] if target == bbl.edge_out[1] else bbl.edge_out[ 1] succ = fun.bbls[n + 1] if succ in bbl.edge_out: # target == other can happen if the cond_bra is pointless if target == succ and target != other: InsFlipCondBra(cond_bra, target, other) continue else: bbl_bra = ir.Bbl(NewDerivedBblName(bbl.name, "bra", fun)) bbl_bra.inss.append(ir.Ins(o.BRA, [other])) fun.bbl_syms[bbl_bra.name] = bbl_bra # forward fallthrough to new bbl if bbl.inss: InsMaybePatchNewSuccessor(bbl.inss[-1], other, bbl_bra) bbl.ReplaceEdgeOut(other, bbl_bra) bbl_bra.AddEdgeOut(other) bbls.append(bbl_bra) fun.bbls = bbls fun.flags &= ~ir.FUN_FLAG.CFG_NOT_LINEAR
def FunSplitBblsAtTerminators(fun: ir.Fun): """split bbls after terminator instructions and remove dead code after 'ret'""" for bbl in fun.bbl_syms.values(): assert not bbl.forward_declared, f"bbl referenced but not defined {bbl}" bbls = [] for bbl in fun.bbls: _BblRemoveUnreachableIns(bbl) ranges = _BblFindSubRanges(bbl) # print ("@@@@ ranges", ranges) inss = bbl.inss for start, end in ranges: new_bbl = bbl if start != 0: new_bbl = ir.Bbl(NewDerivedBblName(bbl.name, "_", fun)) fun.bbl_syms[bbl.name] = bbl new_bbl.inss = inss[start:end] bbls.append(new_bbl) fun.bbl_syms[new_bbl.name] = new_bbl fun.bbls = bbls