def vsqrtps(self, cdg, insn): """ VSQRTPS xmm1, xmm2/m128 VSQRTPS ymm1, ymm2/m256 """ op_size = XMM_SIZE if is_xmm_reg(insn.Op1) else YMM_SIZE # op2 -- m128/m256 if is_mem_op(insn.Op2): r_reg = cdg.load_operand(1) # op2 -- xmm2/ymm2 else: assert is_avx_reg(insn.Op2) r_reg = ida_hexrays.reg2mreg(insn.Op2.reg) # op1 -- xmm1/ymm1 d_reg = ida_hexrays.reg2mreg(insn.Op1.reg) # intrinsic: __m256 _mm256_cvtepi32_ps (__m256i a) bit_size = bytes2bits(op_size) bit_str = str(bit_size) if op_size == YMM_SIZE else "" intrinsic_name = "_mm%s_sqrt_ps" % bit_str avx_intrinsic = AVXIntrinsic(cdg, intrinsic_name) avx_intrinsic.add_argument_reg(r_reg, "__m%u" % bit_size) avx_intrinsic.set_return_reg(d_reg, "__m%u" % bit_size) avx_intrinsic.emit() # clear upper 128 bits of ymm1 if op_size == XMM_SIZE: clear_upper(cdg, d_reg) return ida_hexrays.MERR_OK
def _vmov(self, cdg, insn, data_size): """ Templated handler for dword/qword mov instructions. """ # op form: xmm1, rXX/mXX if is_xmm_reg(insn.Op1): # op2 -- m32/m64 if is_mem_op(insn.Op2): l_reg = cdg.load_operand(1) # op2 -- r32/r64 else: l_reg = ida_hexrays.reg2mreg(insn.Op2.reg) # wrap the source micro-reg as a micro-operand of the specified size l_mop = ida_hexrays.mop_t(l_reg, data_size) # op1 -- xmm1 d_reg = ida_hexrays.reg2mreg(insn.Op1.reg) d_mop = ida_hexrays.mop_t(d_reg, XMM_SIZE) # emit the microcode for this insn cdg.emit(ida_hexrays.m_xdu, l_mop, NO_MOP, d_mop) # clear upper 128 bits of ymm1 clear_upper(cdg, d_reg) return ida_hexrays.MERR_OK # op form: rXX/mXX, xmm1 else: assert is_xmm_reg(insn.Op2) # op2 -- xmm1 l_reg = ida_hexrays.reg2mreg(insn.Op2.reg) l_mop = ida_hexrays.mop_t(l_reg, data_size) # op1 -- m32/m64 if is_mem_op(insn.Op1): cdg.store_operand(0, l_mop) # op1 -- r32/r64 else: d_reg = ida_hexrays.reg2mreg(insn.Op1.reg) d_mop = ida_hexrays.mop_t(d_reg, data_size) cdg.emit(ida_hexrays.m_mov, l_mop, NO_MOP, d_mop) # # TODO: the intel manual doesn't make it entierly clear here # if the upper bits of a r32 operation need to be cleared ? # return ida_hexrays.MERR_OK # failsafe assert "Unreachable..." return ida_hexrays.MERR_INSN
def v_math_ps(self, cdg, insn): """ VADDPS xmm1, xmm2, xmm3/m128 VADDPS ymm1, ymm2, ymm3/m256 VSUBPS xmm1, xmm2, xmm3/m128 VSUBPS ymm1, ymm2, ymm3/m256 VMULPS xmm1, xmm2, xmm3/m128 VMULPS ymm1, ymm2, ymm3/m256 VDIVPS xmm1, xmm2, xmm3/m128 VDIVPS ymm1, ymm2, ymm3/m256 """ assert is_avx_reg(insn.Op1) and is_avx_reg(insn.Op2) op_size = XMM_SIZE if is_xmm_reg(insn.Op1) else YMM_SIZE # op3 -- m128/m256 if is_mem_op(insn.Op3): r_reg = cdg.load_operand(2) # op3 -- xmm3/ymm3 else: assert is_avx_reg(insn.Op3) r_reg = ida_hexrays.reg2mreg(insn.Op3.reg) # op2 -- xmm2/ymm2 l_reg = ida_hexrays.reg2mreg(insn.Op2.reg) # op1 -- xmm1/ymm1 d_reg = ida_hexrays.reg2mreg(insn.Op1.reg) d_mop = ida_hexrays.mop_t(d_reg, op_size) itype2name = \ { ida_allins.NN_vaddps: "_mm%u_add_ps", ida_allins.NN_vsubps: "_mm%u_sub_ps", ida_allins.NN_vmulps: "_mm%u_mul_ps", ida_allins.NN_vdivps: "_mm%u_div_ps", } # create the intrinsic bit_size = bytes2bits(op_size) bit_str = "256" if op_size == YMM_SIZE else "" intrinsic_name = itype2name[insn.itype] % bytes2bits(op_size) avx_intrinsic = AVXIntrinsic(cdg, intrinsic_name) avx_intrinsic.add_argument_reg(l_reg, "__m%u" % bit_size) avx_intrinsic.add_argument_reg(r_reg, "__m%u" % bit_size) avx_intrinsic.set_return_reg(d_reg, "__m%u" % bit_size) avx_intrinsic.emit() # clear upper 128 bits of ymm1 if op_size == XMM_SIZE: clear_upper(cdg, d_reg) return ida_hexrays.MERR_OK
def _v_math_ss_sd(self, cdg, insn, op_size): """ Templated handler for scalar float/double math instructions. """ assert is_avx_reg(insn.Op1) and is_avx_reg(insn.Op2) # op3 -- m32/m64 if is_mem_op(insn.Op3): r_reg = cdg.load_operand(2) # op3 -- xmm3 else: assert is_xmm_reg(insn.Op3) r_reg = ida_hexrays.reg2mreg(insn.Op3.reg) # op2 -- xmm2 l_reg = ida_hexrays.reg2mreg(insn.Op2.reg) # op1 -- xmm1 d_reg = ida_hexrays.reg2mreg(insn.Op1.reg) itype2mcode = \ { ida_allins.NN_vaddss: ida_hexrays.m_fadd, ida_allins.NN_vaddsd: ida_hexrays.m_fadd, ida_allins.NN_vsubss: ida_hexrays.m_fsub, ida_allins.NN_vsubsd: ida_hexrays.m_fsub, ida_allins.NN_vmulss: ida_hexrays.m_fmul, ida_allins.NN_vmulsd: ida_hexrays.m_fmul, ida_allins.NN_vdivss: ida_hexrays.m_fdiv, ida_allins.NN_vdivsd: ida_hexrays.m_fdiv, } # get the hexrays microcode op to use for this instruction mcode_op = itype2mcode[insn.itype] op_dtype = ida_ua.dt_float if op_size == FLOAT_SIZE else ida_ua.dt_double # create a temp register to compute the final result into t0_result = cdg.mba.alloc_kreg(XMM_SIZE) # emit the microcode for this insn cdg.emit(ida_hexrays.m_mov, XMM_SIZE, l_reg, 0, t0_result, 0) cdg.emit_micro_mvm(mcode_op, op_dtype, l_reg, r_reg, t0_result, 0) cdg.emit(ida_hexrays.m_mov, XMM_SIZE, t0_result, 0, d_reg, 0) cdg.mba.free_kreg(t0_result, 16) # clear upper 128 bits of ymm1 assert is_xmm_reg(insn.Op1) clear_upper(cdg, d_reg) return ida_hexrays.MERR_OK
def v_bitwise_ps(self, cdg, insn): """ VORPS xmm1, xmm2, xmm3/m128 VORPS ymm1, ymm2, ymm3/m256 VXORPS xmm1, xmm2, xmm3/m128 VXORPS ymm1, ymm2, ymm3/m256 VANDPS xmm1, xmm2, xmm3/m128 VANDPS ymm1, ymm2, ymm3/m256 """ assert is_avx_reg(insn.Op1) and is_avx_reg(insn.Op2) op_size = XMM_SIZE if is_xmm_reg(insn.Op1) else YMM_SIZE # op3 -- m128/m256 if is_mem_op(insn.Op3): r_reg = cdg.load_operand(2) # op3 -- xmm3/ymm3 else: assert is_avx_reg(insn.Op3) r_reg = ida_hexrays.reg2mreg(insn.Op3.reg) itype2mcode = \ { ida_allins.NN_vorps: ida_hexrays.m_or, ida_allins.NN_vandps: ida_hexrays.m_and, ida_allins.NN_vxorps: ida_hexrays.m_xor, } # get the hexrays microcode op to use for this instruction mcode_op = itype2mcode[insn.itype] # wrap the source micro-reg as a micro-operand r_mop = ida_hexrays.mop_t(r_reg, op_size) # op2 -- xmm2/ymm2 l_reg = ida_hexrays.reg2mreg(insn.Op2.reg) l_mop = ida_hexrays.mop_t(l_reg, op_size) # op1 -- xmm1/ymm1 d_reg = ida_hexrays.reg2mreg(insn.Op1.reg) d_mop = ida_hexrays.mop_t(d_reg, op_size) # emit the microcode for this insn cdg.emit(mcode_op, l_mop, r_mop, d_mop) # clear upper 128 bits of ymm1 if op_size == XMM_SIZE: clear_upper(cdg, d_reg) return ida_hexrays.MERR_OK
def vcvtsi2ss(self, cdg, insn): """ VCVTSI2SS xmm1, xmm2, r/m32 VCVTSI2SS xmm1, xmm2, r/m64 """ src_size = size_of_operand(insn.Op3) # op3 -- m32/m64 if is_mem_op(insn.Op3): r_reg = cdg.load_operand(2) # op3 -- r32/r64 else: assert is_reg_op(insn.Op3) r_reg = ida_hexrays.reg2mreg(insn.Op3.reg) # op2 -- xmm2 l_reg = ida_hexrays.reg2mreg(insn.Op2.reg) # op1 -- xmm1 d_reg = ida_hexrays.reg2mreg(insn.Op1.reg) # create a temp register to compute the final result into t0_result = cdg.mba.alloc_kreg(XMM_SIZE) t0_mop = ida_hexrays.mop_t(t0_result, FLOAT_SIZE) # create a temp register to downcast a double to a float (if needed) t1_i2f = cdg.mba.alloc_kreg(src_size) t1_mop = ida_hexrays.mop_t(t1_i2f, src_size) # copy xmm2 into the temp result reg, as we need its upper 3 dwords cdg.emit(ida_hexrays.m_mov, XMM_SIZE, l_reg, 0, t0_result, 0) # convert the integer (op3) to a float/double depending on its size cdg.emit(ida_hexrays.m_i2f, src_size, r_reg, 0, t1_i2f, 0) # reduce precision on the converted floating point value if needed (only r64/m64) cdg.emit(ida_hexrays.m_f2f, t1_mop, NO_MOP, t0_mop) # transfer the fully computed temp register to the real dest reg cdg.emit(ida_hexrays.m_mov, XMM_SIZE, t0_result, 0, d_reg, 0) cdg.mba.free_kreg(t0_result, XMM_SIZE) cdg.mba.free_kreg(t1_i2f, src_size) # clear upper 128 bits of ymm1 clear_upper(cdg, d_reg) return ida_hexrays.MERR_OK
def vshufps(self, cdg, insn): """ VSHUFPS xmm1, xmm2, xmm3/m128, imm8 VSHUFPS ymm1, ymm2, ymm3/m256, imm8 """ op_size = XMM_SIZE if is_xmm_reg(insn.Op1) else YMM_SIZE # op4 -- imm8 assert insn.Op4.type == ida_ua.o_imm mask_value = insn.Op4.value # op3 -- m128/m256 if is_mem_op(insn.Op3): r_reg = cdg.load_operand(2) # op3 -- xmm3/ymm3 else: assert is_avx_reg(insn.Op3) r_reg = ida_hexrays.reg2mreg(insn.Op3.reg) # op2 -- xmm2/ymm2 l_reg = ida_hexrays.reg2mreg(insn.Op2.reg) # op1 -- xmm1/ymm1 d_reg = ida_hexrays.reg2mreg(insn.Op1.reg) # # intrinsics: # __m128 _mm_shuffle_ps (__m128 a, __m128 b, unsigned int imm8) # __m256 _mm256_shuffle_ps (__m256 a, __m256 b, const int imm8) # bit_size = bytes2bits(op_size) bit_str = str(bit_size) if op_size == YMM_SIZE else "" intrinsic_name = "_mm%s_shuffle_ps" % bit_str avx_intrinsic = AVXIntrinsic(cdg, intrinsic_name) avx_intrinsic.add_argument_reg(l_reg, "__m%u" % bit_size) avx_intrinsic.add_argument_reg(r_reg, "__m%u" % bit_size) avx_intrinsic.add_argument_imm(mask_value, ida_typeinf.BT_INT8) avx_intrinsic.set_return_reg(d_reg, "__m%u" % bit_size) avx_intrinsic.emit() # clear upper 128 bits of ymm1 if op_size == XMM_SIZE: clear_upper(cdg, d_reg) return ida_hexrays.MERR_OK
def vsqrtss(self, cdg, insn): """ VSQRTSS xmm1, xmm2, xmm3/m32 """ assert is_xmm_reg(insn.Op1) and is_xmm_reg(insn.Op2) # op3 -- xmm3 if is_xmm_reg(insn.Op3): r_reg = ida_hexrays.reg2mreg(insn.Op3.reg) # op3 -- m32 else: assert is_mem_op(insn.Op3) r_reg = cdg.load_operand(2) # op2 - xmm2 l_reg = ida_hexrays.reg2mreg(insn.Op2.reg) # op1 - xmm1 d_reg = ida_hexrays.reg2mreg(insn.Op1.reg) # create a temp register to compute the final result into t0_result = cdg.mba.alloc_kreg(XMM_SIZE) # populate the dest reg cdg.emit(ida_hexrays.m_mov, XMM_SIZE, l_reg, 0, t0_result, 0) # mov.fpu call !fsqrt<fast:float xmm1_4.4>.4, t0_result_4.4 avx_intrinsic = AVXIntrinsic(cdg, "fsqrt") avx_intrinsic.add_argument_reg_basic(r_reg, ida_typeinf.BT_FLOAT) avx_intrinsic.set_return_reg_basic(t0_result, ida_typeinf.BT_FLOAT) avx_intrinsic.emit() # store the fully computed result cdg.emit(ida_hexrays.m_mov, XMM_SIZE, t0_result, 0, d_reg, 0) cdg.mba.free_kreg(t0_result, XMM_SIZE) # clear upper 128 bits of ymm1 clear_upper(cdg, d_reg) return ida_hexrays.MERR_OK
def vcvtss2sd(self, cdg, insn): """ VCVTSS2SD xmm1, xmm2, r/m32 """ # op3 -- m32 if is_mem_op(insn.Op3): r_reg = cdg.load_operand(2) # op3 -- r32 else: assert is_reg_op(insn.Op3) r_reg = ida_hexrays.reg2mreg(insn.Op3.reg) r_mop = ida_hexrays.mop_t(r_reg, FLOAT_SIZE) # op2 -- xmm2 l_reg = ida_hexrays.reg2mreg(insn.Op2.reg) # op1 -- xmm1 d_reg = ida_hexrays.reg2mreg(insn.Op1.reg) # create a temp register to compute the final result into t0_result = cdg.mba.alloc_kreg(XMM_SIZE) t0_mop = ida_hexrays.mop_t(t0_result, DOUBLE_SIZE) # copy xmm2 into the temp result reg, as we need its upper quadword cdg.emit(ida_hexrays.m_mov, XMM_SIZE, l_reg, 0, t0_result, 0) # convert float (op3) to a double, storing it in the lower 64 of the temp result reg cdg.emit(ida_hexrays.m_f2f, r_mop, NO_MOP, t0_mop) # transfer the fully computed temp register to the real dest reg cdg.emit(ida_hexrays.m_mov, XMM_SIZE, t0_result, 0, d_reg, 0) cdg.mba.free_kreg(t0_result, XMM_SIZE) # clear upper 128 bits of ymm1 clear_upper(cdg, d_reg) return ida_hexrays.MERR_OK
def vcvtps2pd(self, cdg, insn): """ VCVTPS2PD xmm1, xmm2/m64 VCVTPS2PD ymm1, ymm2/m128 """ src_size = QWORD_SIZE if is_xmm_reg(insn.Op1) else XMM_SIZE # op2 -- m64/m128 if is_mem_op(insn.Op2): r_reg = cdg.load_operand(1) # op2 -- xmm2/ymm2 else: assert is_avx_reg(insn.Op2) r_reg = ida_hexrays.reg2mreg(insn.Op2.reg) # op1 -- xmm1/ymm1 d_reg = ida_hexrays.reg2mreg(insn.Op1.reg) # # intrinsics: # - __m128d _mm_cvtps_pd (__m128 a) # - __m256d _mm256_cvtps_pd (__m128 a) # bit_size = bytes2bits(src_size * 2) bit_str = "256" if (src_size * 2) == YMM_SIZE else "" intrinsic_name = "_mm%s_cvtps_pd" % bit_str avx_intrinsic = AVXIntrinsic(cdg, intrinsic_name) avx_intrinsic.add_argument_reg(r_reg, "__m128") avx_intrinsic.set_return_reg(d_reg, "__m%ud" % bit_size) avx_intrinsic.emit() # clear upper 128 bits of ymm1 if src_size == QWORD_SIZE: clear_upper(cdg, d_reg) return ida_hexrays.MERR_OK
def get_ymm_mreg(xmm_mreg): """ Return the YMM microcode register for a given XMM register. """ xmm_reg = ida_hexrays.mreg2reg(xmm_mreg, XMM_SIZE) xmm_name = ida_idp.get_reg_name(xmm_reg, XMM_SIZE) xmm_number = int(xmm_name.split("mm")[-1]) # compute the ymm mreg id ymm_reg = ida_idp.str2reg("ymm%u" % xmm_number) ymm_mreg = ida_hexrays.reg2mreg(ymm_reg) # sanity check... xmm_name = ida_hexrays.get_mreg_name(xmm_mreg, XMM_SIZE) ymm_name = ida_hexrays.get_mreg_name(ymm_mreg, YMM_SIZE) assert xmm_name[1:] == ymm_name[ 1:], "Reg escalation did not work... (%s, %s)" % (xmm_name, ymm_name) # return the ymm microcode register id return ymm_mreg
def v_mov_ps_dq(self, cdg, insn): """ VMOVAPS xmm1, xmm2/m128 VMOVAPS ymm1, ymm2/m256 VMOVAPS xmm2/m128, xmm1 VMOVAPS ymm2/m256, ymm1 VMOVUPS xmm1, xmm2/m128 VMOVUPS ymm1, ymm2/m256 VMOVUPS xmm2/m128, xmm1 VMOVUPS ymm2/m256, ymm1 VMOVDQA xmm1, xmm2/m128 VMOVDQA xmm2/m128, xmm1 VMOVDQA ymm1, ymm2/m256 VMOVDQA ymm2/m256, ymm1 VMOVDQU xmm1, xmm2/m128 VMOVDQU xmm2/m128, xmm1 VMOVDQU ymm1, ymm2/m256 VMOVDQU ymm2/m256, ymm1 """ # op form: reg, [mem] if is_avx_reg(insn.Op1): op_size = XMM_SIZE if is_xmm_reg(insn.Op1) else YMM_SIZE # op2 -- m128/m256 if is_mem_op(insn.Op2): l_reg = cdg.load_operand(1) # op2 -- xmm1/ymm1 else: assert is_avx_reg(insn.Op2) l_reg = ida_hexrays.reg2mreg(insn.Op2.reg) # wrap the source micro-reg as a micro-operand l_mop = ida_hexrays.mop_t(l_reg, op_size) # op1 -- xmmX/ymmX d_reg = ida_hexrays.reg2mreg(insn.Op1.reg) d_mop = ida_hexrays.mop_t(d_reg, op_size) # emit the microcode for this insn cdg.emit(ida_hexrays.m_mov, l_mop, NO_MOP, d_mop) # clear upper 128 bits of ymm1 if op_size == XMM_SIZE: clear_upper(cdg, d_reg) return ida_hexrays.MERR_OK # op form: [mem], reg else: assert is_mem_op(insn.Op1) and is_avx_reg(insn.Op2) op_size = XMM_SIZE if is_xmm_reg(insn.Op2) else YMM_SIZE # op1 -- xmm1/ymm1 l_reg = ida_hexrays.reg2mreg(insn.Op2.reg) l_mop = ida_hexrays.mop_t(l_reg, op_size) # [m128/m256] = xmm1/ymm1 cdg.store_operand(0, l_mop) return ida_hexrays.MERR_OK # failsafe assert "Unreachable..." return ida_hexrays.MERR_INSN
def _vmov_ss_sd(self, cdg, insn, data_size): """ Templated handler for scalar float/double mov instructions. """ # op form: X, Y -- (2 operands) if insn.Op3.type == ida_ua.o_void: # op form: xmm1, m32/m64 if is_xmm_reg(insn.Op1): assert is_mem_op(insn.Op2) # op2 -- m32/m64 l_reg = cdg.load_operand(1) l_mop = ida_hexrays.mop_t(l_reg, data_size) # op1 -- xmm1 d_reg = ida_hexrays.reg2mreg(insn.Op1.reg) d_mop = ida_hexrays.mop_t(d_reg, XMM_SIZE) # xmm1[:data_size] = [mem] insn = cdg.emit(ida_hexrays.m_xdu, l_mop, NO_MOP, d_mop) # clear xmm1[data_size:] bits (through ymm1) clear_upper(cdg, d_reg, data_size) return ida_hexrays.MERR_OK # op form: m32/m64, xmm1 else: assert is_mem_op(insn.Op1) and is_xmm_reg(insn.Op2) # op2 -- xmm1 l_reg = ida_hexrays.reg2mreg(insn.Op2.reg) l_mop = ida_hexrays.mop_t(l_reg, data_size) # store xmm1[:data_size] into memory at [m32/m64] (op1) insn = cdg.store_operand(0, l_mop) insn.set_fpinsn() return ida_hexrays.MERR_OK # op form: xmm1, xmm2, xmm3 -- (3 operands) else: assert is_xmm_reg(insn.Op1) and is_xmm_reg( insn.Op2) and is_xmm_reg(insn.Op3) d_reg = ida_hexrays.reg2mreg(insn.Op1.reg) l_reg = ida_hexrays.reg2mreg(insn.Op2.reg) r_reg = ida_hexrays.reg2mreg(insn.Op3.reg) # create a temp register to compute the final result into t0_result = cdg.mba.alloc_kreg(XMM_SIZE) # emit the microcode for this insn cdg.emit(ida_hexrays.m_mov, XMM_SIZE, l_reg, 0, t0_result, 0) cdg.emit(ida_hexrays.m_f2f, data_size, r_reg, 0, t0_result, 0) cdg.emit(ida_hexrays.m_mov, XMM_SIZE, t0_result, 0, d_reg, 0) cdg.mba.free_kreg(t0_result, XMM_SIZE) # clear xmm1[data_size:] bits (through ymm1) clear_upper(cdg, d_reg, data_size) return ida_hexrays.MERR_OK # failsafe assert "Unreachable..." return ida_hexrays.MERR_INSN
import ida_hexrays R_fpstat = 0x36 # From intel.hpp g_fps_reg = ida_hexrays.reg2mreg(R_fpstat) def ProcessBlock(mblock): curr = mblock.head while curr: if curr.opcode == ida_hexrays.m_und and curr.d.t == ida_hexrays.mop_r and curr.d.r == g_fps_reg: mblock.make_nop(curr) curr = curr.next class FPUHooks(ida_hexrays.Hexrays_Hooks): def microcode(self,mba): for i in range(mba.qty): ProcessBlock(mba.get_mblock(i)) return ida_hexrays.MERR_OK try: fpuhooks.unhook() del fpuhooks except NameError as e: pass finally: fpuhooks = FPUHooks() fpuhooks.hook()