def extract_insn_offset_features(f, bb, insn): """parse instruction structure offset features args: f (IDA func_t) bb (IDA BasicBlock) insn (IDA insn_t) example: .text:0040112F cmp [esi+4], ebx """ for op in capa.features.extractors.ida.helpers.get_insn_ops( insn, target_ops=(idaapi.o_phrase, idaapi.o_displ)): if capa.features.extractors.ida.helpers.is_op_stack_var(insn.ea, op.n): continue p_info = capa.features.extractors.ida.helpers.get_op_phrase_info(op) op_off = p_info.get("offset", 0) if idaapi.is_mapped(op_off): # Ignore: # mov esi, dword_1005B148[esi] continue # I believe that IDA encodes all offsets as two's complement in a u32. # a 64-bit displacement isn't a thing, see: # https://stackoverflow.com/questions/31853189/x86-64-assembly-why-displacement-not-64-bits op_off = capa.features.extractors.helpers.twos_complement(op_off, 32) yield Offset(op_off), insn.ea yield Offset(op_off, arch=get_arch(f.ctx)), insn.ea
def test_rule_yaml_descriptions(): rule = textwrap.dedent(""" rule: meta: name: test rule features: - and: - number: 1 = This is the number 1 - string: This program cannot be run in DOS mode. description: MS-DOS stub message - string: '/SELECT.*FROM.*WHERE/i' description: SQL WHERE Clause - count(number(2 = AF_INET/SOCK_DGRAM)): 2 - or: - and: - offset: 0x50 = IMAGE_NT_HEADERS.OptionalHeader.SizeOfImage - offset: 0x34 = IMAGE_NT_HEADERS.OptionalHeader.ImageBase description: 32-bits - and: - offset: 0x50 = IMAGE_NT_HEADERS64.OptionalHeader.SizeOfImage - offset: 0x30 = IMAGE_NT_HEADERS64.OptionalHeader.ImageBase description: 64-bits description: PE headers offsets """) r = capa.rules.Rule.from_yaml(rule) assert (r.evaluate({ Number(1): {1}, Number(2): {2, 3}, String("This program cannot be run in DOS mode."): {4}, String("SELECT password FROM hidden_table WHERE user == admin"): {5}, Offset(0x50): {6}, Offset(0x30): {7}, }) == True)
def extract_insn_offset_features(f, bb, insn): """parse structure offset features from the given instruction.""" # examples: # # mov eax, [esi + 4] # mov eax, [esi + ecx + 16384] operands = [o.strip() for o in insn.operands.split(",")] for operand in operands: if not "ptr" in operand: continue if "esp" in operand or "ebp" in operand or "rbp" in operand: continue number = 0 number_hex = re.search(PATTERN_HEXNUM, operand) number_int = re.search(PATTERN_SINGLENUM, operand) if number_hex: number = int(number_hex.group("num"), 16) number = -1 * number if number_hex.group().startswith( "-") else number elif number_int: number = int(number_int.group("num")) number = -1 * number if number_int.group().startswith( "-") else number yield Offset(number), insn.offset yield Offset(number, arch=get_arch(f.smda_report)), insn.offset
def extract_insn_offset_features(f, bb, insn): """parse structure offset features from the given instruction.""" # example: # # .text:0040112F cmp [esi+4], ebx for oper in insn.opers: # this is for both x32 and x64 if not isinstance(oper, envi.archs.i386.disasm.i386RegMemOper): continue if oper.reg == envi.archs.i386.disasm.REG_ESP: continue if oper.reg == envi.archs.i386.disasm.REG_EBP: continue # TODO: do x64 support for real. if oper.reg == envi.archs.amd64.disasm.REG_RBP: continue # viv already decodes offsets as signed v = oper.disp yield Offset(v), insn.va yield Offset(v, arch=get_arch(f.vw)), insn.va
def extract_insn_offset_features(f, bb, insn): """parse instruction structure offset features args: f (IDA func_t) bb (IDA BasicBlock) insn (IDA insn_t) example: .text:0040112F cmp [esi+4], ebx """ syntax = f.unit.syntax for operand in insn.oprs: operand = operand.lower() if any(reg in operand for reg in syntax.registers_cat['ptr'].keys()): continue number = 0 number_hex = re.search(PATTERN_HEXNUM, operand) number_hex_2 = re.search(PATTERN_HEXNUM_2, operand) number_int = re.search(PATTERN_SINGLENUM, operand) if number_hex: number = int(number_hex.group("num"), 16) number = -1 * number if number_hex.group().startswith( "-") else number elif number_hex_2: number = int(number_hex_2.group("num"), 16) number = -1 * number if number_hex_2.group().startswith( "-") else number elif number_int: number = int(number_int.group("num")) number = -1 * number if number_int.group().startswith( "-") else number yield Offset(number), insn.ea yield Offset(number, arch=get_arch(f)), insn.ea
def test_offset_arch(): r = capa.rules.Rule.from_yaml( textwrap.dedent(""" rule: meta: name: test rule features: - offset/x32: 2 """)) assert r.evaluate({Offset(2, arch=ARCH_X32): {1}}) == True assert r.evaluate({Offset(2): {1}}) == False assert r.evaluate({Offset(2, arch=ARCH_X64): {1}}) == False
def extract_insn_number_features(f, bb, insn): """parse number features from the given instruction.""" # example: # # push 3136B0h ; dwControlCode operands = [o.strip() for o in insn.operands.split(",")] if insn.mnemonic == "add" and operands[0] in ["esp", "rsp"]: # skip things like: # # .text:00401140 call sub_407E2B # .text:00401145 add esp, 0Ch return for i, operand in enumerate(operands): try: # The result of bitwise operations is calculated as though carried out # in two’s complement with an infinite number of sign bits value = int(operand, 16) & ((1 << f.smda_report.bitness) - 1) except ValueError: continue else: yield Number(value), insn.offset yield OperandNumber(i, value), insn.offset if insn.mnemonic == "add" and 0 < value < MAX_STRUCTURE_SIZE: # for pattern like: # # add eax, 0x10 # # assume 0x10 is also an offset (imagine eax is a pointer). yield Offset(value), insn.offset yield OperandOffset(i, value), insn.offset
def extract_op_offset_features(f, bb, insn, i, oper): """parse structure offset features from the given operand.""" # example: # # .text:0040112F cmp [esi+4], ebx # this is for both x32 and x64 # like [esi + 4] # reg ^ # disp if isinstance(oper, envi.archs.i386.disasm.i386RegMemOper): if oper.reg == envi.archs.i386.regs.REG_ESP: return if oper.reg == envi.archs.i386.regs.REG_EBP: return # TODO: do x64 support for real. if oper.reg == envi.archs.amd64.regs.REG_RBP: return # viv already decodes offsets as signed v = oper.disp yield Offset(v), insn.va yield OperandOffset(i, v), insn.va if insn.mnem == "lea" and i == 1 and not f.vw.probeMemory( v, 1, envi.memory.MM_READ): # for pattern like: # # lea eax, [ebx + 1] # # assume 1 is also an offset (imagine ebx is a zero register). yield Number(v), insn.va yield OperandNumber(i, v), insn.va # like: [esi + ecx + 16384] # reg ^ ^ # index ^ # disp elif isinstance(oper, envi.archs.i386.disasm.i386SibOper): # viv already decodes offsets as signed v = oper.disp yield Offset(v), insn.va yield OperandOffset(i, v), insn.va
def test_count_offset_symbol(): rule = textwrap.dedent(""" rule: meta: name: test rule features: - or: - count(offset(2 = symbol name)): 1 - count(offset(0x100 = symbol name)): 2 or more - count(offset(0x11 = (FLAG_A | FLAG_B))): 2 or more """) r = capa.rules.Rule.from_yaml(rule) assert r.evaluate({Offset(2): {}}) == False assert r.evaluate({Offset(2): {1}}) == True assert r.evaluate({Offset(2): {1, 2}}) == False assert r.evaluate({Offset(0x100, "symbol name"): {1}}) == False assert r.evaluate({Offset(0x100, "symbol name"): {1, 2, 3}}) == True
def test_offset_arch_symbol(): r = capa.rules.Rule.from_yaml( textwrap.dedent(""" rule: meta: name: test rule features: - offset/x32: 2 = some constant """)) assert r.evaluate( {Offset(2, arch=ARCH_X32, description="some constant"): {1}}) == True
def test_offset_symbol(): rule = textwrap.dedent(""" rule: meta: name: test rule features: - and: - offset: 1 - offset: 2 = symbol name - offset: 3 = symbol name - offset: 4 = symbol name = another name - offset: 0x100 = symbol name """) r = capa.rules.Rule.from_yaml(rule) children = list(r.statement.get_children()) assert (Offset(1) in children) == True assert (Offset(2, "symbol name") in children) == True assert (Offset(3, "symbol name") in children) == True assert (Offset(4, "symbol name = another name") in children) == True assert (Offset(0x100, "symbol name") in children) == True
def extract_insn_offset_features(f, bb, insn): """parse structure offset features from the given instruction.""" # example: # # .text:0040112F cmp [esi+4], ebx for oper in insn.opers: # this is for both x32 and x64 # like [esi + 4] # reg ^ # disp if isinstance(oper, envi.archs.i386.disasm.i386RegMemOper): if oper.reg == envi.archs.i386.regs.REG_ESP: continue if oper.reg == envi.archs.i386.regs.REG_EBP: continue # TODO: do x64 support for real. if oper.reg == envi.archs.amd64.regs.REG_RBP: continue # viv already decodes offsets as signed v = oper.disp yield Offset(v), insn.va yield Offset(v, bitness=get_bitness(f.vw)), insn.va # like: [esi + ecx + 16384] # reg ^ ^ # index ^ # disp elif isinstance(oper, envi.archs.i386.disasm.i386SibOper): # viv already decodes offsets as signed v = oper.disp yield Offset(v), insn.va yield Offset(v, bitness=get_bitness(f.vw)), insn.va
def extract_insn_offset_features(f, bb, insn): """parse instruction structure offset features args: f (IDA func_t) bb (IDA BasicBlock) insn (IDA insn_t) example: .text:0040112F cmp [esi+4], ebx """ for i, op in enumerate(insn.ops): if op.type == idaapi.o_void: break if op.type not in (idaapi.o_phrase, idaapi.o_displ): continue if capa.features.extractors.ida.helpers.is_op_stack_var(insn.ea, op.n): continue p_info = capa.features.extractors.ida.helpers.get_op_phrase_info(op) op_off = p_info.get("offset", 0) if idaapi.is_mapped(op_off): # Ignore: # mov esi, dword_1005B148[esi] continue # I believe that IDA encodes all offsets as two's complement in a u32. # a 64-bit displacement isn't a thing, see: # https://stackoverflow.com/questions/31853189/x86-64-assembly-why-displacement-not-64-bits op_off = capa.features.extractors.helpers.twos_complement(op_off, 32) yield Offset(op_off), insn.ea yield OperandOffset(i, op_off), insn.ea if (insn.itype == idaapi.NN_lea and i == 1 # o_displ is used for both: # [eax+1] # [eax+ebx+2] and op.type == idaapi.o_displ # but the SIB is only present for [eax+ebx+2] # which we don't want and not capa.features.extractors.ida.helpers.has_sib(op)): # for pattern like: # # lea eax, [ebx + 1] # # assume 1 is also an offset (imagine ebx is a zero register). yield Number(op_off), insn.ea yield OperandNumber(i, op_off), insn.ea
def extract_insn_number_features(f, bb, insn): """parse instruction number features args: f (IDA func_t) bb (IDA BasicBlock) insn (IDA insn_t) example: push 3136B0h ; dwControlCode """ if idaapi.is_ret_insn(insn): # skip things like: # .text:0042250E retn 8 return if capa.features.extractors.ida.helpers.is_sp_modified(insn): # skip things like: # .text:00401145 add esp, 0Ch return for i, op in enumerate(insn.ops): if op.type == idaapi.o_void: break if op.type not in (idaapi.o_imm, idaapi.o_mem): continue # skip things like: # .text:00401100 shr eax, offset loc_C if capa.features.extractors.ida.helpers.is_op_offset(insn, op): continue if op.type == idaapi.o_imm: const = capa.features.extractors.ida.helpers.mask_op_val(op) else: const = op.addr yield Number(const), insn.ea yield OperandNumber(i, const), insn.ea if insn.itype == idaapi.NN_add and 0 < const < MAX_STRUCTURE_SIZE and op.type == idaapi.o_imm: # for pattern like: # # add eax, 0x10 # # assume 0x10 is also an offset (imagine eax is a pointer). yield Offset(const), insn.ea yield OperandOffset(i, const), insn.ea
def extract_op_number_features(f, bb, insn, i, oper): """parse number features from the given operand.""" # example: # # push 3136B0h ; dwControlCode # this is for both x32 and x64 if not isinstance(oper, (envi.archs.i386.disasm.i386ImmOper, envi.archs.i386.disasm.i386ImmMemOper)): return if isinstance(oper, envi.archs.i386.disasm.i386ImmOper): v = oper.getOperValue(oper) else: v = oper.getOperAddr(oper) if f.vw.probeMemory(v, 1, envi.memory.MM_READ): # this is a valid address # assume its not also a constant. return if insn.mnem == "add" and insn.opers[0].isReg( ) and insn.opers[0].reg == envi.archs.i386.regs.REG_ESP: # skip things like: # # .text:00401140 call sub_407E2B # .text:00401145 add esp, 0Ch return yield Number(v), insn.va yield OperandNumber(i, v), insn.va if insn.mnem == "add" and 0 < v < MAX_STRUCTURE_SIZE and isinstance( oper, envi.archs.i386.disasm.i386ImmOper): # for pattern like: # # add eax, 0x10 # # assume 0x10 is also an offset (imagine eax is a pointer). yield Offset(v), insn.va yield OperandOffset(i, v), insn.va
def extract_insn_offset_features(f, bb, insn): """parse structure offset features from the given instruction.""" # examples: # # mov eax, [esi + 4] # mov eax, [esi + ecx + 16384] operands = [o.strip() for o in insn.operands.split(",")] for i, operand in enumerate(operands): if "esp" in operand or "ebp" in operand or "rbp" in operand: continue number = 0 number_hex = re.search(PATTERN_HEXNUM, operand) number_int = re.search(PATTERN_SINGLENUM, operand) if number_hex: number = int(number_hex.group("num"), 16) number = -1 * number if number_hex.group().startswith( "-") else number elif number_int: number = int(number_int.group("num")) number = -1 * number if number_int.group().startswith( "-") else number if "ptr" not in operand: if (insn.mnemonic == "lea" and i == 1 and (operand.count("+") + operand.count("-")) == 1 and operand.count("*") == 0): # for pattern like: # # lea eax, [ebx + 1] # # assume 1 is also an offset (imagine ebx is a zero register). yield Number(number), insn.offset yield OperandNumber(i, number), insn.offset continue yield Offset(number), insn.offset yield OperandOffset(i, number), insn.offset
def extract_insn_offset_features(f, bb, insn): """ parse instruction structure offset features args: f (IDA func_t) bb (IDA BasicBlock) insn (IDA insn_t) example: .text:0040112F cmp [esi+4], ebx """ for op in capa.features.extractors.ida.helpers.get_insn_ops(insn, target_ops=(idaapi.o_phrase, idaapi.o_displ)): if capa.features.extractors.ida.helpers.is_op_stack_var(insn.ea, op.n): continue p_info = capa.features.extractors.ida.helpers.get_op_phrase_info(op) op_off = p_info.get("offset", 0) if 0 == op_off: continue if idaapi.is_mapped(op_off): # Ignore: # mov esi, dword_1005B148[esi] continue yield Offset(op_off), insn.ea