def test_rule_yaml_descriptions(): rule = textwrap.dedent(""" rule: meta: name: test rule features: - and: - number: 1 = This is the number 1 - string: This program cannot be run in DOS mode. description: MS-DOS stub message - string: '/SELECT.*FROM.*WHERE/i' description: SQL WHERE Clause - count(number(2 = AF_INET/SOCK_DGRAM)): 2 - or: - and: - offset: 0x50 = IMAGE_NT_HEADERS.OptionalHeader.SizeOfImage - offset: 0x34 = IMAGE_NT_HEADERS.OptionalHeader.ImageBase description: 32-bits - and: - offset: 0x50 = IMAGE_NT_HEADERS64.OptionalHeader.SizeOfImage - offset: 0x30 = IMAGE_NT_HEADERS64.OptionalHeader.ImageBase description: 64-bits description: PE headers offsets """) r = capa.rules.Rule.from_yaml(rule) assert (r.evaluate({ Number(1): {1}, Number(2): {2, 3}, String("This program cannot be run in DOS mode."): {4}, String("SELECT password FROM hidden_table WHERE user == admin"): {5}, Offset(0x50): {6}, Offset(0x30): {7}, }) == True)
def test_rule_yaml(): rule = textwrap.dedent(""" rule: meta: name: test rule author: [email protected] scope: function examples: - foo1234 - bar5678 features: - and: - number: 1 - number: 2 """) r = capa.rules.Rule.from_yaml(rule) assert r.evaluate({Number(0): {1}}) == False assert r.evaluate({Number(0): {1}, Number(1): {1}}) == False assert r.evaluate({Number(0): {1}, Number(1): {1}, Number(2): {1}}) == True assert r.evaluate({ Number(0): {1}, Number(1): {1}, Number(2): {1}, Number(3): {1} }) == True
def extract_insn_number_features(f, bb, insn): """ parse instruction number features args: f (IDA func_t) bb (IDA BasicBlock) insn (IDA insn_t) example: push 3136B0h ; dwControlCode """ if idaapi.is_ret_insn(insn): # skip things like: # .text:0042250E retn 8 return if capa.features.extractors.ida.helpers.is_sp_modified(insn): # skip things like: # .text:00401145 add esp, 0Ch return for op in capa.features.extractors.ida.helpers.get_insn_ops( insn, target_ops=(idaapi.o_imm, )): const = capa.features.extractors.ida.helpers.mask_op_val(op) if not idaapi.is_mapped(const): yield Number(const), insn.ea yield Number(const, arch=get_arch(f.ctx)), insn.ea
def extract_insn_number_features(f, bb, insn): """parse number features from the given instruction.""" # example: # # push 3136B0h ; dwControlCode for oper in insn.opers: # this is for both x32 and x64 if not isinstance(oper, (envi.archs.i386.disasm.i386ImmOper, envi.archs.i386.disasm.i386ImmMemOper)): continue if isinstance(oper, envi.archs.i386.disasm.i386ImmOper): v = oper.getOperValue(oper) else: v = oper.getOperAddr(oper) if f.vw.probeMemory(v, 1, envi.memory.MM_READ): # this is a valid address # assume its not also a constant. continue if insn.mnem == "add" and insn.opers[0].isReg( ) and insn.opers[0].reg == envi.archs.i386.disasm.REG_ESP: # skip things like: # # .text:00401140 call sub_407E2B # .text:00401145 add esp, 0Ch return yield Number(v), insn.va yield Number(v, arch=get_arch(f.vw)), insn.va
def extract_insn_number_features(f, bb, insn): """parse instruction number features args: f (IDA func_t) bb (IDA BasicBlock) insn (IDA insn_t) example: push 3136B0h ; dwControlCode """ if idaapi.is_ret_insn(insn): # skip things like: # .text:0042250E retn 8 return if capa.features.extractors.ida.helpers.is_sp_modified(insn): # skip things like: # .text:00401145 add esp, 0Ch return for op in capa.features.extractors.ida.helpers.get_insn_ops( insn, target_ops=(idaapi.o_imm, idaapi.o_mem)): # skip things like: # .text:00401100 shr eax, offset loc_C if capa.features.extractors.ida.helpers.is_op_offset(insn, op): continue if op.type == idaapi.o_imm: const = capa.features.extractors.ida.helpers.mask_op_val(op) else: const = op.addr yield Number(const), insn.ea yield Number(const, arch=get_arch(f.ctx)), insn.ea
def extract_insn_number_features(f, bb, insn): """parse instruction number features args: f (IDA func_t) bb (IDA BasicBlock) insn (IDA insn_t) example: push 3136B0h ; dwControlCode """ unit: DataUnit unit = f.unit # get from cache (AttrDict will not add new attribute to json) syntax: Assembly syntax = f.unit.syntax if len(insn.oprs) < 1: return stk = insn.oprs[0].lower() if 'ADD' in insn.mne and any( reg in stk for reg in syntax.registers_cat['ptr'].keys()): return if insn.mne in unit.syntax.operations: if unit.syntax.operations[insn.mne].jmp: return for const in get_opr_constant(insn.oprs, insn.oprs_tp, True): yield Number(const), insn.ea yield Number(const, arch=get_arch(f)), insn.ea
def test_rule_yaml_count(): rule = textwrap.dedent(""" rule: meta: name: test rule features: - count(number(100)): 1 """) r = capa.rules.Rule.from_yaml(rule) assert r.evaluate({Number(100): {}}) == False assert r.evaluate({Number(100): {1}}) == True assert r.evaluate({Number(100): {1, 2}}) == False
def test_number_arch(): r = capa.rules.Rule.from_yaml( textwrap.dedent(""" rule: meta: name: test rule features: - number/x32: 2 """)) assert r.evaluate({Number(2, arch=ARCH_X32): {1}}) == True assert r.evaluate({Number(2): {1}}) == False assert r.evaluate({Number(2, arch=ARCH_X64): {1}}) == False
def test_rule_yaml_not(): rule = textwrap.dedent(""" rule: meta: name: test rule features: - and: - number: 1 - not: - number: 2 """) r = capa.rules.Rule.from_yaml(rule) assert r.evaluate({Number(1): {1}}) == True assert r.evaluate({Number(1): {1}, Number(2): {1}}) == False
def extract_insn_number_features(f, bb, insn): """parse number features from the given instruction.""" # example: # # push 3136B0h ; dwControlCode operands = [o.strip() for o in insn.operands.split(",")] if insn.mnemonic == "add" and operands[0] in ["esp", "rsp"]: # skip things like: # # .text:00401140 call sub_407E2B # .text:00401145 add esp, 0Ch return for i, operand in enumerate(operands): try: # The result of bitwise operations is calculated as though carried out # in two’s complement with an infinite number of sign bits value = int(operand, 16) & ((1 << f.smda_report.bitness) - 1) except ValueError: continue else: yield Number(value), insn.offset yield OperandNumber(i, value), insn.offset if insn.mnemonic == "add" and 0 < value < MAX_STRUCTURE_SIZE: # for pattern like: # # add eax, 0x10 # # assume 0x10 is also an offset (imagine eax is a pointer). yield Offset(value), insn.offset yield OperandOffset(i, value), insn.offset
def test_count_number_symbol(): rule = textwrap.dedent(""" rule: meta: name: test rule features: - or: - count(number(2 = symbol name)): 1 - count(number(0x100 = symbol name)): 2 or more - count(number(0x11 = (FLAG_A | FLAG_B))): 2 or more """) r = capa.rules.Rule.from_yaml(rule) assert r.evaluate({Number(2): {}}) == False assert r.evaluate({Number(2): {1}}) == True assert r.evaluate({Number(2): {1, 2}}) == False assert r.evaluate({Number(0x100, "symbol name"): {1}}) == False assert r.evaluate({Number(0x100, "symbol name"): {1, 2, 3}}) == True
def test_rule_yaml_complex(): rule = textwrap.dedent(""" rule: meta: name: test rule features: - or: - and: - number: 1 - number: 2 - or: - number: 3 - 2 or more: - number: 4 - number: 5 - number: 6 """) r = capa.rules.Rule.from_yaml(rule) assert r.evaluate({ Number(5): {1}, Number(6): {1}, Number(7): {1}, Number(8): {1} }) == True assert r.evaluate({ Number(6): {1}, Number(7): {1}, Number(8): {1} }) == False
def extract_insn_number_features(f, bb, insn): """parse number features from the given instruction.""" # example: # # push 3136B0h ; dwControlCode operands = [o.strip() for o in insn.operands.split(",")] if insn.mnemonic == "add" and operands[0] in ["esp", "rsp"]: # skip things like: # # .text:00401140 call sub_407E2B # .text:00401145 add esp, 0Ch return for operand in operands: try: yield Number(int(operand, 16)), insn.offset yield Number(int(operand, 16), arch=get_arch(f.smda_report)), insn.offset except: continue
def test_number_arch_symbol(): r = capa.rules.Rule.from_yaml( textwrap.dedent(""" rule: meta: name: test rule features: - number/x32: 2 = some constant """)) assert r.evaluate( {Number(2, arch=ARCH_X32, description="some constant"): {1}}) == True
def test_rule_yaml_descriptions(): rule = textwrap.dedent(""" rule: meta: name: test rule features: - and: - number: 1 = This is the number 1 - string: This program cannot be run in DOS mode. description: MS-DOS stub message - string: '/SELECT.*FROM.*WHERE/i' description: SQL WHERE Clause - count(number(2 = AF_INET/SOCK_DGRAM)): 2 """) r = capa.rules.Rule.from_yaml(rule) assert (r.evaluate({ Number(1): {1}, Number(2): {2, 3}, String("This program cannot be run in DOS mode."): {4}, String("SELECT password FROM hidden_table WHERE user == admin"): {5}, }) == True)
def test_number_symbol(): rule = textwrap.dedent( """ rule: meta: name: test rule features: - and: - number: 1 - number: 0xFFFFFFFF - number: 2 = symbol name - number: 3 = symbol name - number: 4 = symbol name = another name - number: 0x100 = symbol name - number: 0x11 = (FLAG_A | FLAG_B) """ ) r = capa.rules.Rule.from_yaml(rule) children = list(r.statement.get_children()) assert (Number(1) in children) == True assert (Number(0xFFFFFFFF) in children) == True assert (Number(2, description="symbol name") in children) == True assert (Number(3, description="symbol name") in children) == True assert (Number(4, description="symbol name = another name") in children) == True assert (Number(0x100, description="symbol name") in children) == True
def extract_insn_number_features(f, bb, insn): """parse number features from the given instruction.""" # example: # # push 3136B0h ; dwControlCode operands = [o.strip() for o in insn.operands.split(",")] if insn.mnemonic == "add" and operands[0] in ["esp", "rsp"]: # skip things like: # # .text:00401140 call sub_407E2B # .text:00401145 add esp, 0Ch return for operand in operands: try: # The result of bitwise operations is calculated as though carried out # in two’s complement with an infinite number of sign bits value = int(operand, 16) & ((1 << f.smda_report.bitness) - 1) yield Number(value), insn.offset yield Number(value, bitness=get_bitness(f.smda_report)), insn.offset except: continue
def extract_insn_offset_features(f, bb, insn): """parse instruction structure offset features args: f (IDA func_t) bb (IDA BasicBlock) insn (IDA insn_t) example: .text:0040112F cmp [esi+4], ebx """ for i, op in enumerate(insn.ops): if op.type == idaapi.o_void: break if op.type not in (idaapi.o_phrase, idaapi.o_displ): continue if capa.features.extractors.ida.helpers.is_op_stack_var(insn.ea, op.n): continue p_info = capa.features.extractors.ida.helpers.get_op_phrase_info(op) op_off = p_info.get("offset", 0) if idaapi.is_mapped(op_off): # Ignore: # mov esi, dword_1005B148[esi] continue # I believe that IDA encodes all offsets as two's complement in a u32. # a 64-bit displacement isn't a thing, see: # https://stackoverflow.com/questions/31853189/x86-64-assembly-why-displacement-not-64-bits op_off = capa.features.extractors.helpers.twos_complement(op_off, 32) yield Offset(op_off), insn.ea yield OperandOffset(i, op_off), insn.ea if (insn.itype == idaapi.NN_lea and i == 1 # o_displ is used for both: # [eax+1] # [eax+ebx+2] and op.type == idaapi.o_displ # but the SIB is only present for [eax+ebx+2] # which we don't want and not capa.features.extractors.ida.helpers.has_sib(op)): # for pattern like: # # lea eax, [ebx + 1] # # assume 1 is also an offset (imagine ebx is a zero register). yield Number(op_off), insn.ea yield OperandNumber(i, op_off), insn.ea
def extract_op_offset_features(f, bb, insn, i, oper): """parse structure offset features from the given operand.""" # example: # # .text:0040112F cmp [esi+4], ebx # this is for both x32 and x64 # like [esi + 4] # reg ^ # disp if isinstance(oper, envi.archs.i386.disasm.i386RegMemOper): if oper.reg == envi.archs.i386.regs.REG_ESP: return if oper.reg == envi.archs.i386.regs.REG_EBP: return # TODO: do x64 support for real. if oper.reg == envi.archs.amd64.regs.REG_RBP: return # viv already decodes offsets as signed v = oper.disp yield Offset(v), insn.va yield OperandOffset(i, v), insn.va if insn.mnem == "lea" and i == 1 and not f.vw.probeMemory( v, 1, envi.memory.MM_READ): # for pattern like: # # lea eax, [ebx + 1] # # assume 1 is also an offset (imagine ebx is a zero register). yield Number(v), insn.va yield OperandNumber(i, v), insn.va # like: [esi + ecx + 16384] # reg ^ ^ # index ^ # disp elif isinstance(oper, envi.archs.i386.disasm.i386SibOper): # viv already decodes offsets as signed v = oper.disp yield Offset(v), insn.va yield OperandOffset(i, v), insn.va
def extract_insn_number_features(f, bb, insn): """parse instruction number features args: f (IDA func_t) bb (IDA BasicBlock) insn (IDA insn_t) example: push 3136B0h ; dwControlCode """ if idaapi.is_ret_insn(insn): # skip things like: # .text:0042250E retn 8 return if capa.features.extractors.ida.helpers.is_sp_modified(insn): # skip things like: # .text:00401145 add esp, 0Ch return for i, op in enumerate(insn.ops): if op.type == idaapi.o_void: break if op.type not in (idaapi.o_imm, idaapi.o_mem): continue # skip things like: # .text:00401100 shr eax, offset loc_C if capa.features.extractors.ida.helpers.is_op_offset(insn, op): continue if op.type == idaapi.o_imm: const = capa.features.extractors.ida.helpers.mask_op_val(op) else: const = op.addr yield Number(const), insn.ea yield OperandNumber(i, const), insn.ea if insn.itype == idaapi.NN_add and 0 < const < MAX_STRUCTURE_SIZE and op.type == idaapi.o_imm: # for pattern like: # # add eax, 0x10 # # assume 0x10 is also an offset (imagine eax is a pointer). yield Offset(const), insn.ea yield OperandOffset(i, const), insn.ea
def extract_op_number_features(f, bb, insn, i, oper): """parse number features from the given operand.""" # example: # # push 3136B0h ; dwControlCode # this is for both x32 and x64 if not isinstance(oper, (envi.archs.i386.disasm.i386ImmOper, envi.archs.i386.disasm.i386ImmMemOper)): return if isinstance(oper, envi.archs.i386.disasm.i386ImmOper): v = oper.getOperValue(oper) else: v = oper.getOperAddr(oper) if f.vw.probeMemory(v, 1, envi.memory.MM_READ): # this is a valid address # assume its not also a constant. return if insn.mnem == "add" and insn.opers[0].isReg( ) and insn.opers[0].reg == envi.archs.i386.regs.REG_ESP: # skip things like: # # .text:00401140 call sub_407E2B # .text:00401145 add esp, 0Ch return yield Number(v), insn.va yield OperandNumber(i, v), insn.va if insn.mnem == "add" and 0 < v < MAX_STRUCTURE_SIZE and isinstance( oper, envi.archs.i386.disasm.i386ImmOper): # for pattern like: # # add eax, 0x10 # # assume 0x10 is also an offset (imagine eax is a pointer). yield Offset(v), insn.va yield OperandOffset(i, v), insn.va
def extract_insn_offset_features(f, bb, insn): """parse structure offset features from the given instruction.""" # examples: # # mov eax, [esi + 4] # mov eax, [esi + ecx + 16384] operands = [o.strip() for o in insn.operands.split(",")] for i, operand in enumerate(operands): if "esp" in operand or "ebp" in operand or "rbp" in operand: continue number = 0 number_hex = re.search(PATTERN_HEXNUM, operand) number_int = re.search(PATTERN_SINGLENUM, operand) if number_hex: number = int(number_hex.group("num"), 16) number = -1 * number if number_hex.group().startswith( "-") else number elif number_int: number = int(number_int.group("num")) number = -1 * number if number_int.group().startswith( "-") else number if "ptr" not in operand: if (insn.mnemonic == "lea" and i == 1 and (operand.count("+") + operand.count("-")) == 1 and operand.count("*") == 0): # for pattern like: # # lea eax, [ebx + 1] # # assume 1 is also an offset (imagine ebx is a zero register). yield Number(number), insn.offset yield OperandNumber(i, number), insn.offset continue yield Offset(number), insn.offset yield OperandOffset(i, number), insn.offset
def test_rule_ctor(): r = capa.rules.Rule("test rule", capa.rules.FUNCTION_SCOPE, Number(1), {}) assert r.evaluate({Number(0): {1}}) == False assert r.evaluate({Number(1): {1}}) == True
def extract_insn_number_features( f: CilMethodBody, bb: CilMethodBody, insn: Instruction) -> Iterator[Tuple[Number, int]]: """parse instruction number features""" if insn.is_ldc(): yield Number(insn.get_ldc()), insn.offset