def resolve_op(ist, opnum): op = ist.operands[opnum] if op.type == 'AbsoluteMemory': rv = 0 idaist = idautils.DecodeInstruction(ist.address) if op.index != None: rv += symbolic.symbols(distorm3.Registers[op.index].lower()) * op.scale if op.base != None: rv += symbolic.symbols(distorm3.Registers[op.base].lower()) if op.disp != None: rv += op.disp return DEREF(op.op_size, rv.simplify()) if ist.mnemonic.lower() != 'lea' else rv elif op.type == 'Register': return symbolic.symbols(distorm3.Registers[op.index].lower()) elif op.type == 'Immediate': return symbolic.symbolic(op.value) elif op.type == 'AbsoluteMemoryAddress': return DEREF(op.op_size, op.disp) else: raise BaseException("Unknown Operand Type %s" % (op.type))
def reg_mask(exp): if exp in (AL,BL,CL,DL): return 0xff & symbols('E%sX' % (exp.name[0])) elif exp in (AH,BH,CH,DH): return (0xff00 & symbols('E%sX' % (exp.name[0]))) >> 8 elif exp in (AX,BX,CX,DX,DI,SI,BP,SP): return 0xffff & symbols('E%s' % (exp.name)) else: return exp
def xtest_print_edit_distance_metric(self): ''' skip this because no longer does _tuple_edit_distance memoize ''' import symath.algorithms.editdistance as ed from numpy import * ed._tuple_edit_distance.clear_results() x, y, z, w = symath.symbols('x y z w') exp1 = x(y, z, w, w, x) exp2 = x(w, z, w, y) print '' print 'edit_distance(%s, %s) = %d' % (exp1, exp2, ed.edit_distance(exp1, exp2)) rv = ed._tuple_edit_distance.results util.pretty(rv) m = zeros([len(exp1), len(exp2)], dtype=int) for i in range(len(exp1)): for j in range(len(exp2)): m[i, j] = -1 for k in rv: m[len(k[0][0]), len(k[0][1])] = rv[k][0] print m
def test_summation(self): x,n,y = symath.symbols('x n y') expression = symath.functions.Sum(n, x(n) ** y) expression_dx = diff(expression, x).simplify() valid = (symath.functions.Sum(n, y * (x(n) ** (y - 1)))).simplify() self.assertEqual(valid, expression_dx)
def setUp(self): self.x, self.y, self.z, self.w, self.e1, self.e2 = symath.symbols('x y z w e1 e2') self.g = symath.graph.directed.DirectedGraph() self.g.connect(self.x, self.y, self.e1) self.g.connect(self.y, self.z, self.e2) self.g.connect(self.x, self.y, self.e2) self.g.connect(self.z, self.w) self.g.connect(self.x, self.w)
def _set_big_reg(dst, src): if dst in (AX,BX,CX,DX,SI,DI,BP,SP): edst = symath.symbols('E' + dst.name) context[edst.simplify()] = ((edst.substitute(context) & 0xffff0000) | src).simplify() elif dst in (AL,BL,CL,DL): edst = symath.symbols('E' + dst.name[0] + 'X') context[edst.simplify()] = ((edst.substitute(context) & 0xffffff00) | src).simplify() elif dst in (AH,BH,CH,DH): edst = symath.symbols('E' + dst.name[0] + 'X') context[edst.simplify()] = ((edst.substitute(context) & 0xffff00ff) | (src << 8)).simplify() elif dst.match(DEREF(a, b)): regsonly = {} for k in context: if not k.match(DEREF(a, b)): regsonly[k] = context[k] context[dst.substitute(regsonly).simplify()] = src.simplify() else: context[dst.simplify()] = src.simplify()
def test_more_complicated_solver(self): x, y = symath.symbols("x y") cs = solvers.z3.ConstraintSet() cs.add(x < 0) cs.add(x ** 3 < y ** 2) cs.add(y ** 2 < 9) cs.add(y > 0) cs.add(x < y) r = cs.solve() self.assertNotEqual(r, None)
def test_more_complicated_solver(self): x, y = symath.symbols('x y') cs = solvers.z3.ConstraintSet() cs.add(x < 0) cs.add(x**3 < y**2) cs.add(y**2 < 9) cs.add(y > 0) cs.add(x < y) r = cs.solve() self.assertNotEqual(r, None)
def _get_operand_sym(op): if op.type == 'Immediate': return symath.symbolic(op.value) elif op.type == 'AbsoluteMemoryAddress': return DEREF(op.op_size, op.disp) elif op.type == 'Register': return symath.symbols(distorm3.Registers[op.index].upper()) elif op.type == 'AbsoluteMemory': rv = 0 if op.index != None: rv += symath.symbols(distorm3.Registers[op.index].upper()) * op.scale if op.base != None: rv += symath.symbols(distorm3.Registers[op.base].upper()) if op.disp != None: rv += symath.symbolic(op.disp) return DEREF(op.op_size, rv) else: raise BaseException("Unknown operand type %s (%s)" % (op.type, op))
def _get_operand_sym(op): if op.type == 'Immediate': return symath.symbolic(op.value) elif op.type == 'AbsoluteMemoryAddress': return DEREF(op.op_size, op.disp) elif op.type == 'Register': return symath.symbols(distorm3.Registers[op.index].upper()) elif op.type == 'AbsoluteMemory': rv = 0 if op.index != None: rv += symath.symbols( distorm3.Registers[op.index].upper()) * op.scale if op.base != None: rv += symath.symbols(distorm3.Registers[op.base].upper()) if op.disp != None: rv += symath.symbolic(op.disp) return DEREF(op.op_size, rv) else: raise BaseException("Unknown operand type %s (%s)" % (op.type, op))
def test_wilds_dont_substitute(self): ''' it is implicitly assumed that substitute is too "dumb" to account for wilds in some places in the code base, this makes sure that doesnt change without making sure the rest of the code base is updated ''' a,b = symath.wilds('a b') x,y = symath.symbols('x y') subs = { x(a): x(x) } self.assertEqual(x(y).substitute(subs), x(y)) self.assertEqual(x(a).substitute(subs), x(x)) # this one *should* substitute self.assertEqual(x(b).substitute(subs), x(b))
def forward_data_flow(source, ea=None, calldepth=0): if ea == None: ea = ScreenEA() _clear_colored() inst, dst, src = wilds("inst dst src") w = WildResults() tainted = VersionedSet() tainted.version = -1 tainted.add(source) def _fix_esp(ea, exp): spd = GetSpd(ea) return exp.substitute({esp: (esp + spd).simplify()}) fg = FunctionGraph(ea) # data connections graph TAINTED = symbols("TAINTED") dg = DirectedGraph() dg.connect(TAINTED, source) for addr, level in fg.walk(ea, depthfirst=True): if level <= tainted.version: print "reverting to version %s" % (level - 1) tainted = tainted.get_version(level - 1) tainted.version = level syminst = symdecode(addr) if syminst.match(inst(dst, src), w) and w.inst in tainted_dst_src_insts: print "analyzing %s" % (syminst,) # untaint cleared registers if syminst.match(XOR(dst, dst)) and w.dst in tainted: tainted.remove(w.dst) elif w.src in tainted: _color(addr) print "tainting %s" % (w.dst,) tainted.add(w.dst) elif w.dst in tainted: tainted.remove(w.dst) return tainted
def test_wilds_dont_substitute(self): ''' it is implicitly assumed that substitute is too "dumb" to account for wilds in some places in the code base, this makes sure that doesnt change without making sure the rest of the code base is updated ''' a, b = symath.wilds('a b') x, y = symath.symbols('x y') subs = {x(a): x(x)} self.assertEqual(x(y).substitute(subs), x(y)) self.assertEqual(x(a).substitute(subs), x(x)) # this one *should* substitute self.assertEqual(x(b).substitute(subs), x(b))
def signature(ea=None): if ea == None: ea = idc.ScreenEA() off = symath.symbols('off') _signature = symath.symbols('signature') fg = functiongraph.FunctionGraph(ea) ns = copy.copy(fg.nodes.keys()) ns.sort() rv = [] def _(exp): if isinstance(exp, symath.Number) and int(exp.n) in ns: return off(ns.index(int(exp.n))) elif is_register(exp) and exp not in (ESP,): return symath.wild(str(exp)) else: return exp for i in ns: rv.append(decode(i).walk(_)) return _signature(*rv)
def test_edit_distance(self): from symath.algorithms.editdistance import edit_distance,edit_substitutions x,y,z = symath.symbols('x y z') a,b,c = symath.wilds('a b c') self.assertEqual(edit_distance(x(x, y, x), y(x, x, x)), 2) self.assertEqual(edit_distance(x(y, x), x(y, y, x)), 1) self.assertEqual(edit_distance(x(y, x), x(x)), 1) self.assertEqual(edit_distance(x(y, y, x), x(x)), 2) self.assertEqual(edit_distance(a, x(y, y)), 0) self.assertEqual(edit_distance(a(x, x), x(y, x)), 1) self.assertNotEqual(edit_distance(y(x(a, b), x(b, a)), y(x(a, b), x(a, b))), 0) self.assertEqual(edit_distance(y(x(a, b), x(b, a)), y(x(a, b), x(a, b))), 2) self.assertEqual(edit_distance(y(x(a, b), x(b, a)), x(x(a, b), x(a, b))), 3) exp1 = y(x, x, x, y, y, x, x, x, y) exp2 = y(x, y, y, x, y, x, x, y) self.assertEqual(edit_distance(exp1, exp2), 3)
def powerhash(graph): ''' TODO: MAY BE BROKEN SINCE SWITCHING TO SPARSE MATRIX FORMAT FOR GRAPHS based on: Approaches to Solving The Graph Isomorphism Problem by Jordy Eikenberry returns: Eikenberry(nodecount : int, hash : string) The algorithm: Take an adjacency matrix A of the graph remember that the (A ** n)[a,b] is the number of paths from a -> b of length n for every integer 1 .. n raise A to n and then canonicalize the diagnol (via sorting) and use this as a token for the final hash isomorphic graphs will have the same count of closed paths for every length a True result as usual only means it's a canidate, it does not imply isomorphism ''' A = graph.adjacency_matrix()[1] Aprime = copy.copy(A) nc = len(A) d = symbols('d') hsum = [] for n in range(len(A)): dia = list(numpy.diag(Aprime)) dia.sort() Aprime = numpy.dot(Aprime, A) hsum.append(d(*dia)) hsum = str(GraphPowerHash(*hsum)) hsum = hashlib.sha256(hsum).hexdigest() return GraphPowerHash(nc, hsum)
def powerhash(graph): """ TODO: MAY BE BROKEN SINCE SWITCHING TO SPARSE MATRIX FORMAT FOR GRAPHS based on: Approaches to Solving The Graph Isomorphism Problem by Jordy Eikenberry returns: Eikenberry(nodecount : int, hash : string) The algorithm: Take an adjacency matrix A of the graph remember that the (A ** n)[a,b] is the number of paths from a -> b of length n for every integer 1 .. n raise A to n and then canonicalize the diagnol (via sorting) and use this as a token for the final hash isomorphic graphs will have the same count of closed paths for every length a True result as usual only means it's a canidate, it does not imply isomorphism """ A = graph.adjacency_matrix()[1] Aprime = copy.copy(A) nc = len(A) d = symbols("d") hsum = [] for n in range(len(A)): dia = list(numpy.diag(Aprime)) dia.sort() Aprime = numpy.dot(Aprime, A) hsum.append(d(*dia)) hsum = str(GraphPowerHash(*hsum)) hsum = hashlib.sha256(hsum).hexdigest() return GraphPowerHash(nc, hsum)
def test_edit_distance(self): from symath.algorithms.editdistance import edit_distance, edit_substitutions x, y, z = symath.symbols('x y z') a, b, c = symath.wilds('a b c') self.assertEqual(edit_distance(x(x, y, x), y(x, x, x)), 2) self.assertEqual(edit_distance(x(y, x), x(y, y, x)), 1) self.assertEqual(edit_distance(x(y, x), x(x)), 1) self.assertEqual(edit_distance(x(y, y, x), x(x)), 2) self.assertEqual(edit_distance(a, x(y, y)), 0) self.assertEqual(edit_distance(a(x, x), x(y, x)), 1) self.assertNotEqual( edit_distance(y(x(a, b), x(b, a)), y(x(a, b), x(a, b))), 0) self.assertEqual( edit_distance(y(x(a, b), x(b, a)), y(x(a, b), x(a, b))), 2) self.assertEqual( edit_distance(y(x(a, b), x(b, a)), x(x(a, b), x(a, b))), 3) exp1 = y(x, x, x, y, y, x, x, x, y) exp2 = y(x, y, y, x, y, x, x, y) self.assertEqual(edit_distance(exp1, exp2), 3)
def xtest_print_edit_distance_metric(self): ''' skip this because no longer does _tuple_edit_distance memoize ''' import symath.algorithms.editdistance as ed from numpy import * ed._tuple_edit_distance.clear_results() x,y,z,w = symath.symbols('x y z w') exp1 = x(y, z, w, w, x) exp2 = x(w, z, w, y) print '' print 'edit_distance(%s, %s) = %d' % (exp1, exp2, ed.edit_distance(exp1, exp2)) rv = ed._tuple_edit_distance.results util.pretty(rv) m = zeros([len(exp1), len(exp2)], dtype=int) for i in range(len(exp1)): for j in range(len(exp2)): m[i,j] = -1 for k in rv: m[len(k[0][0]), len(k[0][1])] = rv[k][0] print m
def setUp(self): self.x, self.y = symath.symbols("x y")
#!/usr/bin/env python from symath import symbols # data move Mov = symbols('mov') Movzx = symbols('movzx') Movsx = symbols('movsx') Push = symbols('push') Pop = symbols('pop') Lea = symbols('lea') # arithmetic operations Sub = symbols('sub') Add = symbols('add') Xor = symbols('xor') And = symbols('and') Or = symbols('or') Shr = symbols('shr') Shl = symbols('shl') Sar = symbols('sar') Sal = symbols('sal') # comparison Cmp = symbols('cmp') Test = symbols('test') # call instructions are weird in their definition # they need to pass as a second argument the stack change # and as the third argument the address at which the call # takes place
def test_log(self): x,y = symath.symbols('x y') expression = symath.functions.Log(x * y) dx = diff(expression, x).simplify() self.assertEqual(dx, (y / (x * y)).simplify())
def test_diff_quotient_rule(self): x = symath.symbols('x') print diff(1 / x, x) self.assertEqual(diff(1 / x, x).simplify(), (-1 / x**2).simplify())
def test_diff_chain_rule(self): Exp,x,y = symath.symbols('Exp x y') self.assertEqual(diff(Exp(2 * x), x).simplify(), (2 * Exp(2 * x)).simplify())
def test_diff_non_var(self): x,y = symath.symbols('x y') dx = diff(y, x) self.assertEqual(dx, 0)
def setUp(self): self.x, self.y, self.z = symath.symbols('x y z')
def test_symbol_inequal_wild(self): a = symath.wilds('a') sa = symath.symbols('a') self.assertNotEqual(sa, a)
def test_union(self): og = symath.graph.directed.DirectedGraph() og.connect(self.x, symath.symbols('ognode')) og.union(self.g) self.assertTrue(og.connectedQ(self.x, self.y))
def __symbolic_column_name__(self, colnum): x = symbols('x') return x(colnum)
def setUp(self): self.w, self.v = symath.wilds('w v') self.x, self.y = symath.symbols('x y') self.head = symath.symbols('head')
defines a number of "signature" heuristics for graphs which can be used for trimming out candidate graphs in isomorphism testing some signatures are specific to specific types of graphs, in that case, it is documented in the help for the particular signature function ''' import symath.util from symath import symbols, wilds, WildResults import numpy import hashlib import copy GraphSummation = symbols('GraphSummation') GraphDensity = symbols('GraphDensity') GraphComplexity = symbols('GraphComplexity') GraphPowerHash = symbols('GraphPowerHash') def summation(graph): ''' returns GraphSummation(nodes : int, edges : int) ''' sum_out = 0 sum_in = 0 for n in graph.nodes.values(): sum_out += len(n.outgoing)
def test_diff_power_rule(self): x = symath.symbols('x') dx = diff(x ** 2, x) self.assertEqual(dx.simplify(), (2 * x).simplify())
#!/usr/bin/env from symath import symbols,wilds,WildResults,symbolic CALLRESULT = symbols('CALLRESULT') DEREF = symbols('DEREF') EAX,EBX,ECX,EDX = symbols('EAX EBX ECX EDX') EDI,ESI,ESP,EBP = symbols('EDI ESI ESP EBP') EFLAGS = symbols('EFLAGS') AX,BX,CX,DX,DI,SI,BP,SP = symbols('AX BX CX DX DI SI BP SP') AL,AH,BL,BH,CL,CH,DL,DH = symbols('AL AH BL BH CL CH DL DH') def reg_size(reg): a,b = wilds('a b') val = WildResults() if reg in (AX,BX,CX,DX,DI,SI,BP,SP): return symbolic(2) elif reg in (AL,AH,BL,BH,CL,CH,DL,DH): return symbolic(1) elif reg in (EAX,EBX,ECX,EDX,EDI,ESI,EBP,ESP,EFLAGS): return symbolic(4) elif reg.match(DEREF(a, b), val): return val.a else: raise BaseException('Unknown Register %s' % reg) def is_register(exp): return exp in (AX,BX,CX,DX,DI,SI,BP,SP,AL,AH,BL,BH,CL,CH,DL,DH,EAX,EBX,ECX,EDX,EDI,ESI,EBP,ESP,EFLAGS) def reg_mask(exp):
def test_diff_product(self): x,y = symath.symbols('x y') self.assertEqual(diff(x * y, x).simplify(), y) self.assertEqual(diff(y * x ** 3, x).simplify(), (3 * y * x ** 2).simplify())
def test_diff_fail_on_unknown_function(self): with self.assertRaises(DifferentiationError): Unknown,x = symath.symbols('Unknown x') diff(Unknown(x), x)
from symath import symbols ADD,SUB,MUL,IMUL,DIV,IDIV = symbols('ADD SUB MUL IMUL DIV IDIV') MOV,LEA = symbols('MOV LEA') PUSH,POP,PUSHA,POPA = symbols('PUSH POP PUSHA POPA') XOR,AND,OR = symbols('XOR AND OR') SAR,SHR,SAL,SHL = symbols('SAR SHR SAL SHL') INC,DEC = symbols('INC DEC') MOVSX,MOVZX = symbols('MOVSX MOVZX') CMP,TEST = symbols('CMP TEST') control_flow_instructions = (JMP,JZ,JNZ,JA,JB,JNA,JNB,JE,JNE,JG,JL,JNG,JNL) = symbols('JMP JZ JNZ JA JB JNA JNB JE JNE JG JL JNG JNL')
# -*- coding: utf-8 -*- # <nbformat>3.0</nbformat> # <codecell> from symath import symbols, wilds, WildResults, functions, stdops from IPython.display import Latex _greek = symbols('theta gamma Theta Gamma alpha beta Alpha Beta Delta delta pi Pi phi Phi') def _idisplay(exp): x,y,z,n = wilds('x y z n') ws = WildResults() if exp.match(x ** y, ws): return r"{%s} ^ {%s}" % (_idisplay(ws.x), _idisplay(ws.y)) elif exp in _greek: return r'\%s' % (str(exp),) elif exp.match(-1 * x, ws): return r'-{%s}' % (_idisplay(ws.x),) elif exp.match(x + y, ws): return r'{%s} + {%s}' % (_idisplay(ws.x), _idisplay(ws.y)) elif exp.match(x - y, ws): return r'{%s} - {%s}' % (_idisplay(ws.x), _idisplay(ws.y)) elif exp.match(x * y, ws): return r'{%s} {%s}' % (_idisplay(ws.x), _idisplay(ws.y))
class NFA(object): EPSILON, ANY = symbols('NFA_EPSILON NFA_ANY') def __init__(self, start_state, magic=None): self._start_state = start_state self._transitions = {} self._transitions_to = {} self._final_states = set() self._has_epsilons = False self._bytecode = None self._interupt_states = set() self._magic = magic self._gcrefs = [] self._tag_assocs = util.Associations() self._tcounter = 0 self._states = set() self._state_hooks = {} self.do_tags = False self.choose = lambda a, b: a def _choose(self, arglist): rv = None for i in arglist: if rv == None: rv = i else: rv = self.choose(rv, i) def set_state_hook(self, state, hook): if hook == None: if state in self._state_hooks: del self._state_hooks[state] else: self._state_hooks[state] = hook def transitions_to(self, dst): ''' returns enumerable of (prevstate, t) tuples this is super slow and needs to be sped up ''' if dst in self._transitions_to: for t in self._transitions_to[dst]: for s in self._transitions_to[dst][t]: yield (s, t) def tag(self, transition, src, dst, tagid=None): return self._tag_assocs.associate((transition, src, dst), tagid) def is_tagged(self, transition, src, dst): return (transition, src, dst) in self._tag_assocs def reltags(self, src, cache=None): ''' returns all the tags that are relevant at this state cache should be a dictionary and it is updated by the function ''' if not self._tag_assocs: return set() # f*****g python and it's terrible support for recursion makes this # far more complicated than it needs to be if cache == None: cache = {} q = _otq() q.append(src) updateq = _otq() while q: i = q.popleft() if i in cache: continue cache[i] = set() for (s, t) in self.transitions_to(i): q.append(s) if self.is_tagged(t, s, i): cache[i].add((self.tag(t, s, i), s, i)) updateq.appendleft((i, s)) while updateq: i = updateq.popleft() cache[i[0]].update(cache[i[1]]) return cache[src] def _add_epsilon_states(self, stateset, gathered_epsilons): ''' stateset is the list of initial states gathered_epsilons is a dictionary of (dst: src) epsilon dictionaries ''' for i in list(stateset): if i not in gathered_epsilons: gathered_epsilons[i] = {} q = _otq() q.append(i) while q: s = q.popleft() for j in self._transitions.setdefault(s, {}).setdefault( NFA.EPSILON, set()): gathered_epsilons[i][ j] = s if j not in gathered_epsilons[ i] else self.choose(s, j) q.append(j) stateset.update(gathered_epsilons[i].keys()) def add_interupt_state(self, state): self._interupt_states.add(state) def transitions(self, current_states, cached_transitions=None): if cached_transitions == None: cached_transitions = {} rv = set() for cs in current_states: if cs not in cached_transitions: cached_transitions[cs] = set() for t in self._transitions.setdefault(cs, {}): if t in set([NFA.ANY, NFA.EPSILON]): continue if self._transitions[cs][t]: cached_transitions[cs].add(t) rv.update(cached_transitions[cs]) return rv def nextstates(self, current_states, transition): rv = set() for cs in current_states: rv.update( self._transitions.setdefault(cs, {}).setdefault(transition, set())) if transition not in (NFA.ANY, NFA.EPSILON): for cs in current_states: rv.update(self._transitions[cs].setdefault(NFA.ANY, set())) return rv def _write_transition_code(self, utags, ltags, codeblock): utagd = {} for i in utags: if i[0] in utagd: utagd[i[0]] = self.choose(i[1], utagd[i[0]]) else: utagd[i[0]] = i[1] ltagd = {} for i in ltags: if i[0] in utagd: continue elif i[0] not in ltagd or self.choose(ltagd[i[0]][0], i[1]) == i[1]: ltagd[i[0]] = (i[1], i[2]) for k in utagd: codeblock.append(VM.UpdateTagV(k, utagd[k])) for k in ltagd: codeblock.append(VM.LoadTagV(k, ltagd[k][0], ltagd[k][1])) def _transitions_to_dfa_bytecode(self, sources, trn, \ cached_tcode, \ debug=False, \ compiled_states=None, \ gathered_epsilons=None, \ cached_transitions=None, \ reltags_cache=None \ ): key = (trn, tuple(sources)) if key in cached_tcode: return cached_tcode[key] # get the stateblock sb = self._states_to_dfa_bytecode(sources, tran=trn, debug=debug, \ compiled_states=compiled_states, gathered_epsilons=gathered_epsilons, \ cached_transitions=cached_transitions,cached_tcode=cached_tcode, \ reltags_cache=reltags_cache) # build the transition block tb = self._bytecode.newblock("Transition 0x%x" % (self._tcounter)) self._tcounter += 1 # get a list of tags to emit code for, and reltags to copy previous values from if self.do_tags: tags = set() rtags = set() for s in sources: for d in self._transitions[s].setdefault(trn, set()): rtags.update(self.reltags(d, reltags_cache)) if self.is_tagged(trn, s, d): tags.add((self.tag(trn, s, d), d)) self._write_transition_code(tags, rtags, tb) # if tb is empty, just return the stateblock, no need for an extra jmp if not tb: cached_tcode[key] = sb return sb # jump to the state block tb.append(VM.Jmp(sb)) # return cached_tcode[key] = tb return tb def _states_to_dfa_bytecode(self, states, \ tran=None, \ debug=False, \ compiled_states=None, \ gathered_epsilons=None, \ cached_transitions=None, \ cached_tcode=None, \ reltags_cache=None \ ): '''returns the instruction pointer to the bytecode added''' pstates = copy.copy(states) if reltags_cache == None: reltags_cache = {} if cached_tcode == None: cached_tcode = {} if cached_transitions == None: cached_transitions = {} if gathered_epsilons == None: gathered_epsilons = {} self._add_epsilon_states(states, gathered_epsilons) if tran != None: states = self.nextstates(states, tran) self._add_epsilon_states(states, gathered_epsilons) if self._magic != None: states = states.union(self._magic(states)) tstates = tuple(states) # this is used so we only compile each stateset once if compiled_states == None: compiled_states = {} if tstates in compiled_states: return compiled_states[tstates] # grab the ip from our codeblock ip = self._bytecode.newblock(tstates) compiled_states[tstates] = ip # TODO # epsilon transitions are never 'taken' so we need # to insert any ltagv/utagv instructions required # for all epsilon transitions # gathered_epsilons[state] holds a dictionary of dst: src mappings, so we can use that data if self.do_tags: tags = set() rtags = set() for ts in pstates: for dst in gathered_epsilons[ts]: rtags.update(self.reltags(dst, reltags_cache)) src = gathered_epsilons[ts][dst] if self.is_tagged(NFA.EPSILON, src, dst): tags.add((self.tag(NFA.EPSILON, src, dst), dst)) self._write_transition_code(tags, rtags, ip) # run any defined state hooks for s in tstates: if s in self._state_hooks: ip.append(VM.PyCode(self._state_hooks[s])) # do a multi-match for any final states finals = self._final_states.intersection(states) if len(finals) > 0: ip.append(VM.MultiMatch(finals)) # do any interupts required interupts = self._interupt_states.intersection(states) if len(interupts) > 0: ip.append(VM.MultiInterupt(interupts)) # consume a character ip.append(VM.Consume()) ts = self.transitions(states, cached_transitions) if debug: print 'compiling bytecode for stateset:\n\t%s\n\t0x%x: %s' % ( states, ip, (defaults, ts)) def mkbytecode(t): return lambda: self._transitions_to_dfa_bytecode( states, t, cached_tcode, debug=debug, compiled_states=compiled_states, gathered_epsilons=gathered_epsilons, cached_transitions=cached_transitions, reltags_cache=reltags_cache) # for any of the non-default states add a conditional jmp for k in ts: if k in (NFA.ANY, NFA.EPSILON): continue jmppoint = VM.DelayedArg(mkbytecode(k)) ip.append(VM.Compare(k)) ip.append(VM.CondJmp(jmppoint)) # jmp to default state if there is one, otherwise leave defaults = self.nextstates(states, NFA.ANY) if len(defaults) > 0: jmppoint = VM.DelayedArg(mkbytecode(NFA.ANY)) ip.append(VM.Jmp(jmppoint)) else: ip.append(VM.Leave()) # return the instruction pointer return ip def copy(self): rv = NFA(self._start_state) rv._final_states = copy.deepcopy(self._final_states) rv._has_epsilons = self._has_epsilons rv._transitions = {} rv._transitions_to = {} rv._bytecode = self._bytecode for i in self._transitions: for j in self._transitions[i]: rv._transitions.setdefault( i, {})[j] = self._transitions[i][j].copy() for i in self._transitions_to: for j in self._transitions_to[i]: rv._transitions_to.setdefault( i, {})[j] = self._transitions_to[i][j].copy() return rv def all_states(self): rv = set([self._start_state]) for s in self._transitions: for ns in self._transitions[s].values(): for nns in ns: rv.add(nns) return rv def add_final_state(self, state): self._final_states.add(state) self._bytecode = None def clear_final_states(self): self._final_states = set() self._bytecode = None def find_epsilon_states(self, state, rv=set()): for i in self._transitions.setdefault(state, {}).setdefault( NFA.EPSILON, set()): if i not in rv: rv.add(i) self.find_epsilon_states(i, rv=rv) return rv def get_starting_states(self): epstates = self.find_epsilon_states(self._start_state) return set.union(epstates, set([self._start_state])) def bytecode(self, debug=False): if self._bytecode == None: self._bytecode = VM.CodeBlock('EntryPoint') self._bytecode.append( VM.Jmp( VM.DelayedArg(lambda: self._states_to_dfa_bytecode( set([self._start_state]), debug=debug)))) return self._bytecode def execute(self, tokenstring, debug=False): bc = self.bytecode() rv = bc.execute(tokenstring, debug=debug, state_count=len(self.all_states()), tag_count=len(self._tag_assocs)) return rv def add_transition(self, oldstate, token, newstate): self._transitions.setdefault(oldstate, {}).setdefault(token, set()) self._transitions[oldstate][token].add(newstate) self._transitions_to.setdefault(newstate, {}).setdefault(token, set()) self._transitions_to[newstate][token].add(oldstate) if token == NFA.EPSILON: self._has_epsilons = True self._bytecode = None def locate_final_states(self): dstates = set() for i in self._transitions: for t in self._transitions[i]: dstates = dstates.union(self._transitions[i][t]) sstates = set([self._start_state]) for i in self._transitions: if len(self._transitions[i]) > 0: sstates.add(i) self._final_states = dstates.difference(sstates) return self._final_states def get_transitions(self, oldstate, newstate): rv = set() for t in self._transitions.setdefault(oldstate, {}): if newstate in self._transitions[oldstate][t]: rv.add(t) return rv def get_following_states(self, oldstate): rv = set() for i in self._transitions.setdefault(oldstate, {}).values(): rv = set.union(rv, i) return rv def final_states(self, states): return set.intersection(states, self._final_states) def to_graph(self): from symath.graph import directed g = directed.DirectedGraph() for s in self.all_states(): g.add_node(s) for t in self._transitions.setdefault(s, {}): for dest in self._transitions[s][t]: lbl = None if t == NFA.EPSILON: lbl = 'E' elif t == NFA.ANY: lbl = '*' else: lbl = "'%s'" % (t if 0x30 <= ord(str(t)) <= 0x7a else 'chr(%s)' % (ord(str(t))), ) if self.is_tagged(t, s, dest): lbl = "%s/%s" % (lbl, self.tag(t, s, dest)) g.connect(s, dest, lbl) for fs in self._final_states: g.set_color(fs, 'red') for hs in self._state_hooks: g.set_color(hs, 'blue') g.set_color(self._start_state, 'green') return g @staticmethod def _test(): print '----- NFA TEST -----' nfa = NFA(0) # should match [ab].abcdef nfa.add_transition(0, 'a', 1) nfa.add_transition(0, 'b', 3) nfa.add_transition(1, NFA.ANY, 2) rest = "cdef" for i in range(len(rest)): nfa.add_transition(2 + i, rest[i], 3 + i) nfa.locate_final_states() def _exec(s): nothing = True for i in nfa.execute(s): print "execute('%s') = %s" % (s, i) nothing = False if nothing: print "execute('%s') = No Results" % (s) #print 'nfa regex: %s' % (nfa.to_regex(hexesc=False)) #print "execute('abcdefhi') = %s" % (nfa.execute("accdefhi")) #print "execute('ccdef') = %s" % (nfa.execute("ccdef")) #print "execute('bdef') = %s" % (nfa.execute("bdef")) _exec('abcdefhi') _exec('ccdef') _exec('bdef') bc = nfa.bytecode() bc = bc.link() print bc
def setUp(self): self.x, self.y = symath.symbols('x y')
from symath import symbols VER_ADD,VER_DELETE = symbols('VER_ADD VER_DELETE') def op_add(obj, version): return (VER_ADD(version), obj) def op_del(obj, version): return (VER_DELETE(version), obj)
#!/usr/bin/env python import distorm3 import symath as symbolic import copy import memoize import symath.graph.algorithms as algorithms from memoize import Memoize from functiongraph import FunctionGraph from idafun import * # registers eax,ebx,ecx,edx,esi,edi,ebp,esp = symbolic.symbols('eax ebx ecx edx esi edi ebp esp') ax,bx,cx,dx,si,di,bp,sp = symbolic.symbols('ax bx cx dx si di bp sp') al,ah,bl,bh,cl,ch,dl,dh = symbolic.symbols('al ah bl bh cl ch dl dh') eflags = symbolic.symbols('eflags') # functions DEREF = symbolic.symbols('DEREF') PHI = symbolic.symbols('PHI', associative=True, commutative=True) AT = symbolic.symbols('@') CALL = symbolic.symbols('CALL') LOOKUP = symbolic.symbols('=>') regmasks = \ { ax: eax & 0xffff, bx: ebx & 0xffff, cx: ecx & 0xffff, dx: edx & 0xffff,