def diff_ins(self, di): self.di = di off_add = {} off_rm = {} for i in self.di.add_ins: off_add[i[0]] = i for i in self.di.remove_ins: off_rm[i[0]] = i nb = 0 for i in self.bb1.ins: ok = False if nb in off_add: debug("%d ADD %s %s" % (nb, off_add[nb][2].get_name(), off_add[nb][2].get_operands())) self.ins.append(off_add[nb][2]) setattr(off_add[nb][2], "diff_tag", DIFF_INS_TAG["ADD"]) del off_add[nb] if nb in off_rm: debug("%d RM %s %s" % (nb, off_rm[nb][2].get_name(), off_rm[nb][2].get_operands())) self.ins.append(off_rm[nb][2]) setattr(off_rm[nb][2], "diff_tag", DIFF_INS_TAG["REMOVE"]) del off_rm[nb] ok = True if ok == False: self.ins.append(i) debug("%d %s %s" % (nb, i.get_name(), i.get_operands())) setattr(i, "diff_tag", DIFF_INS_TAG["ORIG"]) nb += 1 #print nb, off_add, off_rm nbmax = nb if off_add != {}: nbmax = sorted(off_add)[-1] if off_rm != {}: nbmax = max(nbmax, sorted(off_rm)[-1]) while nb <= nbmax: if nb in off_add: debug("%d ADD %s %s" % (nb, off_add[nb][2].get_name(), off_add[nb][2].get_operands())) self.ins.append(off_add[nb][2]) setattr(off_add[nb][2], "diff_tag", DIFF_INS_TAG["ADD"]) del off_add[nb] if nb in off_rm: debug("%d RM %s %s" % (nb, off_rm[nb][2].get_name(), off_rm[nb][2].get_operands())) self.ins.append(off_rm[nb][2]) setattr(off_rm[nb][2], "diff_tag", DIFF_INS_TAG["REMOVE"]) del off_rm[nb] nb += 1
def _init_similarity(self): # Add a cache to available method in BB2 available_vm2_methods = [] for k in self.filters[METHODS][self.vm2[0]]: # B2 not at 0.0 in BB1 if k.getsha256() not in self.filters[HASHSUM][self.vm1[0]]: available_vm2_methods.append(k) # B2 matched perfectly in BB1 else: if k not in self.filters[MATCHMETHODS]: self.filters[MATCHMETHODS].append(k) # Check if some methods in the first file has been modified for j in self.filters[METHODS][self.vm1[0]]: debug("SIM FOR %s %s %s" % (j.m.get_class_name(), j.m.get_name(), j.m.get_descriptor())) # B1 not at 0.0 in BB2 if j.getsha256() not in self.filters[HASHSUM][self.vm2[0]]: for k in available_vm2_methods: j.similarity(k, self.filters[BASE][FILTER_SIM_METH]) if j not in self.filters[DIFFMETHODS]: self.filters[DIFFMETHODS].append(j) # B1 matched perfectly in BB2 else: if j not in self.filters[MATCHMETHODS]: self.filters[MATCHMETHODS].append(j)
def _init_similarity(self) : # Add a cache to available method in BB2 available_vm2_methods = [] for k in self.filters[METHODS][self.vm2[0]] : # B2 not at 0.0 in BB1 if k.getsha256() not in self.filters[HASHSUM][self.vm1[0]] : available_vm2_methods.append( k ) # B2 matched perfectly in BB1 else : if k not in self.filters[MATCHMETHODS] : self.filters[MATCHMETHODS].append( k ) # Check if some methods in the first file has been modified for j in self.filters[METHODS][self.vm1[0]] : debug("SIM FOR %s %s %s" % (j.m.get_class_name(), j.m.get_name(), j.m.get_descriptor())) # B1 not at 0.0 in BB2 if j.getsha256() not in self.filters[HASHSUM][self.vm2[0]] : for k in available_vm2_methods : j.similarity( k, self.filters[BASE][FILTER_SIM_METH] ) if j not in self.filters[DIFFMETHODS] : self.filters[DIFFMETHODS].append(j) # B1 matched perfectly in BB2 else : if j not in self.filters[MATCHMETHODS] : self.filters[MATCHMETHODS].append( j )
def diff_ins(self, di) : self.di = di off_add = {} off_rm = {} for i in self.di.add_ins : off_add[ i[0] ] = i for i in self.di.remove_ins : off_rm[ i[0] ] = i nb = 0 for i in self.bb1.ins : ok = False if nb in off_add : debug("%d ADD %s %s" % (nb, off_add[ nb ][2].get_name(), off_add[ nb ][2].get_operands())) self.ins.append( off_add[ nb ][2] ) setattr( off_add[ nb ][2], "diff_tag", DIFF_INS_TAG["ADD"] ) del off_add[ nb ] if nb in off_rm : debug("%d RM %s %s" % (nb, off_rm[ nb ][2].get_name(), off_rm[ nb ][2].get_operands())) self.ins.append( off_rm[ nb ][2] ) setattr( off_rm[ nb ][2], "diff_tag", DIFF_INS_TAG["REMOVE"] ) del off_rm[ nb ] ok = True if ok == False : self.ins.append( i ) debug("%d %s %s" % (nb, i.get_name(), i.get_operands())) setattr( i, "diff_tag", DIFF_INS_TAG["ORIG"] ) nb += 1 #print nb, off_add, off_rm nbmax = nb if off_add != {} : nbmax = sorted(off_add)[-1] if off_rm != {} : nbmax = max(nbmax, sorted(off_rm)[-1]) while nb <= nbmax : if nb in off_add : debug("%d ADD %s %s" % (nb, off_add[ nb ][2].get_name(), off_add[ nb ][2].get_operands())) self.ins.append( off_add[ nb ][2] ) setattr( off_add[ nb ][2], "diff_tag", DIFF_INS_TAG["ADD"] ) del off_add[ nb ] if nb in off_rm : debug("%d RM %s %s" % (nb, off_rm[ nb ][2].get_name(), off_rm[ nb ][2].get_operands())) self.ins.append( off_rm[ nb ][2] ) setattr( off_rm[ nb ][2], "diff_tag", DIFF_INS_TAG["REMOVE"] ) del off_rm[ nb ] nb += 1
def set_childs(self, abb): childs = [] for c in self.bb.childs: if c[2].name in abb: debug("SET %s %s " % (c[2], abb[c[2].name])) childs.append((c[0], c[1], abb[c[2].name])) else: debug("SET ORIG %s" % str(c)) childs.append(c) self.childs = childs
def filter_sort_meth_basic( x, value ) : z = sorted(x.iteritems(), key=lambda (k,v): (v,k)) if get_debug() : for i in z : debug("\t %s %s %s %d %f" %(i[0].m.get_class_name(), i[0].m.get_name(), i[0].m.get_descriptor(), i[0].m.get_length(), i[1])) if z[:1][0][1] > value : return [] return z[:1]
def set_childs(self, abb) : childs = [] for c in self.bb.childs : if c[2].name in abb : debug("SET %s %s " % (c[2], abb[ c[2].name ])) childs.append( (c[0], c[1], abb[ c[2].name ]) ) else : debug("SET ORIG %s" % str(c)) childs.append( c ) self.childs = childs
def _init_sort_methods(self) : # print "DEBUG DIFF METHODS" delete_methods = [] for j in self.filters[DIFFMETHODS] : debug("%s %s %s %d" % (j.m.get_class_name(), j.m.get_name(), j.m.get_descriptor(), j.m.get_length())) ret = j.sort( self.filters[BASE][FILTER_SORT_METH], self.filters[BASE][FILTER_SORT_VALUE] ) if ret == False : delete_methods.append( j ) for j in delete_methods : self.filters[ DELETEMETHODS ].append( j ) pos = self.filters[ DIFFMETHODS ].index( j ) self.filters[ DIFFMETHODS ].remove( j )
def filter_sort_meth_basic(x, value): z = sorted(x.iteritems(), key=lambda (k, v): (v, k)) if get_debug(): for i in z: debug("\t %s %s %s %d %f" % (i[0].m.get_class_name(), i[0].m.get_name(), i[0].m.get_descriptor(), i[0].m.get_length(), i[1])) if z[:1][0][1] > value: return [] return z[:1]
def _init_sort_methods(self): # print "DEBUG DIFF METHODS" delete_methods = [] for j in self.filters[DIFFMETHODS]: debug("%s %s %s %d" % (j.m.get_class_name(), j.m.get_name(), j.m.get_descriptor(), j.m.get_length())) ret = j.sort(self.filters[BASE][FILTER_SORT_METH], self.filters[BASE][FILTER_SORT_VALUE]) if ret == False: delete_methods.append(j) for j in delete_methods: self.filters[DELETEMETHODS].append(j) pos = self.filters[DIFFMETHODS].index(j) self.filters[DIFFMETHODS].remove(j)
def set_childs(self, abb): self.childs = self.bb1.childs for i in self.ins: if i == self.bb2.ins[-1]: childs = [] for c in self.bb2.childs: if c[2].name in abb: debug("SET %s %s" % (c[2], abb[c[2].name])) childs.append((c[0], c[1], abb[c[2].name])) else: debug("SET ORIG %s" % str(c)) childs.append(c) i.childs = childs
def set_childs(self, abb) : self.childs = self.bb1.childs for i in self.ins : if i == self.bb2.ins[-1] : childs = [] for c in self.bb2.childs : if c[2].name in abb : debug("SET %s %s" % (c[2], abb[ c[2].name ])) childs.append( (c[0], c[1], abb[ c[2].name ]) ) else : debug("SET ORIG %s" % str(c)) childs.append( c ) i.childs = childs
def _init_mark_methods(self) : # Change the compression to have a better result for a one <-> one comparaison in order to have a correct percentage self.sim.set_compress_type( XZ_COMPRESS ) # mark diff methods for j in self.filters[DIFFMETHODS] : debug("%s %s %s" % (j.m.get_class_name(), j.m.get_name(), j.m.get_descriptor())) # get the first method which match k = j.get_meth_first_sort() # recalculate the similarity to have better percentage with a better algorithm v1 = j.quick_similarity( k, self.filters[BASE][FILTER_SIM_METH] ) # filter the mark to eliminate totaly diff method v2 = self.filters[BASE][FILTER_MARK_METH]( v1 ) self.diff_methods_marks.append( v2 ) # mark match methods for m in self.filters[ MATCHMETHODS ] : v = self.filters[BASE][FILTER_MARK_METH]( 0.0 ) self.diff_methods_marks.append( v )
def _init_mark_methods(self): # Change the compression to have a better result for a one <-> one comparaison in order to have a correct percentage self.sim.set_compress_type(XZ_COMPRESS) # mark diff methods for j in self.filters[DIFFMETHODS]: debug("%s %s %s" % (j.m.get_class_name(), j.m.get_name(), j.m.get_descriptor())) # get the first method which match k = j.get_meth_first_sort() # recalculate the similarity to have better percentage with a better algorithm v1 = j.quick_similarity(k, self.filters[BASE][FILTER_SIM_METH]) # filter the mark to eliminate totaly diff method v2 = self.filters[BASE][FILTER_MARK_METH](v1) self.diff_methods_marks.append(v2) # mark match methods for m in self.filters[MATCHMETHODS]: v = self.filters[BASE][FILTER_MARK_METH](0.0) self.diff_methods_marks.append(v)
def getDiff(C, X, Y, i, j, a, r): if i > 0 and j > 0 and X[i - 1] == Y[j - 1]: getDiff(C, X, Y, i - 1, j - 1, a, r) debug(" " + "%02X" % ord(X[i - 1])) else: if j > 0 and (i == 0 or C[i][j - 1] >= C[i - 1][j]): getDiff(C, X, Y, i, j - 1, a, r) a.append((j - 1, Y[j - 1])) debug(" + " + "%02X" % ord(Y[j - 1])) elif i > 0 and (j == 0 or C[i][j - 1] < C[i - 1][j]): getDiff(C, X, Y, i - 1, j, a, r) r.append((i - 1, X[i - 1])) debug(" - " + "%02X" % ord(X[i - 1]))
def getDiff(C, X, Y, i, j, a, r): if i > 0 and j > 0 and X[i-1] == Y[j-1]: getDiff(C, X, Y, i-1, j-1, a, r) debug(" " + "%02X" % ord(X[i-1])) else: if j > 0 and (i == 0 or C[i][j-1] >= C[i-1][j]): getDiff(C, X, Y, i, j-1, a, r) a.append( (j-1, Y[j-1]) ) debug(" + " + "%02X" % ord(Y[j-1])) elif i > 0 and (j == 0 or C[i][j-1] < C[i-1][j]): getDiff(C, X, Y, i-1, j, a, r) r.append( (i-1, X[i-1]) ) debug(" - " + "%02X" % ord(X[i-1]))
def filter_diff_ins_basic( dbb, sim ) : final_add = [] final_rm = [] hS = {} rS = {} X = toString( dbb.bb1, hS, rS ) Y = toString( dbb.bb2, hS, rS ) debug("%s %d" % (repr(X), len(X))) debug("%s %d" % (repr(Y), len(Y))) m = len(X) n = len(Y) C = LCS( X, Y ) a = [] r = [] getDiff(C, X, Y, m, n, a, r) debug(a) debug(r) debug("DEBUG ADD") for i in a : debug(" \t %s %s %s" % (i[0], dbb.bb2.ins[ i[0] ].get_name(), dbb.bb2.ins[ i[0] ].get_operands())) final_add.append( (i[0], 0, dbb.bb2.ins[ i[0] ]) ) debug("DEBUG REMOVE") for i in r : debug(" \t %s %s %s" % (i[0], dbb.bb1.ins[ i[0] ].get_name(), dbb.bb1.ins[ i[0] ].get_operands())) final_rm.append( (i[0], 0, dbb.bb1.ins[ i[0] ]) ) dbb.diff_ins( DiffINS( final_add, final_rm ) )
def filter_diff_ins_basic(dbb, sim): final_add = [] final_rm = [] hS = {} rS = {} X = toString(dbb.bb1, hS, rS) Y = toString(dbb.bb2, hS, rS) debug("%s %d" % (repr(X), len(X))) debug("%s %d" % (repr(Y), len(Y))) m = len(X) n = len(Y) C = LCS(X, Y) a = [] r = [] getDiff(C, X, Y, m, n, a, r) debug(a) debug(r) debug("DEBUG ADD") for i in a: debug(" \t %s %s %s" % (i[0], dbb.bb2.ins[i[0]].get_name(), dbb.bb2.ins[i[0]].get_operands())) final_add.append((i[0], 0, dbb.bb2.ins[i[0]])) debug("DEBUG REMOVE") for i in r: debug(" \t %s %s %s" % (i[0], dbb.bb1.ins[i[0]].get_name(), dbb.bb1.ins[i[0]].get_operands())) final_rm.append((i[0], 0, dbb.bb1.ins[i[0]])) dbb.diff_ins(DiffINS(final_add, final_rm))
def diff(self, func_sim_bb, func_diff_ins): if self.sort_h == [] : self.dbb = {} self.nbb = {} return bb1 = self.bb ### Dict for diff basic blocks ### vm1 basic block : vm2 basic blocks -> value (0.0 to 1.0) diff_bb = {} ### List to get directly all diff basic blocks direct_diff_bb = [] ### Dict for new basic blocks new_bb = {} ### Reverse Dict with matches diff basic blocks associated_bb = {} for b1 in bb1 : diff_bb[ bb1[ b1 ] ] = {} debug("%s 0x%x" % (b1, bb1[ b1 ].basic_block.end)) for i in self.sort_h : bb2 = i[0].bb b_z = diff_bb[ bb1[ b1 ] ] bb2hash = i[0].bb_sha256 # If b1 is in bb2 : # we can have one or more identical basic blocks to b1, we must add them if bb1[ b1 ].get_hash() in bb2hash : for equal_bb in bb2hash[ bb1[ b1 ].get_hash() ] : b_z[ equal_bb.basic_block.name ] = 0.0 # If b1 is not in bb2 : # we must check similarities between all bb2 else : for b2 in bb2 : b_z[ b2 ] = func_sim_bb( bb1[ b1 ], bb2[ b2 ], self.sim ) sorted_bb = sorted(b_z.iteritems(), key=lambda (k,v): (v,k)) debug("\t\t%s" % sorted_bb[:2]) for new_diff in sorted_bb : associated_bb[ new_diff[0] ] = bb1[ b1 ].basic_block if new_diff[1] == 0.0 : direct_diff_bb.append( new_diff[0] ) if sorted_bb[0][1] != 0.0 : diff_bb[ bb1[ b1 ] ] = (bb2[ sorted_bb[0][0] ], sorted_bb[0][1]) direct_diff_bb.append( sorted_bb[0][0] ) else : del diff_bb[ bb1[ b1 ] ] for i in self.sort_h : bb2 = i[0].bb for b2 in bb2 : if b2 not in direct_diff_bb : new_bb[ b2 ] = bb2[ b2 ] dbb = {} nbb = {} # Add all different basic blocks for d in diff_bb : dbb[ d.basic_block.name ] = DiffBB( d.basic_block, diff_bb[ d ][0].basic_block, diff_bb[ d ] ) # Add all new basic blocks for n in new_bb : nbb[ new_bb[ n ].basic_block ] = NewBB( new_bb[ n ].basic_block ) if n in associated_bb : del associated_bb[ n ] self.dbb = dbb self.nbb = nbb # Found diff instructions for d in dbb : func_diff_ins( dbb[d], self.sim ) # Set new childs for diff basic blocks # The instructions will be tag with a new flag "childs" for d in dbb : dbb[ d ].set_childs( associated_bb ) # Set new childs for new basic blocks for d in nbb : nbb[ d ].set_childs( associated_bb ) # Create and tag all (orig/diff/new) basic blocks self.create_bbs()
def diff(self, func_sim_bb, func_diff_ins): if self.sort_h == []: self.dbb = {} self.nbb = {} return bb1 = self.bb ### Dict for diff basic blocks ### vm1 basic block : vm2 basic blocks -> value (0.0 to 1.0) diff_bb = {} ### List to get directly all diff basic blocks direct_diff_bb = [] ### Dict for new basic blocks new_bb = {} ### Reverse Dict with matches diff basic blocks associated_bb = {} for b1 in bb1: diff_bb[bb1[b1]] = {} debug("%s 0x%x" % (b1, bb1[b1].basic_block.end)) for i in self.sort_h: bb2 = i[0].bb b_z = diff_bb[bb1[b1]] bb2hash = i[0].bb_sha256 # If b1 is in bb2 : # we can have one or more identical basic blocks to b1, we must add them if bb1[b1].get_hash() in bb2hash: for equal_bb in bb2hash[bb1[b1].get_hash()]: b_z[equal_bb.basic_block.name] = 0.0 # If b1 is not in bb2 : # we must check similarities between all bb2 else: for b2 in bb2: b_z[b2] = func_sim_bb(bb1[b1], bb2[b2], self.sim) sorted_bb = sorted(b_z.iteritems(), key=lambda (k, v): (v, k)) debug("\t\t%s" % sorted_bb[:2]) for new_diff in sorted_bb: associated_bb[new_diff[0]] = bb1[b1].basic_block if new_diff[1] == 0.0: direct_diff_bb.append(new_diff[0]) if sorted_bb[0][1] != 0.0: diff_bb[bb1[b1]] = (bb2[sorted_bb[0][0]], sorted_bb[0][1]) direct_diff_bb.append(sorted_bb[0][0]) else: del diff_bb[bb1[b1]] for i in self.sort_h: bb2 = i[0].bb for b2 in bb2: if b2 not in direct_diff_bb: new_bb[b2] = bb2[b2] dbb = {} nbb = {} # Add all different basic blocks for d in diff_bb: dbb[d.basic_block.name] = DiffBB(d.basic_block, diff_bb[d][0].basic_block, diff_bb[d]) # Add all new basic blocks for n in new_bb: nbb[new_bb[n].basic_block] = NewBB(new_bb[n].basic_block) if n in associated_bb: del associated_bb[n] self.dbb = dbb self.nbb = nbb # Found diff instructions for d in dbb: func_diff_ins(dbb[d], self.sim) # Set new childs for diff basic blocks # The instructions will be tag with a new flag "childs" for d in dbb: dbb[d].set_childs(associated_bb) # Set new childs for new basic blocks for d in nbb: nbb[d].set_childs(associated_bb) # Create and tag all (orig/diff/new) basic blocks self.create_bbs()