def test_get_similars_basic(fdb_mem): """ Try to run get_similars and see if it doesn't crash. """ res = fdb_mem.get_similars(b'adfasdfasdf',5) assert isinstance(res,list) # Add a function and request a function similar to that function. We expect # to get one result: fdb_mem.add_function('func_name',b'func_data','func_comment') res = fdb_mem.get_similars(b'func_data',num_similars=3) assert len(res) == 1 # Add another function, and request functions similar to that function. We # expect to get all the functions, where func2 will be first on the list: fdb_mem.add_function('func_name2',b'func_data2','func_comment2') res = fdb_mem.get_similars(b'func_data2',num_similars=3) assert len(res) == 2 # Hash should match: assert res[0].func_hash == strong_hash(b'func_data2') assert res[1].func_hash != strong_hash(b'func_data2') # Signature should match: assert res[0].func_sig == sign(b'func_data2',NUM_HASHES) # Request for just one similars, and make sure we get only one, although # the DB contains two functions: res = fdb_mem.get_similars(b'func_data',num_similars=1) assert len(res) == 1
def add_function(self, func_name, func_data, func_comment): """ Add a (Reversed) function to the database. """ self._check_is_open() c = self._conn.cursor() try: s = sign(func_data, self._num_hashes) func_hash = strong_hash(func_data) cmd_insert = \ """INSERT OR REPLACE into funcs (func_hash,func_name,func_comment""" for i in range(self._num_hashes): cmd_insert += ',c' + str(i + 1) + ' ' cmd_insert += ') values (?,?,?' + (',?' * self._num_hashes) + ');' c.execute(cmd_insert,[\ sqlite3.Binary(func_hash),func_name,func_comment] + s) # Commit functions inserted to the db if _funcs_pending is large # enough: if self._funcs_pending > FUNCTION_BATCH: self.commit_funcs() except sqlite3.Error: # Give up previous transaction, and start a new one. c.execute('ROLLBACK') c.execute('BEGIN TRANSACTION')
def add_function(self,func_name,func_data,func_comment): """ Add a (Reversed) function to the database. """ self._check_is_open() c = self._conn.cursor() try: s = sign(func_data,self._num_hashes) func_hash = strong_hash(func_data) cmd_insert = \ """INSERT OR REPLACE into funcs (func_hash,func_name,func_comment""" for i in range(self._num_hashes): cmd_insert += ',c' + str(i+1) + ' ' cmd_insert += ') values (?,?,?' + (',?' * self._num_hashes) + ');' c.execute(cmd_insert,[\ sqlite3.Binary(func_hash),func_name,func_comment] + s) # Commit functions inserted to the db if _funcs_pending is large # enough: if self._funcs_pending > FUNCTION_BATCH: self.commit_funcs() except sqlite3.Error: # Give up previous transaction, and start a new one. c.execute('ROLLBACK') c.execute('BEGIN TRANSACTION')
def test_basic_strong_hash(): """ Make sure that strong_hash function works. """ # Basic invocation: res = strong_hash(b'adfklasjdflkajsdflkajsdf') assert isinstance(res,bytes) # Consistency: res1 = strong_hash(b'34908523904kf9034fk9032kf903f4k') res2 = strong_hash(b'34908523904kf9034fk9032kf903f4k') assert res1 == res2 # Different results for different input: res1 = strong_hash(b'34908523904kf9034fk9032kf903f4ka') res2 = strong_hash(b'34908523904kf9034fk9032kf903f4kb') assert res1 != res2 # But length is always the same: assert len(res1) == len(res2)
def test_basic_strong_hash(): """ Make sure that strong_hash function works. """ # Basic invocation: res = strong_hash(b'adfklasjdflkajsdflkajsdf') assert isinstance(res, bytes) # Consistency: res1 = strong_hash(b'34908523904kf9034fk9032kf903f4k') res2 = strong_hash(b'34908523904kf9034fk9032kf903f4k') assert res1 == res2 # Different results for different input: res1 = strong_hash(b'34908523904kf9034fk9032kf903f4ka') res2 = strong_hash(b'34908523904kf9034fk9032kf903f4kb') assert res1 != res2 # But length is always the same: assert len(res1) == len(res2)
def test_match_special_func(self): """ Try to find similarity for the special function. We expect to find it. """ sims = self.fdb_mem.get_similars(self.special_func_data,5) # We assume that only the special function will return: assert len(sims) == 1 assert sims[0].func_hash == strong_hash(self.special_func_data) assert sims[0].func_name == self.special_func_name assert sims[0].func_comment == self.special_func_comment assert sims[0].func_sig == sign(self.special_func_data,NUM_HASHES) assert sims[0].func_grade == NUM_HASHES
def test_match_like_special_func(self): """ Try to find similarity for a function that is just a bit different from the special function. We expect to find the special function. """ func_data = self.special_func_data # Change some bytes randomly: for i in range(5): func_data = change_random_byte(func_data) sims = self.fdb_mem.get_similars(func_data,5) # We assume that only the special function will return: assert len(sims) == 1 assert sims[0].func_hash == strong_hash(self.special_func_data) assert sims[0].func_name == self.special_func_name assert sims[0].func_comment == self.special_func_comment assert sims[0].func_sig == sign(self.special_func_data,NUM_HASHES) assert sims[0].func_grade < NUM_HASHES
def get_similars(self, func_data, num_similars): """ Get a list of at most num_similars similar functions to a given function. The list will be ordered by similarity. The first element is the most similar one. """ self._check_is_open() c = self._conn.cursor() try: # A list to keep results: res_list = [] s = sign(func_data, self._num_hashes) func_hash = strong_hash(func_data) # Get all potential candidates for similarity: lselects = ['SELECT * FROM funcs WHERE c' + str(i+1) + '=?' \ for i in range(self._num_hashes)] # Also search for exact match (Using strong hash): sel_hash = 'SELECT * FROM funcs WHERE func_hash=?' lselects.append(sel_hash) selects = "\nUNION\n".join(lselects) # Find best matching rows matching = 'SELECT func_hash,func_name,func_comment,' sig_vals = ",".join( ['c' + str(i + 1) for i in range(self._num_hashes)]) matching += sig_vals # Make an expression (c1=sig[0]) + (c2=sig[1]) + ... # Which will be the grade of every row (The amount of matches of the # signature). sig_sum = ' + '.join(\ ['(c' + str(i+1) + '=?)' for i in range(self._num_hashes)]) matching += ',(' + sig_sum + ') AS grade ' matching += 'FROM (' + selects + ') ' # Find the num_similars rows with highest grade: matching += 'ORDER BY grade DESC LIMIT ?' c.execute(matching, s + s + [func_hash, num_similars]) for res in c.fetchall(): res_hash, res_name, res_comment = res[:3] # We don't want to include the last superficial column grade, this # is why we have -1 here: res_sig = list(res[3:-1]) # The function's grade: grade = res[-1] sres = DBSimilar(\ func_hash=res_hash,\ func_name=res_name,\ func_comment=res_comment,\ func_sig=res_sig,\ func_grade=grade) # If we have exact match (Using strong hash), we move the result to # the beginning of res_list. Otherwise, we just append to the end. # The exact match will always be at the beginning. if res_hash == func_hash: res_list.insert(0, sres) else: res_list.append(sres) return res_list except sqlite3.Error: # Give up previous transaction, and start a new one. c.execute('ROLLBACK') c.execute('BEGIN TRANSACTION')
def get_similars(self,func_data,num_similars): """ Get a list of at most num_similars similar functions to a given function. The list will be ordered by similarity. The first element is the most similar one. """ self._check_is_open() c = self._conn.cursor() try: # A list to keep results: res_list = [] s = sign(func_data,self._num_hashes) func_hash = strong_hash(func_data) # Get all potential candidates for similarity: lselects = ['SELECT * FROM funcs WHERE c' + str(i+1) + '=?' \ for i in range(self._num_hashes)] # Also search for exact match (Using strong hash): sel_hash = 'SELECT * FROM funcs WHERE func_hash=?' lselects.append(sel_hash) selects = "\nUNION\n".join(lselects) # Find best matching rows matching = 'SELECT func_hash,func_name,func_comment,' sig_vals = ",".join(['c' + str(i+1) for i in range(self._num_hashes)]) matching += sig_vals # Make an expression (c1=sig[0]) + (c2=sig[1]) + ... # Which will be the grade of every row (The amount of matches of the # signature). sig_sum = ' + '.join(\ ['(c' + str(i+1) + '=?)' for i in range(self._num_hashes)]) matching += ',(' + sig_sum + ') AS grade ' matching += 'FROM (' + selects + ') ' # Find the num_similars rows with highest grade: matching += 'ORDER BY grade DESC LIMIT ?' c.execute(matching,s + s + [func_hash,num_similars]) for res in c.fetchall(): res_hash,res_name,res_comment = res[:3] # We don't want to include the last superficial column grade, this # is why we have -1 here: res_sig = list(res[3:-1]) # The function's grade: grade = res[-1] sres = DBSimilar(\ func_hash=res_hash,\ func_name=res_name,\ func_comment=res_comment,\ func_sig=res_sig,\ func_grade=grade) # If we have exact match (Using strong hash), we move the result to # the beginning of res_list. Otherwise, we just append to the end. # The exact match will always be at the beginning. if res_hash == func_hash: res_list.insert(0,sres) else: res_list.append(sres) return res_list except sqlite3.Error: # Give up previous transaction, and start a new one. c.execute('ROLLBACK') c.execute('BEGIN TRANSACTION')