def test_sign_deterministic(): """ Make sure that signing the same data results in the same result """ res1 = sign(b'3kl4jfklsdjfklasjf8934j9sjdf9adfkalsdjflkasjdflkasdf',\ num_perms=20) res2 = sign(b'3kl4jfklsdjfklasjf8934j9sjdf9adfkalsdjflkasjdflkasdf',\ num_perms=20) assert res1 == res2
def test_sign_similars(): """ Sign similar strings and expect similar signatures. Sign very different strings and expect zero similarity. """ s1 = sign(b'hello world he2llo world', 16) s2 = sign(b'hello world he1llo world', 16) assert calc_sim(s1, s2) > 6 s1 = sign(b'akjdflkasjflkasjlfkasjdflkjaslkdfjaslkjfsaklfdjaslkjdfsf', 16) s2 = sign(b'4039582903850923850928345982309589023845823458230945', 16) assert calc_sim(s1, s2) == 0
def test_sign_similars(): """ Sign similar strings and expect similar signatures. Sign very different strings and expect zero similarity. """ s1 = sign(b'hello world he2llo world',16) s2 = sign(b'hello world he1llo world',16) assert calc_sim(s1,s2) > 6 s1 = sign(b'akjdflkasjflkasjlfkasjdflkjaslkdfjaslkjfsaklfdjaslkjdfsf',16) s2 = sign(b'4039582903850923850928345982309589023845823458230945',16) assert calc_sim(s1,s2) == 0
def test_short_input(): """ See what happens if sign or slow_sign are given a too short input (below 4 bytes). """ # Should raise an error: with pytest.raises(Catalog1Error): sign(b'123', 16) # Should raise an error: with pytest.raises(Catalog1Error): slow_sign(b'123', 16) # Will not raise an error: sign(b'1234', 16) slow_sign(b'1234', 16)
def test_get_similars_basic(fdb_mem): """ Try to run get_similars and see if it doesn't crash. """ res = fdb_mem.get_similars(b'adfasdfasdf',5) assert isinstance(res,list) # Add a function and request a function similar to that function. We expect # to get one result: fdb_mem.add_function('func_name',b'func_data','func_comment') res = fdb_mem.get_similars(b'func_data',num_similars=3) assert len(res) == 1 # Add another function, and request functions similar to that function. We # expect to get all the functions, where func2 will be first on the list: fdb_mem.add_function('func_name2',b'func_data2','func_comment2') res = fdb_mem.get_similars(b'func_data2',num_similars=3) assert len(res) == 2 # Hash should match: assert res[0].func_hash == strong_hash(b'func_data2') assert res[1].func_hash != strong_hash(b'func_data2') # Signature should match: assert res[0].func_sig == sign(b'func_data2',NUM_HASHES) # Request for just one similars, and make sure we get only one, although # the DB contains two functions: res = fdb_mem.get_similars(b'func_data',num_similars=1) assert len(res) == 1
def add_function(self,func_name,func_data,func_comment): """ Add a (Reversed) function to the database. """ self._check_is_open() c = self._conn.cursor() try: s = sign(func_data,self._num_hashes) func_hash = strong_hash(func_data) cmd_insert = \ """INSERT OR REPLACE into funcs (func_hash,func_name,func_comment""" for i in range(self._num_hashes): cmd_insert += ',c' + str(i+1) + ' ' cmd_insert += ') values (?,?,?' + (',?' * self._num_hashes) + ');' c.execute(cmd_insert,[\ sqlite3.Binary(func_hash),func_name,func_comment] + s) # Commit functions inserted to the db if _funcs_pending is large # enough: if self._funcs_pending > FUNCTION_BATCH: self.commit_funcs() except sqlite3.Error: # Give up previous transaction, and start a new one. c.execute('ROLLBACK') c.execute('BEGIN TRANSACTION')
def test_short_input(): """ See what happens if sign or slow_sign are given a too short input (below 4 bytes). """ # Should raise an error: with pytest.raises(Catalog1Error): sign(b'123',16) # Should raise an error: with pytest.raises(Catalog1Error): slow_sign(b'123',16) # Will not raise an error: sign(b'1234',16) slow_sign(b'1234',16)
def add_function(self, func_name, func_data, func_comment): """ Add a (Reversed) function to the database. """ self._check_is_open() c = self._conn.cursor() try: s = sign(func_data, self._num_hashes) func_hash = strong_hash(func_data) cmd_insert = \ """INSERT OR REPLACE into funcs (func_hash,func_name,func_comment""" for i in range(self._num_hashes): cmd_insert += ',c' + str(i + 1) + ' ' cmd_insert += ') values (?,?,?' + (',?' * self._num_hashes) + ');' c.execute(cmd_insert,[\ sqlite3.Binary(func_hash),func_name,func_comment] + s) # Commit functions inserted to the db if _funcs_pending is large # enough: if self._funcs_pending > FUNCTION_BATCH: self.commit_funcs() except sqlite3.Error: # Give up previous transaction, and start a new one. c.execute('ROLLBACK') c.execute('BEGIN TRANSACTION')
def test_sign_long_data(): """ Sign long data """ res = sign((b'asdfklasjdf') * 40,num_perms=20) assert len(res) == 20 for x in res: assert isdword(x)
def test_sign_long_data(): """ Sign long data """ res = sign((b'asdfklasjdf') * 40, num_perms=20) assert len(res) == 20 for x in res: assert isdword(x)
def test_sign_basic(): """ Sign some strings. """ res = sign(b'afdasdklfjaskljdfaklsjdf', num_perms=16) assert len(res) == 16 for x in res: assert isdword(x) res = \ sign(b'3kl4jfklsdjfklasjf8934j9sjdf9adfkalsdjflkasjdflkasdf',num_perms=20) assert len(res) == 20 for x in res: assert isdword(x) res = sign(b'4095809348529384523904582390485092384509283', num_perms=32) assert len(res) == 32 for x in res: assert isdword(x)
def test_sign_basic(): """ Sign some strings. """ res = sign(b'afdasdklfjaskljdfaklsjdf',num_perms=16) assert len(res) == 16 for x in res: assert isdword(x) res = \ sign(b'3kl4jfklsdjfklasjf8934j9sjdf9adfkalsdjflkasjdflkasdf',num_perms=20) assert len(res) == 20 for x in res: assert isdword(x) res = sign(b'4095809348529384523904582390485092384509283',num_perms=32) assert len(res) == 32 for x in res: assert isdword(x)
def test_slow_matches_fast(): """ Make sure that the two implementations of catalog1 (The python and the C one) match. """ datas = [] datas.append(b'klsfjsalkdfjlksajfdlksaj340985390485ksldjflksdflksdjf') datas.append(b'abc' * 205) datas.append(b'349085092384590903485309485' * 300) data = b'kslajflksajfaiosueroiqwuroiqwer9034851283904lkfjsalkfasdfsf' for data in datas: assert slow_sign(data,4) == sign(data,4)
def test_match_special_func(self): """ Try to find similarity for the special function. We expect to find it. """ sims = self.fdb_mem.get_similars(self.special_func_data,5) # We assume that only the special function will return: assert len(sims) == 1 assert sims[0].func_hash == strong_hash(self.special_func_data) assert sims[0].func_name == self.special_func_name assert sims[0].func_comment == self.special_func_comment assert sims[0].func_sig == sign(self.special_func_data,NUM_HASHES) assert sims[0].func_grade == NUM_HASHES
def test_slow_matches_fast(): """ Make sure that the two implementations of catalog1 (The python and the C one) match. """ datas = [] datas.append(b'klsfjsalkdfjlksajfdlksaj340985390485ksldjflksdflksdjf') datas.append(b'abc' * 205) datas.append(b'349085092384590903485309485' * 300) data = b'kslajflksajfaiosueroiqwuroiqwer9034851283904lkfjsalkfasdfsf' for data in datas: assert slow_sign(data, 4) == sign(data, 4)
def test_match_like_special_func(self): """ Try to find similarity for a function that is just a bit different from the special function. We expect to find the special function. """ func_data = self.special_func_data # Change some bytes randomly: for i in range(5): func_data = change_random_byte(func_data) sims = self.fdb_mem.get_similars(func_data,5) # We assume that only the special function will return: assert len(sims) == 1 assert sims[0].func_hash == strong_hash(self.special_func_data) assert sims[0].func_name == self.special_func_name assert sims[0].func_comment == self.special_func_comment assert sims[0].func_sig == sign(self.special_func_data,NUM_HASHES) assert sims[0].func_grade < NUM_HASHES
def get_dist(i): return num_matches(res[i].func_sig,sign(f1,NUM_HASHES))
def get_similars(self,func_data,num_similars): """ Get a list of at most num_similars similar functions to a given function. The list will be ordered by similarity. The first element is the most similar one. """ self._check_is_open() c = self._conn.cursor() try: # A list to keep results: res_list = [] s = sign(func_data,self._num_hashes) func_hash = strong_hash(func_data) # Get all potential candidates for similarity: lselects = ['SELECT * FROM funcs WHERE c' + str(i+1) + '=?' \ for i in range(self._num_hashes)] # Also search for exact match (Using strong hash): sel_hash = 'SELECT * FROM funcs WHERE func_hash=?' lselects.append(sel_hash) selects = "\nUNION\n".join(lselects) # Find best matching rows matching = 'SELECT func_hash,func_name,func_comment,' sig_vals = ",".join(['c' + str(i+1) for i in range(self._num_hashes)]) matching += sig_vals # Make an expression (c1=sig[0]) + (c2=sig[1]) + ... # Which will be the grade of every row (The amount of matches of the # signature). sig_sum = ' + '.join(\ ['(c' + str(i+1) + '=?)' for i in range(self._num_hashes)]) matching += ',(' + sig_sum + ') AS grade ' matching += 'FROM (' + selects + ') ' # Find the num_similars rows with highest grade: matching += 'ORDER BY grade DESC LIMIT ?' c.execute(matching,s + s + [func_hash,num_similars]) for res in c.fetchall(): res_hash,res_name,res_comment = res[:3] # We don't want to include the last superficial column grade, this # is why we have -1 here: res_sig = list(res[3:-1]) # The function's grade: grade = res[-1] sres = DBSimilar(\ func_hash=res_hash,\ func_name=res_name,\ func_comment=res_comment,\ func_sig=res_sig,\ func_grade=grade) # If we have exact match (Using strong hash), we move the result to # the beginning of res_list. Otherwise, we just append to the end. # The exact match will always be at the beginning. if res_hash == func_hash: res_list.insert(0,sres) else: res_list.append(sres) return res_list except sqlite3.Error: # Give up previous transaction, and start a new one. c.execute('ROLLBACK') c.execute('BEGIN TRANSACTION')
def get_similars(self, func_data, num_similars): """ Get a list of at most num_similars similar functions to a given function. The list will be ordered by similarity. The first element is the most similar one. """ self._check_is_open() c = self._conn.cursor() try: # A list to keep results: res_list = [] s = sign(func_data, self._num_hashes) func_hash = strong_hash(func_data) # Get all potential candidates for similarity: lselects = ['SELECT * FROM funcs WHERE c' + str(i+1) + '=?' \ for i in range(self._num_hashes)] # Also search for exact match (Using strong hash): sel_hash = 'SELECT * FROM funcs WHERE func_hash=?' lselects.append(sel_hash) selects = "\nUNION\n".join(lselects) # Find best matching rows matching = 'SELECT func_hash,func_name,func_comment,' sig_vals = ",".join( ['c' + str(i + 1) for i in range(self._num_hashes)]) matching += sig_vals # Make an expression (c1=sig[0]) + (c2=sig[1]) + ... # Which will be the grade of every row (The amount of matches of the # signature). sig_sum = ' + '.join(\ ['(c' + str(i+1) + '=?)' for i in range(self._num_hashes)]) matching += ',(' + sig_sum + ') AS grade ' matching += 'FROM (' + selects + ') ' # Find the num_similars rows with highest grade: matching += 'ORDER BY grade DESC LIMIT ?' c.execute(matching, s + s + [func_hash, num_similars]) for res in c.fetchall(): res_hash, res_name, res_comment = res[:3] # We don't want to include the last superficial column grade, this # is why we have -1 here: res_sig = list(res[3:-1]) # The function's grade: grade = res[-1] sres = DBSimilar(\ func_hash=res_hash,\ func_name=res_name,\ func_comment=res_comment,\ func_sig=res_sig,\ func_grade=grade) # If we have exact match (Using strong hash), we move the result to # the beginning of res_list. Otherwise, we just append to the end. # The exact match will always be at the beginning. if res_hash == func_hash: res_list.insert(0, sres) else: res_list.append(sres) return res_list except sqlite3.Error: # Give up previous transaction, and start a new one. c.execute('ROLLBACK') c.execute('BEGIN TRANSACTION')