def test_update(self): simhash = Simhash() assert simhash.tokens == [] simhash.update('This is for testing purpose \n It should work fine') expected = [ 'This', 'is', 'for', 'testing', 'purpose', 'It', 'should', 'work', 'fine' ] assert simhash.tokens == expected
def test_hex_digest3(self): simhash = Simhash() test_file = self.get_test_loc('fingerprint/fingerprint-test3.py') with open(test_file, 'r') as f: hashable = f.read() simhash.update(hashable) result = simhash.hex_digest() assert result == '7f43e1b18f9c0e705fcf28007bc41754'
def test_hex_digest2(self): simhash = Simhash() test_file = self.get_test_loc('fingerprint/fingerprint-test2.c') with open(test_file, 'r') as f: hashable = f.read() simhash.update(hashable) result = simhash.hex_digest() assert result == 'baa2d1d169be06a306c1873afe6db4da'
def test_hex_digest1(self): simhash = Simhash() test_file = self.get_test_loc('fingerprint/fingerprint-test1.java') with open(test_file, 'r') as f: hashable = f.read() simhash.update(hashable) result = simhash.hex_digest() assert result == '4e42d8c0ed6693654866425451210417'
def get_fingerprint(location, **kwargs): """ Return a mapping of fingerprint generated for the file at `location`. """ with open(location, 'r') as f: hashable = f.read() simhash = Simhash() simhash.update(hashable) result = simhash.hex_digest() return dict(fingerprint=result)
def test_generate_fingerprint(self): simhash = Simhash() simhash.update('This should work') expected = bitarray( '11100010001110100111100010101000101110111111110000010011000110000110001110000000100000111011101110111100110001011011110001011100' ) assert simhash.generate_fingerprint() == expected simhash.update('this will get added too!') expected = bitarray( '00000010000000000011110010100000101000001111100000000001010110000110101110111000100000110101000000010100100000000010110011010010' ) assert simhash.generate_fingerprint() == expected
def test_similarity_matching3(self): simhash1 = Simhash() simhash2 = Simhash() test_file1 = self.get_test_loc('fingerprint/similarity_matching5.py') test_file2 = self.get_test_loc('fingerprint/similarity_matching6.py') with open(test_file1, 'r') as f: hashable1 = f.read() with open(test_file2, 'r') as f: hashable2 = f.read() simhash1.update(hashable1) simhash2.update(hashable2) distance = simhash1.hamming_distance(simhash1.generate_fingerprint(), simhash2.generate_fingerprint()) assert distance == 13
def test_get_weighted_hash2(self): simhash = Simhash() test_file = self.get_test_loc( 'fingerprint/get_weighted_hash-test2.txt') with open(test_file, 'r') as f: hashable = f.read() simhash.update(hashable) result = simhash.get_weighted_hash() expected = [ -1, -1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, -1, -1, 1 ] assert result == expected