コード例 #1
0
    def test_update(self):
        simhash = Simhash()
        assert simhash.tokens == []

        simhash.update('This is for testing purpose \n It should work fine')
        expected = [
            'This', 'is', 'for', 'testing', 'purpose', 'It', 'should', 'work',
            'fine'
        ]
        assert simhash.tokens == expected
コード例 #2
0
    def test_hex_digest3(self):
        simhash = Simhash()
        test_file = self.get_test_loc('fingerprint/fingerprint-test3.py')

        with open(test_file, 'r') as f:
            hashable = f.read()

        simhash.update(hashable)
        result = simhash.hex_digest()
        assert result == '7f43e1b18f9c0e705fcf28007bc41754'
コード例 #3
0
    def test_hex_digest2(self):
        simhash = Simhash()
        test_file = self.get_test_loc('fingerprint/fingerprint-test2.c')

        with open(test_file, 'r') as f:
            hashable = f.read()

        simhash.update(hashable)
        result = simhash.hex_digest()
        assert result == 'baa2d1d169be06a306c1873afe6db4da'
コード例 #4
0
    def test_hex_digest1(self):
        simhash = Simhash()
        test_file = self.get_test_loc('fingerprint/fingerprint-test1.java')

        with open(test_file, 'r') as f:
            hashable = f.read()

        simhash.update(hashable)
        result = simhash.hex_digest()
        assert result == '4e42d8c0ed6693654866425451210417'
コード例 #5
0
def get_fingerprint(location, **kwargs):
    """
    Return a mapping of fingerprint generated for the file at `location`.
    """
    with open(location, 'r') as f:
        hashable = f.read()

    simhash = Simhash()
    simhash.update(hashable)
    result = simhash.hex_digest()

    return dict(fingerprint=result)
コード例 #6
0
    def test_generate_fingerprint(self):
        simhash = Simhash()
        simhash.update('This should work')
        expected = bitarray(
            '11100010001110100111100010101000101110111111110000010011000110000110001110000000100000111011101110111100110001011011110001011100'
        )
        assert simhash.generate_fingerprint() == expected

        simhash.update('this will get added too!')
        expected = bitarray(
            '00000010000000000011110010100000101000001111100000000001010110000110101110111000100000110101000000010100100000000010110011010010'
        )
        assert simhash.generate_fingerprint() == expected
コード例 #7
0
    def test_similarity_matching3(self):
        simhash1 = Simhash()
        simhash2 = Simhash()

        test_file1 = self.get_test_loc('fingerprint/similarity_matching5.py')
        test_file2 = self.get_test_loc('fingerprint/similarity_matching6.py')

        with open(test_file1, 'r') as f:
            hashable1 = f.read()

        with open(test_file2, 'r') as f:
            hashable2 = f.read()

        simhash1.update(hashable1)
        simhash2.update(hashable2)
        distance = simhash1.hamming_distance(simhash1.generate_fingerprint(),
                                             simhash2.generate_fingerprint())

        assert distance == 13
コード例 #8
0
    def test_get_weighted_hash2(self):
        simhash = Simhash()
        test_file = self.get_test_loc(
            'fingerprint/get_weighted_hash-test2.txt')

        with open(test_file, 'r') as f:
            hashable = f.read()

        simhash.update(hashable)
        result = simhash.get_weighted_hash()
        expected = [
            -1, -1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1,
            1, 1, -1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, 1, -1, -1, -1,
            -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, 1, 1, -1,
            1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1,
            -1, -1, -1, 1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, 1, 1, -1, -1, -1,
            1, -1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1,
            -1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, -1, -1, 1
        ]
        assert result == expected