Esempio n. 1
0
 def test_simhash128_3(self):
     sh = SimHashSignature(128)
     str1 = "♡♥❤❥"
     str2 = u"♡♥❤❥"
     sig5 = sh.get_signature(str1)
     sig6 = sh.get_signature(str2)
     self.assertNotEqual(sig5, sig6)
Esempio n. 2
0
    def test_simhash_similarity(self):
        """Signatures should be consistent"""
        sh = SimHashSignature(64)

        sig1 = sh.get_signature("abracadabra")
        sig2 = sh.get_signature("")
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 25)

        sig1 = sh.get_signature("abracadabra")
        sig2 = sh.get_signature("abracadabra")
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 0)

        sig1 = sh.get_signature("abracadabra")
        sig2 = sh.get_signature("abracdabra")
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 6)
Esempio n. 3
0
    def test_simhash_similarity(self):
        """Signatures should be consistent"""
        sh = SimHashSignature(64)

        sig1 = sh.get_signature("abracadabra")
        sig2 = sh.get_signature("abracadabra")
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 0)

        sig1 = sh.get_signature("abracadabra")
        sig2 = sh.get_signature("arbcd")
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 13)

        sig1 = sh.get_signature("abracadabra")
        sig2 = sh.get_signature("")
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 32)
Esempio n. 4
0
    def test_simhash64(self):
        """Simhash signature of an empty string should be zero and unicode and
        regular strings should give the same simhash signatures
        """
        sh = SimHashSignature(64)

        sig1 = sh.get_signature("")
        sig2 = sh.get_signature(u"")
        self.assertEqual(sig1, sig2)

        sig3 = sh.get_signature("abracadabra")
        sig4 = sh.get_signature(u"abracadabra")
        self.assertEqual(sig3, sig4)

        str1 = "♡♥❤❥"
        str2 = u"♡♥❤❥"
        sig5 = sh.get_signature(str1)
        sig6 = sh.get_signature(str2)
        self.assertNotEqual(sig5, sig6)
Esempio n. 5
0
    def test_simhash128(self):
        """Simhash signature of an empty string should be zero
        and unicode and regular strings should give the same
        simhash signatures
        """
        sh = SimHashSignature(128)

        sig1 = sh.get_signature("")
        sig2 = sh.get_signature(u"")
        self.assertEqual(sig1, 0)
        self.assertEqual(sig1, sig2)

        sig3 = sh.get_signature("abracadabra")
        sig4 = sh.get_signature(u"abracadabra")
        self.assertEqual(sig3, sig4)

        str1 = "♡♥❤❥"
        str2 = u"♡♥❤❥"
        sig5 = sh.get_signature(str1)
        sig6 = sh.get_signature(str2)
        self.assertNotEqual(sig5, sig6)
Esempio n. 6
0
    def test_simhash_feature_weights(self):
        """Features should be weighted and should contribute to
        similarity/distance
        """
        sh = SimHashSignature(64)

        sig1 = sh.get_signature("abracadabra")
        sig2 = sh.get_signature("abracdabra")
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 6)

        sig1 = sh.get_signature("abracadabra", ("cats", 0))
        sig2 = sh.get_signature("abracdabra", ("dogs", 0))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 6)

        sig1 = sh.get_signature("abracadabra", ("cats", 0))
        sig2 = sh.get_signature("abracadabra", ("dogs", 0))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 0)

        sig1 = sh.get_signature("abracadabra", ("ca", 4))
        sig2 = sh.get_signature("abracadabra", ("do", 4))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 0)

        sig1 = sh.get_signature("abracadabra", ("ca", 5))
        sig2 = sh.get_signature("abracadabra", ("do", 5))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 7)

        sig1 = sh.get_signature("abracadabra", ("cats", 200))
        sig2 = sh.get_signature("abracadabra", ("dogs", 200))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 17)

        sig1 = sh.get_signature("abracadabra", ("cats", 10))
        sig2 = sh.get_signature("abracadabra", ("cats", 10))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 0)
Esempio n. 7
0
    def test_simhash_feature_weights(self):
        """Features should be weighted and should contribute to
        similarity/distance
        """
        sh = SimHashSignature(64)

        sig1 = sh.get_signature("abracadabra")
        sig2 = sh.get_signature("abracdabra")
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 3)

        sig1 = sh.get_signature("abracadabra", ("cats", 0))
        sig2 = sh.get_signature("abracdabra", ("dogs", 0))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 3)

        sig1 = sh.get_signature("abracadabra", ("cats", 0))
        sig2 = sh.get_signature("abracadabra", ("dogs", 0))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 0)

        sig1 = sh.get_signature("abracadabra", ("ca", 4))
        sig2 = sh.get_signature("abracadabra", ("do", 4))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 9)

        sig1 = sh.get_signature("abracadabra", ("ca", 5))
        sig2 = sh.get_signature("abracadabra", ("do", 5))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 9)

        sig1 = sh.get_signature("abracadabra", ("cats", 200))
        sig2 = sh.get_signature("abracadabra", ("dogs", 200))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 21)

        sig1 = sh.get_signature("abracadabra", ("cats", 10))
        sig2 = sh.get_signature("abracadabra", ("cats", 10))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 0)
Esempio n. 8
0
 def test_simhash_similarity_3(self):
     sh = SimHashSignature(64)
     sig1 = sh.get_signature("abracadabra")
     sig2 = sh.get_signature("")
     dist = hamming(sig1, sig2)
     self.assertEqual(37, dist)
Esempio n. 9
0
 def test_simhash128_2(self):
     sh = SimHashSignature(128)
     sig3 = sh.get_signature("abracadabra")
     sig4 = sh.get_signature(u"abracadabra")
     self.assertEqual(sig3, sig4)
Esempio n. 10
0
 def test_simhash128_1(self):
     sh = SimHashSignature(128)
     sig1 = sh.get_signature("")
     sig2 = sh.get_signature(u"")
     self.assertEqual(0, sig1)
     self.assertEqual(sig1, sig2)
Esempio n. 11
0
 def test_simhash64_1(self):
     sh = SimHashSignature(64)
     sig1 = sh.get_signature("")
     sig2 = sh.get_signature(u"")
     self.assertEqual(sig1, sig2)
Esempio n. 12
0
 def test_simhash_feature_weights_7(self):
     sh = SimHashSignature(64)
     sig1 = sh.get_signature("abracadabra", ("cats", 10))
     sig2 = sh.get_signature("abracadabra", ("cats", 10))
     dist = hamming(sig1, sig2)
     self.assertEqual(0, dist)
Esempio n. 13
0
 def test_simhash_feature_weights_1(self):
     sh = SimHashSignature(64)
     sig1 = sh.get_signature("abracadabra")
     sig2 = sh.get_signature("abracdabra")
     dist = hamming(sig1, sig2)
     self.assertEqual(3, dist)