def test_hamming(self): """Correctly calculate Hamming distances between numbers""" for _ in range(100): num1 = random.randint(0, sys.maxint) num2 = random.randint(0, sys.maxint) self.assertEqual(hamming(num1, num2), hamming_from_iter(bitlist(num1), bitlist(num2)))
def test_simhash_similarity(self): """Signatures should be consistent""" sh = SimHashSignature(64) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("abracadabra") dist = hamming(sig1, sig2) self.assertEqual(dist, 0) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("arbcd") dist = hamming(sig1, sig2) self.assertEqual(dist, 13) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("") dist = hamming(sig1, sig2) self.assertEqual(dist, 32)
def test_simhash_feature_weights(self): """Features should be weighted and should contribute to similarity/distance """ sh = SimHashSignature(64) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("abracdabra") dist = hamming(sig1, sig2) self.assertEqual(dist, 3) sig1 = sh.get_signature("abracadabra", ("cats", 0)) sig2 = sh.get_signature("abracdabra", ("dogs", 0)) dist = hamming(sig1, sig2) self.assertEqual(dist, 3) sig1 = sh.get_signature("abracadabra", ("cats", 0)) sig2 = sh.get_signature("abracadabra", ("dogs", 0)) dist = hamming(sig1, sig2) self.assertEqual(dist, 0) sig1 = sh.get_signature("abracadabra", ("ca", 4)) sig2 = sh.get_signature("abracadabra", ("do", 4)) dist = hamming(sig1, sig2) self.assertEqual(dist, 9) sig1 = sh.get_signature("abracadabra", ("ca", 5)) sig2 = sh.get_signature("abracadabra", ("do", 5)) dist = hamming(sig1, sig2) self.assertEqual(dist, 9) sig1 = sh.get_signature("abracadabra", ("cats", 200)) sig2 = sh.get_signature("abracadabra", ("dogs", 200)) dist = hamming(sig1, sig2) self.assertEqual(dist, 21) sig1 = sh.get_signature("abracadabra", ("cats", 10)) sig2 = sh.get_signature("abracadabra", ("cats", 10)) dist = hamming(sig1, sig2) self.assertEqual(dist, 0)
def test_simhash_similarity_3(self): sh = SimHashSignature(64) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("") dist = hamming(sig1, sig2) self.assertEqual(37, dist)
def test_simhash_feature_weights_7(self): sh = SimHashSignature(64) sig1 = sh.get_signature("abracadabra", ("cats", 10)) sig2 = sh.get_signature("abracadabra", ("cats", 10)) dist = hamming(sig1, sig2) self.assertEqual(0, dist)
def test_simhash_feature_weights_1(self): sh = SimHashSignature(64) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("abracdabra") dist = hamming(sig1, sig2) self.assertEqual(3, dist)
def test_minhash_sketch_similarity_3(self): sh = MinHashSketchSignature(64) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("") dist = hamming(sig1, sig2) self.assertEqual(32, dist)