def test_simhash128_3(self): sh = SimHashSignature(128) str1 = "♡♥❤❥" str2 = u"♡♥❤❥" sig5 = sh.get_signature(str1) sig6 = sh.get_signature(str2) self.assertNotEqual(sig5, sig6)
def test_simhash_feature_weights(self): """Features should be weighted and should contribute to similarity/distance """ sh = SimHashSignature(64) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("abracdabra") dist = hamming(sig1, sig2) self.assertEqual(dist, 6) sig1 = sh.get_signature("abracadabra", ("cats", 0)) sig2 = sh.get_signature("abracdabra", ("dogs", 0)) dist = hamming(sig1, sig2) self.assertEqual(dist, 6) sig1 = sh.get_signature("abracadabra", ("cats", 0)) sig2 = sh.get_signature("abracadabra", ("dogs", 0)) dist = hamming(sig1, sig2) self.assertEqual(dist, 0) sig1 = sh.get_signature("abracadabra", ("ca", 4)) sig2 = sh.get_signature("abracadabra", ("do", 4)) dist = hamming(sig1, sig2) self.assertEqual(dist, 0) sig1 = sh.get_signature("abracadabra", ("ca", 5)) sig2 = sh.get_signature("abracadabra", ("do", 5)) dist = hamming(sig1, sig2) self.assertEqual(dist, 7) sig1 = sh.get_signature("abracadabra", ("cats", 200)) sig2 = sh.get_signature("abracadabra", ("dogs", 200)) dist = hamming(sig1, sig2) self.assertEqual(dist, 17) sig1 = sh.get_signature("abracadabra", ("cats", 10)) sig2 = sh.get_signature("abracadabra", ("cats", 10)) dist = hamming(sig1, sig2) self.assertEqual(dist, 0)
def test_simhash_similarity(self): """Signatures should be consistent""" sh = SimHashSignature(64) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("abracadabra") dist = hamming(sig1, sig2) self.assertEqual(dist, 0) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("arbcd") dist = hamming(sig1, sig2) self.assertEqual(dist, 13) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("") dist = hamming(sig1, sig2) self.assertEqual(dist, 32)
def test_simhash64(self): """Simhash signature of an empty string should be zero and unicode and regular strings should give the same simhash signatures """ sh = SimHashSignature(64) sig1 = sh.get_signature("") sig2 = sh.get_signature(u"") self.assertEqual(sig1, sig2) sig3 = sh.get_signature("abracadabra") sig4 = sh.get_signature(u"abracadabra") self.assertEqual(sig3, sig4) str1 = "♡♥❤❥" str2 = u"♡♥❤❥" sig5 = sh.get_signature(str1) sig6 = sh.get_signature(str2) self.assertNotEqual(sig5, sig6)
def test_simhash_similarity(self): """Signatures should be consistent""" sh = SimHashSignature(64) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("") dist = hamming(sig1, sig2) self.assertEqual(dist, 25) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("abracadabra") dist = hamming(sig1, sig2) self.assertEqual(dist, 0) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("abracdabra") dist = hamming(sig1, sig2) self.assertEqual(dist, 6)
def test_simhash128(self): """Simhash signature of an empty string should be zero and unicode and regular strings should give the same simhash signatures """ sh = SimHashSignature(128) sig1 = sh.get_signature("") sig2 = sh.get_signature(u"") self.assertEqual(sig1, 0) self.assertEqual(sig1, sig2) sig3 = sh.get_signature("abracadabra") sig4 = sh.get_signature(u"abracadabra") self.assertEqual(sig3, sig4) str1 = "♡♥❤❥" str2 = u"♡♥❤❥" sig5 = sh.get_signature(str1) sig6 = sh.get_signature(str2) self.assertNotEqual(sig5, sig6)
def __init__(self, cfg, content_filter=None, trace_every=0, get_body=None, get_label=None, get_prefix=None, min_support=None, seed=0, normalizer=None, tokenizer=None): """Read configuration""" self.cfg = cfg self._get_body = get_body self._get_label = get_label self._get_prefix = get_prefix self.trace_every = trace_every # Set options self.content_filter = content_filter self.min_support = cfg[ 'min_support'] if min_support is None else min_support # normalizer and tokenizer self.normalizer = get_default_normalizer( **cfg.get('preprocessor', {}).get('normalizer', {})) \ if normalizer is None else normalizer self.tokenizer = get_default_tokenizer( ) if tokenizer is None else tokenizer # Configure minhash signer sig_width = cfg['sig_width'] lsh_hasher = LSHC(width=sig_width, **cfg['lsh_options']) self.signer = MinHashSignature(sig_width, lsh_hasher=lsh_hasher, kmin=cfg['kmin']) # Configure shingler cfg_key_shingle = cfg['shingler'] self.shingler = get_default_shingler(**cfg_key_shingle) # Configure sketch comparison algorithm cfg_sketch = cfg['sketch'] self.sketch_enabled = cfg_sketch['enabled'] self.sketch_dist_fn = None self.max_dist = None if self.sketch_enabled: algorithm_name = cfg_sketch['algorithm'] try: sketch_algorithm = getattr(SketchModel, algorithm_name) except AttributeError: raise RuntimeError("Unknown sketch model specified: '%s'" % algorithm_name) self.sketch_bits = cfg_sketch['size'] cfg_sketch_shingler = cfg_sketch['shingler'] if not cfg_sketch_shingler['enabled']: # if sketch shingler is disabled, we also disable signer # as we will use default signer self.sketch_shingler = None self.sketch_signer = None elif sketch_algorithm == SketchModel.simhash: del cfg_sketch_shingler['enabled'] self.sketch_shingler = Shingler(**cfg_sketch_shingler) self.sketch_signer = SimHashSignature(self.sketch_bits, seed=seed) elif sketch_algorithm == SketchModel.minhash: del cfg_sketch_shingler['enabled'] self.sketch_shingler = Shingler(**cfg_sketch_shingler) self.sketch_signer = MinHashSketchSignature(self.sketch_bits, seed=seed) self.sketch_shingler._tokenizer = None self.sketch_shingler._normalizer = None self.max_dist = \ int(floor(self.sketch_bits * (1.0 - float(cfg_sketch['resemblance'])))) self.sketch_dist_fn = hamming self.sketch_operator = OPERATOR_MAP[cfg_sketch.get( 'operator', 'and')] self.cluster_builder = Cluster(sketch_dist_fn=self.sketch_dist_fn, max_dist=self.max_dist, min_support=self.min_support, sketch_operator=self.sketch_operator)
def test_simhash_feature_weights(self): """Features should be weighted and should contribute to similarity/distance """ sh = SimHashSignature(64) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("abracdabra") dist = hamming(sig1, sig2) self.assertEqual(dist, 3) sig1 = sh.get_signature("abracadabra", ("cats", 0)) sig2 = sh.get_signature("abracdabra", ("dogs", 0)) dist = hamming(sig1, sig2) self.assertEqual(dist, 3) sig1 = sh.get_signature("abracadabra", ("cats", 0)) sig2 = sh.get_signature("abracadabra", ("dogs", 0)) dist = hamming(sig1, sig2) self.assertEqual(dist, 0) sig1 = sh.get_signature("abracadabra", ("ca", 4)) sig2 = sh.get_signature("abracadabra", ("do", 4)) dist = hamming(sig1, sig2) self.assertEqual(dist, 9) sig1 = sh.get_signature("abracadabra", ("ca", 5)) sig2 = sh.get_signature("abracadabra", ("do", 5)) dist = hamming(sig1, sig2) self.assertEqual(dist, 9) sig1 = sh.get_signature("abracadabra", ("cats", 200)) sig2 = sh.get_signature("abracadabra", ("dogs", 200)) dist = hamming(sig1, sig2) self.assertEqual(dist, 21) sig1 = sh.get_signature("abracadabra", ("cats", 10)) sig2 = sh.get_signature("abracadabra", ("cats", 10)) dist = hamming(sig1, sig2) self.assertEqual(dist, 0)
def test_simhash_similarity_3(self): sh = SimHashSignature(64) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("") dist = hamming(sig1, sig2) self.assertEqual(37, dist)
def test_simhash128_2(self): sh = SimHashSignature(128) sig3 = sh.get_signature("abracadabra") sig4 = sh.get_signature(u"abracadabra") self.assertEqual(sig3, sig4)
def test_simhash128_1(self): sh = SimHashSignature(128) sig1 = sh.get_signature("") sig2 = sh.get_signature(u"") self.assertEqual(0, sig1) self.assertEqual(sig1, sig2)
def test_simhash64_1(self): sh = SimHashSignature(64) sig1 = sh.get_signature("") sig2 = sh.get_signature(u"") self.assertEqual(sig1, sig2)
def test_simhash_feature_weights_7(self): sh = SimHashSignature(64) sig1 = sh.get_signature("abracadabra", ("cats", 10)) sig2 = sh.get_signature("abracadabra", ("cats", 10)) dist = hamming(sig1, sig2) self.assertEqual(0, dist)
def test_simhash_feature_weights_1(self): sh = SimHashSignature(64) sig1 = sh.get_signature("abracadabra") sig2 = sh.get_signature("abracdabra") dist = hamming(sig1, sig2) self.assertEqual(3, dist)