Example #1
0
 def test_simhash128_3(self):
     sh = SimHashSignature(128)
     str1 = "♡♥❤❥"
     str2 = u"♡♥❤❥"
     sig5 = sh.get_signature(str1)
     sig6 = sh.get_signature(str2)
     self.assertNotEqual(sig5, sig6)
Example #2
0
    def test_simhash_feature_weights(self):
        """Features should be weighted and should contribute to
        similarity/distance
        """
        sh = SimHashSignature(64)

        sig1 = sh.get_signature("abracadabra")
        sig2 = sh.get_signature("abracdabra")
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 6)

        sig1 = sh.get_signature("abracadabra", ("cats", 0))
        sig2 = sh.get_signature("abracdabra", ("dogs", 0))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 6)

        sig1 = sh.get_signature("abracadabra", ("cats", 0))
        sig2 = sh.get_signature("abracadabra", ("dogs", 0))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 0)

        sig1 = sh.get_signature("abracadabra", ("ca", 4))
        sig2 = sh.get_signature("abracadabra", ("do", 4))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 0)

        sig1 = sh.get_signature("abracadabra", ("ca", 5))
        sig2 = sh.get_signature("abracadabra", ("do", 5))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 7)

        sig1 = sh.get_signature("abracadabra", ("cats", 200))
        sig2 = sh.get_signature("abracadabra", ("dogs", 200))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 17)

        sig1 = sh.get_signature("abracadabra", ("cats", 10))
        sig2 = sh.get_signature("abracadabra", ("cats", 10))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 0)
Example #3
0
    def test_simhash_similarity(self):
        """Signatures should be consistent"""
        sh = SimHashSignature(64)

        sig1 = sh.get_signature("abracadabra")
        sig2 = sh.get_signature("abracadabra")
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 0)

        sig1 = sh.get_signature("abracadabra")
        sig2 = sh.get_signature("arbcd")
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 13)

        sig1 = sh.get_signature("abracadabra")
        sig2 = sh.get_signature("")
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 32)
Example #4
0
    def test_simhash64(self):
        """Simhash signature of an empty string should be zero and unicode and
        regular strings should give the same simhash signatures
        """
        sh = SimHashSignature(64)

        sig1 = sh.get_signature("")
        sig2 = sh.get_signature(u"")
        self.assertEqual(sig1, sig2)

        sig3 = sh.get_signature("abracadabra")
        sig4 = sh.get_signature(u"abracadabra")
        self.assertEqual(sig3, sig4)

        str1 = "♡♥❤❥"
        str2 = u"♡♥❤❥"
        sig5 = sh.get_signature(str1)
        sig6 = sh.get_signature(str2)
        self.assertNotEqual(sig5, sig6)
Example #5
0
    def test_simhash_similarity(self):
        """Signatures should be consistent"""
        sh = SimHashSignature(64)

        sig1 = sh.get_signature("abracadabra")
        sig2 = sh.get_signature("")
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 25)

        sig1 = sh.get_signature("abracadabra")
        sig2 = sh.get_signature("abracadabra")
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 0)

        sig1 = sh.get_signature("abracadabra")
        sig2 = sh.get_signature("abracdabra")
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 6)
Example #6
0
    def test_simhash128(self):
        """Simhash signature of an empty string should be zero
        and unicode and regular strings should give the same
        simhash signatures
        """
        sh = SimHashSignature(128)

        sig1 = sh.get_signature("")
        sig2 = sh.get_signature(u"")
        self.assertEqual(sig1, 0)
        self.assertEqual(sig1, sig2)

        sig3 = sh.get_signature("abracadabra")
        sig4 = sh.get_signature(u"abracadabra")
        self.assertEqual(sig3, sig4)

        str1 = "♡♥❤❥"
        str2 = u"♡♥❤❥"
        sig5 = sh.get_signature(str1)
        sig6 = sh.get_signature(str2)
        self.assertNotEqual(sig5, sig6)
Example #7
0
    def __init__(self,
                 cfg,
                 content_filter=None,
                 trace_every=0,
                 get_body=None,
                 get_label=None,
                 get_prefix=None,
                 min_support=None,
                 seed=0,
                 normalizer=None,
                 tokenizer=None):
        """Read configuration"""
        self.cfg = cfg
        self._get_body = get_body
        self._get_label = get_label
        self._get_prefix = get_prefix

        self.trace_every = trace_every

        # Set options
        self.content_filter = content_filter
        self.min_support = cfg[
            'min_support'] if min_support is None else min_support

        # normalizer and tokenizer
        self.normalizer = get_default_normalizer(
            **cfg.get('preprocessor', {}).get('normalizer', {})) \
            if normalizer is None else normalizer
        self.tokenizer = get_default_tokenizer(
        ) if tokenizer is None else tokenizer

        # Configure minhash signer
        sig_width = cfg['sig_width']
        lsh_hasher = LSHC(width=sig_width, **cfg['lsh_options'])
        self.signer = MinHashSignature(sig_width,
                                       lsh_hasher=lsh_hasher,
                                       kmin=cfg['kmin'])

        # Configure shingler
        cfg_key_shingle = cfg['shingler']
        self.shingler = get_default_shingler(**cfg_key_shingle)

        # Configure sketch comparison algorithm
        cfg_sketch = cfg['sketch']
        self.sketch_enabled = cfg_sketch['enabled']
        self.sketch_dist_fn = None
        self.max_dist = None
        if self.sketch_enabled:
            algorithm_name = cfg_sketch['algorithm']
            try:
                sketch_algorithm = getattr(SketchModel, algorithm_name)
            except AttributeError:
                raise RuntimeError("Unknown sketch model specified: '%s'" %
                                   algorithm_name)
            self.sketch_bits = cfg_sketch['size']
            cfg_sketch_shingler = cfg_sketch['shingler']
            if not cfg_sketch_shingler['enabled']:
                # if sketch shingler is disabled, we also disable signer
                # as we will use default signer
                self.sketch_shingler = None
                self.sketch_signer = None
            elif sketch_algorithm == SketchModel.simhash:
                del cfg_sketch_shingler['enabled']
                self.sketch_shingler = Shingler(**cfg_sketch_shingler)
                self.sketch_signer = SimHashSignature(self.sketch_bits,
                                                      seed=seed)
            elif sketch_algorithm == SketchModel.minhash:
                del cfg_sketch_shingler['enabled']
                self.sketch_shingler = Shingler(**cfg_sketch_shingler)
                self.sketch_signer = MinHashSketchSignature(self.sketch_bits,
                                                            seed=seed)

            self.sketch_shingler._tokenizer = None
            self.sketch_shingler._normalizer = None

            self.max_dist = \
                int(floor(self.sketch_bits *
                          (1.0 - float(cfg_sketch['resemblance']))))
            self.sketch_dist_fn = hamming
            self.sketch_operator = OPERATOR_MAP[cfg_sketch.get(
                'operator', 'and')]
        self.cluster_builder = Cluster(sketch_dist_fn=self.sketch_dist_fn,
                                       max_dist=self.max_dist,
                                       min_support=self.min_support,
                                       sketch_operator=self.sketch_operator)
Example #8
0
    def test_simhash_feature_weights(self):
        """Features should be weighted and should contribute to
        similarity/distance
        """
        sh = SimHashSignature(64)

        sig1 = sh.get_signature("abracadabra")
        sig2 = sh.get_signature("abracdabra")
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 3)

        sig1 = sh.get_signature("abracadabra", ("cats", 0))
        sig2 = sh.get_signature("abracdabra", ("dogs", 0))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 3)

        sig1 = sh.get_signature("abracadabra", ("cats", 0))
        sig2 = sh.get_signature("abracadabra", ("dogs", 0))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 0)

        sig1 = sh.get_signature("abracadabra", ("ca", 4))
        sig2 = sh.get_signature("abracadabra", ("do", 4))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 9)

        sig1 = sh.get_signature("abracadabra", ("ca", 5))
        sig2 = sh.get_signature("abracadabra", ("do", 5))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 9)

        sig1 = sh.get_signature("abracadabra", ("cats", 200))
        sig2 = sh.get_signature("abracadabra", ("dogs", 200))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 21)

        sig1 = sh.get_signature("abracadabra", ("cats", 10))
        sig2 = sh.get_signature("abracadabra", ("cats", 10))
        dist = hamming(sig1, sig2)
        self.assertEqual(dist, 0)
Example #9
0
 def test_simhash_similarity_3(self):
     sh = SimHashSignature(64)
     sig1 = sh.get_signature("abracadabra")
     sig2 = sh.get_signature("")
     dist = hamming(sig1, sig2)
     self.assertEqual(37, dist)
Example #10
0
 def test_simhash128_2(self):
     sh = SimHashSignature(128)
     sig3 = sh.get_signature("abracadabra")
     sig4 = sh.get_signature(u"abracadabra")
     self.assertEqual(sig3, sig4)
Example #11
0
 def test_simhash128_1(self):
     sh = SimHashSignature(128)
     sig1 = sh.get_signature("")
     sig2 = sh.get_signature(u"")
     self.assertEqual(0, sig1)
     self.assertEqual(sig1, sig2)
Example #12
0
 def test_simhash64_1(self):
     sh = SimHashSignature(64)
     sig1 = sh.get_signature("")
     sig2 = sh.get_signature(u"")
     self.assertEqual(sig1, sig2)
Example #13
0
 def test_simhash_feature_weights_7(self):
     sh = SimHashSignature(64)
     sig1 = sh.get_signature("abracadabra", ("cats", 10))
     sig2 = sh.get_signature("abracadabra", ("cats", 10))
     dist = hamming(sig1, sig2)
     self.assertEqual(0, dist)
Example #14
0
 def test_simhash_feature_weights_1(self):
     sh = SimHashSignature(64)
     sig1 = sh.get_signature("abracadabra")
     sig2 = sh.get_signature("abracdabra")
     dist = hamming(sig1, sig2)
     self.assertEqual(3, dist)