def test_added_text(self):
        a = self.compute(self.jabberwocky)
        b = self.compute(self.jabberwocky +
                         ' - Lewis Carroll (Alice in Wonderland)')

        self.assertLessEqual(simhash.num_differing_bits(a, b),
                             self.MATCH_THRESHOLD)
Beispiel #2
0
    def test_added_text(self):
        a = self.compute(self.jabberwocky)
        b = self.compute(
            self.jabberwocky + ' - Lewis Carroll (Alice in Wonderland)')

        self.assertLessEqual(
            simhash.num_differing_bits(a, b),
            self.MATCH_THRESHOLD)
Beispiel #3
0
def test_simhash():

    try:
        from simhash import num_differing_bits
    except ImportError:
        raise SkipTest
    from sklearn.feature_extraction.text import HashingVectorizer
    from freediscovery.dupdet import SimhashDuplicates

    DISTANCE = 4

    fe = HashingVectorizer(ngram_range=(4, 4), analyzer='word')

    X = fe.fit_transform([
        jabberwocky, jabberwocky + jabberwocky_author, jabberwocky_author,
        jabberwocky
    ])

    sh = SimhashDuplicates()
    sh.fit(X)

    # make sure small changes in the text results in a small number of different bytes
    assert num_differing_bits(*sh._fit_shash[:2]) <= 3
    # different text produces a large number of different bytes
    assert num_differing_bits(*sh._fit_shash[1:3]) >= 20

    # same text produces a zero bit difference
    assert num_differing_bits(*sh._fit_shash[[0, -1]]) == 0

    simhash, cluster_id, dup_pairs = sh.query(distance=DISTANCE, blocks=42)
    assert str(dup_pairs.dtype) == 'uint64'
    assert str(cluster_id.dtype) == 'int64'
    assert str(dup_pairs.dtype) == 'uint64'

    assert simhash[0] == simhash[
        -1]  # duplicate documents have the same simhash
    assert cluster_id[0] == cluster_id[-1]  # and belong to the same cluster

    for idx, shash in enumerate(simhash):
        if (shash == simhash).sum() == 1:  # ignore duplicates
            assert sh.get_index_by_hash(shash) == idx

    for pairs in dup_pairs:
        assert num_differing_bits(*pairs) <= DISTANCE
Beispiel #4
0
def hammingCompare(outtweets, innerTwitter):
    client = retinasdk.FullClient(apiKey.retina_token,
                                  apiServer="http://api.cortical.io/rest",
                                  retinaName="en_associative")
    liteClient = retinasdk.LiteClient(apiKey.retina_token)
    res = []

    for index, outtweet in enumerate(outtweets):
        result = {}
        # get simHash
        simhash_pair = getSimHash(outtweet[2], innerTwitter, client)
        if len(simhash_pair) > 1:
            diff_bits = simhash.num_differing_bits(simhash_pair['out_hash'],
                                                   simhash_pair['in_hash'])
            hashes = [simhash_pair['out_hash'], simhash_pair['in_hash']]
            blocks = 4  # Number of blocks to use
            distance = 3  # Number of bits that may differ in matching pairs
            matches = simhash.find_all(hashes, blocks, distance)
            res.append([index, outtweet[2], matches])
    return res
Beispiel #5
0
 def test_inverse(self):
     hashes = [0xDEADBEEFDEADBEEF, 0x2152411021524110]
     self.assertEqual(64, simhash.num_differing_bits(*hashes))
     self.assertEqual(0, simhash.compute(hashes))
Beispiel #6
0
 def test_different(self):
     a = self.compute(self.jabberwocky)
     b = self.compute(self.pope)
     self.assertGreater(
         simhash.num_differing_bits(a, b),
         self.MATCH_THRESHOLD)
Beispiel #7
0
 def test_identical_text(self):
     a = self.compute(self.jabberwocky)
     b = self.compute(self.jabberwocky)
     self.assertEqual(0, simhash.num_differing_bits(a, b))
Beispiel #8
0
 def test_basic(self):
     a = 0xDEADBEEF
     b = 0xDEADBEAD
     self.assertEqual(2, simhash.num_differing_bits(a, b))
Beispiel #9
0
 def distance(self, hash):
     return num_differing_bits(self.value, hash.value)
 def test_basic(self):
     a = 0xDEADBEEF
     b = 0xDEADBEAD
     self.assertEqual(2, simhash.num_differing_bits(a, b))
     print(pope)
 def test_inverse(self):
     hashes = [0xDEADBEEFDEADBEEF, 0x2152411021524110]
     self.assertEqual(64, simhash.num_differing_bits(*hashes))
     self.assertEqual(0, simhash.compute(hashes))
 def test_different(self):
     a = self.compute(self.jabberwocky)
     b = self.compute(self.pope)
     self.assertGreater(simhash.num_differing_bits(a, b),
                        self.MATCH_THRESHOLD)
 def test_identical_text(self):
     a = self.compute(self.jabberwocky)
     b = self.compute(self.jabberwocky)
     self.assertEqual(0, simhash.num_differing_bits(a, b))
Beispiel #14
0
def calculate_fingerprint_diff(row):
    return simhash.num_differing_bits(row['fingerprint_old'],
                                      row['fingerprint_new'])