Python num_differing_bits Examples

Programming Language: Python

Namespace/Package Name: simhash

Method/Function: num_differing_bits

Examples at hotexamples.com: 14

Python num_differing_bits - 14 examples found. These are the top rated real world Python examples of simhash.num_differing_bits extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: test.py Project: manojkmr63712/Text_Difference-Nirmal

    def test_added_text(self):
        a = self.compute(self.jabberwocky)
        b = self.compute(self.jabberwocky +
                         ' - Lewis Carroll (Alice in Wonderland)')

        self.assertLessEqual(simhash.num_differing_bits(a, b),
                             self.MATCH_THRESHOLD)

Example #2

Show file

File: test.py Project: rlugojr/simhash-py

    def test_added_text(self):
        a = self.compute(self.jabberwocky)
        b = self.compute(
            self.jabberwocky + ' - Lewis Carroll (Alice in Wonderland)')

        self.assertLessEqual(
            simhash.num_differing_bits(a, b),
            self.MATCH_THRESHOLD)

Example #3

Show file

def test_simhash():

    try:
        from simhash import num_differing_bits
    except ImportError:
        raise SkipTest
    from sklearn.feature_extraction.text import HashingVectorizer
    from freediscovery.dupdet import SimhashDuplicates

    DISTANCE = 4

    fe = HashingVectorizer(ngram_range=(4, 4), analyzer='word')

    X = fe.fit_transform([
        jabberwocky, jabberwocky + jabberwocky_author, jabberwocky_author,
        jabberwocky
    ])

    sh = SimhashDuplicates()
    sh.fit(X)

    # make sure small changes in the text results in a small number of different bytes
    assert num_differing_bits(*sh._fit_shash[:2]) <= 3
    # different text produces a large number of different bytes
    assert num_differing_bits(*sh._fit_shash[1:3]) >= 20

    # same text produces a zero bit difference
    assert num_differing_bits(*sh._fit_shash[[0, -1]]) == 0

    simhash, cluster_id, dup_pairs = sh.query(distance=DISTANCE, blocks=42)
    assert str(dup_pairs.dtype) == 'uint64'
    assert str(cluster_id.dtype) == 'int64'
    assert str(dup_pairs.dtype) == 'uint64'

    assert simhash[0] == simhash[
        -1]  # duplicate documents have the same simhash
    assert cluster_id[0] == cluster_id[-1]  # and belong to the same cluster

    for idx, shash in enumerate(simhash):
        if (shash == simhash).sum() == 1:  # ignore duplicates
            assert sh.get_index_by_hash(shash) == idx

    for pairs in dup_pairs:
        assert num_differing_bits(*pairs) <= DISTANCE

Example #4

Show file

File: helpers.py Project: shanpy/aiCompetition

def hammingCompare(outtweets, innerTwitter):
    client = retinasdk.FullClient(apiKey.retina_token,
                                  apiServer="http://api.cortical.io/rest",
                                  retinaName="en_associative")
    liteClient = retinasdk.LiteClient(apiKey.retina_token)
    res = []

    for index, outtweet in enumerate(outtweets):
        result = {}
        # get simHash
        simhash_pair = getSimHash(outtweet[2], innerTwitter, client)
        if len(simhash_pair) > 1:
            diff_bits = simhash.num_differing_bits(simhash_pair['out_hash'],
                                                   simhash_pair['in_hash'])
            hashes = [simhash_pair['out_hash'], simhash_pair['in_hash']]
            blocks = 4  # Number of blocks to use
            distance = 3  # Number of bits that may differ in matching pairs
            matches = simhash.find_all(hashes, blocks, distance)
            res.append([index, outtweet[2], matches])
    return res

Example #5

Show file

File: test.py Project: rlugojr/simhash-py

 def test_inverse(self):
     hashes = [0xDEADBEEFDEADBEEF, 0x2152411021524110]
     self.assertEqual(64, simhash.num_differing_bits(*hashes))
     self.assertEqual(0, simhash.compute(hashes))

Example #6

Show file

File: test.py Project: rlugojr/simhash-py

 def test_different(self):
     a = self.compute(self.jabberwocky)
     b = self.compute(self.pope)
     self.assertGreater(
         simhash.num_differing_bits(a, b),
         self.MATCH_THRESHOLD)

Example #7

Show file

File: test.py Project: rlugojr/simhash-py

 def test_identical_text(self):
     a = self.compute(self.jabberwocky)
     b = self.compute(self.jabberwocky)
     self.assertEqual(0, simhash.num_differing_bits(a, b))

Example #8

Show file

File: test.py Project: rlugojr/simhash-py

 def test_basic(self):
     a = 0xDEADBEEF
     b = 0xDEADBEAD
     self.assertEqual(2, simhash.num_differing_bits(a, b))

Example #9

Show file

 def distance(self, hash):
     return num_differing_bits(self.value, hash.value)

Example #10

Show file

File: test.py Project: manojkmr63712/Text_Difference-Nirmal

 def test_basic(self):
     a = 0xDEADBEEF
     b = 0xDEADBEAD
     self.assertEqual(2, simhash.num_differing_bits(a, b))
     print(pope)

Example #11

Show file

File: test.py Project: manojkmr63712/Text_Difference-Nirmal

 def test_inverse(self):
     hashes = [0xDEADBEEFDEADBEEF, 0x2152411021524110]
     self.assertEqual(64, simhash.num_differing_bits(*hashes))
     self.assertEqual(0, simhash.compute(hashes))

Example #12

Show file

File: test.py Project: manojkmr63712/Text_Difference-Nirmal

 def test_different(self):
     a = self.compute(self.jabberwocky)
     b = self.compute(self.pope)
     self.assertGreater(simhash.num_differing_bits(a, b),
                        self.MATCH_THRESHOLD)

Example #13

Show file

File: test.py Project: manojkmr63712/Text_Difference-Nirmal

 def test_identical_text(self):
     a = self.compute(self.jabberwocky)
     b = self.compute(self.jabberwocky)
     self.assertEqual(0, simhash.num_differing_bits(a, b))

Example #14

Show file

def calculate_fingerprint_diff(row):
    return simhash.num_differing_bits(row['fingerprint_old'],
                                      row['fingerprint_new'])