def test_added_text(self): a = self.compute(self.jabberwocky) b = self.compute(self.jabberwocky + ' - Lewis Carroll (Alice in Wonderland)') self.assertLessEqual(simhash.num_differing_bits(a, b), self.MATCH_THRESHOLD)
def test_added_text(self): a = self.compute(self.jabberwocky) b = self.compute( self.jabberwocky + ' - Lewis Carroll (Alice in Wonderland)') self.assertLessEqual( simhash.num_differing_bits(a, b), self.MATCH_THRESHOLD)
def test_simhash(): try: from simhash import num_differing_bits except ImportError: raise SkipTest from sklearn.feature_extraction.text import HashingVectorizer from freediscovery.dupdet import SimhashDuplicates DISTANCE = 4 fe = HashingVectorizer(ngram_range=(4, 4), analyzer='word') X = fe.fit_transform([ jabberwocky, jabberwocky + jabberwocky_author, jabberwocky_author, jabberwocky ]) sh = SimhashDuplicates() sh.fit(X) # make sure small changes in the text results in a small number of different bytes assert num_differing_bits(*sh._fit_shash[:2]) <= 3 # different text produces a large number of different bytes assert num_differing_bits(*sh._fit_shash[1:3]) >= 20 # same text produces a zero bit difference assert num_differing_bits(*sh._fit_shash[[0, -1]]) == 0 simhash, cluster_id, dup_pairs = sh.query(distance=DISTANCE, blocks=42) assert str(dup_pairs.dtype) == 'uint64' assert str(cluster_id.dtype) == 'int64' assert str(dup_pairs.dtype) == 'uint64' assert simhash[0] == simhash[ -1] # duplicate documents have the same simhash assert cluster_id[0] == cluster_id[-1] # and belong to the same cluster for idx, shash in enumerate(simhash): if (shash == simhash).sum() == 1: # ignore duplicates assert sh.get_index_by_hash(shash) == idx for pairs in dup_pairs: assert num_differing_bits(*pairs) <= DISTANCE
def hammingCompare(outtweets, innerTwitter): client = retinasdk.FullClient(apiKey.retina_token, apiServer="http://api.cortical.io/rest", retinaName="en_associative") liteClient = retinasdk.LiteClient(apiKey.retina_token) res = [] for index, outtweet in enumerate(outtweets): result = {} # get simHash simhash_pair = getSimHash(outtweet[2], innerTwitter, client) if len(simhash_pair) > 1: diff_bits = simhash.num_differing_bits(simhash_pair['out_hash'], simhash_pair['in_hash']) hashes = [simhash_pair['out_hash'], simhash_pair['in_hash']] blocks = 4 # Number of blocks to use distance = 3 # Number of bits that may differ in matching pairs matches = simhash.find_all(hashes, blocks, distance) res.append([index, outtweet[2], matches]) return res
def test_inverse(self): hashes = [0xDEADBEEFDEADBEEF, 0x2152411021524110] self.assertEqual(64, simhash.num_differing_bits(*hashes)) self.assertEqual(0, simhash.compute(hashes))
def test_different(self): a = self.compute(self.jabberwocky) b = self.compute(self.pope) self.assertGreater( simhash.num_differing_bits(a, b), self.MATCH_THRESHOLD)
def test_identical_text(self): a = self.compute(self.jabberwocky) b = self.compute(self.jabberwocky) self.assertEqual(0, simhash.num_differing_bits(a, b))
def test_basic(self): a = 0xDEADBEEF b = 0xDEADBEAD self.assertEqual(2, simhash.num_differing_bits(a, b))
def distance(self, hash): return num_differing_bits(self.value, hash.value)
def test_basic(self): a = 0xDEADBEEF b = 0xDEADBEAD self.assertEqual(2, simhash.num_differing_bits(a, b)) print(pope)
def test_different(self): a = self.compute(self.jabberwocky) b = self.compute(self.pope) self.assertGreater(simhash.num_differing_bits(a, b), self.MATCH_THRESHOLD)
def calculate_fingerprint_diff(row): return simhash.num_differing_bits(row['fingerprint_old'], row['fingerprint_new'])