def getSimHash(outtweet, innerTwitter, client): try: fingerprints = getFingerPrint(outtweet, innerTwitter, client) if len(fingerprints) > 1: out_hash = simhash.compute(fingerprints["out_fingerprint"]) in_hash = simhash.compute(fingerprints["inner_fingerprint"]) return {'out_hash': out_hash, 'in_hash': in_hash} else: return {'out_hash': False} except Exception as e: return {'out_hash': False}
def sim_shi4_mm3(text): # NB: It makes quite little sense to use both 64bit numbers to compare # hashes as pairwise Hamming distance using high 64bit is highly correlated # with the distance computed using low 64bit. It's actually expected, but # it means, that summing these distances is not linear and should be avoided. # -- https://gist.github.com/darkk/e2b2762c4fe053a3cf8a299520f0490e i1, i2 = itertools.tee(WORD_RE.finditer(text)) for _ in xrange(3): # 4 words per shingle next(i2, None) mm = [mmh3.hash64(text[m1.start():m2.end()]) for m1, m2 in itertools.izip(i1, i2)] return (simhash.compute([_[0] & 0xffffffffffffffff for _ in mm]), simhash.compute([_[1] & 0xffffffffffffffff for _ in mm]))
def sim_shi4_mm3(text): # NB: It makes quite little sense to use both 64bit numbers to compare # hashes as pairwise Hamming distance using high 64bit is highly correlated # with the distance computed using low 64bit. It's actually expected, but # it means, that summing these distances is not linear and should be avoided. # -- https://gist.github.com/darkk/e2b2762c4fe053a3cf8a299520f0490e i1, i2 = itertools.tee(WORD_RE.finditer(text)) for _ in xrange(3): # 4 words per shingle next(i2, None) mm = [ mmh3.hash64(text[m1.start():m2.end()]) for m1, m2 in itertools.izip(i1, i2) ] return (simhash.compute([_[0] & 0xffffffffffffffff for _ in mm]), simhash.compute([_[1] & 0xffffffffffffffff for _ in mm]))
def fit(self, X, y=None): """ Parameters ---------- X : array_like or sparse (CSR) matrix, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. Returns ------- self : object Returns self. """ from simhash import compute self._fit_X = X = check_array(X, accept_sparse='csr') n_features = X.shape[1] def _scale_hash_64bit(indices, n_features): return indices.astype('uint64') * ((2**64 - 1) // n_features) shash = [] for idx in range(X.shape[0]): mhash = _scale_hash_64bit(X[idx].indices, n_features) shash.append(compute(mhash)) self._fit_shash = np.asarray(shash, dtype='uint64') self._fit_shash_dict = { val: key for key, val in enumerate(self._fit_shash) }
def compute_simhash(text): tokens = re.split(r'\W+', text.lower(), flags=re.UNICODE) shingles = [ ''.join(shingle) for shingle in simhash.shingle(''.join(tokens), 4) ] hashes = [simhash.unsigned_hash(s.encode('utf8')) for s in shingles] return simhash.compute(hashes)
def tokens_to_fingerprint(tokens): shingles = [ ''.join(shingle) for shingle in simhash.shingle(''.join(tokens), 8) ] hashes = [ simhash.unsigned_hash(s.encode('utf8', 'ignore')) for s in shingles ] return simhash.compute(hashes)
def compute(self, data, token_size): content = data.lower() content = ''.join(self.filter.findall(content)) shingles = [ ''.join(_shingle) for _shingle in shingle(content, token_size) ] hashes = [ unsigned_hash(s.encode("utf-8")) for s in sorted(shingles) ] return compute(hashes)
def compute(text): """ compute hash for a document by shingles """ tokens = text.split() phrases = (' '.join(phrase) for phrase in simhash.shingle(tokens, 4)) hashes = map(simhash.unsigned_hash, phrases) return simhash.compute(hashes)
def hash_text(t): text, domain, year, season = t tokens = nltk.tokenize.word_tokenize(text) #https://github.com/seomoz/simhash-py/issues/47 # A generator for ' '-joined strings of consecutive tokens shingles = (' '.join(shingle) for shingle in simhash.shingle(tokens, 4)) # They need to be unsigned 64-bit ints h= simhash.compute([ctypes.c_ulong(hash(shingle)).value for shingle in shingles]) return (h, text, domain, year, season)
def compute(text): """ compute hash for a document by shingles """ #tokens = re.split(r'\W+', text) tokens = text.split() # 默认以‘ ’来进行切割,得到分词的列表 #logger.debug('%s', ''.join(tokens[:5])) phrases = (' '.join(phrase) for phrase in simhash.shingle(tokens, 4)) #logger.debug('%s', [x for x in phrases]) hashes = map(simhash.unsigned_hash, phrases) return simhash.compute(hashes)
def sim_shi4_mm3_layout(text): text = SPACE_RE.sub(' ', text) # replace all runs of spaces i1, i2 = itertools.tee(WORD_RE.finditer(text)) for _ in xrange(3): # 4 words per shingle next(i2, None) hash64 = mmh3.hash64 # NB: `array` of i64 & ui64 is Python 3.3+ mm = [hash64(text[m1.start():m2.end()])[0] & 0xffffffffffffffff for m1, m2 in itertools.izip(i1, i2)] r = simhash.compute(mm) return r - 0x10000000000000000 if r > 0x7fffffffffffffff else r
def sim_shi4_mm3_layout(text): text = SPACE_RE.sub(" ", text) # replace all runs of spaces i1, i2 = itertools.tee(WORD_RE.finditer(text)) for _ in xrange(3): # 4 words per shingle next(i2, None) hash64 = mmh3.hash64 # NB: `array` of i64 & ui64 is Python 3.3+ mm = [ hash64(text[m1.start():m2.end()])[0] & 0xFFFFFFFFFFFFFFFF for m1, m2 in itertools.izip(i1, i2) ] r = simhash.compute(mm) return r - 0x10000000000000000 if r > 0x7FFFFFFFFFFFFFFF else r
def sim_shi4_mm3_text(text): text = HTML_COMMENT_RE.sub(' ', text) text = SCRIPT_RE.sub(' ', text) text = STYLE_RE.sub(' ', text) text = HTML_TAGS_RE.sub(' ', text) text = HTML_ENTITIES_RE.sub(' ', text) text = NONTEXT_RE.sub(' ', text) # replace all runs of spaces and punctuation i1, i2 = itertools.tee(WORD_RE.finditer(text)) for _ in xrange(3): # 4 words per shingle next(i2, None) hash64 = mmh3.hash64 # NB: `array` of i64 & ui64 is Python 3.3+ mm = [hash64(text[m1.start():m2.end()])[0] & 0xffffffffffffffff for m1, m2 in itertools.izip(i1, i2)] r = simhash.compute(mm) return r - 0x10000000000000000 if r > 0x7fffffffffffffff else r
def sim_shi4_mm3_text(text): text = HTML_COMMENT_RE.sub(" ", text) text = SCRIPT_RE.sub(" ", text) text = STYLE_RE.sub(" ", text) text = HTML_TAGS_RE.sub(" ", text) text = HTML_ENTITIES_RE.sub(" ", text) text = NONTEXT_RE.sub(" ", text) # replace all runs of spaces and punctuation i1, i2 = itertools.tee(WORD_RE.finditer(text)) for _ in xrange(3): # 4 words per shingle next(i2, None) hash64 = mmh3.hash64 # NB: `array` of i64 & ui64 is Python 3.3+ mm = [ hash64(text[m1.start():m2.end()])[0] & 0xFFFFFFFFFFFFFFFF for m1, m2 in itertools.izip(i1, i2) ] r = simhash.compute(mm) return r - 0x10000000000000000 if r > 0x7FFFFFFFFFFFFFFF else r
def fit(self, X, y=None): """ Parameters ---------- X : {array, sparse matrix}, shape (n_samples, n_features) List of n_features-dimensional data points. Each row corresponds to a single data point. Returns ------- self : object Returns self. """ from simhash import compute self._fit_X = X = check_array(X, accept_sparse='csr') n_features = X.shape[1] def _scale_hash_32_64bit(indices): return indices * ((2**64 - 1) // 2**32 - 1) hash_func = self.hash_func hashing_table = np.array( [hash_func(el, 0) for el in range(n_features)], dtype='uint64') shash = [] for idx in range(X.shape[0]): # get hashes of indices mhash = hashing_table[X[idx].indices] if self.hash_func_nbytes == 32: mhash = _scale_hash_32_64bit(mhash) shash.append(compute(mhash)) _fit_shash = np.asarray(shash, dtype='uint64') self._fit_shash = _fit_shash self._fit_shash_dict = { val: key for key, val in enumerate(self._fit_shash) }
def test_repeat(self): number = 0xDEADBEEF self.assertEqual(number, simhash.compute([number] * 100))
def __hash(shingles): return simhash.compute( [ctypes.c_ulong(hash(shingle)).value for shingle in shingles])
def test_basic(self): hashes = [0xABCD, 0xBCDE, 0xCDEF] self.assertEqual(0xADCF, simhash.compute(hashes))
def test_inverse(self): hashes = [0xDEADBEEFDEADBEEF, 0x2152411021524110] self.assertEqual(64, simhash.num_differing_bits(*hashes)) self.assertEqual(0, simhash.compute(hashes))
def test_empty(self): self.assertEqual(0, simhash.compute([]))
def compute(self, text): tokens = re.split(r'\W+', text.lower(), flags=re.UNICODE) shingles = [''.join(shingle) for shingle in simhash.shingle(''.join(tokens), 4)] hashes = [simhash.unsigned_hash(s.encode('utf8')) for s in shingles] return simhash.compute(hashes)
def compute(self, text): tokens = re.split(r'\W+', text, flags=re.UNICODE) phrases = (' '.join(phrase) for phrase in simhash.shingle(tokens, 4)) hashes = map(simhash.unsigned_hash, phrases) return simhash.compute(hashes)