Example #1
0
def getSimHash(outtweet, innerTwitter, client):
    try:
        fingerprints = getFingerPrint(outtweet, innerTwitter, client)
        if len(fingerprints) > 1:
            out_hash = simhash.compute(fingerprints["out_fingerprint"])
            in_hash = simhash.compute(fingerprints["inner_fingerprint"])
            return {'out_hash': out_hash, 'in_hash': in_hash}
        else:
            return {'out_hash': False}
    except Exception as e:
        return {'out_hash': False}
def sim_shi4_mm3(text):
    # NB: It makes quite little sense to use both 64bit numbers to compare
    # hashes as pairwise Hamming distance using high 64bit is highly correlated
    # with the distance computed using low 64bit. It's actually expected, but
    # it means, that summing these distances is not linear and should be avoided.
    # -- https://gist.github.com/darkk/e2b2762c4fe053a3cf8a299520f0490e
    i1, i2 = itertools.tee(WORD_RE.finditer(text))
    for _ in xrange(3): # 4 words per shingle
        next(i2, None)
    mm = [mmh3.hash64(text[m1.start():m2.end()]) for m1, m2 in itertools.izip(i1, i2)]
    return (simhash.compute([_[0] & 0xffffffffffffffff for _ in mm]),
            simhash.compute([_[1] & 0xffffffffffffffff for _ in mm]))
def sim_shi4_mm3(text):
    # NB: It makes quite little sense to use both 64bit numbers to compare
    # hashes as pairwise Hamming distance using high 64bit is highly correlated
    # with the distance computed using low 64bit. It's actually expected, but
    # it means, that summing these distances is not linear and should be avoided.
    # -- https://gist.github.com/darkk/e2b2762c4fe053a3cf8a299520f0490e
    i1, i2 = itertools.tee(WORD_RE.finditer(text))
    for _ in xrange(3):  # 4 words per shingle
        next(i2, None)
    mm = [
        mmh3.hash64(text[m1.start():m2.end()])
        for m1, m2 in itertools.izip(i1, i2)
    ]
    return (simhash.compute([_[0] & 0xffffffffffffffff for _ in mm]),
            simhash.compute([_[1] & 0xffffffffffffffff for _ in mm]))
Example #4
0
    def fit(self, X, y=None):
        """
        Parameters
        ----------
        X : array_like or sparse (CSR) matrix, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.
        Returns
        -------
        self : object
            Returns self.
        """
        from simhash import compute
        self._fit_X = X = check_array(X, accept_sparse='csr')

        n_features = X.shape[1]

        def _scale_hash_64bit(indices, n_features):
            return indices.astype('uint64') * ((2**64 - 1) // n_features)

        shash = []
        for idx in range(X.shape[0]):
            mhash = _scale_hash_64bit(X[idx].indices, n_features)
            shash.append(compute(mhash))
        self._fit_shash = np.asarray(shash, dtype='uint64')
        self._fit_shash_dict = {
            val: key
            for key, val in enumerate(self._fit_shash)
        }
Example #5
0
def compute_simhash(text):
    tokens = re.split(r'\W+', text.lower(), flags=re.UNICODE)
    shingles = [
        ''.join(shingle) for shingle in simhash.shingle(''.join(tokens), 4)
    ]
    hashes = [simhash.unsigned_hash(s.encode('utf8')) for s in shingles]
    return simhash.compute(hashes)
Example #6
0
def tokens_to_fingerprint(tokens):
    shingles = [
        ''.join(shingle) for shingle in simhash.shingle(''.join(tokens), 8)
    ]
    hashes = [
        simhash.unsigned_hash(s.encode('utf8', 'ignore')) for s in shingles
    ]
    return simhash.compute(hashes)
Example #7
0
 def compute(self, data, token_size):
     content = data.lower()
     content = ''.join(self.filter.findall(content))
     shingles = [
         ''.join(_shingle) for _shingle in shingle(content, token_size)
     ]
     hashes = [
         unsigned_hash(s.encode("utf-8")) for s in sorted(shingles)
     ]
     return compute(hashes)
Example #8
0
def compute(text):
    """
        compute hash for a document by shingles
    """

    tokens = text.split()
    phrases = (' '.join(phrase) for phrase in simhash.shingle(tokens, 4))

    hashes = map(simhash.unsigned_hash, phrases)

    return simhash.compute(hashes)
Example #9
0
def hash_text(t):
    text, domain, year, season = t
    tokens = nltk.tokenize.word_tokenize(text)

    #https://github.com/seomoz/simhash-py/issues/47
    # A generator for ' '-joined strings of consecutive tokens
    shingles = (' '.join(shingle) for shingle in simhash.shingle(tokens, 4))
    # They need to be unsigned 64-bit ints
    h= simhash.compute([ctypes.c_ulong(hash(shingle)).value for shingle in shingles])

    return (h, text, domain, year, season)
Example #10
0
def compute(text):
    """
        compute hash for a document by shingles
    """
    #tokens = re.split(r'\W+', text)
    tokens = text.split()  # 默认以‘ ’来进行切割,得到分词的列表
    #logger.debug('%s', ''.join(tokens[:5]))

    phrases = (' '.join(phrase) for phrase in simhash.shingle(tokens, 4))
    #logger.debug('%s', [x for x in phrases])
    hashes = map(simhash.unsigned_hash, phrases)
    return simhash.compute(hashes)
Example #11
0
def sim_shi4_mm3_layout(text):
    text = SPACE_RE.sub(' ', text) # replace all runs of spaces

    i1, i2 = itertools.tee(WORD_RE.finditer(text))
    for _ in xrange(3): # 4 words per shingle
        next(i2, None)

    hash64 = mmh3.hash64
    # NB: `array` of i64 & ui64 is Python 3.3+
    mm = [hash64(text[m1.start():m2.end()])[0] & 0xffffffffffffffff
          for m1, m2 in itertools.izip(i1, i2)]
    r = simhash.compute(mm)
    return r - 0x10000000000000000 if r > 0x7fffffffffffffff else r
Example #12
0
def sim_shi4_mm3_layout(text):
    text = SPACE_RE.sub(" ", text)  # replace all runs of spaces

    i1, i2 = itertools.tee(WORD_RE.finditer(text))
    for _ in xrange(3):  # 4 words per shingle
        next(i2, None)

    hash64 = mmh3.hash64
    # NB: `array` of i64 & ui64 is Python 3.3+
    mm = [
        hash64(text[m1.start():m2.end()])[0] & 0xFFFFFFFFFFFFFFFF
        for m1, m2 in itertools.izip(i1, i2)
    ]
    r = simhash.compute(mm)
    return r - 0x10000000000000000 if r > 0x7FFFFFFFFFFFFFFF else r
Example #13
0
def sim_shi4_mm3_text(text):
    text = HTML_COMMENT_RE.sub(' ', text)
    text = SCRIPT_RE.sub(' ', text)
    text = STYLE_RE.sub(' ', text)
    text = HTML_TAGS_RE.sub(' ', text)
    text = HTML_ENTITIES_RE.sub(' ', text)
    text = NONTEXT_RE.sub(' ', text) # replace all runs of spaces and punctuation

    i1, i2 = itertools.tee(WORD_RE.finditer(text))
    for _ in xrange(3): # 4 words per shingle
        next(i2, None)

    hash64 = mmh3.hash64
    # NB: `array` of i64 & ui64 is Python 3.3+
    mm = [hash64(text[m1.start():m2.end()])[0] & 0xffffffffffffffff
          for m1, m2 in itertools.izip(i1, i2)]
    r = simhash.compute(mm)
    return r - 0x10000000000000000 if r > 0x7fffffffffffffff else r
Example #14
0
def sim_shi4_mm3_text(text):
    text = HTML_COMMENT_RE.sub(" ", text)
    text = SCRIPT_RE.sub(" ", text)
    text = STYLE_RE.sub(" ", text)
    text = HTML_TAGS_RE.sub(" ", text)
    text = HTML_ENTITIES_RE.sub(" ", text)
    text = NONTEXT_RE.sub(" ",
                          text)  # replace all runs of spaces and punctuation

    i1, i2 = itertools.tee(WORD_RE.finditer(text))
    for _ in xrange(3):  # 4 words per shingle
        next(i2, None)

    hash64 = mmh3.hash64
    # NB: `array` of i64 & ui64 is Python 3.3+
    mm = [
        hash64(text[m1.start():m2.end()])[0] & 0xFFFFFFFFFFFFFFFF
        for m1, m2 in itertools.izip(i1, i2)
    ]
    r = simhash.compute(mm)
    return r - 0x10000000000000000 if r > 0x7FFFFFFFFFFFFFFF else r
Example #15
0
    def fit(self, X, y=None):
        """
        Parameters
        ----------
        X : {array, sparse matrix}, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.
        Returns
        -------
        self : object
            Returns self.
        """
        from simhash import compute
        self._fit_X = X = check_array(X, accept_sparse='csr')

        n_features = X.shape[1]

        def _scale_hash_32_64bit(indices):
            return indices * ((2**64 - 1) // 2**32 - 1)

        hash_func = self.hash_func

        hashing_table = np.array(
            [hash_func(el, 0) for el in range(n_features)], dtype='uint64')

        shash = []
        for idx in range(X.shape[0]):
            # get hashes of indices
            mhash = hashing_table[X[idx].indices]
            if self.hash_func_nbytes == 32:
                mhash = _scale_hash_32_64bit(mhash)
            shash.append(compute(mhash))
        _fit_shash = np.asarray(shash, dtype='uint64')
        self._fit_shash = _fit_shash
        self._fit_shash_dict = {
            val: key
            for key, val in enumerate(self._fit_shash)
        }
 def test_repeat(self):
     number = 0xDEADBEEF
     self.assertEqual(number, simhash.compute([number] * 100))
Example #17
0
 def __hash(shingles):
     return simhash.compute(
         [ctypes.c_ulong(hash(shingle)).value for shingle in shingles])
 def test_basic(self):
     hashes = [0xABCD, 0xBCDE, 0xCDEF]
     self.assertEqual(0xADCF, simhash.compute(hashes))
Example #19
0
 def test_basic(self):
     hashes = [0xABCD, 0xBCDE, 0xCDEF]
     self.assertEqual(0xADCF, simhash.compute(hashes))
Example #20
0
 def test_inverse(self):
     hashes = [0xDEADBEEFDEADBEEF, 0x2152411021524110]
     self.assertEqual(64, simhash.num_differing_bits(*hashes))
     self.assertEqual(0, simhash.compute(hashes))
Example #21
0
 def test_repeat(self):
     number = 0xDEADBEEF
     self.assertEqual(number, simhash.compute([number] * 100))
Example #22
0
 def test_empty(self):
     self.assertEqual(0, simhash.compute([]))
Example #23
0
 def compute(self, text):
     tokens = re.split(r'\W+', text.lower(), flags=re.UNICODE)
     shingles = [''.join(shingle) for shingle in
                 simhash.shingle(''.join(tokens), 4)]
     hashes = [simhash.unsigned_hash(s.encode('utf8')) for s in shingles]
     return simhash.compute(hashes)
 def test_inverse(self):
     hashes = [0xDEADBEEFDEADBEEF, 0x2152411021524110]
     self.assertEqual(64, simhash.num_differing_bits(*hashes))
     self.assertEqual(0, simhash.compute(hashes))
 def test_empty(self):
     self.assertEqual(0, simhash.compute([]))
Example #26
0
 def compute(self, text):
     tokens = re.split(r'\W+', text, flags=re.UNICODE)
     phrases = (' '.join(phrase) for phrase in simhash.shingle(tokens, 4))
     hashes = map(simhash.unsigned_hash, phrases)
     return simhash.compute(hashes)