def _run_minhash(data, seed, p): hasher = pyhash.murmur3_32() m = MinHash(num_perm=2**p) for d in data: m.digest(Hash(hasher(d, seed=seed))) return m.count()
def _run_minhash(data, seed, p): hasher = pyhash.murmur3_32() m = MinHash(num_perm=2**p, hashobj=Hash) for d in data: m.update(hasher(d, seed=seed)) return m.count()
'estimating', 'the', 'similarity', 'between', 'documents'] m1, m2 = MinHash(), MinHash() for d in data1: m1.update(d.encode('utf8')) for d in data2: m2.update(d.encode('utf8')) print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2)) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2)))/float(len(s1.union(s2))) print("Actual Jaccard for data1 and data2 is", actual_jaccard) >>> >>> m = MinHash(num_perm=256) >>> m.count() 0.0 >>> from sparselsh import LSH from scipy.sparse import csr_matrix X = csr_matrix( [ [ 3, 0, 0, 0, 0, 0, -1], [ 0, 1, 0, 0, 0, 0, 1], [ 1, 1, 1, 1, 1, 1, 1] ]) # One class number for each input point y = [ 0, 3, 10] X_sim = csr_matrix( [ [ 1, 1, 1, 1, 1, 1, 0]]) lsh = LSH( 4,