def test_jaccard(self): m1 = minhash.MinHash(4, 1) m2 = minhash.MinHash(4, 1) self.assertTrue(minhash.jaccard([m1, m2]) == 1.0) m2.digest(FakeHash(12)) self.assertTrue(minhash.jaccard([m1, m2]) == 0.0) m1.digest(FakeHash(13)) self.assertTrue(minhash.jaccard([m1, m2]) < 1.0)
def test_jaccard(self): m1 = minhash.MinHash(4, 1) m2 = minhash.MinHash(4, 1) self.assertTrue(minhash.jaccard([m1, m2]) == 1.0) m2.digest(sha1(bytes(12))) self.assertTrue(minhash.jaccard([m1, m2]) == 0.0) m1.digest(sha1(bytes(13))) self.assertTrue(minhash.jaccard([m1, m2]) < 1.0)
def run_acc(size, num_perm): logging.info("MinHash using %d permutation functions" % num_perm) m1, s1 = _run_acc(size, 1, num_perm) m2, s2 = _run_acc(size, 4, num_perm) j = float(len(s1.intersection(s2)))/float(len(s1.union(s2))) j_e = jaccard(m1, m2) err = abs(j - j_e) return err
def run_acc(size, num_perm): logging.info("MinHash using %d permutation functions" % num_perm) m1, s1 = _run_acc(size, 1, num_perm) m2, s2 = _run_acc(size, 4, num_perm) j = float(len(s1.intersection(s2)))/float(len(s1.union(s2))) j_e = jaccard([m1, m2]) err = abs(j - j_e) return err
def _run_minhash(A, B, data, seed, num_perm): (a_start, a_end), (b_start, b_end) = A, B hasher = pyhash.murmur3_32() m1 = MinHash(num_perm=num_perm) m2 = MinHash(num_perm=num_perm) for i in xrange(a_start, a_end): m1.digest(Hash(hasher(data[i], seed=seed))) for i in xrange(b_start, b_end): m2.digest(Hash(hasher(data[i], seed=seed))) return jaccard([m1, m2])
def eg1(): m1 = MinHash() m2 = MinHash() for d in data1: m1.digest(sha1(d.encode('utf8'))) for d in data2: m2.digest(sha1(d.encode('utf8'))) print("Estimated Jaccard for data1 and data2 is", jaccard(m1, m2)) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2))) /\ float(len(s1.union(s2))) print("Actual Jaccard for data1 and data2 is", actual_jaccard)
def eg1(): m1 = MinHash() m2 = MinHash() for d in data1: m1.digest(sha1(d.encode('utf8'))) for d in data2: m2.digest(sha1(d.encode('utf8'))) print("Estimated Jaccard for data1 and data2 is", jaccard([m1, m2])) s1 = set(data1) s2 = set(data2) actual_jaccard = float(len(s1.intersection(s2))) /\ float(len(s1.union(s2))) print("Actual Jaccard for data1 and data2 is", actual_jaccard)
def _minhash_inclusion(m1, m2): c1 = m1.count() c2 = m2.count() j = jaccard(m1, m2) return (j / (j + 1.0)) * (1.0 + float(c2) / float(c1))
def test_union(self): m1 = minhash.MinHash(4, 1) m2 = minhash.MinHash(4, 1) m2.digest(FakeHash(12)) u = minhash.MinHash.union(m1, m2) self.assertTrue(minhash.jaccard(u, m2) == 1.0)
def test_merge(self): m1 = minhash.MinHash(4, 1) m2 = minhash.MinHash(4, 1) m2.digest(FakeHash(12)) m1.merge(m2) self.assertTrue(minhash.jaccard(m1, m2) == 1.0)
def test_merge(self): m1 = minhash.MinHash(4, 1) m2 = minhash.MinHash(4, 1) m2.digest(sha1(bytes(12))) m1.merge(m2) self.assertTrue(minhash.jaccard([m1, m2]) == 1.0)