def test_theta_set_operations(self): k = 12 # 2^k = 4096 rows in the table n = 1 << 18 # ~256k unique values # we'll have 1/4 of the values overlap offset = int(3 * n / 4) # it's a float w/o cast # create a couple sketches and inject some values sk1 = self.generate_theta_sketch(n, k) sk2 = self.generate_theta_sketch(n, k, offset) # UNIONS # create a union object union = theta_union(k) union.update(sk1) union.update(sk2) # getting result from union returns a compact_theta_sketch # compact theta sketches can be used in additional unions # or set operations but cannot accept further item updates result = union.get_result() self.assertTrue(isinstance(result, compact_theta_sketch)) # since our process here is deterministic, we have # checked and know the exact answer is within one # standard deviation of the estimate self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4) self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4) # INTERSECTIONS # create an intersection object intersect = theta_intersection() # no lg_k intersect.update(sk1) intersect.update(sk2) # has_result() indicates the intersection has been used, # although the result may be the empty set self.assertTrue(intersect.has_result()) # as with unions, the result is a compact sketch result = intersect.get_result() self.assertTrue(isinstance(result, compact_theta_sketch)) # we know the sets overlap by 1/4 self.assertLessEqual(result.get_lower_bound(1), n / 4) self.assertGreaterEqual(result.get_upper_bound(1), n / 4) # A NOT B # create an a_not_b object anb = theta_a_not_b() # no lg_k result = anb.compute(sk1, sk2) # as with unions, the result is a compact sketch self.assertTrue(isinstance(result, compact_theta_sketch)) # we know the sets overlap by 1/4, so the remainder is 3/4 self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4) self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)
def __init__(self, theta_sketch=None, union=None, compact_theta=None): if theta_sketch is None: theta_sketch = datasketches.update_theta_sketch() if union is None: union = datasketches.theta_union() else: union = _copy_union(union) if compact_theta is not None: union.update(compact_theta) self.theta_sketch = theta_sketch self.union = union
def get_result(self): """ Generate a theta sketch Returns ------- compact_sketch : datasketches.compact_theta_sketch Read-only compact theta sketch with full statistics. """ new_union = datasketches.theta_union() new_union.update(self.union.get_result()) new_union.update(self.theta_sketch) return new_union.get_result()
def merge(self, other): """ Merge another `ThetaSketch` with this one, returning a new object Parameters ---------- other : ThetaSketch Other theta sketch Returns ------- new : ThetaSketch New theta sketch with merged statistics """ new_union = datasketches.theta_union() new_union.update(self.get_result()) new_union.update(other.get_result()) return ThetaSketch(union=new_union)
def _copy_union(union): new_union = datasketches.theta_union() new_union.update(union.get_result()) return new_union
def test_theta_set_operations(self): k = 12 # 2^k = 4096 rows in the table n = 1 << 18 # ~256k unique values # we'll have 1/4 of the values overlap offset = int(3 * n / 4) # it's a float w/o cast # create a couple sketches and inject some values sk1 = self.generate_theta_sketch(n, k) sk2 = self.generate_theta_sketch(n, k, offset) # UNIONS # create a union object union = theta_union(k) union.update(sk1) union.update(sk2) # getting result from union returns a compact_theta_sketch # compact theta sketches can be used in additional unions # or set operations but cannot accept further item updates result = union.get_result() self.assertTrue(isinstance(result, compact_theta_sketch)) # since our process here is deterministic, we have # checked and know the exact answer is within one # standard deviation of the estimate self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4) self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4) # INTERSECTIONS # create an intersection object intersect = theta_intersection() # no lg_k intersect.update(sk1) intersect.update(sk2) # has_result() indicates the intersection has been used, # although the result may be the empty set self.assertTrue(intersect.has_result()) # as with unions, the result is a compact sketch result = intersect.get_result() self.assertTrue(isinstance(result, compact_theta_sketch)) # we know the sets overlap by 1/4 self.assertLessEqual(result.get_lower_bound(1), n / 4) self.assertGreaterEqual(result.get_upper_bound(1), n / 4) # A NOT B # create an a_not_b object anb = theta_a_not_b() # no lg_k result = anb.compute(sk1, sk2) # as with unions, the result is a compact sketch self.assertTrue(isinstance(result, compact_theta_sketch)) # we know the sets overlap by 1/4, so the remainder is 3/4 self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4) self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4) # JACCARD SIMILARITY # Jaccard Similarity measure returns (lower_bound, estimate, upper_bound) jac = theta_jaccard_similarity.jaccard(sk1, sk2) # we can check that results are in the expected order self.assertLess(jac[0], jac[1]) self.assertLess(jac[1], jac[2]) # checks for sketch equivalency self.assertTrue(theta_jaccard_similarity.exactly_equal(sk1, sk1)) self.assertFalse(theta_jaccard_similarity.exactly_equal(sk1, sk2)) # we can apply a check for similarity or dissimilarity at a # given threshhold, at 97.7% confidence. # check that the Jaccard Index is at most (upper bound) 0.2. # exact result would be 1/7 self.assertTrue( theta_jaccard_similarity.dissimilarity_test(sk1, sk2, 0.2)) # check that the Jaccard Index is at least (lower bound) 0.7 # exact result would be 3/4, using result from A NOT B test self.assertTrue( theta_jaccard_similarity.similarity_test(sk1, result, 0.7))