Example #1
0
    def test_theta_set_operations(self):
        k = 12      # 2^k = 4096 rows in the table
        n = 1 << 18 # ~256k unique values

        # we'll have 1/4 of the values overlap
        offset = int(3 * n / 4) # it's a float w/o cast

        # create a couple sketches and inject some values
        sk1 = self.generate_theta_sketch(n, k)
        sk2 = self.generate_theta_sketch(n, k, offset)

        # UNIONS
        # create a union object
        union = theta_union(k)
        union.update(sk1)
        union.update(sk2)

        # getting result from union returns a compact_theta_sketch
        # compact theta sketches can be used in additional unions
        # or set operations but cannot accept further item updates
        result = union.get_result()
        self.assertTrue(isinstance(result, compact_theta_sketch))

        # since our process here is deterministic, we have
        # checked and know the exact answer is within one
        # standard deviation of the estimate
        self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
        self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)


        # INTERSECTIONS
        # create an intersection object
        intersect = theta_intersection() # no lg_k
        intersect.update(sk1)
        intersect.update(sk2)

        # has_result() indicates the intersection has been used,
        # although the result may be the empty set
        self.assertTrue(intersect.has_result())

        # as with unions, the result is a compact sketch
        result = intersect.get_result()
        self.assertTrue(isinstance(result, compact_theta_sketch))

        # we know the sets overlap by 1/4
        self.assertLessEqual(result.get_lower_bound(1), n / 4)
        self.assertGreaterEqual(result.get_upper_bound(1), n / 4)


        # A NOT B
        # create an a_not_b object
        anb = theta_a_not_b() # no lg_k
        result = anb.compute(sk1, sk2)

        # as with unions, the result is a compact sketch
        self.assertTrue(isinstance(result, compact_theta_sketch))

        # we know the sets overlap by 1/4, so the remainder is 3/4
        self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4)
        self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)
Example #2
0
    def __init__(self, theta_sketch=None, union=None, compact_theta=None):
        if theta_sketch is None:
            theta_sketch = datasketches.update_theta_sketch()
        if union is None:
            union = datasketches.theta_union()
        else:
            union = _copy_union(union)
        if compact_theta is not None:
            union.update(compact_theta)

        self.theta_sketch = theta_sketch
        self.union = union
Example #3
0
    def get_result(self):
        """
        Generate a theta sketch

        Returns
        -------
        compact_sketch : datasketches.compact_theta_sketch
            Read-only compact theta sketch with full statistics.
        """
        new_union = datasketches.theta_union()
        new_union.update(self.union.get_result())
        new_union.update(self.theta_sketch)
        return new_union.get_result()
Example #4
0
    def merge(self, other):
        """
        Merge another `ThetaSketch` with this one, returning a new object

        Parameters
        ----------
        other : ThetaSketch
            Other theta sketch

        Returns
        -------
        new : ThetaSketch
            New theta sketch with merged statistics
        """
        new_union = datasketches.theta_union()
        new_union.update(self.get_result())
        new_union.update(other.get_result())
        return ThetaSketch(union=new_union)
Example #5
0
def _copy_union(union):
    new_union = datasketches.theta_union()
    new_union.update(union.get_result())
    return new_union
Example #6
0
    def test_theta_set_operations(self):
        k = 12  # 2^k = 4096 rows in the table
        n = 1 << 18  # ~256k unique values

        # we'll have 1/4 of the values overlap
        offset = int(3 * n / 4)  # it's a float w/o cast

        # create a couple sketches and inject some values
        sk1 = self.generate_theta_sketch(n, k)
        sk2 = self.generate_theta_sketch(n, k, offset)

        # UNIONS
        # create a union object
        union = theta_union(k)
        union.update(sk1)
        union.update(sk2)

        # getting result from union returns a compact_theta_sketch
        # compact theta sketches can be used in additional unions
        # or set operations but cannot accept further item updates
        result = union.get_result()
        self.assertTrue(isinstance(result, compact_theta_sketch))

        # since our process here is deterministic, we have
        # checked and know the exact answer is within one
        # standard deviation of the estimate
        self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
        self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)

        # INTERSECTIONS
        # create an intersection object
        intersect = theta_intersection()  # no lg_k
        intersect.update(sk1)
        intersect.update(sk2)

        # has_result() indicates the intersection has been used,
        # although the result may be the empty set
        self.assertTrue(intersect.has_result())

        # as with unions, the result is a compact sketch
        result = intersect.get_result()
        self.assertTrue(isinstance(result, compact_theta_sketch))

        # we know the sets overlap by 1/4
        self.assertLessEqual(result.get_lower_bound(1), n / 4)
        self.assertGreaterEqual(result.get_upper_bound(1), n / 4)

        # A NOT B
        # create an a_not_b object
        anb = theta_a_not_b()  # no lg_k
        result = anb.compute(sk1, sk2)

        # as with unions, the result is a compact sketch
        self.assertTrue(isinstance(result, compact_theta_sketch))

        # we know the sets overlap by 1/4, so the remainder is 3/4
        self.assertLessEqual(result.get_lower_bound(1), 3 * n / 4)
        self.assertGreaterEqual(result.get_upper_bound(1), 3 * n / 4)

        # JACCARD SIMILARITY
        # Jaccard Similarity measure returns (lower_bound, estimate, upper_bound)
        jac = theta_jaccard_similarity.jaccard(sk1, sk2)

        # we can check that results are in the expected order
        self.assertLess(jac[0], jac[1])
        self.assertLess(jac[1], jac[2])

        # checks for sketch equivalency
        self.assertTrue(theta_jaccard_similarity.exactly_equal(sk1, sk1))
        self.assertFalse(theta_jaccard_similarity.exactly_equal(sk1, sk2))

        # we can apply a check for similarity or dissimilarity at a
        # given threshhold, at 97.7% confidence.

        # check that the Jaccard Index is at most (upper bound) 0.2.
        # exact result would be 1/7
        self.assertTrue(
            theta_jaccard_similarity.dissimilarity_test(sk1, sk2, 0.2))

        # check that the Jaccard Index is at least (lower bound) 0.7
        # exact result would be 3/4, using result from A NOT B test
        self.assertTrue(
            theta_jaccard_similarity.similarity_test(sk1, result, 0.7))