Esempio n. 1
0
    def test_sample_noisy_large(self):
        s1 = liquid_legions.LiquidLegions(2.0, 100000, random_seed=42)
        s2 = s1.get_compatible_sketch()

        s1.add_ids(list(range(300000)))
        s2.add_ids(list(range(200000, 400000)))

        noised_s1 = liquid_legions.Noiser(0.25)(s1)
        noised_s2 = liquid_legions.Noiser(0.1)(s2)

        sampler = liquid_legions.Sampler([noised_s1, noised_s2])

        m = sampler.sample_matrix()
        logging.info('Sampled matrix: %s', m.sum(axis=0))

        sampled_s1, sampled_s2 = sampler.sample()

        logging.info('s1 legionaries: %d', sampled_s1.legionaries_count())
        logging.info('s1 cardinality: %.3f', sampled_s1.get_cardinality())
        logging.info('s2 cardinality: %.3f', sampled_s2.get_cardinality())
        union = liquid_legions.LiquidLegions.merge_of([s1, s2])
        logging.info('s1|s2 cardinality: %.3f', union.get_cardinality())
        self.assertAlmostEqual(sampled_s1.get_cardinality(),
                               300000,
                               delta=15000)
        self.assertAlmostEqual(sampled_s2.get_cardinality(),
                               200000,
                               delta=10000)
        self.assertAlmostEqual(union.get_cardinality(), 400000, delta=20000)
Esempio n. 2
0
    def test_posteriors_noisy(self):
        s1 = liquid_legions.LiquidLegions(2.0, 100000, random_seed=42)
        s2 = s1.get_compatible_sketch()

        s1.add_ids(list(range(30000)))
        s2.add_ids(list(range(20000, 40000)))

        s1 = liquid_legions.Noiser(0.3)(s1)
        s2 = liquid_legions.Noiser(0.2)(s2)

        e = liquid_legions.VennEstimator([s1, s2])
        logging.info('Venn: %s', e.estimate_from_all())
        sampler = liquid_legions.Sampler([s1, s2])
        posteriors = sampler.get_all_posteriors()
        logging.info('Row sums: %s', posteriors.sum(axis=1))

        counts = posteriors.sum(axis=0)
        logging.info('Bit count expectations: %s', counts)
        s1_cardinality = sampler.sketch.get_cardinality_for_legionaries_count(
            counts[1] + counts[3])
        logging.info('s1 cardinality: %.3f', s1_cardinality)
        s2_cardinality = sampler.sketch.get_cardinality_for_legionaries_count(
            counts[2] + counts[3])
        logging.info('s2 cardinality: %.3f', s2_cardinality)
        union_cardinality = sampler.sketch.get_cardinality_for_legionaries_count(
            counts[1] + counts[2] + counts[3])
        logging.info('s1 | s2 cardinality: %.3f', union_cardinality)
        self.assertAlmostEqual(s1_cardinality, 30000, delta=1500)
        self.assertAlmostEqual(s2_cardinality, 20000, delta=1000)
        self.assertAlmostEqual(union_cardinality, 40000, delta=2000)
Esempio n. 3
0
    def test_sample_matrix_noisy(self):
        s1 = liquid_legions.LiquidLegions(2.0, 100000, random_seed=42)
        s2 = s1.get_compatible_sketch()

        s1.add_ids(list(range(30000)))
        s2.add_ids(list(range(20000, 40000)))

        dp_p = 0.25
        noiser = liquid_legions.Noiser(dp_p)
        s1, s2 = list(map(noiser, [s1, s2]))

        e = liquid_legions.VennEstimator([s1, s2])
        logging.info('Venn: %s', e.estimate_from_all())
        sampler = liquid_legions.Sampler([s1, s2])
        posteriors = sampler.get_all_posteriors()
        logging.info('Row sums: %s', posteriors.sum(axis=1))

        sample = sampler.sample_matrix()
        logging.info('Sample shape: %s', sample.shape)
        counts = sample.sum(axis=0)
        logging.info('Bit count expectations: %s', counts)
        s1_cardinality = sampler.sketch.get_cardinality_for_legionaries_count(
            counts[0])
        s2_cardinality = sampler.sketch.get_cardinality_for_legionaries_count(
            counts[1])
        logging.info('s1 cardinality: %s', s1_cardinality)
        logging.info('s2 cardinality: %s', s2_cardinality)
        self.assertAlmostEqual(s1_cardinality, 30000, delta=1500)
        self.assertAlmostEqual(s2_cardinality, 20000, delta=1000)
Esempio n. 4
0
 def test_venn_estimator_noised(self):
     chain = self.make_chain(2, 5, 100000, 50000)
     dp_p = 0.25
     noiser = liquid_legions.Noiser(dp_p)
     noised_chain = list(map(noiser, chain))
     e = liquid_legions.VennEstimator(noised_chain)
     self.assertAlmostEqual(e.estimate_from_all()[1], 50000, delta=6000)
     self.assertAlmostEqual(e.estimate_from_all()[2], 50000, delta=6000)
     self.assertAlmostEqual(e.estimate_from_all()[3], 50000, delta=6000)
     logging.info('Venn: %s', e.estimate_from_all())
Esempio n. 5
0
 def test_noisy_union(self):
     dp_p = 0.25
     noiser = liquid_legions.Noiser(dp_p)
     num_sketches = 4
     halfsize = 100000
     chain = self.make_chain(num_sketches, 3, 100000, halfsize)
     noised_chain = list(map(noiser, chain))
     estimator = liquid_legions.Estimator(dp_p)
     estimation = estimator(noised_chain)
     true_size = (num_sketches + 1) * halfsize
     logging.info('Estimate: %.3f, Truth: %.3f, RelError: %.3f.',
                  estimation, true_size, estimation / true_size - 1)
     self.assertAlmostEqual(estimation, true_size, delta=true_size * 0.1)
Esempio n. 6
0
 def test_sequential_estimator(self):
     noiser = liquid_legions.Noiser(0.25)
     estimator = liquid_legions.SequentialEstimator()
     true_set = set()
     universe_set = list(range(1000000))
     sketches = []
     for i in range(10):
         new_set = numpy.random.choice(universe_set, size=10000)
         sketch = liquid_legions.LiquidLegions(10, 10000, random_seed=42)
         sketch.add_ids(new_set)
         true_set = true_set | set(new_set)
         noised_sketch = noiser(sketch)
         sketches.append(noised_sketch)
         logging.info('Step %d, cardinality %.3f, true_cardinality %d.', i,
                      estimator(sketches), len(true_set))
     self.assertAlmostEqual(len(true_set),
                            estimator(sketches),
                            delta=len(true_set) * 0.25)
Esempio n. 7
0
    def test_distribution_given_observation(self):
        s1 = liquid_legions.LiquidLegions(20.0, 10, random_seed=42)
        s2 = s1.get_compatible_sketch()

        s1.add_ids(list(range(10)))
        s2.add_ids(list(range(10000)))

        s1, s2 = map(liquid_legions.Noiser(0.01), [s1, s2])
        sampler = liquid_legions.Sampler([s1, s2])
        logging.info('Transition matrix: %s', sampler.transition_matrix)
        logging.info('Distribution: %s',
                     sampler.distribution_given_observation(1))
        self.assertAlmostEqual(sampler.transition_matrix[0, 0],
                               0.98,
                               delta=0.001)
        self.assertAlmostEqual(sampler.transition_matrix[0, 1],
                               0.0099,
                               delta=0.0001)
Esempio n. 8
0
    def test_manual_sequential_merge_large_overlap_pure(self):
        noiser = liquid_legions.Noiser(0.0)  # No noise.

        s = liquid_legions.LiquidLegions(10.0, 50000, random_seed=42)

        true_set = set()
        for i in range(10):
            a = s.get_compatible_sketch()
            new_set = range(i * 1000, i * 1000 + 10000)
            a.add_ids(new_set)
            true_set = true_set | set(new_set)
            noised_a = noiser(a)
            sampler = liquid_legions.Sampler([s, noised_a])
            _, sampled_a = sampler.sample()
            venn_estimator = liquid_legions.VennEstimator([s, noised_a])
            logging.info('Venn: %s', venn_estimator())

            s.merge_in(sampled_a)
            logging.info('Step %d, cardinality %.3f, true cardinality: %d', i,
                         s.get_cardinality(), len(true_set))
        self.assertAlmostEqual(len(true_set),
                               s.get_cardinality(),
                               delta=len(true_set) * 0.1)
Esempio n. 9
0
 def test_set_difference_noised(self):
     noiser = liquid_legions.Noiser(0.25)
     s1 = liquid_legions.LiquidLegions(3.0, 100000, random_seed=42)
     s2 = liquid_legions.LiquidLegions(3.0, 100000, random_seed=42)
     s1.add_ids(list(range(0, 60000)))
     s2.add_ids(list(range(40000, 150000)))
     # Expected diff is range(0, 40000), i.e. size 40k.
     sampler = liquid_legions.Sampler([noiser(s1), noiser(s2)])
     estimated_diff_sketch = sampler.sample_diff()
     diff_sketch = s1.get_compatible_sketch()
     diff_sketch.add_ids(range(0, 40000))
     logging.info('Diff cardinality: %.3f',
                  estimated_diff_sketch.get_cardinality())
     self.assertAlmostEqual(estimated_diff_sketch.get_cardinality(),
                            40000,
                            delta=4000)
     match_count = {}
     for i in range(100000):
         estimated_bit = estimated_diff_sketch.sketch.get(i, 0) > 0
         true_bit = diff_sketch.sketch.get(i, 0) > 0
         match_count[true_bit, estimated_bit] = match_count.get(
             (true_bit, estimated_bit), 0) + 1
     logging.info('Match / mismatch counts: %s',
                  sorted(match_count.items()))
        denoiser=bloom_filters.SurrealDenoiser(probability=0.25)),
    sketch_noiser=bloom_filters.BlipNoiser(epsilon=np.log(3)))

EXP_BLOOM_FILTER_1E5_10_INFTY_FIRST_MOMENT_LOG = SketchEstimatorConfig(
    name='exp_bloom_filter-1e5_10-infty-first_moment_exp',
    sketch_factory=bloom_filters.ExponentialBloomFilter.get_sketch_factory(
        length=10**5, decay_rate=10),
    estimator=bloom_filters.FirstMomentEstimator(
        method=bloom_filters.FirstMomentEstimator.METHOD_EXP))

LIQUID_LEGIONS_1E5_10_LN3_SEQUENTIAL = SketchEstimatorConfig(
    name='liquid_legions-1e5_10-ln3-sequential',
    sketch_factory=liquid_legions.LiquidLegions.get_sketch_factory(
        a=10, m=10**5),
    estimator=liquid_legions.SequentialEstimator(),
    sketch_noiser=liquid_legions.Noiser(flip_probability=0.25))

LIQUID_LEGIONS_1E5_10_INFTY_SEQUENTIAL = SketchEstimatorConfig(
    name='liquid_legions-1e5_10-infty-sequential',
    sketch_factory=liquid_legions.LiquidLegions.get_sketch_factory(
        a=10, m=10**5),
    estimator=liquid_legions.SequentialEstimator())

VECTOR_OF_COUNTS_4096_LN3_SEQUENTIAL = SketchEstimatorConfig(
    name='vector_of_counts-4096-ln3-sequential',
    sketch_factory=vector_of_counts.VectorOfCounts.get_sketch_factory(
        num_buckets=4096),
    estimator=vector_of_counts.SequentialEstimator(),
    sketch_noiser=vector_of_counts.LaplaceNoiser(epsilon=np.log(3)))

VECTOR_OF_COUNTS_4096_INFTY_SEQUENTIAL = SketchEstimatorConfig(