Esempio n. 1
0
def text_test(words_file=DEFAULT_TEST_DATA_FILE,
              epsilon=DEFAULT_EPSILON,
              delta=DEFAULT_DELTA,
              n=count_min_sketch.DEFAULT_N,
              n_factor=5):

    words = read_words(words_file)

    n *= n_factor
    sketches = OrderedDict(
        list=count_min_sketch.TopNCountMinSketch(
            delta,
            epsilon,
            n=n,
            table_class=sketch_tables.ListBackedSketchTable),
        array=count_min_sketch.TopNCountMinSketch(
            delta,
            epsilon,
            n=n,
            table_class=sketch_tables.ArrayBackedSketchTable),
        matrix=count_min_sketch.TopNCountMinSketch(
            delta,
            epsilon,
            n=n,
            table_class=sketch_tables.NumpyMatrixBackedSketchTable),
        bitarray=count_min_sketch.TopNCountMinSketch(
            delta,
            epsilon,
            n=n,
            table_class=sketch_tables.BitarrayBackedSketchTable),
        counter=Counter())

    benchmark(words, sketches)
    most_common_comparison(sketches, n / n_factor)
Esempio n. 2
0
def test_update_strategies(words_file=DEFAULT_TEST_DATA_FILE,
                           epsilon=DEFAULT_EPSILON,
                           delta=DEFAULT_DELTA,
                           n=count_min_sketch.DEFAULT_N,
                           table_class=sketch_tables.ArrayBackedSketchTable,
                           n_factor=5):
    words = read_words(words_file)
    # Using a factor here to account for different values in the exact top n
    n *= n_factor

    sketches = OrderedDict(
        array=count_min_sketch.TopNCountMinSketch(
            delta, epsilon, n=n, table_class=table_class),
        array_hash_pair=double_hashing.HashPairCMSketch(
            delta, epsilon, n=n, table_class=table_class),
        array_conservative=count_min_sketch.TopNCountMinSketch(
            delta,
            epsilon,
            n=n,
            table_class=table_class,
            update_strategy=update_strategy.ConservativeUpdateStrategy),
        array_hash_pair_conservative=double_hashing.HashPairCMSketch(
            delta,
            epsilon,
            n=n,
            table_class=table_class,
            update_strategy=update_strategy.ConservativeUpdateStrategy),
        counter=Counter())

    # This test does generate the hash-pair ones and the non-hash-pair ones
    # with different sizes. However, forcing the non-hash-pair ones into the
    # hash-pair scheme only generates worse results

    benchmark(words, sketches)
    most_common_comparison(sketches, n)
Esempio n. 3
0
def basic_count_min_test():
    count = 100000
    numbers = [
        random.randint(1, hash_strategy.ARBITRARY_LARGE_PRIME_NUMBER - 1)
        for _ in range(count)
    ]
    counts = [random.randint(10, 1000) for _ in range(count)]

    cms = count_min_sketch.TopNCountMinSketch(10e-7, 0.005)

    for (number, count) in izip(numbers, counts):
        cms.insert(number, count)

    print(cms.table.depth, cms.table.width, cms.table.depth * cms.table.width)

    total_error = 0
    percent_error = 0

    for i in range(count):
        error = float(cms.get(numbers[i]) - counts[i])
        total_error += error
        percent_error += error / counts[i]

    print('Total error:', total_error / sum(counts))
    print('Average percent error:', percent_error / count)
    print(cms)
Esempio n. 4
0
def test_lossy_strategy(words_file=DEFAULT_TEST_DATA_FILE,
                        epsilon=DEFAULT_EPSILON,
                        delta=DEFAULT_DELTA,
                        gamma=0.01,
                        bucket_size=100,
                        n=count_min_sketch.DEFAULT_N,
                        table_class=sketch_tables.ArrayBackedSketchTable):
    words = read_words(words_file)
    # To test lossy strategies, we process the files similarly as before
    # And compare by buckets, akin to Goyal and Daumé (2010)
    counter = Counter()
    sketches = OrderedDict(
        no_lossy=count_min_sketch.TopNCountMinSketch(
            delta, epsilon, n=n, table_class=table_class),
        lossy_no_threshold=count_min_sketch.TopNCountMinSketch(
            delta,
            epsilon,
            n=n,
            table_class=table_class,
            lossy_strategy=lossy_strategy.LossyUpdateStrategy(
                gamma, lossy_strategy.no_threshold_func)),
        lossy_1_threshold=count_min_sketch.TopNCountMinSketch(
            delta,
            epsilon,
            n=n,
            table_class=table_class,
            lossy_strategy=lossy_strategy.LossyUpdateStrategy(
                gamma, lossy_strategy.one_threshold_func)),
        lossy_window_size=count_min_sketch.TopNCountMinSketch(
            delta,
            epsilon,
            n=n,
            table_class=table_class,
            lossy_strategy=lossy_strategy.LossyUpdateStrategy(
                gamma, lossy_strategy.window_size_threshold_func)),
        lossy_sqrt_window=count_min_sketch.TopNCountMinSketch(
            delta,
            epsilon,
            n=n,
            table_class=table_class,
            lossy_strategy=lossy_strategy.LossyUpdateStrategy(
                gamma, lossy_strategy.sqrt_window_size_threshold_func)),
        counter=counter)

    benchmark(words, sketches)
    benchmark_by_buckets(bucket_size, sketches)
Esempio n. 5
0
def test_sketch_hybrid(words_file=DEFAULT_TEST_DATA_FILE,
                       epsilon=DEFAULT_EPSILON,
                       delta=DEFAULT_DELTA,
                       n=count_min_sketch.DEFAULT_N,
                       table_class=sketch_tables.ArrayBackedSketchTable,
                       n_factor=5):
    words = read_words(words_file)
    # Using a factor here to account for different values in the exact top n
    n *= n_factor

    sketches = OrderedDict(
        array_conservative=count_min_sketch.TopNCountMinSketch(
            delta,
            epsilon,
            n=n,
            table_class=table_class,
            update_strategy=update_strategy.ConservativeUpdateStrategy),
        array_hash_pair_conservative=double_hashing.HashPairCMSketch(
            delta,
            epsilon,
            n=n,
            table_class=table_class,
            update_strategy=update_strategy.ConservativeUpdateStrategy),
        hybrid_conservative=counter_sketch_hybrid.SketchCounterHybrid(
            count_min_sketch.TopNCountMinSketch(
                delta,
                epsilon,
                n=n,
                table_class=table_class,
                update_strategy=update_strategy.ConservativeUpdateStrategy)),
        hybrid_hash_pair_conservative=counter_sketch_hybrid.SketchCounterHybrid(
            double_hashing.HashPairCMSketch(
                delta,
                epsilon,
                n=n,
                table_class=table_class,
                update_strategy=update_strategy.ConservativeUpdateStrategy)),
        counter=Counter())

    benchmark(words, sketches)
    most_common_comparison(sketches, n)
Esempio n. 6
0
def test_double_hashing(words_file=DEFAULT_TEST_DATA_FILE,
                        epsilon=DEFAULT_EPSILON,
                        delta=DEFAULT_DELTA,
                        n=count_min_sketch.DEFAULT_N,
                        table_class=sketch_tables.ArrayBackedSketchTable):

    words = read_words(words_file)
    sketches = OrderedDict(
        array=count_min_sketch.TopNCountMinSketch(
            delta, epsilon, table_class=table_class),
        array_hash_pair=double_hashing.HashPairCMSketch(
            delta=n, epsilon=epsilon, table_class=table_class),
        array_hash_pair_multi=double_hashing.MultiHashPairTopNCMSketch(
            delta, epsilon, table_class=table_class),
        counter=Counter())

    benchmark(words, sketches)
    most_common_comparison(sketches, n)