def text_test(words_file=DEFAULT_TEST_DATA_FILE, epsilon=DEFAULT_EPSILON, delta=DEFAULT_DELTA, n=count_min_sketch.DEFAULT_N, n_factor=5): words = read_words(words_file) n *= n_factor sketches = OrderedDict( list=count_min_sketch.TopNCountMinSketch( delta, epsilon, n=n, table_class=sketch_tables.ListBackedSketchTable), array=count_min_sketch.TopNCountMinSketch( delta, epsilon, n=n, table_class=sketch_tables.ArrayBackedSketchTable), matrix=count_min_sketch.TopNCountMinSketch( delta, epsilon, n=n, table_class=sketch_tables.NumpyMatrixBackedSketchTable), bitarray=count_min_sketch.TopNCountMinSketch( delta, epsilon, n=n, table_class=sketch_tables.BitarrayBackedSketchTable), counter=Counter()) benchmark(words, sketches) most_common_comparison(sketches, n / n_factor)
def test_update_strategies(words_file=DEFAULT_TEST_DATA_FILE, epsilon=DEFAULT_EPSILON, delta=DEFAULT_DELTA, n=count_min_sketch.DEFAULT_N, table_class=sketch_tables.ArrayBackedSketchTable, n_factor=5): words = read_words(words_file) # Using a factor here to account for different values in the exact top n n *= n_factor sketches = OrderedDict( array=count_min_sketch.TopNCountMinSketch( delta, epsilon, n=n, table_class=table_class), array_hash_pair=double_hashing.HashPairCMSketch( delta, epsilon, n=n, table_class=table_class), array_conservative=count_min_sketch.TopNCountMinSketch( delta, epsilon, n=n, table_class=table_class, update_strategy=update_strategy.ConservativeUpdateStrategy), array_hash_pair_conservative=double_hashing.HashPairCMSketch( delta, epsilon, n=n, table_class=table_class, update_strategy=update_strategy.ConservativeUpdateStrategy), counter=Counter()) # This test does generate the hash-pair ones and the non-hash-pair ones # with different sizes. However, forcing the non-hash-pair ones into the # hash-pair scheme only generates worse results benchmark(words, sketches) most_common_comparison(sketches, n)
def basic_count_min_test(): count = 100000 numbers = [ random.randint(1, hash_strategy.ARBITRARY_LARGE_PRIME_NUMBER - 1) for _ in range(count) ] counts = [random.randint(10, 1000) for _ in range(count)] cms = count_min_sketch.TopNCountMinSketch(10e-7, 0.005) for (number, count) in izip(numbers, counts): cms.insert(number, count) print(cms.table.depth, cms.table.width, cms.table.depth * cms.table.width) total_error = 0 percent_error = 0 for i in range(count): error = float(cms.get(numbers[i]) - counts[i]) total_error += error percent_error += error / counts[i] print('Total error:', total_error / sum(counts)) print('Average percent error:', percent_error / count) print(cms)
def test_lossy_strategy(words_file=DEFAULT_TEST_DATA_FILE, epsilon=DEFAULT_EPSILON, delta=DEFAULT_DELTA, gamma=0.01, bucket_size=100, n=count_min_sketch.DEFAULT_N, table_class=sketch_tables.ArrayBackedSketchTable): words = read_words(words_file) # To test lossy strategies, we process the files similarly as before # And compare by buckets, akin to Goyal and Daumé (2010) counter = Counter() sketches = OrderedDict( no_lossy=count_min_sketch.TopNCountMinSketch( delta, epsilon, n=n, table_class=table_class), lossy_no_threshold=count_min_sketch.TopNCountMinSketch( delta, epsilon, n=n, table_class=table_class, lossy_strategy=lossy_strategy.LossyUpdateStrategy( gamma, lossy_strategy.no_threshold_func)), lossy_1_threshold=count_min_sketch.TopNCountMinSketch( delta, epsilon, n=n, table_class=table_class, lossy_strategy=lossy_strategy.LossyUpdateStrategy( gamma, lossy_strategy.one_threshold_func)), lossy_window_size=count_min_sketch.TopNCountMinSketch( delta, epsilon, n=n, table_class=table_class, lossy_strategy=lossy_strategy.LossyUpdateStrategy( gamma, lossy_strategy.window_size_threshold_func)), lossy_sqrt_window=count_min_sketch.TopNCountMinSketch( delta, epsilon, n=n, table_class=table_class, lossy_strategy=lossy_strategy.LossyUpdateStrategy( gamma, lossy_strategy.sqrt_window_size_threshold_func)), counter=counter) benchmark(words, sketches) benchmark_by_buckets(bucket_size, sketches)
def test_sketch_hybrid(words_file=DEFAULT_TEST_DATA_FILE, epsilon=DEFAULT_EPSILON, delta=DEFAULT_DELTA, n=count_min_sketch.DEFAULT_N, table_class=sketch_tables.ArrayBackedSketchTable, n_factor=5): words = read_words(words_file) # Using a factor here to account for different values in the exact top n n *= n_factor sketches = OrderedDict( array_conservative=count_min_sketch.TopNCountMinSketch( delta, epsilon, n=n, table_class=table_class, update_strategy=update_strategy.ConservativeUpdateStrategy), array_hash_pair_conservative=double_hashing.HashPairCMSketch( delta, epsilon, n=n, table_class=table_class, update_strategy=update_strategy.ConservativeUpdateStrategy), hybrid_conservative=counter_sketch_hybrid.SketchCounterHybrid( count_min_sketch.TopNCountMinSketch( delta, epsilon, n=n, table_class=table_class, update_strategy=update_strategy.ConservativeUpdateStrategy)), hybrid_hash_pair_conservative=counter_sketch_hybrid.SketchCounterHybrid( double_hashing.HashPairCMSketch( delta, epsilon, n=n, table_class=table_class, update_strategy=update_strategy.ConservativeUpdateStrategy)), counter=Counter()) benchmark(words, sketches) most_common_comparison(sketches, n)
def test_double_hashing(words_file=DEFAULT_TEST_DATA_FILE, epsilon=DEFAULT_EPSILON, delta=DEFAULT_DELTA, n=count_min_sketch.DEFAULT_N, table_class=sketch_tables.ArrayBackedSketchTable): words = read_words(words_file) sketches = OrderedDict( array=count_min_sketch.TopNCountMinSketch( delta, epsilon, table_class=table_class), array_hash_pair=double_hashing.HashPairCMSketch( delta=n, epsilon=epsilon, table_class=table_class), array_hash_pair_multi=double_hashing.MultiHashPairTopNCMSketch( delta, epsilon, table_class=table_class), counter=Counter()) benchmark(words, sketches) most_common_comparison(sketches, n)