def count_hash_incremental_move(output=True, decimals=4): """ For all English words, starting with a hashtable of size 1,024 and a load factor of 0.75, count how many times the hash code (i.e., %) is invoked. """ from ch03.book import CountableHash from ch03.hashtable_linked import DynamicHashtable print( 'Each emitted row contains an operation more costly than any before...' ) ht_dynamic = DynamicHashtable(1023) tbl = DataTable([20, 10, 10], ['Word', 'N', 'cost'], output=output, decimals=decimals) tbl.format('Word', 's') tbl.format('N', ',d') max_cost = 0 now = time.time() for w in english_words(): before = time.time() ht_dynamic.put(CountableHash(w), w) cost = time.time() - before if cost > max_cost: max_cost = cost tbl.row([w, ht_dynamic.N, cost]) total_normal = time.time() - now print('Normal:{}'.format(total_normal)) for delta in [512, 256, 128, 64, 32, 16, 8, 4]: ht = DynamicHashtableIncrementalResizing(1023, delta=delta) tbl = DataTable([20, 10, 10], ['Word', 'N', 'cost'], output=output, decimals=decimals) tbl.format('Word', 's') tbl.format('N', ',d') max_cost = 0 now = time.time() for w in english_words(): before = time.time() ht.put(CountableHash(w), w) cost = time.time() - before if cost > max_cost: max_cost = cost tbl.row([w, ht.N, cost]) total_delta = time.time() - now print('delta={}, Normal:{}'.format(delta, total_delta))
def test_challenge(self): from ch03.challenge import DynamicHashtableIncrementalResizing as Hashtable from resources.english import english_words ht = Hashtable(31, 5) for w in english_words(): ht.put(w, w) # make sure all still present for w in english_words(): self.assertEqual(w, ht.get(w)) # now remove them one at a time for w in english_words(): self.assertEqual(w, ht.remove(w))
def time_results_linked(output=True, decimals=3): """Average time to find a key in growing hashtable_open.""" sizes = [8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576] tbl = DataTable([8] + [8] * len(sizes), ['N'] + [comma(sz) for sz in sizes], output=output, decimals=decimals) # Now start with M words to be added into a table of size N. # Start at 1000 and work up to 2000 words = english_words() for num_to_add in [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]: all_words = words[:num_to_add] line = [num_to_add] for size in sizes: time1 = min( timeit.repeat(stmt=''' table = Hashtable({}) for word in words: table.put(word, 99)'''.format(size), setup=''' from ch03.hashtable_linked import Hashtable words={}'''.format(all_words), repeat=1, number=100)) line.append(1000000 * time1 / size) tbl.row(line) return tbl
def test_ordered_pq(self): from ch04.ordered import PQ from resources.english import english_words words = english_words()[:10000] pair = self.priority_queue_stress_test(PQ(len(words)), len(words)) # Note: we cannot guarantee individual words BUT we can guarantee length self.assertEqual((len('acetylphenylhydrazine'), len('a')), (len(pair[0]), len(pair[1])))
def test_builtin_heap_pq(self): from ch04.builtin import PQ from resources.english import english_words words = english_words()[:1000] pair = self.priority_queue_stress_test(PQ(len(words)), len(words)) # Note: we cannot guarantee individual words BUT we can guarantee length self.assertEqual((len('abdominohysterectomy'), len('a')), (len(pair[0]), len(pair[1])))
def compare_dynamic_build_and_access_time(repeat=25, num=10, output=True): """Generate tables for build and access for AVL trees.""" from ch06.symbol import BinaryTree from resources.english import english_words bt = BinaryTree() for w in english_words(): bt.put(w, w) total = len(english_words()) if output: print('This will take several minutes...') print('total number of words =', total) print('height of AVL tree for all English words =', bt.root.height) print('has to at least be =', math.log(total + 1) / math.log(2) - 1) # When 'ht = HTLL(...) is inside the STMT, it measures BUILD TIME. # When it is included in the setup, we are measuring ACCESS TIME. t_build = min( timeit.repeat(stmt=''' ht = BinaryTree() for w in words: ht.put(w,w)''', setup=''' from ch06.symbol import BinaryTree from resources.english import english_words words = english_words()''', repeat=repeat, number=num)) / num t_access = min( timeit.repeat(stmt=''' for w in words: ht.get(w)''', setup=''' from ch06.symbol import BinaryTree from resources.english import english_words ht = BinaryTree() words = english_words() for w in words: ht.put(w,w)''', repeat=repeat, number=num)) / num if output: print('Build-time =', t_build, ', Access-time = ', t_access) return (t_build, t_access)
def test_binary_tree_from_chapter_06(self): from ch06.pq import PQ from ch04.test import TestChapter4 from resources.english import english_words words = english_words() pair = TestChapter4().priority_queue_stress_test(PQ(), len(words)) # Note: we cannot guarantee individual words BUT we can guarantee length self.assertEqual((len('formaldehydesulphoxylate'), len('a')), (len(pair[0]), len(pair[1])))
def count_collisions(num_rows=0, output=True, decimals=1): """Generate table counting collisions.""" all_words = english_words() N = len(all_words) from ch03.hashtable_linked import Hashtable as HL from ch03.hashtable_linked import stats_linked_lists from ch03.hashtable_open import Hashtable as OHL from ch03.hashtable_open import stats_open_addressing tbl = DataTable([10,8,8,8,8], ['M', 'Avg LL', 'Max LL', 'Avg OA', 'Max OA'], output=output, decimals=decimals) tbl.format('Max LL', 'd') tbl.format('Max OA', 'd') M = 20*N hl = HL(M) ohl = OHL(M) for w in all_words: hl.put(w, 1) ohl.put(w, 1) avg_size_linked = stats_linked_lists(hl) avg_size_open = stats_open_addressing(ohl) tbl.row([M, avg_size_linked[0], avg_size_linked[1], avg_size_open[0], avg_size_open[1]]) M = 2*N while M > N/16: hl = HL(M) ohl = OHL(M) for w in all_words: hl.put(w, 1) if M > N: # otherwise, will fail... ohl.put(w, 1) avg_size_linked = stats_linked_lists(hl) if N < M: avg_size_open = stats_open_addressing(ohl) else: tbl.format('Avg OA', 's') tbl.format('Max OA', 's') avg_size_open = [SKIP, SKIP] num_rows -= 1 tbl.row([M, avg_size_linked[0], avg_size_linked[1], avg_size_open[0], avg_size_open[1]]) # Once below threshold, go down at 60% clip if M > N: M = (M * 95) // 100 else: M = (M * 6) // 10 # To allow for testing, simple way to break out after a number of rows are generated. if num_rows == 0: break return tbl
def check_for_duplicates(): """ Determine if there are any hash() clashes on the words in the English language. Because Python uses 64-bit hashcodes the likelihood is tremendously small. Also remember that Python now salts hash code values, so they are not the same from one run to the next. The Python code below finds no clashes on hash() values. The following Java code finds 11 clashes:: import java.book.*; public class EnglishClash { public static void main(String[] args) throws Exception { java.io.File f = new java.io.File("words.english.txt"); Scanner sc = new Scanner(f); Hashtable<Integer,String> ht = new Hashtable<>(); while (sc.hasNextLine()) { String s = sc.nextLine(); int i = s.hashCode(); if (ht.containsKey(i)) { System.out.println("clash on " + s + " and " + ht.get(i)); } else { ht.put(i, s); } } sc.close(); } } The above code finds 11 clashes clash on hazardless and agarwal clash on hierarch and crinolines clash on isohel and epistolaries clash on kindergartener and acouasm clash on misused and horsemints clash on poised and dentinalgia clash on proselytized and nonguard clash on righto and buzzards clash on unapprehending and fineable clash on unheavenly and hypoplankton clash on variants and gelato """ hash_values = {} clashes = 0 for w in english_words(): hc = hash(w) if hc in hash_values: print('clash on', w, 'and', hash_values[hc]) clashes += 1 hash_values[hc] = w print('Number of duplicate hashcodes found for dictionary:', clashes)
def count_collisions_dynamic(num_rows=0, output=True, decimals=2): """Generate data counting collisions for dynamic hashtables. Not used in book.""" all_words = english_words() # start twice as big as the number of words, and reduce steadily, counting collisions N = len(all_words) M = 2*N from ch03.hashtable_linked import DynamicHashtable as DHL from ch03.hashtable_linked import stats_linked_lists from ch03.hashtable_open import DynamicHashtable as ODHL from ch03.hashtable_open import stats_open_addressing tbl = DataTable([10,8,8,8,8], ['M', 'Avg LL', 'Max LL', 'Avg OA', 'Max OA'], output=output, decimals=decimals) tbl.format('Max LL', 'd') tbl.format('Max OA', 'd') while M > N/16: dhl = DHL(M) odhl = ODHL(M) for w in all_words: dhl.put(w, 1) odhl.put(w, 1) avg_size_linked_dynamic = stats_linked_lists(dhl) avg_size_open_dynamic = stats_open_addressing(odhl) num_rows -= 1 tbl.row([M, avg_size_linked_dynamic[0], avg_size_linked_dynamic[1], avg_size_open_dynamic[0], avg_size_open_dynamic[1]]) # Start with one ten times as big, then drop down to 2*N if M > N: M = (M * 95) // 100 else: M = (M * 6) // 10 # To allow for testing, simple way to break out after a number of rows are generated. if num_rows == 0: break return tbl
def count_hash(output=True, decimals=2): """ For all English words, starting with a hashtable of size 1,024 and a load factor of 0.75, count how many times the hash code (i.e., %) is invoked. """ from ch03.hashtable_linked import DynamicHashtable ht = DynamicHashtable(1023) tbl = DataTable([20,10,10,10,10],['Word', 'M', 'N', '#insert', 'average'], output=output, decimals=decimals) tbl.format('Word', 's') tbl.format('N', ',d') tbl.format('M', ',d') tbl.format('#insert', ',d') last_word = None for w in english_words(): last_word = w last_m = ht.M last = CountableHash.hash_count ht.put(CountableHash(w), w) if CountableHash.hash_count != last + 1: tbl.row([w, last_m, ht.N, CountableHash.hash_count, CountableHash.hash_count/ht.N]) tbl.row([last_word, last_m, ht.N, CountableHash.hash_count, CountableHash.hash_count/ht.N]) # determine when next resize event would occur... for i in range(1, 200000): last = CountableHash.hash_count last_m = ht.M ht.put(CountableHash(last_word + str(i)), last_word) if CountableHash.hash_count != last + 1: tbl.row([last_word + str(i), last_m, ht.N, CountableHash.hash_count, CountableHash.hash_count/ht.N]) break return tbl
def priority_queue_stress_test(self, pq, max_length=None): """ Given an empty Priority queue, add words from English dictionary where priority is length of word. Because some PQ implementations are so inefficient, allow a caller to restrict """ from resources.english import english_words words = english_words() if max_length: words = words[:max_length] for w in words: pq.enqueue(w, len(w)) # First word out is longest... / Last one out is smallest first = pq.dequeue() while pq: last = pq.dequeue() # Should be drained with self.assertRaises(RuntimeError): pq.dequeue() return (first, last)
def time_results_open_addressing(num_rows=0, output=True, decimals=3): """Average time to insert a key in growing hashtable_open (in microseconds).""" sizes = [8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576] headers = [comma(s) for s in sizes] headers.insert(0,'N') tbl = DataTable([8,8,8,8,8,8,8,8,10], headers, output=output, decimals=decimals) # Now start with M words to be added into a table of size N. # Start at 1000 and work up to 2000 for num_to_add in [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768]: all_words = english_words()[:num_to_add] line = [len(all_words)] for size in sizes: try: tbl.format(comma(size), '.3f') timing = min(timeit.repeat(stmt=''' table = Hashtable({}) for word in all_words: table.put(word, 99)'''.format(size), setup=''' from ch03.hashtable_open import Hashtable from resources.english import english_words all_words=english_words()[:{}]'''.format(num_to_add),repeat=1,number=100)) timing = (100000.0 * timing) / size except RuntimeError: timing = SKIP line.append(timing) num_rows -= 1 tbl.row(line) # Provide effective way to terminate early for testing. if num_rows == 0: break return tbl
####################################################################### if __name__ == '__main__': print( 'Average search Times with separate chaining hashtables (time in ns)') time_results_linked() print() print( 'Count how many times hashcode is computed (i.e., when % is invoked) on PUT' ) probability_of_failure() print() print('Statistics from the perfect hash on just a few words') ewords = english_words() simple_stats(ewords[:10]) print() print('Compare performance of perfect hash with regular hashtable') random.shuffle(ewords) compare_time(ewords) print() print( 'Trying to find two words in the dictionary with the same Python hash() value' ) check_for_duplicates() print()
def test_bad_timing(self): from resources.english import english_words from ch03.challenge import bad_timing tbl = bad_timing(english_words()[:100], output=False) self.assertTrue(tbl.entry('Good', 'Max Len') > 0)
def measure_performance_resize(max_d=50, output=True): """Generate table of statistics for table resizing up to (but not including maxd=50).""" from ch03.hashtable_linked import DynamicHashtable try: # Added in Python 3.7 from time import time_ns timing = time_ns except ImportError: from time import time timing = time if output: print('Dynamic Resizing Hashtable') tbl = DataTable([8, 15, 15, 10, 10], ['idx', 'word', 'time', 'old-size', 'new-size'], output=output, decimals=2) tbl.format('idx', 'd') tbl.format('word', 's') tbl.format('old-size', ',d') tbl.format('new-size', ',d') ht = DynamicHashtable(1023) idx = 1 last = None average = 0 words = english_words() for w in words: before = timing() old_size = len(ht.table) ht.put(w, w) new_size = len(ht.table) after = timing() average += (after - before) if last: if after - before > last: last = after - before tbl.row([idx, w, last, old_size, new_size]) else: last = after - before idx += 1 average /= len(words) ht = None if output: print('Average was ', average) print('Incremental Resizing Hashtable') tbl_ir = DataTable([8, 15, 15, 10, 10], ['idx', 'word', 'time', 'old-size', 'new-size'], output=output, decimals=2) tbl_ir.format('idx', 'd') tbl_ir.format('word', 's') tbl_ir.format('old-size', ',d') tbl_ir.format('new-size', ',d') ht = DynamicHashtableIncrementalResizing(1023, 10) idx = 1 last = None average = 0 words = english_words() for w in words: before = timing() old_size = len(ht.table) ht.put(w, w) new_size = len(ht.table) after = timing() average += (after - before) if last: if after - before > last: last = after - before tbl_ir.row([idx, w, last, old_size, new_size]) else: last = after - before idx += 1 ht = None average /= len(words) if output: print('Average was ', average) print('Incremental Resizing dependent on Delta') print() tbl_d = DataTable([8, 10], ['Delta', 'Average'], output=output) tbl_d.format('Delta', 'd') for delta in range(1, max_d): ht = DynamicHashtableIncrementalResizing(1023, delta) average = 0 words = english_words() for w in words: before = timing() ht.put(w, w) after = timing() average += (after - before) average /= len(words) tbl_d.row([delta, average]) return (tbl, tbl_ir, tbl_d)
####################################################################### if __name__ == '__main__': chapter = 3 with ExerciseNum(1) as exercise_number: exercise_triangle_number_probing() print(caption(chapter, exercise_number), 'Fragment evaluation') with ExerciseNum(2) as exercise_number: evaluate_hashtable_sorted_chains() print(caption(chapter, exercise_number), 'Hashtable with sorted linked list chains') # To provide a full exercise, remove the "[:5000]" from below, otherwise takes too long for book. with ExerciseNum(3) as exercise_number: bad_timing(english_words()[:5000]) print(caption(chapter, exercise_number), 'ValueBadHash exercise') with ExerciseNum(4) as exercise_number: prime_number_difference(english_words()) print(caption(chapter, exercise_number), 'Prime Number exercise') with ExerciseNum(5) as exercise_number: evaluate_DynamicHashtablePlusRemove() print(caption(chapter, exercise_number), 'Open Addressing with Marked Elements as deleted.') with ExerciseNum(6) as exercise_number: count_hash_incremental_move() print( caption(chapter, exercise_number),