def __init__(self, input_file, output_file, MMB, index_type): self.input_file = input_file self.output_file = output_file self.MMB = MMB if index_type == "btree": self.index = BTree() elif index_type == "hash": self.index = MyHash() else: print("Invalid index type") exit(-1) self.dup_open() while True: rec = self.get_next() # input file finished if rec == None: self.flush_output() break # append the record to output buffer if unique if not self.index.search(rec): self.out_buff.append(rec) self.index.insert(rec) # if output buffer is full then flush the output if len(self.out_buff) >= self.NTB: self.flush_output() self.dup_close()
def test_keys_and_values(self): hash_table = MyHash() # random character generator for easy test insertion def random_val(): return random.choice(string.lowercase) [hash_table.set(random_val(), random_val()) for i in range(8)] self.assertEqual(len(hash_table.values()), 8) self.assertEqual(len(hash_table.keys()), 8)
def test_delete(self): hash_table = MyHash() # Test basic deletion hash_table.set('hello', 'world') length = len(hash_table.values()) hash_table.delete('hello') self.assertLess(len(hash_table.values()), length)
def test_resize(self): hash_table = MyHash() # test both increasing in size when load factor is met original_length = len(hash_table._buckets) for i in range(30): key = ''.join(random.choice(string.lowercase) for i in range(3)) val = ''.join(random.choice(string.lowercase) for i in range(3)) hash_table.set(key, val) new_length = len(hash_table._buckets) self.assertGreater(new_length, original_length) # test decreasing in size when load is too small for key in hash_table.keys()[:26]: hash_table.delete(key) new_length = len(hash_table._buckets) self.assertEqual(new_length, original_length)
class DuplicateElimination(object): def __init__(self, input_file, output_file, MMB, index_type): self.input_file = input_file self.output_file = output_file self.MMB = MMB if index_type == "btree": self.index = BTree() elif index_type == "hash": self.index = MyHash() else: print("Invalid index type") exit(-1) self.dup_open() while True: rec = self.get_next() # input file finished if rec == None: self.flush_output() break # append the record to output buffer if unique if not self.index.search(rec): self.out_buff.append(rec) self.index.insert(rec) # if output buffer is full then flush the output if len(self.out_buff) >= self.NTB: self.flush_output() self.dup_close() def flush_output(self): if len(self.out_buff) == 0: return #st = "\n".join([" ".join(list(map(str, r))) for r in self.out_buff]) + "\n" st = "".join([r for r in self.out_buff]) self.out_fd.write(st) self.out_fd.flush() del self.out_buff[:] def dup_open(self): with open(self.input_file, "r") as f: self.NR = sum(1 for line in f if line.rstrip()) with open(self.input_file, "r") as f: for line in f: if line.rstrip(): self.NC = len(line.rstrip().split()) break # assuming that each value in tuple is 32-bit (4-byte) int self.NTB = int(BLOCK_SIZE / (4 * self.NC)) print("MMB = number of main memory blocks = ", self.MMB) print("NTB = number of tuples in a block = ", self.NTB) print("NR = number of tuples in relation = ", self.NR) print("NC = number of cols in relation = ", self.NC) print("BR = number of blocks in relation = ", self.NR//self.NTB) self.out_buff = [] self.inp_buff = [[] for i in range(self.MMB - 1)] self.inp_idx = 0 self.inp_fd = open(self.input_file, "r") self.out_fd = open(self.output_file, "w") def dup_close(self): self.inp_fd.close() self.out_fd.close() def get_next(self): cnt = 0 while cnt <= len(self.inp_buff): ib = self.inp_buff[self.inp_idx] self.inp_idx = (self.inp_idx + 1)%len(self.inp_buff) cnt += 1 if len(ib) != 0: return ib.pop(-1) else: for i in range(self.NTB): row = self.inp_fd.readline() if not row: break #row = [int(c) for c in row.rstrip().split()] ib.append(row) return None
def test_set_and_get(self): hash_table = MyHash() # test basic set and get functionality for strings hash_table.set('hello', 'world') self.assertEqual(hash_table.get('hello'), 'world') # test basic set and get functionality for ints hash_table.set(3, 5) self.assertEqual(hash_table.get(3), 5) # test basic set and get functionality for objects class test(object): pass class test2(object): pass key = test() val = test2() hash_table.set(key, val) self.assertIsInstance(hash_table.get(key), test2) # test for invalid key self.assertRaises(KeyError, hash_table.get, 'test')