def setUp(self): Node = node_factory('<i') left = Node(True, [2, 7, 11], [32, 87, 43, 2]) right = Node(True, [15, 24, 31], [45, 67, 89, 0]) with open('foobar', 'wb') as file: pass manager = BufferManager() block = manager.get_file_block('foobar', 0) with pin(block): block.write(b'\0' * BufferManager.block_size) block = manager.get_file_block('foobar', 1) with pin(block): block.write(bytes(left)) block = manager.get_file_block('foobar', 2) with pin(block): block.write(bytes(right)) self.Node = Node self.left = left
def test_buffer_manager(self): manager = BufferManager() a = manager.get_file_block('foo', 0) a.pin() self.assertEqual(a.read(), b'Hello') b = manager.get_file_block('./foo', 0) self.assertTrue(a is b) # test cache hit a.write(b'hello') # a is not flushed b = manager.get_file_block('foo', 1) b.pin() time.sleep(0.5) self.assertEqual(b.read(), b' Worl') with self.assertRaises(RuntimeError): c = manager.get_file_block('foo', 2) # test buffer run out of space a.unpin() b.unpin() c = manager.get_file_block('foo', 2) # test lru swap self.assertFalse((os.path.abspath('foo'), 0) in manager._blocks.keys()) # a should be swapped out self.assertTrue( (os.path.abspath('foo'), 1) in manager._blocks.keys()) # b should remain in the buffer with open('foo', 'rb') as file: self.assertEqual( file.read(), b'hello World') # test the swapped out block is flushed
class LeafIterator: def __init__(self, Node, index_file_path, node, key_position): self.Node = Node self.index_file_path = index_file_path self.node = node self.key_position = key_position self.manager = BufferManager() def __iter__(self): return self def __next__(self): value = self.node.children[self.key_position] if self.key_position < len(self.node.keys): key = self.node.keys[self.key_position] self.key_position += 1 return key, value else: # jump if value == 0: raise StopIteration else: node_block = self.manager.get_file_block( self.index_file_path, value) with pin(node_block): self.node = self.Node.frombytes(node_block.read()) self.key_position = 1 return self.node.keys[0], self.node.children[0]
class IndexManager: def __init__(self, index_file_path, fmt): """specify the path of the index file and the format of the keys, return a index manager if the index file exists, read data from the file otherwise create it and initialize its header info multiple index manager on the same file MUSTN'T simultaneously exist""" self.Node = node_factory(fmt) self.index_file_path = index_file_path self._manager = BufferManager() self.meta_struct = Struct( '<4i' ) # total blocks, offset of the first deleted block, offset of the root node try: meta_block = self._manager.get_file_block(self.index_file_path, 0) with pin(meta_block): self.total_blocks, self.first_deleted_block, self.root, self.first_leaf = self.meta_struct.unpack( meta_block.read()[:self.meta_struct.size]) except FileNotFoundError: # create and initialize an index file if not exits self.total_blocks, self.first_deleted_block, self.root, self.first_leaf = 1, 0, 0, 0 with open(index_file_path, 'wb') as f: f.write( self.meta_struct.pack(self.total_blocks, self.first_deleted_block, self.root, self.first_leaf).ljust( BufferManager.block_size, b'\0')) def dump_header(self): """write the header info to the index file MUST be called before the program exits, otherwise the header info in the file won't be updated""" meta_block = self._manager.get_file_block(self.index_file_path, 0) with pin(meta_block): meta_block.write( self.meta_struct.pack(self.total_blocks, self.first_deleted_block, self.root, self.first_leaf).ljust( BufferManager.block_size, b'\0')) def _get_free_block(self): """return a free block and update header info, assuming this block will be used""" if self.first_deleted_block > 0: block_offset = self.first_deleted_block block = self._manager.get_file_block(self.index_file_path, block_offset) s = Struct('<i') next_deleted = s.unpack(block.read()[:s.size])[0] self.first_deleted_block = next_deleted return block else: block_offset = self.total_blocks block = self._manager.get_file_block(self.index_file_path, block_offset) self.total_blocks += 1 return block def _delete_node(self, node, block): """delete node and writes it to block just a shortcut to mark a block as deleted""" with pin(block): node.next_deleted = self.first_deleted_block block.write(bytes(node)) self.first_deleted_block = block.block_offset def _find_leaf(self, key): """find the first leaf node where key may reside key may not really reside in this node, in this case, the index file has no such key""" key = _convert_to_tuple(key) node_block_offset = self.root path_to_parents = [] while True: # find the insert position node_block = self._manager.get_file_block(self.index_file_path, node_block_offset) with pin(node_block): node = self.Node.frombytes(node_block.read()) if node.is_leaf: return node, node_block, path_to_parents else: # continue searching path_to_parents.append(node_block_offset) child_index = bisect.bisect_right(node.keys, key) node_block_offset = node.children[child_index] def _handle_overflow(self, node, block, path_to_parents): if not path_to_parents: # the root overflowed new_block = self._get_free_block() new_node, key, value = node.split(new_block.block_offset) with pin(block), pin(new_block): block.write(bytes(node)) new_block.write(bytes(new_node)) new_root_block = self._get_free_block() with pin(new_root_block): new_root_node = self.Node( False, [key], [block.block_offset, new_block.block_offset]) new_root_block.write(bytes(new_root_node)) self.root = new_root_block.block_offset return else: parent_offset = path_to_parents.pop() new_block = self._get_free_block() new_node, key, value = node.split(new_block.block_offset) with pin(block), pin(new_block): block.write(bytes(node)) new_block.write(bytes(new_node)) parent_block = self._manager.get_file_block( self.index_file_path, parent_offset) parent_node = self.Node.frombytes(parent_block.read()) parent_node.insert(key, value) if len(parent_node.keys) <= self.Node.n: with pin(parent_block): parent_block.write(bytes(parent_node)) else: self._handle_overflow(parent_node, parent_block, path_to_parents) def _handle_underflow(self, node, block, path_to_parents): """handle underflow after deletion will try to transfer from the left sibling first then try to transfer from the right sibling then try to fuse with the left sibling then try to fuse with the right sibling""" if block.block_offset == self.root: if not node.keys: # root has no key at all; this node is no longer needed if node.is_leaf: self.root = 0 self.first_leaf = 0 else: self.root = node.children[0] self._delete_node(node, block) else: block.write(bytes(node)) return # root underflow is not a problem parent_offset = path_to_parents.pop() parent_block = self._manager.get_file_block(self.index_file_path, parent_offset) with pin(parent_block): parent = self.Node.frombytes(parent_block.read()) my_position = bisect.bisect_right(parent.keys, node.keys[0]) if my_position > 0: # try find the left sibling left_sibling_offset = parent.children[my_position - 1] left_sibling_block = self._manager.get_file_block( self.index_file_path, left_sibling_offset) with pin(left_sibling_block): left_sibling = self.Node.frombytes(left_sibling_block.read()) if len(left_sibling.keys) > ceil( node.n / 2): # a transfer is possible node.transfer_from_left(left_sibling, parent, my_position - 1) with pin(block), pin(left_sibling_block), pin(parent_block): block.write(bytes(node)) left_sibling_block.write(bytes(left_sibling)) parent_block.write(bytes(parent)) return else: left_sibling = None # no left sibling if my_position < len(parent.keys) - 1: # try find the right sibling right_sibling_offset = parent.children[my_position + 1] right_sibling_block = self._manager.get_file_block( self.index_file_path, right_sibling_offset) with pin(right_sibling_block): right_sibling = self.Node.frombytes(right_sibling_block.read()) if len(right_sibling.keys) > ceil( node.n / 2): # a transfer is possible node.transfer_from_right(right_sibling, parent, my_position) with pin(block), pin(right_sibling_block), pin(parent_block): block.write(bytes(node)) right_sibling_block.write(bytes(right_sibling)) parent_block.write(bytes(parent)) return else: right_sibling = None # no right sibling if left_sibling is not None: # fuse with left sibling left_sibling.fuse_with(node, parent, my_position - 1) with pin(left_sibling_block): left_sibling_block.write(bytes(left_sibling)) self._delete_node(node, block) if len(parent.keys) >= ceil(node.n / 2): return else: self._handle_underflow(parent, parent_block, path_to_parents) else: # fuse with right sibling node.fuse_with(right_sibling, parent, my_position) with pin(block): block.write(bytes(node)) self._delete_node(right_sibling, right_sibling_block) if len(parent.keys) >= ceil(node.n / 2): return else: self._handle_underflow(parent, parent_block, path_to_parents) def find(self, key): """find the smallest key >= (parameter) key return an iterator from this position raise RuntimeError if the index is empty""" key = _convert_to_tuple(key) if self.root == 0: raise RuntimeError('cannot find from empty index') else: node, node_block, path_to_parents = self._find_leaf(key) key_position = bisect.bisect_left(node.keys, key) return LeafIterator(self.Node, self.index_file_path, node, key_position) def insert(self, key, value): """insert a key-value pair into the index file if key already in this index, raise ValueError""" key = _convert_to_tuple(key) if self.root == 0: block = self._get_free_block() with pin(block): self.root = block.block_offset self.first_leaf = self.root node = self.Node(is_leaf=True, keys=[key], children=[value, 0]) block.write(bytes(node)) else: node, node_block, path_to_parents = self._find_leaf(key) key_position = bisect.bisect_left(node.keys, key) if key_position < len( node.keys) and node.keys[key_position] == key: raise ValueError('duplicate key {}'.format(key)) node.insert(key, value) if len(node.keys) <= node.n: node_block.write(bytes(node)) return else: # split self._handle_overflow(node, node_block, path_to_parents) def delete(self, key): """delete the key-value pair with key equal the parameter if the index file has no such key, raise ValueError""" key = _convert_to_tuple(key) if self.root == 0: raise ValueError('can\'t delete from empty index') else: node, node_block, path_to_parents = self._find_leaf(key) key_position = bisect.bisect_left(node.keys, key) if key_position < len( node.keys) and node.keys[key_position] == key: # key match del node.keys[key_position] del node.children[key_position] if len(node.keys) >= ceil(node.n / 2): node_block.write(bytes(node)) return else: # underflow self._handle_underflow(node, node_block, path_to_parents) else: # key doesn't match raise ValueError('index has no such key {}'.format(key)) def iter_leaves(self): """return an iterator at the beginning of the leaf node chain""" if self.first_leaf == 0: raise RuntimeError('can\'t iter from empty index') first_leaf_block = self._manager.get_file_block( self.index_file_path, self.first_leaf) first_leaf = self.Node.frombytes(first_leaf_block.read()) return LeafIterator(self.Node, self.index_file_path, first_leaf, 0)
class Record: # The format of header should be the same for all records files. header_format = '<ii' # will be confirmed by RecordManager header_struct = Struct(header_format) def __init__(self, file_path, fmt): self.buffer_manager = BufferManager() self.filename = file_path # Each record in file has 2 extra info: next's record_off and valid bit self.record_struct = Struct(fmt + 'ci') self.first_free_rec, self.rec_tail = self._parse_header() def insert(self, attributes): """Insert the given record""" record_info = convert_str_to_bytes(attributes) + ( b'1', -1) # valid bit, next free space self.first_free_rec, self.rec_tail = self._parse_header() if self.first_free_rec >= 0: # There are space in free list first_free_blk, local_offset = self._calc(self.first_free_rec) block = self.buffer_manager.get_file_block(self.filename, first_free_blk) with pin(block): data = block.read() records = self._parse_block_data(data, first_free_blk) next_free_rec = records[self.first_free_rec][-1] records[local_offset] = record_info new_data = self._generate_new_data(records, first_free_blk) block.write(new_data) position = self.first_free_rec self.first_free_rec = next_free_rec else: # No space in free list, append the new record to the end of file self.rec_tail += 1 block_offset, local_offset = self._calc(self.rec_tail) block = self.buffer_manager.get_file_block(self.filename, block_offset) with pin(block): data = block.read() records = self._parse_block_data(data, block_offset) records.append(record_info) new_data = self._generate_new_data(records, block_offset) block.write(new_data) position = self.rec_tail self._update_header() return position def remove(self, record_offset): """Remove the record at specified position and update the free list""" self.first_free_rec, self.rec_tail = self._parse_header() block_offset, local_offset = self._calc(record_offset) block = self.buffer_manager.get_file_block(self.filename, block_offset) with pin(block): data = block.read() records = self._parse_block_data(data, block_offset) try: records[local_offset][-1] except IndexError: raise IndexError('The offset points to an empty space') if records[local_offset][-2] == b'0': raise RuntimeError('Cannot remove an empty record') records[local_offset][ -1] = self.first_free_rec # A positive number, putting this position into free list records[local_offset][-2] = b'0' self.first_free_rec = record_offset # update the head of free list new_data = self._generate_new_data(records, block_offset) block.write(new_data) self._update_header() def modify(self, attributes, record_offset): """Modify the record at specified offset""" block_offset, local_offset = self._calc(record_offset) block = self.buffer_manager.get_file_block(self.filename, block_offset) record_info = convert_str_to_bytes(attributes) + ( b'1', -1) # Updated record must be real with pin(block): data = block.read() records = self._parse_block_data(data, block_offset) if records[local_offset][-2] == b'0': raise RuntimeError('Cannot update an empty record') records[local_offset] = record_info new_data = self._generate_new_data(records, block_offset) block.write(new_data) def read(self, record_offset): """ Return the record at the corresponding position """ block_offset, local_offset = self._calc(record_offset) block = self.buffer_manager.get_file_block(self.filename, block_offset) with pin(block): data = block.read() records = self._parse_block_data(data, block_offset) if records[local_offset][-2] == b'0': raise RuntimeError('Cannot read an empty record') return convert_bytes_to_str(tuple(records[local_offset][:-2])) def scanning_select(self, conditions): # condition should be a dict: { attribute offset : {operator : value } } total_blk = self._calc(self.rec_tail)[0] + 1 result_set = [] for block_offset in range(total_blk): block = self.buffer_manager.get_file_block(self.filename, block_offset) records = self._parse_block_data(block.read(), block_offset) result_set += tuple([ convert_bytes_to_str(record[:-2]) for record in records if self._check_condition(record, conditions) is True ]) return result_set def scanning_delete(self, conditions): total_blk = self._calc(self.rec_tail)[0] + 1 record_offset = 0 for block_offset in range(total_blk): block = self.buffer_manager.get_file_block(self.filename, block_offset) records = self._parse_block_data(block.read(), block_offset) for i, record in enumerate(records): if self._check_condition(convert_bytes_to_str(record), conditions): records[i][-2] = b'0' records[i][-1] = self.first_free_rec self.first_free_rec = record_offset record_offset += 1 block.write(self._generate_new_data(records, block_offset)) self._update_header() def scanning_update(self, conditions, attributes): # The file header won't change when updating total_blk = self._calc(self.rec_tail)[0] + 1 new_record = convert_str_to_bytes(attributes) + (b'1', -1) for block_offset in range(total_blk): block = self.buffer_manager.get_file_block(self.filename, block_offset) records = self._parse_block_data(block.read(), block_offset) for i, record in enumerate(records): if self._check_condition(convert_bytes_to_str(record), conditions): records[i] = new_record block.write(self._generate_new_data(records, block_offset)) def _calc(self, record_offset): rec_per_blk = BufferManager.block_size // self.record_struct.size rec_first_blk = (BufferManager.block_size - self.header_struct.size) // self.record_struct.size if record_offset < rec_first_blk: # in 1st block return 0, record_offset else: # not in 1st block block_offset = (record_offset - rec_first_blk) // rec_per_blk + 1 local_offset = record_offset - rec_first_blk - (block_offset - 1) * rec_per_blk return block_offset, local_offset @staticmethod def _check_condition(record, conditions): if record[ -2] == b'0': # check the valid bit, return false when meet empty record return False str_record = convert_bytes_to_str(record[:-2]) for position, condition in conditions.items(): value = str_record[position] for operator_type, value_restriction in condition.items(): if operator_type == '=': if value != value_restriction: return False elif operator_type == '>': if value <= value_restriction: return False elif operator_type == '<': if value >= value_restriction: return False return True def _generate_new_data(self, records, blk_offset): if blk_offset == 0: data = bytearray(self.header_struct.size) else: data = bytearray() for r in records: data += self.record_struct.pack(*r) return data def _parse_block_data(self, data, blk_offset): upper_bound = len(data) if (upper_bound - self.header_struct.size) % self.record_struct.size != 0: upper_bound -= self.record_struct.size if blk_offset == 0: # is the first block, need to consider the header lower_bound = self.header_struct.size else: # not the first block, all data are records lower_bound = 0 records = [ list(self.record_struct.unpack_from(data, offset)) for offset in range(lower_bound, upper_bound, self.record_struct.size) ] return records def _parse_header(self): # Parse the file header, refresh corresponding info # and return the info with a tuple block = self.buffer_manager.get_file_block(self.filename, 0) # Get the first block with pin(block): data = block.read() header_info = self.header_struct.unpack_from(data, 0) return header_info def _update_header(self): # Update the file header after modifying the records block = self.buffer_manager.get_file_block(self.filename, 0) with pin(block): data = block.read() header_info = (self.first_free_rec, self.rec_tail) data[:self.header_struct.size] = self.header_struct.pack( *header_info) block.write(data)
def test_detach(self): manager = BufferManager() manager.get_file_block('foo', 0) manager.detach_from_file('foo') self.assertFalse(manager._blocks)