def write_to(self, block, dry_run=False): ''' Write relevant data sections of self to block's file name Return: (total_bytes, seeks), the total number of bytes written and the number of seeks required in block. Similar to read_from but for writes. ''' if not self.overlap(block): return 0, 0 assert(block.file_name), f"Block {block} has no file name" data_b = self.get_data_block(block, dry_run) data = data_b.data _, _, block_offsets = block.block_offsets(data_b) # block offsets are now the offsets in the block to be written data_offset = 0 seeks = len(block_offsets) / 2 mode = 'wb' if os.path.exists(block.file_name): # if file already exists, open in r+b mode # to modify without overwriting mode = 'r+b' write_time = 0 with open(block.file_name, mode) as f: total_bytes = 0 for i, r in enumerate(block_offsets): if i % 2 == 1: continue next_data_offset = (data_offset + block_offsets[i+1] - block_offsets[i] + 1) if dry_run: wrote_bytes = next_data_offset - data_offset else: start = time.time() f.seek(block_offsets[i]) wrote_bytes = f.write(data.get(data_offset, next_data_offset)) write_time += time.time() - start total_bytes += wrote_bytes data_offset = next_data_offset if total_bytes != 0: log(f' Wrote {total_bytes} bytes to {block.file_name} ' f'({len(block_offsets)/2} seeks)', 0) f.close() return total_bytes, seeks, write_time
def read_from(self, block, dry_run=False): ''' Read the relevant data sections of self from block's file name. In general, block doesn't have the same origin or shape as self. Return: (total_bytes, seeks), the total number of bytes read and the number of seeks required in block. Similar to write_to but for reading ''' if not self.overlap(block): return 0, 0 data = bytearray() origin, shape, block_offsets = block.block_offsets(self) if len(block_offsets) == 0: return 0, 0 # nothing to read # Read in block seeks = len(block_offsets)/2 est_total_bytes = sum([block_offsets[i+1]-block_offsets[i] + 1 if i % 2 == 0 else 0 for i in range(len(block_offsets))]) if dry_run: self.set_data_size(self.get_data_size() + est_total_bytes) return est_total_bytes, seeks, 0 read_time = 0 with open(block.file_name, 'rb') as f: log(f'<< Reading from {block.file_name}' f' ({len(block_offsets)/2} seeks)', 0) total_bytes = 0 for i, r in enumerate(block_offsets): if i % 2 == 1: continue start = time.time() f.seek(block_offsets[i]) data += f.read(block_offsets[i+1]-block_offsets[i] + 1) read_time += time.time() - start total_bytes += block_offsets[i+1]-block_offsets[i] + 1 assert(len(data) == total_bytes), (f'Data size: {len(data)}, ' 'read {total_bytes} bytes ' ' from block {block}') log(f'Read {total_bytes} bytes', 0) # Write data block to self data_block = Block(origin=origin, shape=shape, data=data) self.put_data_block(data_block) assert(total_bytes == est_total_bytes) return total_bytes, seeks, read_time
def read(self): ''' Read the block from argument file_name. File file_name has to contain the block and only the block Return number of bytes read Similar to write but for reading ''' if self.data.mem_usage() == math.prod(self.shape): # don't read the block again if it was already read # TODO: investigate why this is happening return self.data.mem_usage() log(f'<< Reading {self.file_name}', 0) start = time.time() with open(self.file_name, 'rb') as f: data = f.read() read_time = time.time() - start self.data.put(0, data) message = (f'Block contains {self.data.mem_usage()}B but shape is ' f' {math.prod(self.shape)}B') assert(self.data.mem_usage() == math.prod(self.shape)), message return self.data.mem_usage(), read_time
def find_shape_with_constraint(in_blocks, out_blocks, m): ''' Search for a read block shape that respects memory constraint m ''' assert (in_blocks.ndim == 3), 'Only supports dimension 3' # r_hat is the best shape, if it fits in memory or there is no memory # constraint, return it r_hat = get_r_hat(in_blocks, out_blocks) log(f'keep: rhat is {r_hat}') mc = peak_memory(r_hat, in_blocks, out_blocks) if m is None or mc <= m: return r_hat, mc array = in_blocks.array # evaluate nmax shapes of the form (divs0[i], r_hat[1], r_hat[2]) divs0 = sorted([x for x in divisors(array.shape[0]) if x <= r_hat[0]], reverse=True) nmax = len(divs0) ind = None for i in range(min(nmax, len(divs0))): shape = (divs0[i], r_hat[1], r_hat[2]) log(f'Evaluating shape {shape}, memory constraint is {m}', 1) mc = peak_memory(shape, in_blocks, out_blocks) log(f'Memory estimate: {mc}B', 1) if (mc <= m): ind = i break if ind is not None: return (divs0[ind], r_hat[1], r_hat[2]), mc # We're going to have to seek in the second dimension, let's just give up assert (False), "Cannot find read shape that satisfies memory constraint"
def repartition(self, out_blocks, m, get_read_blocks_and_cache, dry_run=False): ''' Write data from self in files of partition out_blocks. Implements Algorithm 1 in the paper. Arguments: out_blocks: a partition. The blocks of this partition are written. m: memory constraint. get_read_blocks_and_cache: function that returns read blocks and an initialized cache from (in_blocks, out_blocks, m, array) Return number of bytes read or written, and number of seeks done ''' log('') log(f'repartition: # Repartitioning {self.name} in {out_blocks.name}') r, c, e, p = get_read_blocks_and_cache(self, out_blocks, m, self.array) read_blocks, cache, expected_seeks, est_peak_mem = (r, c, e, p) seeks = 0 peak_mem = 0 total_bytes = 0 bytes_in_cache = 0 read_time = 0 write_time = 0 for read_block in read_blocks.blocks: log(f'repartition: reading block: {read_block}', 0) t, s, rt = self.read_block(read_blocks.blocks[read_block], dry_run) bytes_in_cache += t total_bytes += t seeks += s read_time += rt log(f'repartition: inserting read block of size ' f'{read_blocks.blocks[read_block].mem_usage()}B to cache') complete_blocks = cache.insert(read_blocks.blocks[read_block], dry_run) log(f'repartition: Cache: {str(cache)}', 0) peak_mem = max(peak_mem, cache.mem_usage()) for b in complete_blocks: log(f'repartition: Writing complete block {b}', 0) t, s, wt = out_blocks.write_block(b, dry_run) assert (t == b.mem_usage()) b.clear() bytes_in_cache -= t log(f'repartition: Write required {s} seeks', 0) log(f'repartition: Cache: {str(cache)}', 0) total_bytes += t seeks += s write_time += wt b.clear() message = (f'{bytes_in_cache}, {cache.mem_usage()}') assert (bytes_in_cache == cache.mem_usage()), message message = (f'Incorrect seek count. Expected: {expected_seeks}.' f' Real: {seeks}') assert (dry_run or (expected_seeks == seeks)), message message = (f'Incorrect memory usage. Expected: {est_peak_mem}B.' f' Real: {peak_mem}B.') assert (dry_run or (est_peak_mem == peak_mem)), message return total_bytes, seeks, peak_mem, read_time, write_time
def main(args=None): parser = ArgumentParser() parser.add_argument("A", action="store", help="shape of the reconstructed array") parser.add_argument( "I", action="store", help="shape of the input blocks. Input blocks " "called 'in...' must be stored on disk", ) parser.add_argument( "O", action="store", help="shape of the outut blocks. Output blocks" " called 'out...' will be created on disk", ) commands = parser.add_mutually_exclusive_group() commands.add_argument( "--create", action="store_true", help="create input blocks on disk" " before repartitioning.", ) commands.add_argument( "--repartition", action="store_true", help="repartition input blocks to output block dimensions", ) commands.add_argument( "--delete", action="store_true", help="delete output blocks after repartitioning.", ) commands.add_argument( "--test-data", action="store_true", help="reconstruct array from input blocks, " "reconstruct array from output blocks, " "check that data is identical in both " "reconstructions.", ) parser.add_argument("--max-mem", action="store", help="max memory to use, in bytes") parser.add_argument( "method", action="store", help="repartitioning method to use", choices=["baseline", "keep"], ) args, params = parser.parse_known_args(args) mem = args.max_mem if mem is not None: mem = int(mem) repart_func = {"baseline": keep.baseline, "keep": keep.keep} array = Partition(make_tuple(args.A), name="array") if args.create: fill = "random" log("Creating input blocks", 1) else: fill = None log("Using existing input blocks", 1) in_blocks = Partition(make_tuple(args.I), name="in", array=array, fill=fill) in_blocks.clear() if not args.create: out_blocks = Partition(make_tuple(args.O), name="out", array=array) # Repartitioning if args.repartition: log("Repartitioning input blocks into output blocks", 1) out_blocks.delete() out_blocks.clear() # shouldn't be necessary but just in case start = time.time() ( total_bytes, seeks, peak_mem, read_time, write_time, ) = in_blocks.repartition(out_blocks, mem, repart_func[args.method]) end = time.time() total_time = end - start assert total_time > read_time + write_time assert total_bytes == 2 * math.prod(array.shape) log( f"Seeks, peak memory (B), read time (s)," f" write time (s), elapsed time (s):" + os.linesep + f"{seeks},{peak_mem},{round(read_time,2)}," f"{round(write_time,2)},{round(total_time,2)}", 2, ) if args.test_data: log("Testing data", 1) in_blocks.repartition(array, mem, repart_func[args.method]) with open(array.blocks[(0, 0, 0)].file_name, "rb") as f: in_data = f.read() array.delete() out_blocks.repartition(array, mem, repart_func[args.method]) with open(array.blocks[(0, 0, 0)].file_name, "rb") as f: out_data = f.read() assert in_data == out_data if args.delete: log("Deleting output blocks", 1) out_blocks.delete()