Beispiel #1
0
    def write_to(self, block, dry_run=False):
        '''
        Write relevant data sections of self to block's file name

        Return: (total_bytes, seeks), the total number of bytes written and the
        number of seeks required in block.

        Similar to read_from but for writes.
        '''
        if not self.overlap(block):
            return 0, 0

        assert(block.file_name), f"Block {block} has no file name"

        data_b = self.get_data_block(block, dry_run)
        data = data_b.data

        _, _, block_offsets = block.block_offsets(data_b)
        # block offsets are now the offsets in the block to be written

        data_offset = 0
        seeks = len(block_offsets) / 2
        mode = 'wb'
        if os.path.exists(block.file_name):
            # if file already exists, open in r+b mode
            #  to modify without overwriting
            mode = 'r+b'
        write_time = 0
        with open(block.file_name, mode) as f:
            total_bytes = 0
            for i, r in enumerate(block_offsets):
                if i % 2 == 1:
                    continue
                next_data_offset = (data_offset +
                                    block_offsets[i+1] -
                                    block_offsets[i] + 1)
                if dry_run:
                    wrote_bytes = next_data_offset - data_offset
                else:
                    start = time.time()
                    f.seek(block_offsets[i])
                    wrote_bytes = f.write(data.get(data_offset,
                                                   next_data_offset))
                    write_time += time.time() - start
                total_bytes += wrote_bytes
                data_offset = next_data_offset
            if total_bytes != 0:
                log(f'  Wrote {total_bytes} bytes to {block.file_name} '
                    f'({len(block_offsets)/2} seeks)', 0)
        f.close()
        return total_bytes, seeks, write_time
Beispiel #2
0
    def read_from(self, block, dry_run=False):
        '''
        Read the relevant data sections of self from block's file name.
        In general, block doesn't have the same origin or shape as self.

        Return: (total_bytes, seeks), the total number of bytes read and the
        number of seeks required in block.

        Similar to write_to but for reading
        '''

        if not self.overlap(block):
            return 0, 0

        data = bytearray()
        origin, shape, block_offsets = block.block_offsets(self)
        if len(block_offsets) == 0:
            return 0, 0  # nothing to read
        # Read in block
        seeks = len(block_offsets)/2
        est_total_bytes = sum([block_offsets[i+1]-block_offsets[i] + 1
                              if i % 2 == 0 else 0
                              for i in range(len(block_offsets))])
        if dry_run:
            self.set_data_size(self.get_data_size() + est_total_bytes)
            return est_total_bytes, seeks, 0
        read_time = 0
        with open(block.file_name, 'rb') as f:
            log(f'<< Reading from {block.file_name}'
                f' ({len(block_offsets)/2} seeks)', 0)
            total_bytes = 0
            for i, r in enumerate(block_offsets):
                if i % 2 == 1:
                    continue
                start = time.time()
                f.seek(block_offsets[i])
                data += f.read(block_offsets[i+1]-block_offsets[i] + 1)
                read_time += time.time() - start
                total_bytes += block_offsets[i+1]-block_offsets[i] + 1
            assert(len(data) == total_bytes), (f'Data size: {len(data)}, '
                                               'read {total_bytes} bytes '
                                               ' from block {block}')
            log(f'Read {total_bytes} bytes', 0)

        # Write data block to self
        data_block = Block(origin=origin, shape=shape, data=data)
        self.put_data_block(data_block)
        assert(total_bytes == est_total_bytes)
        return total_bytes, seeks, read_time
Beispiel #3
0
    def read(self):
        '''
        Read the block from argument file_name. File file_name has to contain
        the block and only the block

        Return number of bytes read

        Similar to write but for reading
        '''
        if self.data.mem_usage() == math.prod(self.shape):
            # don't read the block again if it was already read
            # TODO: investigate why this is happening
            return self.data.mem_usage()

        log(f'<< Reading {self.file_name}', 0)
        start = time.time()
        with open(self.file_name, 'rb') as f:
            data = f.read()
        read_time = time.time() - start
        self.data.put(0, data)
        message = (f'Block contains {self.data.mem_usage()}B but shape is '
                   f' {math.prod(self.shape)}B')
        assert(self.data.mem_usage() == math.prod(self.shape)), message
        return self.data.mem_usage(), read_time
Beispiel #4
0
def find_shape_with_constraint(in_blocks, out_blocks, m):
    '''
    Search for a read block shape that respects memory constraint m
    '''

    assert (in_blocks.ndim == 3), 'Only supports dimension 3'

    # r_hat is the best shape, if it fits in memory or there is no memory
    # constraint, return it
    r_hat = get_r_hat(in_blocks, out_blocks)
    log(f'keep: rhat is {r_hat}')
    mc = peak_memory(r_hat, in_blocks, out_blocks)
    if m is None or mc <= m:
        return r_hat, mc

    array = in_blocks.array

    # evaluate nmax shapes of the form (divs0[i], r_hat[1], r_hat[2])
    divs0 = sorted([x for x in divisors(array.shape[0]) if x <= r_hat[0]],
                   reverse=True)
    nmax = len(divs0)
    ind = None

    for i in range(min(nmax, len(divs0))):
        shape = (divs0[i], r_hat[1], r_hat[2])
        log(f'Evaluating shape {shape}, memory constraint is {m}', 1)
        mc = peak_memory(shape, in_blocks, out_blocks)
        log(f'Memory estimate: {mc}B', 1)
        if (mc <= m):
            ind = i
            break
    if ind is not None:
        return (divs0[ind], r_hat[1], r_hat[2]), mc

    # We're going to have to seek in the second dimension, let's just give up
    assert (False), "Cannot find read shape that satisfies memory constraint"
Beispiel #5
0
    def repartition(self,
                    out_blocks,
                    m,
                    get_read_blocks_and_cache,
                    dry_run=False):
        '''
        Write data from self in files of partition out_blocks. Implements
        Algorithm 1 in the paper.

        Arguments:
            out_blocks: a partition. The blocks of this partition are written.
            m: memory constraint.
            get_read_blocks_and_cache: function that returns read blocks and
                                       an initialized cache from
                                       (in_blocks, out_blocks, m, array)

        Return number of bytes read or written, and number of seeks done
        '''
        log('')
        log(f'repartition: # Repartitioning {self.name} in {out_blocks.name}')
        r, c, e, p = get_read_blocks_and_cache(self, out_blocks, m, self.array)
        read_blocks, cache, expected_seeks, est_peak_mem = (r, c, e, p)
        seeks = 0
        peak_mem = 0
        total_bytes = 0
        bytes_in_cache = 0
        read_time = 0
        write_time = 0
        for read_block in read_blocks.blocks:
            log(f'repartition: reading block: {read_block}', 0)
            t, s, rt = self.read_block(read_blocks.blocks[read_block], dry_run)
            bytes_in_cache += t
            total_bytes += t
            seeks += s
            read_time += rt
            log(f'repartition: inserting read block of size '
                f'{read_blocks.blocks[read_block].mem_usage()}B to cache')
            complete_blocks = cache.insert(read_blocks.blocks[read_block],
                                           dry_run)
            log(f'repartition: Cache: {str(cache)}', 0)
            peak_mem = max(peak_mem, cache.mem_usage())
            for b in complete_blocks:
                log(f'repartition: Writing complete block {b}', 0)
                t, s, wt = out_blocks.write_block(b, dry_run)
                assert (t == b.mem_usage())
                b.clear()
                bytes_in_cache -= t
                log(f'repartition: Write required {s} seeks', 0)
                log(f'repartition: Cache: {str(cache)}', 0)
                total_bytes += t
                seeks += s
                write_time += wt
                b.clear()
            message = (f'{bytes_in_cache}, {cache.mem_usage()}')
            assert (bytes_in_cache == cache.mem_usage()), message

        message = (f'Incorrect seek count. Expected: {expected_seeks}.'
                   f' Real: {seeks}')
        assert (dry_run or (expected_seeks == seeks)), message
        message = (f'Incorrect memory usage. Expected: {est_peak_mem}B.'
                   f' Real: {peak_mem}B.')
        assert (dry_run or (est_peak_mem == peak_mem)), message
        return total_bytes, seeks, peak_mem, read_time, write_time
Beispiel #6
0
def main(args=None):
    parser = ArgumentParser()

    parser.add_argument("A",
                        action="store",
                        help="shape of the reconstructed array")
    parser.add_argument(
        "I",
        action="store",
        help="shape of the input blocks. Input blocks "
        "called 'in...' must be stored on disk",
    )
    parser.add_argument(
        "O",
        action="store",
        help="shape of the outut blocks. Output blocks"
        " called 'out...' will be created on disk",
    )
    commands = parser.add_mutually_exclusive_group()
    commands.add_argument(
        "--create",
        action="store_true",
        help="create input blocks on disk"
        " before repartitioning.",
    )
    commands.add_argument(
        "--repartition",
        action="store_true",
        help="repartition input blocks to output block dimensions",
    )
    commands.add_argument(
        "--delete",
        action="store_true",
        help="delete output blocks after repartitioning.",
    )
    commands.add_argument(
        "--test-data",
        action="store_true",
        help="reconstruct array from input blocks, "
        "reconstruct array from output blocks, "
        "check that data is identical in both "
        "reconstructions.",
    )
    parser.add_argument("--max-mem",
                        action="store",
                        help="max memory to use, in bytes")
    parser.add_argument(
        "method",
        action="store",
        help="repartitioning method to use",
        choices=["baseline", "keep"],
    )

    args, params = parser.parse_known_args(args)
    mem = args.max_mem
    if mem is not None:
        mem = int(mem)

    repart_func = {"baseline": keep.baseline, "keep": keep.keep}

    array = Partition(make_tuple(args.A), name="array")

    if args.create:
        fill = "random"
        log("Creating input blocks", 1)
    else:
        fill = None
        log("Using existing input blocks", 1)

    in_blocks = Partition(make_tuple(args.I),
                          name="in",
                          array=array,
                          fill=fill)

    in_blocks.clear()

    if not args.create:
        out_blocks = Partition(make_tuple(args.O), name="out", array=array)

        # Repartitioning
        if args.repartition:
            log("Repartitioning input blocks into output blocks", 1)
            out_blocks.delete()
            out_blocks.clear()  # shouldn't be necessary but just in case
            start = time.time()
            (
                total_bytes,
                seeks,
                peak_mem,
                read_time,
                write_time,
            ) = in_blocks.repartition(out_blocks, mem,
                                      repart_func[args.method])
            end = time.time()
            total_time = end - start
            assert total_time > read_time + write_time
            assert total_bytes == 2 * math.prod(array.shape)
            log(
                f"Seeks, peak memory (B), read time (s),"
                f" write time (s), elapsed time (s):" + os.linesep +
                f"{seeks},{peak_mem},{round(read_time,2)},"
                f"{round(write_time,2)},{round(total_time,2)}",
                2,
            )

        if args.test_data:
            log("Testing data", 1)
            in_blocks.repartition(array, mem, repart_func[args.method])
            with open(array.blocks[(0, 0, 0)].file_name, "rb") as f:
                in_data = f.read()
            array.delete()
            out_blocks.repartition(array, mem, repart_func[args.method])
            with open(array.blocks[(0, 0, 0)].file_name, "rb") as f:
                out_data = f.read()
            assert in_data == out_data

        if args.delete:
            log("Deleting output blocks", 1)
            out_blocks.delete()