Esempio n. 1
0
    def _test_async_queue_read(async_queue, cuda_device):
        ref_files = []
        for i in range(async_queue):
            f, _ = _do_ref_write(tmpdir, i)
            ref_files.append(f)

        aio_buffers = []
        for i in range(async_queue):
            if cuda_device:
                buf = torch.empty(IO_SIZE, dtype=torch.uint8, device='cuda')
            else:
                buf = torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu').pin_memory()
            aio_buffers.append(buf)

        single_submit = True
        overlap_events = True
        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
                                               QUEUE_DEPTH,
                                               single_submit,
                                               overlap_events,
                                               IO_PARALLEL)

        _validate_handle_state(h, single_submit, overlap_events)

        for i in range(async_queue):
            read_status = h.async_pread(aio_buffers[i], ref_files[i])
            assert read_status == 0

        wait_status = h.wait()
        assert wait_status == async_queue

        for i in range(async_queue):
            with open(ref_files[i], 'rb') as f:
                ref_buffer = list(f.read())
            assert ref_buffer == aio_buffers[i].tolist()
    def __init__(self, swap_config, aio_config, base_folder, optimizer,
                 largest_numel, device, dtype, timers):
        super(PartitionedOptimizerSwapper,
              self).__init__(swap_config, aio_config, base_folder, optimizer,
                             largest_numel, device, dtype, timers)

        aio_op = AsyncIOBuilder().load()
        self.aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE],
                                            aio_config[AIO_QUEUE_DEPTH],
                                            aio_config[AIO_SINGLE_SUBMIT],
                                            aio_config[AIO_OVERLAP_EVENTS],
                                            aio_config[AIO_THREAD_COUNT])

        # Overlap swapping out
        self.gradient_swapper = AsyncTensorSwapper(
            aio_handle=self.aio_handle,
            numel_alignment=self.numel_alignment,
            timers=self.timers)

        self.print_exclude_list += [
            'aio_handle', 'gradient_swapper', 'print_exclude_list'
        ]

        if torch.distributed.get_rank() == 0:
            print_object(obj=self,
                         name='PartitionedOptimizerSwapper',
                         exclude_list=self.print_exclude_list)
Esempio n. 3
0
    def _test_async_read(single_submit, overlap_events, cuda_device):
        ref_file, _ = _do_ref_write(tmpdir)

        if cuda_device:
            aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device='cuda')
        else:
            aio_buffer = torch.empty(IO_SIZE,
                                     dtype=torch.uint8,
                                     device='cpu').pin_memory()

        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
                                               QUEUE_DEPTH,
                                               single_submit,
                                               overlap_events,
                                               IO_PARALLEL)

        _validate_handle_state(h, single_submit, overlap_events)

        read_status = h.async_pread(aio_buffer, ref_file)
        assert read_status == 0

        wait_status = h.wait()
        assert wait_status == 1

        with open(ref_file, 'rb') as f:
            ref_buffer = list(f.read())
        assert ref_buffer == aio_buffer.tolist()
Esempio n. 4
0
def pre_handle(args, tid, read_op):
    io_string = "Read" if read_op else "Write"
    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
    file = args.read_file if read_op else f'{args.write_file}.{tid}'

    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
    if args.gpu:
        buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cuda')
    else:
        buffer = torch.empty(num_bytes, dtype=torch.uint8,
                             device='cpu').pin_memory()
    task_log(
        tid,
        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
    )

    io_parallel = args.io_parallel if args.io_parallel else 1
    handle = AsyncIOBuilder().load().aio_handle(args.block_size,
                                                args.queue_depth,
                                                args.single_submit,
                                                args.overlap_events,
                                                io_parallel)
    task_log(tid, f'created deepspeed aio handle')

    ctxt = {}
    ctxt['file'] = file
    ctxt['num_bytes'] = num_bytes
    ctxt['handle'] = handle
    ctxt['buffer'] = buffer
    ctxt['elapsed_sec'] = 0

    return ctxt
Esempio n. 5
0
    def _test_parallel_write(single_submit, overlap_events):
        ref_file, ref_buffer = _do_ref_write(tmpdir)

        aio_file, aio_buffer = _get_test_file_and_buffer(
            tmpdir, ref_buffer, False)

        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH,
                                               single_submit, overlap_events,
                                               IO_PARALLEL)

        _validate_handle_state(h, single_submit, overlap_events)

        write_status = h.sync_pwrite(aio_buffer, aio_file)
        assert write_status == 1

        assert os.path.isfile(aio_file)

        filecmp.clear_cache()
        assert filecmp.cmp(ref_file, aio_file, shallow=False)
Esempio n. 6
0
    def __init__(self, swap_config, aio_config, base_folder, optimizer,
                 largest_numel, device, dtype, timers):
        super(PipelinedOptimizerSwapper,
              self).__init__(swap_config, aio_config, base_folder, optimizer,
                             largest_numel, device, dtype, timers)

        aio_op = AsyncIOBuilder().load()
        self.write_aio_handle = aio_op.aio_handle(
            aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH],
            aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS],
            aio_config[AIO_THREAD_COUNT])

        self.read_aio_handle = aio_op.aio_handle(
            aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH],
            aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS],
            aio_config[AIO_THREAD_COUNT])

        # Overlap gradient swap out
        self.gradient_swapper = AsyncTensorSwapper(
            aio_handle=self.write_aio_handle,
            numel_alignment=self.numel_alignment,
            timers=self.timers)

        self.async_swap_in = swap_config[OFFLOAD_OPTIMIZER_PIPELINE_READ]
        self.async_swap_out = swap_config[OFFLOAD_OPTIMIZER_PIPELINE_WRITE]

        self.swap_ops = {
            SYNC_SWAP_IN: None,
            ASYNC_SWAP_IN: None,
            SYNC_SWAP_OUT: None,
            ASYNC_SWAP_OUT: None
        }

        self.print_exclude_list += [
            'gradient_swapper', 'read_aio_handle', 'write_aio_handle',
            'swap_ops', 'print_exclude_list'
        ]

        if torch.distributed.get_rank() == 0:
            print_object(obj=self,
                         name='PipelinedOptimizerSwapper',
                         exclude_list=self.print_exclude_list)
Esempio n. 7
0
def main_basic_write(pool_params):
    args, tid, ctxt = pool_params
    start_time = time.time()
    AsyncIOBuilder().load().aio_write(ctxt['buffer'], ctxt['file'],
                                      args.block_size, args.queue_depth,
                                      args.single_submit, args.overlap_events,
                                      args.validate)
    end_time = time.time()
    ctxt['elapsed_sec'] += end_time - start_time

    return ctxt
Esempio n. 8
0
    def _test_async_queue_write(async_queue, cuda_device):
        ref_files = []
        ref_buffers = []
        for i in range(async_queue):
            f, buf = _do_ref_write(tmpdir, i)
            ref_files.append(f)
            ref_buffers.append(buf)

        aio_files = []
        aio_buffers = []
        for i in range(async_queue):
            f, buf = _get_test_file_and_buffer(tmpdir, ref_buffers[i], cuda_device, i)
            aio_files.append(f)
            aio_buffers.append(buf)

        single_submit = True
        overlap_events = True
        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
                                               QUEUE_DEPTH,
                                               single_submit,
                                               overlap_events,
                                               IO_PARALLEL)

        _validate_handle_state(h, single_submit, overlap_events)

        for i in range(async_queue):
            read_status = h.async_pwrite(aio_buffers[i], aio_files[i])
            assert read_status == 0

        wait_status = h.wait()
        assert wait_status == async_queue

        for i in range(async_queue):
            assert os.path.isfile(aio_files[i])

            filecmp.clear_cache()
            assert filecmp.cmp(ref_files[i], aio_files[i], shallow=False)
    def __init__(self, ds_config, model_dtype):

        aio_op = AsyncIOBuilder().load(verbose=False)
        self.aio_handle = aio_op.aio_handle
        self.dtype = model_dtype

        #set swap buffers, create aio handles
        self._configure_aio(ds_config)

        #mapping from param id to path
        self.id_to_path = {}

        #mapping from pram_id to buffer id
        self.param_id_to_buffer_id = {}

        # mapping from param_id to swap buffer
        self.param_id_to_swap_buffer = {}

        #number of elements in the param
        self.param_id_to_numel = {}

        self.pending_writes = 0
        self.pending_reads = 0

        #keep track of async swap in params and buffers
        self.inflight_params = []
        self.inflight_swap_in_buffers = []
        self.inflight_numel = 0

        #keep track of available params
        self.available_params = set()
        self.available_numel = 0

        # for swapping out from partitioned fp32 params
        self.partitioned_swap_buffer = None
        self.partitioned_swap_pool = None

        self.invalid_buffer = torch.tensor(1).half()

        if dist.get_rank() == 0:
            exclude_list = ['aio_read_handle', 'aio_write_handle', 'buffers']
            print_object(obj=self,
                         name='AsyncPartitionedParameterSwapper',
                         exclude_list=exclude_list)
Esempio n. 10
0
import deepspeed
from deepspeed.ops.aio import AsyncIOBuilder
assert AsyncIOBuilder().is_compatible()
Esempio n. 11
0
def async_io_setup():
    from deepspeed.ops.aio import AsyncIOBuilder
    return AsyncIOBuilder().is_compatible()