Example #1
0
 def __iter__(self):
     # define a thread for collation & memory pinning here
     if self.pin_memory:
         self._join_memory_thread_done_event = threading.Event()
         self._data_queue = queue.Queue()
         self.join_memory_thread = threading.Thread(
             target=self.join_streams_thread,
             args=(
                 self._data_queue,
                 torch.cuda.current_device(),
                 self._join_memory_thread_done_event,
             ),
         )
         self.join_memory_thread.daemon = True
         self.join_memory_thread.start()
        
         while not self._join_memory_thread_done_event.is_set():
             batch = self._data_queue.get(timeout=100000)
             batch = {'data':batch}
             yield batch
         self.join_memory_thread.join()
     else:
         # Single-Process
         for batch_parts in self.get_stream_loaders():
             data = list(chain(*batch_parts))
             batch = torch.cat([item[:, None] for item in data], dim=1)
             batch = {'data':batch}
             yield batch
    def __iter__(self):
        """
        main process: setup workers, setup joining thread, collect max_iter batches from main data queue.
        """
        procs = [
            mp.Process(target=self.worker_loop, args=(i, m), daemon=True)
            for i, m in enumerate(self.shared_arrays)
        ]

        [p.start() for p in procs]

        if self.main_thread:
            self._join_memory_thread_done_event = threading.Event()
            self._data_queue = queue.Queue()
            self.join_memory_thread = threading.Thread(
                target=self.join_streams_thread,
                args=(
                    self._data_queue,
                    torch.cuda.current_device(),
                    self._join_memory_thread_done_event,
                ),
            )
            self.join_memory_thread.daemon = True
            self.join_memory_thread.start()

        for i in range(self.max_iter):
            data = (self._data_queue.get(
                timeout=100000) if self.main_thread else self.get_batch())
            yield data

        if self.pin_memory:
            self._join_memory_thread_done_event.set()
        [p.terminate() for p in procs]
        self.epoch += 1
Example #3
0
    def __init__(self, loader):
        self.dataset = loader.dataset
        self.scale = loader.scale
        self.collate_fn = loader.collate_fn
        self.batch_sampler = loader.batch_sampler
        self.num_workers = loader.num_workers
        self.pin_memory = loader.pin_memory and torch.cuda.is_available()
        self.timeout = loader.timeout

        self.sample_iter = iter(self.batch_sampler)

        base_seed = torch.LongTensor(1).random_().item()
        if self.num_workers > 0:
            self.worker_init_fn = loader.worker_init_fn
            self.worker_queue_idx = 0
            self.worker_result_queue = multiprocessing.Queue()
            self.batches_outstanding = 0
            self.worker_pids_set = False
            self.shutdown = False
            self.send_idx = 0
            self.rcvd_idx = 0
            self.reorder_dict = {}
            self.done_event = multiprocessing.Event()

            base_seed = torch.LongTensor(1).random_()[0]

            self.index_queues = []
            self.workers = []
            for i in range(self.num_workers):
                index_queue = multiprocessing.Queue()
                index_queue.cancel_join_thread()
                w = multiprocessing.Process(
                    target=_ms_loop,
                    args=(self.dataset, index_queue, self.worker_result_queue,
                          self.done_event, self.collate_fn, self.scale,
                          base_seed + i, self.worker_init_fn, i))
                w.daemon = True
                w.start()
                self.index_queues.append(index_queue)
                self.workers.append(w)

            if self.pin_memory:
                self.data_queue = queue.Queue()
                pin_memory_thread = threading.Thread(
                    target=_utils.pin_memory._pin_memory_loop,
                    args=(self.worker_result_queue, self.data_queue,
                          torch.cuda.current_device(), self.done_event))
                pin_memory_thread.daemon = True
                pin_memory_thread.start()
                self.pin_memory_thread = pin_memory_thread
            else:
                self.data_queue = self.worker_result_queue

            _utils.signal_handling._set_worker_pids(
                id(self), tuple(w.pid for w in self.workers))
            _utils.signal_handling._set_SIGCHLD_handler()
            self.worker_pids_set = True
            for _ in range(2 * self.num_workers):
                self._put_indices()
Example #4
0
    def __init__(self,
                 fp16=False,
                 mean=(0., 0., 0.),
                 std=(1., 1., 1.),
                 pin_memory=True,
                 pca_jitter=False,
                 **kwargs):
        super().__init__(**kwargs)
        print('Using DALI CPU iterator')
        self.stream = torch.cuda.Stream()

        self.fp16 = fp16
        self.mean = torch.tensor(mean).cuda().view(1, 3, 1, 1)
        self.std = torch.tensor(std).cuda().view(1, 3, 1, 1)
        self.pin_memory = pin_memory
        self.pca_jitter = pca_jitter

        if self.fp16:
            self.mean = self.mean.half()
            self.std = self.std.half()

        self.proc_next_input = Event()
        self.done_event = Event()
        self.output_queue = queue.Queue(maxsize=5)
        self.preproc_thread = threading.Thread(
            target=_preproc_worker,
            kwargs={
                'dali_iterator': self._dali_iterator,
                'cuda_stream': self.stream,
                'fp16': self.fp16,
                'mean': self.mean,
                'std': self.std,
                'proc_next_input': self.proc_next_input,
                'done_event': self.done_event,
                'output_queue': self.output_queue,
                'pin_memory': self.pin_memory,
                'pca_jitter': self.pca_jitter
            })
        self.preproc_thread.daemon = True
        self.preproc_thread.start()

        self.proc_next_input.set()
    def __init__(self, loader):
        super(_MultiProcessingDataLoaderIter, self).__init__(loader)

        assert self._num_workers > 0

        if loader.multiprocessing_context is None:
            multiprocessing_context = multiprocessing
        else:
            multiprocessing_context = loader.multiprocessing_context

        self._worker_init_fn = loader.worker_init_fn
        self._worker_queue_idx_cycle = itertools.cycle(range(
            self._num_workers))
        self._worker_result_queue = multiprocessing_context.Queue()
        self._worker_pids_set = False
        self._shutdown = False
        self._send_idx = 0  # idx of the next task to be sent to workers
        self._rcvd_idx = 0  # idx of the next task to be returned in __next__
        # information about data not yet yielded, i.e., tasks w/ indices in range [rcvd_idx, send_idx).
        # map: task idx => - (worker_id,)        if data isn't fetched (outstanding)
        #                  \ (worker_id, data)   if data is already fetched (out-of-order)
        self._task_info = {}
        self._tasks_outstanding = 0  # always equal to count(v for v in task_info.values() if len(v) == 1)
        self._workers_done_event = multiprocessing_context.Event()

        self._index_queues = []
        self._workers = []
        # A list of booleans representing whether each worker still has work to
        # do, i.e., not having exhausted its iterable dataset object. It always
        # contains all `True`s if not using an iterable-style dataset
        # (i.e., if kind != Iterable).
        self._workers_status = []
        for i in range(self._num_workers):
            index_queue = multiprocessing_context.Queue()
            # index_queue.cancel_join_thread()
            w = multiprocessing_context.Process(
                target=worker_loop,
                args=(self._dataset_kind, self._dataset, index_queue,
                      self._worker_result_queue, self._workers_done_event,
                      self._auto_collation, self._collate_fn, self._drop_last,
                      self._base_seed + i, self._worker_init_fn, i,
                      self._num_workers))
            w.daemon = True
            # NB: Process.start() actually take some time as it needs to
            #     start a process and pass the arguments over via a pipe.
            #     Therefore, we only add a worker to self._workers list after
            #     it started, so that we do not call .join() if program dies
            #     before it starts, and __del__ tries to join but will get:
            #     AssertionError: can only join a started process.
            w.start()
            self._index_queues.append(index_queue)
            self._workers.append(w)
            self._workers_status.append(True)

        if self._pin_memory:
            self._pin_memory_thread_done_event = threading.Event()
            self._data_queue = queue.Queue()
            pin_memory_thread = threading.Thread(
                target=_utils.pin_memory._pin_memory_loop,
                args=(self._worker_result_queue, self._data_queue,
                      torch.cuda.current_device(),
                      self._pin_memory_thread_done_event))
            pin_memory_thread.daemon = True
            pin_memory_thread.start()
            # Similar to workers (see comment above), we only register
            # pin_memory_thread once it is started.
            self._pin_memory_thread = pin_memory_thread
        else:
            self._data_queue = self._worker_result_queue

        _utils.signal_handling._set_worker_pids(
            id(self), tuple(w.pid for w in self._workers))
        _utils.signal_handling._set_SIGCHLD_handler()
        self._worker_pids_set = True

        # prime the prefetch loop
        for _ in range(2 * self._num_workers):
            self._try_put_index()
Example #6
0
    def __init__(self, loader):
        self.dataset = loader.dataset
        self.collate_fn = loader.collate_fn
        self.batch_sampler = loader.batch_sampler
        self.num_workers = loader.num_workers
        self.pin_memory = loader.pin_memory and torch.cuda.is_available()
        self.timeout = loader.timeout

        self.sample_iter = iter(self.batch_sampler)

        base_seed = torch.LongTensor(1).random_().item()

        if self.num_workers > 0:
            self.worker_init_fn = loader.worker_init_fn
            self.worker_queue_idx = 0
            self.worker_result_queue = multiprocessing.Queue()
            self.batches_outstanding = 0
            self.worker_pids_set = False
            self.shutdown = False
            self.send_idx = 0
            self.rcvd_idx = 0
            self.reorder_dict = {}
            self.done_event = multiprocessing.Event()

            self.index_queues = []
            self.workers = []
            for i in range(self.num_workers):
                index_queue = multiprocessing.Queue()
                index_queue.cancel_join_thread()
                w = multiprocessing.Process(
                    # target=_utils.worker._worker_loop,
                    target=_worker_loop,
                    args=(self.dataset, index_queue,
                          self.worker_result_queue, self.done_event,
                          self.collate_fn, base_seed + i,
                          self.worker_init_fn, i))
                w.daemon = True
                # NB: Process.start() actually take some time as it needs to
                #     start a process and pass the arguments over via a pipe.
                #     Therefore, we only add a worker to self.workers list after
                #     it started, so that we do not call .join() if program dies
                #     before it starts, and __del__ tries to join but will get:
                #     AssertionError: can only join a started process.
                w.start()
                self.index_queues.append(index_queue)
                self.workers.append(w)

            if self.pin_memory:
                self.data_queue = queue.Queue()
                pin_memory_thread = threading.Thread(
                    target=_utils.pin_memory._pin_memory_loop,
                    args=(self.worker_result_queue, self.data_queue,
                          torch.cuda.current_device(), self.done_event))
                pin_memory_thread.daemon = True
                pin_memory_thread.start()
                # Similar to workers (see comment above), we only register
                # pin_memory_thread once it is started.
                self.pin_memory_thread = pin_memory_thread
            else:
                self.data_queue = self.worker_result_queue

            _utils.signal_handling._set_worker_pids(id(self), tuple(w.pid for w in self.workers))
            _utils.signal_handling._set_SIGCHLD_handler()
            self.worker_pids_set = True

            # prime the prefetch loop
            for _ in range(2 * self.num_workers):
                self._put_indices()
Example #7
0
    def __init__(self,
                 pipelines,
                 output_map,
                 size,
                 global_size,
                 auto_reset=False,
                 fill_last_batch=True,
                 dynamic_shape=False,
                 last_batch_padded=False,
                 batch_maps=[],
                 counter_maps=[],
                 locks=[],
                 iter_id=0,
                 must_pin=True,
                 gpu_per_job=1,
                 must_save=False):
        if not isinstance(pipelines, list):
            pipelines = [pipelines]
        self._num_gpus = len(pipelines)
        print("INIT batch map len = {}".format(len(batch_maps)))
        assert pipelines is not None, "Number of provided pipelines has to be at least 1"
        self.batch_size = pipelines[0].batch_size
        self._size = int(size)
        self.pin_memory_thread = None
        self.batch_prep_thread = None
        self._global_size = int(global_size)
        self._iter_id = iter_id
        self._auto_reset = auto_reset
        self._dynamic_shape = dynamic_shape
        self._fill_last_batch = fill_last_batch
        self._last_batch_padded = last_batch_padded
        self._local_batch_id = 0
        self._global_batch_id = 0
        self._dl_to_fetch_from = 0
        self._outstanding_batches = 0
        self._must_pin = must_pin
        self.must_save = must_save
        self.gpu_per_job = gpu_per_job
        self._threads = []
        self.batch_maps = batch_maps
        self.counter_maps = counter_maps
        self.locks = locks
        self.done_event = multiprocessing.Event()
        #self._dl_to_fetch_from = self._iter_id
        self._total_dl = len(batch_maps)  #Count from 1
        self.total_batches_required = math.ceil(self._global_size /
                                                self.batch_size)
        self.total_batches_self = math.ceil(self._size / self.batch_size)
        print("PIPELINE : Total Batches required = {}, this dl batches={}".
              format(self.total_batches_required, self.total_batches_self))
        print(
            "PIPELINE - Num gpu={}, size={}, global_size={}, batch_size={}, iter ID={}, total_dl:{}"
            .format(self._num_gpus, self._size, self._global_size,
                    self.batch_size, self._iter_id, self._total_dl))
        assert self._size != 0, "Size cannot be 0"
        assert self._size > 0 or (
            self._size < 0 and len(pipelines)
            == 1), "Negative size is supported only for a single pipeline"
        if self._size < 0:
            self._auto_reset = False
            self._fill_last_batch = False
            self._last_batch_padded = False
        self._pipes = pipelines
        # Build all pipelines
        for p in self._pipes:
            with p._check_api_type_scope(types.PipelineAPIType.ITERATOR):
                p.build()
        # Use double-buffering of data batches
        self._data_batches = [None for i in range(self._num_gpus)]
        self._counter = 0
        self._global_counter = 0
        assert len(set(output_map)) == len(
            output_map), "output_map names should be distinct"
        self._output_categories = set(output_map)
        self.output_map = output_map

        # We need data about the batches (like shape information),
        # so we need to run a single batch as part of setup to get that info
        for p in self._pipes:
            with p._check_api_type_scope(types.PipelineAPIType.ITERATOR):
                p.schedule_run()

        self._last_batch_returned = 0
        self._pin_batch_counter = 0
        self._total_batches_other_dl = self.total_batches_self * (
            self._total_dl - 1)
        self._total_batches = self.total_batches_self * self._total_dl
        print("TOTAL BATCHES={}".format(self._total_batches))
        #Create pin memory thread for this DL
        if self._must_pin:
            self._final_data_queue = queue.Queue()
            print("Created data fetch q {}".format(self._total_batches))
            #Start a thread to populate batches
            batch_prep_thread = threading.Thread(
                target=batch_util.get_next_batch,
                args=(self.batch_maps, self.counter_maps, self.locks,
                      self._final_data_queue, self._iter_id,
                      self._total_batches, self._total_dl, self.done_event,
                      self._pipes, self._num_gpus, self.output_map,
                      self._data_batches, self._output_categories,
                      self._dynamic_shape, self._fill_last_batch, self._size,
                      self._global_size, self.batch_size, self.must_save,
                      self.gpu_per_job))
            batch_prep_thread.daemon = True
            batch_prep_thread.start()
            self.batch_prep_thread = batch_prep_thread

            #print("MUST PIN. Start thread")
            #pin_memory_thread = threading.Thread(
            #        target=pin_util.pin_memory_loop,
            #        args=(self.batch_maps, self.counter_maps, self.locks, self._final_data_queue, torch.cuda.current_device(), self._total_batches, self._pin_batch_counter, self._total_dl, self.done_event))
            #pin_memory_thread.daemon = True
            #pin_memory_thread.start()
            #self.pin_memory_thread = pin_memory_thread

        self._first_batch = None
        self._second_batch = None
        #Prefetch two batches
        self._first_batch = self.next()
        self._second_batch = self.next()
Example #8
0
    def __init__(self, loader):
        super(_MultiProcessingDataLoaderIter, self).__init__(loader)

        assert self._num_workers > 0

        if loader.multiprocessing_context is None:
            multiprocessing_context = multiprocessing
        else:
            multiprocessing_context = loader.multiprocessing_context

        self._worker_init_fn = loader.worker_init_fn
        self._worker_queue_idx_cycle = itertools.cycle(range(
            self._num_workers))
        self._worker_result_queue = multiprocessing_context.Queue()
        self._worker_pids_set = False
        self._shutdown = False
        self._workers_done_event = multiprocessing_context.Event()

        self._index_queues = []
        self._workers = []
        # A list of booleans representing whether each worker still has work to
        # do, i.e., not having exhausted its iterable dataset object. It always
        # contains all `True`s if not using an iterable-style dataset
        # (i.e., if kind != Iterable).
        self._workers_status = []
        for i in range(self._num_workers):
            index_queue = multiprocessing_context.Queue()
            # index_queue.cancel_join_thread()
            w = multiprocessing_context.Process(
                target=worker._worker_loop,
                args=(
                    self._dataset_kind,
                    self._dataset,
                    index_queue,
                    self._worker_result_queue,
                    self._workers_done_event,
                    self._auto_collation,
                    self._collate_fn,
                    self._drop_last,
                    self._base_seed + i,
                    self._worker_init_fn,
                    i,
                    self._num_workers,
                    self._persistent_workers,
                ),
            )
            w.daemon = True
            # NB: Process.start() actually take some time as it needs to
            #     start a process and pass the arguments over via a pipe.
            #     Therefore, we only add a worker to self._workers list after
            #     it started, so that we do not call .join() if program dies
            #     before it starts, and __del__ tries to join but will get:
            #     AssertionError: can only join a started process.
            w.start()
            self._index_queues.append(index_queue)
            self._workers.append(w)
            self._workers_status.append(True)

        if self._pin_memory:
            self._pin_memory_thread_done_event = threading.Event()
            self._data_queue = queue.Queue()
            pin_memory_thread = threading.Thread(
                target=_utils.pin_memory._pin_memory_loop,
                args=(
                    self._worker_result_queue,
                    self._data_queue,
                    torch.cuda.current_device(),
                    self._pin_memory_thread_done_event,
                ),
            )
            pin_memory_thread.daemon = True
            pin_memory_thread.start()
            # Similar to workers (see comment above), we only register
            # pin_memory_thread once it is started.
            self._pin_memory_thread = pin_memory_thread
        else:
            self._data_queue = self._worker_result_queue

        _utils.signal_handling._set_worker_pids(
            id(self), tuple(w.pid for w in self._workers))
        _utils.signal_handling._set_SIGCHLD_handler()
        self._worker_pids_set = True
        self._reset(loader, first_iter=True)