def __iter__(self): # define a thread for collation & memory pinning here if self.pin_memory: self._join_memory_thread_done_event = threading.Event() self._data_queue = queue.Queue() self.join_memory_thread = threading.Thread( target=self.join_streams_thread, args=( self._data_queue, torch.cuda.current_device(), self._join_memory_thread_done_event, ), ) self.join_memory_thread.daemon = True self.join_memory_thread.start() while not self._join_memory_thread_done_event.is_set(): batch = self._data_queue.get(timeout=100000) batch = {'data':batch} yield batch self.join_memory_thread.join() else: # Single-Process for batch_parts in self.get_stream_loaders(): data = list(chain(*batch_parts)) batch = torch.cat([item[:, None] for item in data], dim=1) batch = {'data':batch} yield batch
def __iter__(self): """ main process: setup workers, setup joining thread, collect max_iter batches from main data queue. """ procs = [ mp.Process(target=self.worker_loop, args=(i, m), daemon=True) for i, m in enumerate(self.shared_arrays) ] [p.start() for p in procs] if self.main_thread: self._join_memory_thread_done_event = threading.Event() self._data_queue = queue.Queue() self.join_memory_thread = threading.Thread( target=self.join_streams_thread, args=( self._data_queue, torch.cuda.current_device(), self._join_memory_thread_done_event, ), ) self.join_memory_thread.daemon = True self.join_memory_thread.start() for i in range(self.max_iter): data = (self._data_queue.get( timeout=100000) if self.main_thread else self.get_batch()) yield data if self.pin_memory: self._join_memory_thread_done_event.set() [p.terminate() for p in procs] self.epoch += 1
def __init__(self, loader): self.dataset = loader.dataset self.scale = loader.scale self.collate_fn = loader.collate_fn self.batch_sampler = loader.batch_sampler self.num_workers = loader.num_workers self.pin_memory = loader.pin_memory and torch.cuda.is_available() self.timeout = loader.timeout self.sample_iter = iter(self.batch_sampler) base_seed = torch.LongTensor(1).random_().item() if self.num_workers > 0: self.worker_init_fn = loader.worker_init_fn self.worker_queue_idx = 0 self.worker_result_queue = multiprocessing.Queue() self.batches_outstanding = 0 self.worker_pids_set = False self.shutdown = False self.send_idx = 0 self.rcvd_idx = 0 self.reorder_dict = {} self.done_event = multiprocessing.Event() base_seed = torch.LongTensor(1).random_()[0] self.index_queues = [] self.workers = [] for i in range(self.num_workers): index_queue = multiprocessing.Queue() index_queue.cancel_join_thread() w = multiprocessing.Process( target=_ms_loop, args=(self.dataset, index_queue, self.worker_result_queue, self.done_event, self.collate_fn, self.scale, base_seed + i, self.worker_init_fn, i)) w.daemon = True w.start() self.index_queues.append(index_queue) self.workers.append(w) if self.pin_memory: self.data_queue = queue.Queue() pin_memory_thread = threading.Thread( target=_utils.pin_memory._pin_memory_loop, args=(self.worker_result_queue, self.data_queue, torch.cuda.current_device(), self.done_event)) pin_memory_thread.daemon = True pin_memory_thread.start() self.pin_memory_thread = pin_memory_thread else: self.data_queue = self.worker_result_queue _utils.signal_handling._set_worker_pids( id(self), tuple(w.pid for w in self.workers)) _utils.signal_handling._set_SIGCHLD_handler() self.worker_pids_set = True for _ in range(2 * self.num_workers): self._put_indices()
def __init__(self, fp16=False, mean=(0., 0., 0.), std=(1., 1., 1.), pin_memory=True, pca_jitter=False, **kwargs): super().__init__(**kwargs) print('Using DALI CPU iterator') self.stream = torch.cuda.Stream() self.fp16 = fp16 self.mean = torch.tensor(mean).cuda().view(1, 3, 1, 1) self.std = torch.tensor(std).cuda().view(1, 3, 1, 1) self.pin_memory = pin_memory self.pca_jitter = pca_jitter if self.fp16: self.mean = self.mean.half() self.std = self.std.half() self.proc_next_input = Event() self.done_event = Event() self.output_queue = queue.Queue(maxsize=5) self.preproc_thread = threading.Thread( target=_preproc_worker, kwargs={ 'dali_iterator': self._dali_iterator, 'cuda_stream': self.stream, 'fp16': self.fp16, 'mean': self.mean, 'std': self.std, 'proc_next_input': self.proc_next_input, 'done_event': self.done_event, 'output_queue': self.output_queue, 'pin_memory': self.pin_memory, 'pca_jitter': self.pca_jitter }) self.preproc_thread.daemon = True self.preproc_thread.start() self.proc_next_input.set()
def __init__(self, loader): super(_MultiProcessingDataLoaderIter, self).__init__(loader) assert self._num_workers > 0 if loader.multiprocessing_context is None: multiprocessing_context = multiprocessing else: multiprocessing_context = loader.multiprocessing_context self._worker_init_fn = loader.worker_init_fn self._worker_queue_idx_cycle = itertools.cycle(range( self._num_workers)) self._worker_result_queue = multiprocessing_context.Queue() self._worker_pids_set = False self._shutdown = False self._send_idx = 0 # idx of the next task to be sent to workers self._rcvd_idx = 0 # idx of the next task to be returned in __next__ # information about data not yet yielded, i.e., tasks w/ indices in range [rcvd_idx, send_idx). # map: task idx => - (worker_id,) if data isn't fetched (outstanding) # \ (worker_id, data) if data is already fetched (out-of-order) self._task_info = {} self._tasks_outstanding = 0 # always equal to count(v for v in task_info.values() if len(v) == 1) self._workers_done_event = multiprocessing_context.Event() self._index_queues = [] self._workers = [] # A list of booleans representing whether each worker still has work to # do, i.e., not having exhausted its iterable dataset object. It always # contains all `True`s if not using an iterable-style dataset # (i.e., if kind != Iterable). self._workers_status = [] for i in range(self._num_workers): index_queue = multiprocessing_context.Queue() # index_queue.cancel_join_thread() w = multiprocessing_context.Process( target=worker_loop, args=(self._dataset_kind, self._dataset, index_queue, self._worker_result_queue, self._workers_done_event, self._auto_collation, self._collate_fn, self._drop_last, self._base_seed + i, self._worker_init_fn, i, self._num_workers)) w.daemon = True # NB: Process.start() actually take some time as it needs to # start a process and pass the arguments over via a pipe. # Therefore, we only add a worker to self._workers list after # it started, so that we do not call .join() if program dies # before it starts, and __del__ tries to join but will get: # AssertionError: can only join a started process. w.start() self._index_queues.append(index_queue) self._workers.append(w) self._workers_status.append(True) if self._pin_memory: self._pin_memory_thread_done_event = threading.Event() self._data_queue = queue.Queue() pin_memory_thread = threading.Thread( target=_utils.pin_memory._pin_memory_loop, args=(self._worker_result_queue, self._data_queue, torch.cuda.current_device(), self._pin_memory_thread_done_event)) pin_memory_thread.daemon = True pin_memory_thread.start() # Similar to workers (see comment above), we only register # pin_memory_thread once it is started. self._pin_memory_thread = pin_memory_thread else: self._data_queue = self._worker_result_queue _utils.signal_handling._set_worker_pids( id(self), tuple(w.pid for w in self._workers)) _utils.signal_handling._set_SIGCHLD_handler() self._worker_pids_set = True # prime the prefetch loop for _ in range(2 * self._num_workers): self._try_put_index()
def __init__(self, loader): self.dataset = loader.dataset self.collate_fn = loader.collate_fn self.batch_sampler = loader.batch_sampler self.num_workers = loader.num_workers self.pin_memory = loader.pin_memory and torch.cuda.is_available() self.timeout = loader.timeout self.sample_iter = iter(self.batch_sampler) base_seed = torch.LongTensor(1).random_().item() if self.num_workers > 0: self.worker_init_fn = loader.worker_init_fn self.worker_queue_idx = 0 self.worker_result_queue = multiprocessing.Queue() self.batches_outstanding = 0 self.worker_pids_set = False self.shutdown = False self.send_idx = 0 self.rcvd_idx = 0 self.reorder_dict = {} self.done_event = multiprocessing.Event() self.index_queues = [] self.workers = [] for i in range(self.num_workers): index_queue = multiprocessing.Queue() index_queue.cancel_join_thread() w = multiprocessing.Process( # target=_utils.worker._worker_loop, target=_worker_loop, args=(self.dataset, index_queue, self.worker_result_queue, self.done_event, self.collate_fn, base_seed + i, self.worker_init_fn, i)) w.daemon = True # NB: Process.start() actually take some time as it needs to # start a process and pass the arguments over via a pipe. # Therefore, we only add a worker to self.workers list after # it started, so that we do not call .join() if program dies # before it starts, and __del__ tries to join but will get: # AssertionError: can only join a started process. w.start() self.index_queues.append(index_queue) self.workers.append(w) if self.pin_memory: self.data_queue = queue.Queue() pin_memory_thread = threading.Thread( target=_utils.pin_memory._pin_memory_loop, args=(self.worker_result_queue, self.data_queue, torch.cuda.current_device(), self.done_event)) pin_memory_thread.daemon = True pin_memory_thread.start() # Similar to workers (see comment above), we only register # pin_memory_thread once it is started. self.pin_memory_thread = pin_memory_thread else: self.data_queue = self.worker_result_queue _utils.signal_handling._set_worker_pids(id(self), tuple(w.pid for w in self.workers)) _utils.signal_handling._set_SIGCHLD_handler() self.worker_pids_set = True # prime the prefetch loop for _ in range(2 * self.num_workers): self._put_indices()
def __init__(self, pipelines, output_map, size, global_size, auto_reset=False, fill_last_batch=True, dynamic_shape=False, last_batch_padded=False, batch_maps=[], counter_maps=[], locks=[], iter_id=0, must_pin=True, gpu_per_job=1, must_save=False): if not isinstance(pipelines, list): pipelines = [pipelines] self._num_gpus = len(pipelines) print("INIT batch map len = {}".format(len(batch_maps))) assert pipelines is not None, "Number of provided pipelines has to be at least 1" self.batch_size = pipelines[0].batch_size self._size = int(size) self.pin_memory_thread = None self.batch_prep_thread = None self._global_size = int(global_size) self._iter_id = iter_id self._auto_reset = auto_reset self._dynamic_shape = dynamic_shape self._fill_last_batch = fill_last_batch self._last_batch_padded = last_batch_padded self._local_batch_id = 0 self._global_batch_id = 0 self._dl_to_fetch_from = 0 self._outstanding_batches = 0 self._must_pin = must_pin self.must_save = must_save self.gpu_per_job = gpu_per_job self._threads = [] self.batch_maps = batch_maps self.counter_maps = counter_maps self.locks = locks self.done_event = multiprocessing.Event() #self._dl_to_fetch_from = self._iter_id self._total_dl = len(batch_maps) #Count from 1 self.total_batches_required = math.ceil(self._global_size / self.batch_size) self.total_batches_self = math.ceil(self._size / self.batch_size) print("PIPELINE : Total Batches required = {}, this dl batches={}". format(self.total_batches_required, self.total_batches_self)) print( "PIPELINE - Num gpu={}, size={}, global_size={}, batch_size={}, iter ID={}, total_dl:{}" .format(self._num_gpus, self._size, self._global_size, self.batch_size, self._iter_id, self._total_dl)) assert self._size != 0, "Size cannot be 0" assert self._size > 0 or ( self._size < 0 and len(pipelines) == 1), "Negative size is supported only for a single pipeline" if self._size < 0: self._auto_reset = False self._fill_last_batch = False self._last_batch_padded = False self._pipes = pipelines # Build all pipelines for p in self._pipes: with p._check_api_type_scope(types.PipelineAPIType.ITERATOR): p.build() # Use double-buffering of data batches self._data_batches = [None for i in range(self._num_gpus)] self._counter = 0 self._global_counter = 0 assert len(set(output_map)) == len( output_map), "output_map names should be distinct" self._output_categories = set(output_map) self.output_map = output_map # We need data about the batches (like shape information), # so we need to run a single batch as part of setup to get that info for p in self._pipes: with p._check_api_type_scope(types.PipelineAPIType.ITERATOR): p.schedule_run() self._last_batch_returned = 0 self._pin_batch_counter = 0 self._total_batches_other_dl = self.total_batches_self * ( self._total_dl - 1) self._total_batches = self.total_batches_self * self._total_dl print("TOTAL BATCHES={}".format(self._total_batches)) #Create pin memory thread for this DL if self._must_pin: self._final_data_queue = queue.Queue() print("Created data fetch q {}".format(self._total_batches)) #Start a thread to populate batches batch_prep_thread = threading.Thread( target=batch_util.get_next_batch, args=(self.batch_maps, self.counter_maps, self.locks, self._final_data_queue, self._iter_id, self._total_batches, self._total_dl, self.done_event, self._pipes, self._num_gpus, self.output_map, self._data_batches, self._output_categories, self._dynamic_shape, self._fill_last_batch, self._size, self._global_size, self.batch_size, self.must_save, self.gpu_per_job)) batch_prep_thread.daemon = True batch_prep_thread.start() self.batch_prep_thread = batch_prep_thread #print("MUST PIN. Start thread") #pin_memory_thread = threading.Thread( # target=pin_util.pin_memory_loop, # args=(self.batch_maps, self.counter_maps, self.locks, self._final_data_queue, torch.cuda.current_device(), self._total_batches, self._pin_batch_counter, self._total_dl, self.done_event)) #pin_memory_thread.daemon = True #pin_memory_thread.start() #self.pin_memory_thread = pin_memory_thread self._first_batch = None self._second_batch = None #Prefetch two batches self._first_batch = self.next() self._second_batch = self.next()
def __init__(self, loader): super(_MultiProcessingDataLoaderIter, self).__init__(loader) assert self._num_workers > 0 if loader.multiprocessing_context is None: multiprocessing_context = multiprocessing else: multiprocessing_context = loader.multiprocessing_context self._worker_init_fn = loader.worker_init_fn self._worker_queue_idx_cycle = itertools.cycle(range( self._num_workers)) self._worker_result_queue = multiprocessing_context.Queue() self._worker_pids_set = False self._shutdown = False self._workers_done_event = multiprocessing_context.Event() self._index_queues = [] self._workers = [] # A list of booleans representing whether each worker still has work to # do, i.e., not having exhausted its iterable dataset object. It always # contains all `True`s if not using an iterable-style dataset # (i.e., if kind != Iterable). self._workers_status = [] for i in range(self._num_workers): index_queue = multiprocessing_context.Queue() # index_queue.cancel_join_thread() w = multiprocessing_context.Process( target=worker._worker_loop, args=( self._dataset_kind, self._dataset, index_queue, self._worker_result_queue, self._workers_done_event, self._auto_collation, self._collate_fn, self._drop_last, self._base_seed + i, self._worker_init_fn, i, self._num_workers, self._persistent_workers, ), ) w.daemon = True # NB: Process.start() actually take some time as it needs to # start a process and pass the arguments over via a pipe. # Therefore, we only add a worker to self._workers list after # it started, so that we do not call .join() if program dies # before it starts, and __del__ tries to join but will get: # AssertionError: can only join a started process. w.start() self._index_queues.append(index_queue) self._workers.append(w) self._workers_status.append(True) if self._pin_memory: self._pin_memory_thread_done_event = threading.Event() self._data_queue = queue.Queue() pin_memory_thread = threading.Thread( target=_utils.pin_memory._pin_memory_loop, args=( self._worker_result_queue, self._data_queue, torch.cuda.current_device(), self._pin_memory_thread_done_event, ), ) pin_memory_thread.daemon = True pin_memory_thread.start() # Similar to workers (see comment above), we only register # pin_memory_thread once it is started. self._pin_memory_thread = pin_memory_thread else: self._data_queue = self._worker_result_queue _utils.signal_handling._set_worker_pids( id(self), tuple(w.pid for w in self._workers)) _utils.signal_handling._set_SIGCHLD_handler() self._worker_pids_set = True self._reset(loader, first_iter=True)