def __init__(self, data, transform, cache_num=sys.maxsize, cache_rate=1.0, num_workers=0): """ Args: data (Iterable): input data to load and transform to generate dataset for model. transform (Callable): transforms to execute operations on input data. cache_num (int): number of items to be cached. Default is `sys.maxsize`. will take the minimum of (cache_num, data_length x cache_rate, data_length). cache_rate (float): percentage of cached data in total, default is 1.0 (cache all). will take the minimum of (cache_num, data_length x cache_rate, data_length). num_workers (int): the number of worker threads to use. If 0 a single thread will be used. Default is 0. """ if not isinstance(transform, Compose): transform = Compose(transform) super().__init__(data, transform) self.cache_num = min(cache_num, int(len(self) * cache_rate), len(self)) if self.cache_num > 0: self._cache = [None] * self.cache_num print("Load and cache transformed data...") if num_workers > 0: self._item_processed = 0 self._thread_lock = threading.Lock() with ThreadPool(num_workers) as p: p.map( self._load_cache_item_thread, [(i, data[i], transform.transforms) for i in range(self.cache_num)], ) else: for i in range(self.cache_num): self._cache[i] = self._load_cache_item(data[i], transform.transforms) process_bar(i + 1, self.cache_num)
def __init__(self, data, transform, cache_num=sys.maxsize, cache_rate=1.0): """ Args: data (Iterable): input data to load and transform to generate dataset for model. transform (Callable): transforms to execute operations on input data. cache_num (int): number of items to be cached. Default is `sys.maxsize`. will take the minimum of (cache_num, data_length x cache_rate, data_length). cache_rate (float): percentage of cached data in total, default is 1.0 (cache all). will take the minimum of (cache_num, data_length x cache_rate, data_length). """ if not isinstance(transform, Compose): transform = Compose(transform) super().__init__(data, transform) self.cache_num = min(cache_num, int(len(self) * cache_rate), len(self)) self._cache = list() print('Load and cache transformed data...') for i in range(self.cache_num): process_bar(i + 1, self.cache_num) item = data[i] for _transform in transform.transforms: # execute all the deterministic transforms before the first random transform if isinstance(_transform, Randomizable): break item = apply_transform(_transform, item) self._cache.append(item)
def _load_cache_item_thread(self, args): i, item, transforms = args self._cache[i] = self._load_cache_item(item, transforms) with self._thread_lock: self._item_processed += 1 process_bar(self._item_processed, self.cache_num)
def _process_hook(blocknum, blocksize, totalsize): process_bar(blocknum * blocksize, totalsize)