def get_imagenet_dataflow(datadir, name, batch_size, augmentors): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' cpu = min(30, multiprocessing.cpu_count()) meta_dir = './ilsvrc_metadata' if isTrain: ds = dataset.ILSVRC12(datadir, name, meta_dir=meta_dir, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) ds = PrefetchDataZMQ(ds, cpu) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, meta_dir=meta_dir, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, cpu, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) return ds
def get_imagenet_dataflow( datadir, name, batch_size, augmentors, parallel=None): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logger.warn("DataFlow may become the bottleneck when too few processes are used.") ds = PrefetchDataZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) return ds
def get_imagenet_dataflow(datadir, name, batch_size, augmentors, parallel=None): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' if parallel is None: parallel = min(40, multiprocessing.cpu_count()) if isTrain: ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = PrefetchDataZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp jpeg_filename = os.path.basename(fname) jpeg_dirname = os.path.basename(os.path.dirname(fname)) zip_filepath = os.path.dirname(fname) + '.zip' f = zipfile.ZipFile(zip_filepath, 'r') compress_jpeg = np.fromstring(f.read( os.path.join(jpeg_dirname, jpeg_filename)), dtype=np.uint8) im = cv2.imdecode(compress_jpeg, cv2.IMREAD_COLOR) #im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) return ds
def get_imagenet_dataflow(datadir, name, batch_size, augmentors, parallel=None): #获取图像网络数据流 """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' if parallel is None: # 如果不是并行的话 parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading 超线程? 获取当前计算机cpu数量 if isTrain: # dataset:创建一个在数据流上运行的预测器,并且拿出一个batch? ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) # 使用共享的增强参数在多个组件上应用图像增强器 if parallel < 16: # 如果少于16个的话 logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = PrefetchDataZMQ(ds, parallel) # 实现高效的数据流水线 ds = BatchData(ds, batch_size, remainder=False) # 取一个batch? else: # 如果是测试时,增强图像,加速对数据流的读取操作等 # 与ILSVRC12相同,但生成图像的文件名而不是np array。 ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR ) # cv2.IMREAD_COLOR : 默认使用该种标识。加载一张彩色图片,忽视它的透明度 im = aug.augment(im) # 增强图像 return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) # 并行加速? ds = BatchData(ds, batch_size, remainder=True) # 取一个batch? ds = PrefetchDataZMQ(ds, 1) return ds
def get_train_dataflow(datadir, batch, augmentors=None): """ Sec 3, Remark 4: Use a single random shuffling of the training data (per epoch) that is divided amongst all k workers. NOTE: Here we do not follow the paper which makes some differences. Here, each machine shuffles independently. """ if augmentors is None: augmentors = fbresnet_augmentor(True) ds = dataset.ILSVRC12(datadir, 'train', shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) ds = BatchData(ds, batch, remainder=False) ds = MultiProcessRunnerZMQ(ds, min(50, mp.cpu_count())) return ds
def get_tp_loader(data_dir, name, batch_size, parallel=None): isTrain = name == 'train' augmentors = get_tp_augmentor(isTrain) if data_dir.endswith('lmdb'): # 500000[70:87:20, 1.95it/s] data_dir = os.path.join(data_dir, 'ILSVRC-%s.lmdb' % name) ds = LMDBSerializer.load(data_dir, shuffle=False) ds = get_sequential_loader(ds, isTrain, batch_size, augmentors, parallel) else: # 500000[27:11:03, 5.11it/s] if isTrain: ds = dataset.ILSVRC12(data_dir, name, shuffle=True) else: ds = dataset.ILSVRC12Files(data_dir, name, shuffle=False) ds = get_random_loader(ds, isTrain, batch_size, augmentors, parallel) return ds