def get_imagenet_dataflow(datadir, name, batch_size, augmentors): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' cpu = min(30, multiprocessing.cpu_count()) if isTrain: ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) ds = PrefetchDataZMQ(ds, cpu) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = ThreadedMapData(ds, cpu, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) return ds
def get_imagenet_dataflow( datadir, name, batch_size, augmentors, parallel=None): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' meta_dir = os.path.join(datadir, "meta") if parallel is None: parallel = min(40, multiprocessing.cpu_count()) if isTrain: ds = Imagenet5k(datadir, name, meta_dir=meta_dir, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logger.warn("DataFlow may become the bottleneck when too few processes are used.") ds = PrefetchDataZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = Imagenet5kFiles(datadir, name, meta_dir=meta_dir, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) return ds
def get_val_dataflow( datadir, batch_size, augmentors, parallel=None, num_splits=None, split_index=None): assert datadir is not None assert isinstance(augmentors, list) if parallel is None: parallel = min(40, multiprocessing.cpu_count()) if num_splits is None: ds = dataset.ILSVRC12Files(datadir, 'val', shuffle=False) else: assert split_index < num_splits files = dataset.ILSVRC12Files(datadir, 'val', shuffle=False) files.reset_state() files = list(files.get_data()) logger.info("#ValidationData = {}".format(len(files))) split_size = len(files) // num_splits start, end = split_size * split_index, split_size * (split_index + 1) end = min(end, len(files)) logger.info("#ValidationSplit = {} - {}".format(start, end)) files = files[start: end] ds = DataFromList(files, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) # ds = PrefetchDataZMQ(ds, 1) # do not fork() under MPI return ds
def data_augmentation(im, augmentors): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert isinstance(augmentors, list) aug = imgaug.AugmentorList(augmentors) im = aug.augment(im) return im
def get_imagenet_dataflow(datadir, name, batch_size, augmentors, parallel=None): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' if parallel is None: parallel = min(40, multiprocessing.cpu_count()) if isTrain: ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = PrefetchDataZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp jpeg_filename = os.path.basename(fname) jpeg_dirname = os.path.basename(os.path.dirname(fname)) zip_filepath = os.path.dirname(fname) + '.zip' f = zipfile.ZipFile(zip_filepath, 'r') compress_jpeg = np.fromstring(f.read( os.path.join(jpeg_dirname, jpeg_filename)), dtype=np.uint8) im = cv2.imdecode(compress_jpeg, cv2.IMREAD_COLOR) #im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) return ds
def get_imagenet_dataflow(datadir, name, batch_size, augmentors, parallel=None): #获取图像网络数据流 """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' if parallel is None: # 如果不是并行的话 parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading 超线程? 获取当前计算机cpu数量 if isTrain: # dataset:创建一个在数据流上运行的预测器,并且拿出一个batch? ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) # 使用共享的增强参数在多个组件上应用图像增强器 if parallel < 16: # 如果少于16个的话 logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = PrefetchDataZMQ(ds, parallel) # 实现高效的数据流水线 ds = BatchData(ds, batch_size, remainder=False) # 取一个batch? else: # 如果是测试时,增强图像,加速对数据流的读取操作等 # 与ILSVRC12相同,但生成图像的文件名而不是np array。 ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR ) # cv2.IMREAD_COLOR : 默认使用该种标识。加载一张彩色图片,忽视它的透明度 im = aug.augment(im) # 增强图像 return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) # 并行加速? ds = BatchData(ds, batch_size, remainder=True) # 取一个batch? ds = PrefetchDataZMQ(ds, 1) return ds
def get_imagenet_dataflow(datadir, name, batch_size, augmentors, parallel=None): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 6) if isTrain: ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logger.warning( "DataFlow may become the bottleneck when too few processes are used." ) ds = PrefetchData(ds, 1000, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(30): try: im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) break except Exception as e: logger.warning(str(e), 'file=', fname) time.sleep(1) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchData(ds, 100, 1) return ds
def get_val_dataflow(datadir, batch_size, augmentors=None, parallel=None, num_splits=None, split_index=None, dataname="val"): if augmentors is None: augmentors = fbresnet_augmentor(False) assert datadir is not None assert isinstance(augmentors, list) if parallel is None: parallel = min(40, multiprocessing.cpu_count()) if num_splits is None: ds = dataset.ILSVRC12Files(datadir, dataname, shuffle=True) else: # shard validation data assert False assert split_index < num_splits files = dataset.ILSVRC12Files(datadir, dataname, shuffle=True) files.reset_state() files = list(files.get_data()) logger.info("Number of validation data = {}".format(len(files))) split_size = len(files) // num_splits start, end = split_size * split_index, split_size * (split_index + 1) end = min(end, len(files)) logger.info("Local validation split = {} - {}".format(start, end)) files = files[start:end] ds = DataFromList(files, shuffle=True) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) #from BGR to RGB im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=min(2000, ds.size()), strict=True) ds = BatchData(ds, batch_size, remainder=False) ds = RepeatedData(ds, num=-1) # do not fork() under MPI return ds
def imgaug_wrapper(imgaug_list): imgaug_augmentor = imgaug.AugmentorList(imgaug_list) # don't normalize the image yet. will normalize within data_prefetcher # don't use transforms.ToTensor(). It will divide pixels by 256 ToTensor = torch.from_numpy # img_obj is a PIL Image object. Return a CPU tensor def real_augmentor(img_obj): img_np = np.asarray(img_obj) transforms = imgaug_augmentor.get_transform(img_np) img_np2 = transforms.apply_image(img_np) img_np2 = np.rollaxis(img_np2, 2) img_ts = ToTensor(np.ascontiguousarray(img_np2)) return img_ts return real_augmentor
def get_sequential_loader(ds, isTrain, batch_size, augmentors, parallel=None): """ Load a Single-File LMDB (Sequential Read) Args: augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)` Returns: A LMDBData which produces BGR images and labels. See explanations in the tutorial: http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html """ assert isinstance(augmentors, list) aug = imgaug.AugmentorList(augmentors) if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = LocallyShuffleData(ds, 50000) ds = MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0) ds = AugmentImageComponent(ds, aug, copy=False) if parallel < 16: logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = BatchData(ds, batch_size, remainder=False, use_list=True) ds = MultiProcessRunnerZMQ(ds, parallel) else: def mapper(data): im, label = data im = cv2.imdecode(im, cv2.IMREAD_COLOR) im = aug.augment(im) return im, label ds = MultiProcessMapDataZMQ(ds, parallel, mapper, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True, use_list=True) return ds
def get_random_loader(ds, isTrain, batch_size, augmentors, parallel=None): """ DataFlow data (Random Read) Args: augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)` Returns: A DataFlow which produces BGR images and labels. See explanations in the tutorial: http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html """ assert isinstance(augmentors, list) aug = imgaug.AugmentorList(augmentors) if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = AugmentImageComponent(ds, aug, copy=False) if parallel < 16: logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = MultiProcessRunnerZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = MultiProcessRunnerZMQ(ds, 1) return ds
def get_data_tmp(name, base_dir, meta_dir, gpu_nums): isTrain = True if 'train' in name else False m = np.array([104, 116, 122]) const_arr = np.resize(m, (1, 1, 3)) # NCHW const_arr = np.zeros((args.crop_size[0], args.crop_size[1], 3)) + const_arr # broadcast def imgread(ds): img, label = ds img = cv2.imread(img, cv2.IMREAD_COLOR) label = cv2.imread(label, cv2.IMREAD_GRAYSCALE) return img, label if isTrain: #ds = LMDBData('/data2/dataset/cityscapes/cityscapes_train.lmdb', shuffle=True) #ds = FakeData([[batch_size, CROP_HEIGHT, CROP_HEIGHT, 3], [batch_size, CROP_HEIGHT, CROP_HEIGHT, 1]], 5000, random=False, dtype='uint8') ds = PascalVOC12Files(base_dir, meta_dir, name, shuffle=True) parallel = min(6, multiprocessing.cpu_count()) augmentors = [ RandomCropWithPadding(args.crop_size), Flip(horiz=True), ] aug = imgaug.AugmentorList(augmentors) def mapf(ds): img, label = ds img = cv2.imread(img, cv2.IMREAD_COLOR) label = cv2.imread(label, cv2.IMREAD_GRAYSCALE) img, params = aug.augment_return_params(img) label = aug._augment(label, params) img = img - const_arr # very time-consuming return img, label ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=500, strict=True) ds = BatchData(ds, args.batch_size * gpu_nums) ds = PrefetchDataZMQ(ds, 1) else: ds = PascalVOC12(base_dir, meta_dir, name, shuffle=False) ds = MapData(ds, imgread) ds = BatchData(ds, 1) return ds
def get_AVA2012_dataflow(datadir, name, batch_size, augmentors, CAM_dir_pkl=None, CAMCropR=False, repeat_times=1, strict_order=False, remainder_TR=False): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' cpu = min(2, multiprocessing.cpu_count()) if isTrain: # change @ 20171122 # change @ 20171125: for CAMCropR ds = dataset.AVA2012(datadir, name, CAM_dir_pkl=CAM_dir_pkl, CAMCropR=CAMCropR, shuffle=(strict_order == False)) print( '--> information about the original dataFlow from AVA2012 [TRAIN]:' ) print(ds) ds = AugmentImageComponent(ds, augmentors, copy=False) # change @ 20171122 # NOTE: When ``nr_proc=1``, the dataflow produces the same data as ``ds`` in the same order ds = PrefetchDataZMQ(ds, cpu if strict_order == False else 1) ds = BatchData(ds, batch_size, remainder=remainder_TR) else: ds = dataset.AVA2012(datadir, name, shuffle=False) print( '--> information about the original dataFlow from AVA2012 [VALIDATION]:' ) print(ds) # add @ 20171121 ds = RepeatedDataPoint(ds, repeat_times) aug = imgaug.AugmentorList(augmentors) def mapf(dp): im, cls = dp im = aug.augment(im) return im, cls # change @ 20171122 # BUG @ 20171128 # NOTE: The order of data points out of MultiThreadMapData.get_data() # is not the same as ds.get_data() ! # NOTE: buffer_size is the minimum number of images loaded from a given folder ds = MultiThreadMapData(ds, cpu if strict_order == False else 1, \ mapf, buffer_size = 500, strict = True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) return ds