def get_cifar_iter_dali(type, image_dir, batch_size, num_threads, local_rank=0, world_size=1, val_size=32, cutout=0): if type == 'train': pip_train = HybridTrainPipe_CIFAR(batch_size=batch_size, num_threads=num_threads, device_id=local_rank, data_dir=image_dir, crop=32, world_size=world_size, local_rank=local_rank, cutout=cutout) pip_train.build() dali_iter_train = DALIClassificationIterator(pip_train, size=50000 // world_size) return dali_iter_train elif type == 'val': pip_val = HybridValPipe_CIFAR(batch_size=batch_size, num_threads=num_threads, device_id=local_rank, data_dir=image_dir, crop=32, size=val_size, world_size=world_size, local_rank=local_rank) pip_val.build() dali_iter_val = DALIClassificationIterator(pip_val, size=10000 // world_size) return dali_iter_val
def get_data_loader(args): assert osp.isdir(args.data), '{} does not exist'.format(args.data) crop_size = 224 val_size = 256 # train loader pipe = HybridTrainPipe(batch_size=args.batch_size, num_threads=args.workers, data_dir=args.data, crop=crop_size, dali_cpu=args.dali_cpu) pipe.build() train_loader = DALIClassificationIterator(pipe, size=int( pipe.epoch_size("Reader"))) # val loader pipe = HybridValPipe(batch_size=args.batch_size, num_threads=args.workers, data_dir=args.data, crop=crop_size, size=val_size) pipe.build() val_loader = DALIClassificationIterator(pipe, size=int( pipe.epoch_size("Reader"))) return train_loader, val_loader
def speedtest(pipeclass, batch, n_threads, data='./', dali_cpu=False, memcpy=False, cuda=False): pipe = pipeclass(batch, n_threads, 0, data_dir=data, dali_cpu=dali_cpu) pipe.build() if not memcpy: for i in range(5): pipe.run() n_test = 5000 num_processed = 0 t_start = timer() if not memcpy: for i in range(n_test): images, label = pipe.run() num_processed += len(images) else: loader = DALIClassificationIterator(pipe, size=1000000) for i in range(n_test): out = loader.next() images = out[0]["data"] if cuda: images = images.cuda() t = timer() - t_start print("Speed {}: {:.2f} imgs/s, time={:.2f}s, Images={}".format( type(pipe).__name__, (n_test * batch) / t, t, num_processed)) print("---" * 20)
class DaliIterator(object): """ Wrapper class to decode the DALI iterator output & provide iterator that functions the same as torchvision pipelines (Pipeline): DALI pipelines size (int): Number of examples in set Note: allow extra inputs to keep compatibility with CPU iterator """ def __init__(self, pipelines, size, **kwargs): self._dali_iterator = DALIClassificationIterator(pipelines=pipelines, size=size) def gen_wrapper(dalipipeline): for data in dalipipeline: input = data[0]["data"] target = torch.reshape(data[0]["label"], [-1]).cuda().long() yield input, target dalipipeline.reset() def __iter__(self): return DaliIterator.gen_wrapper(self._dali_iterator) def __len__(self): return int( math.ceil(self._dali_iterator._size / self._dali_iterator.batch_size)) def reset(self): self._dali_iterator.reset()
def __init__(self, session, dataset, num_samples, batch_size, steps=None, num_threads=0, fill_last_batch=True, is_random_flip=True, preprocess=None): self.steps = steps self.dataset = dataset self.batch_size = batch_size self.num_samples = num_samples pipe = ExternalSourcePipeline(session, self.dataset, batch_size=batch_size, num_threads=num_threads, num_samples=self.num_samples, device_id=0, is_random_flip=is_random_flip, preprocess=preprocess, fill_last_batch=fill_last_batch) pipe.build() self.dataloader = DALIClassificationIterator( pipe, self.num_samples, auto_reset=True, fill_last_batch=fill_last_batch, last_batch_padded=True)
def get_imagenet_iter_dali(type, image_dir, batch_size, num_threads, device_id, num_gpus, crop, val_size=256, world_size=1, local_rank=0): if type == 'train': pip_train = HybridTrainPipe(batch_size=batch_size, num_threads=num_threads, device_id=local_rank, data_dir=image_dir + '/train', crop=crop, world_size=world_size, local_rank=local_rank) pip_train.build() dali_iter_train = DALIClassificationIterator( pip_train, size=pip_train.epoch_size("Reader") // world_size) return dali_iter_train elif type == 'val': pip_val = HybridValPipe(batch_size=batch_size, num_threads=num_threads, device_id=local_rank, data_dir=image_dir + '/val', crop=crop, size=val_size, world_size=world_size, local_rank=local_rank) pip_val.build() dali_iter_val = DALIClassificationIterator( pip_val, size=pip_val.epoch_size("Reader") // world_size) return dali_iter_val
def dataloader(batch_size, data_dir, augment=True, cpu=True): traindir = data_dir + 'train' testdir = data_dir + 'val' pipe = HybridTrainPipe(batch_size=batch_size, num_threads=8, device_id=0, data_dir=traindir, crop=224, augment=augment, dali_cpu=cpu) pipe.build() trainloader = DALIClassificationIterator(pipe, size=int( pipe.epoch_size("Reader"))) train_iter = math.ceil(pipe.epoch_size("Reader") / batch_size) pipe = HybridValPipe(batch_size=batch_size, num_threads=8, device_id=0, data_dir=testdir, crop=224, size=256) pipe.build() testloader = DALIClassificationIterator(pipe, size=int( pipe.epoch_size("Reader"))) test_iter = math.ceil(pipe.epoch_size("Reader") / batch_size) return trainloader, train_iter, testloader, test_iter
def __init__(self, dataset, size, batch_size, num_workers): self.batchsize = batch_size self.pip_train = HybridTrainPipe(batch_size=batch_size, num_threads=num_workers, device_id=0, data_dir=dataset, size=size) self.pip_train.build() self.dataloader = DALIClassificationIterator( self.pip_train, size=self.pip_train.epoch_size("Reader"))
def get_dali_iterator(args): train_pipe = TrainPipe(args.batch_size, args.threads_num, args.gpu_id, args.gpus_num, args.db_dir) train_pipe.build() train_loader = DALIClassificationIterator([train_pipe], size=train_pipe.epoch_size("Reader")) val_pipe = ValPipe(args.batch_size, args.threads_num, args.gpu_id, args.gpus_num, args.db_dir) val_pipe.build() val_loader = DALIClassificationIterator([val_pipe], size=train_pipe.epoch_size("Reader")) return train_loader, val_loader
class DALILoader(object): def gen_loader(loader, steps): for i, data in enumerate(loader): input = data[0]['data'].cuda() target = data[0]["label"].cuda() yield input, target def __init__(self, session, dataset, num_samples, batch_size, steps=None, num_threads=0, fill_last_batch=True, is_random_flip=True, preprocess=None): self.steps = steps self.dataset = dataset self.batch_size = batch_size self.num_samples = num_samples pipe = ExternalSourcePipeline(session, self.dataset, batch_size=batch_size, num_threads=num_threads, num_samples=self.num_samples, device_id=0, is_random_flip=is_random_flip, preprocess=preprocess, fill_last_batch=fill_last_batch) pipe.build() self.dataloader = DALIClassificationIterator( pipe, self.num_samples, auto_reset=True, fill_last_batch=fill_last_batch, last_batch_padded=True) def __len__(self): if self.steps is None: steps = self.num_samples // self.batch_size if self.num_samples % self.batch_size != 0: steps += 1 return steps return self.steps def __iter__(self): return DALILoader.gen_loader(self.dataloader, self.steps) def reset(self): self.dataloader.reset()
def get_dali_imagenet(conf): from nvidia.dali.plugin.pytorch import DALIClassificationIterator from .dali import HybridTrainPipe, HybridValPipe dali_cpu = False if conf.get()['cuda']['avail'] else True pipe = HybridTrainPipe(batch_size=conf.get()['model']['batch'], num_threads=4, device_id=0, data_dir=conf.get()['data']['tr']['path'], crop=224, dali_cpu=dali_cpu) pipe.build() train_loader = DALIClassificationIterator(pipe, size=int(pipe.epoch_size("Reader"))) pipe = HybridValPipe(batch_size=conf.get()['model']['batch'], num_threads=4, device_id=0, data_dir=conf.get()['data']['test']['path'], crop=224, size=256) pipe.build() val_loader = DALIClassificationIterator(pipe, size=int(pipe.epoch_size("Reader"))) return train_loader, None, val_loader
def get_pipeline(folder="train", custom_reader=None): pipe = Pipeline(batch_size=64, num_threads=1, device_id=1) if custom_reader: raw_files, labels = custom_reader else: raw_files, labels = fn.file_reader(file_root="%s" % folder, random_shuffle=True) decode = fn.image_decoder(raw_files, device="mixed", output_type=types.GRAY) resize = fn.resize(decode, device="gpu", image_type=types.RGB, interp_type=types.INTERP_LINEAR, resize_x=WIDTH, resize_y=HEIGHT) hsv = fn.hsv(resize, hue=fn.uniform(range=(-10, 10)), saturation=fn.uniform(range=(-.5, .5)), value=fn.uniform(range=(0.9, 1.2)), device="gpu", dtype=types.UINT8) bc = fn.brightness_contrast(hsv, device="gpu", brightness=fn.uniform(range=(.9, 1.1))) cmn = fn.crop_mirror_normalize(bc, device="gpu", output_dtype=types.FLOAT, output_layout=types.NHWC, image_type=types.GRAY, mean=[255 // 2], std=[255 // 2]) rot = fn.rotate(cmn, angle=fn.uniform(range=(-40, 40)), device="gpu", keep_size=True) tpose = fn.transpose(rot, perm=(2, 0, 1), device="gpu") # Reshaping to a format PyTorch likes pipe.set_outputs(tpose, labels) pipe.build() dali_iter = DALIClassificationIterator([pipe], -1) return dali_iter
def gdtl(data_path, batch_size, workers=5, _worker_init_fn=None): """ DALI train loader function :param data_path: image data path :param batch_size: batch size in training phase :param workers: how much workers we use :param _worker_init_fn: initialize worker function :return: DALIWrapper(train_loader) or int(pipe.epoch_size("Reader") / (world_size * batch_size)): wrapper of train loader, or number of batch """ if torch.distributed.is_initialized(): local_rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() else: local_rank = 0 world_size = 1 traindir = os.path.join(data_path, 'train') pipe = HybridTrainPipe(batch_size=batch_size, num_threads=workers, device_id=local_rank, data_dir=traindir, crop=224, dali_cpu=dali_cpu) pipe.build() train_loader = DALIClassificationIterator(pipe, size \ = int(pipe.epoch_size("Reader") / world_size)) return DALIWrapper(train_loader), \ int(pipe.epoch_size("Reader") / (world_size * batch_size))
def get_imgs_iter_dali(image_dir, batch_size, num_threads, num_gpus, crop, shuffle, scale, ratio, seed, da=False): classes = [ d for d in os.listdir(image_dir) if os.path.isdir(os.path.join(image_dir, d)) ] classes.sort() class_to_idx = {classes[i]: i for i in range(len(classes))} pip_train = HybridTrainPipe(batch_size=batch_size, num_threads=num_threads, data_dir=image_dir, crop=crop, shuffle=shuffle, scale=scale, ratio=ratio, world_size=num_gpus, seed=seed, da=da) pip_train.build() dali_iter_train = DALIClassificationIterator( pip_train, size=pip_train.epoch_size("Reader") // num_gpus, auto_reset=True) return dali_iter_train, class_to_idx
def gdvl(data_path, batch_size, workers=5, _worker_init_fn=None): """ DALI valid loader function :param data_path: image data path :param batch_size: batch size in validation phase :param workers: how much workers we use :param _worker_init_fn: initialize worker function :return: DALIWrapper(val_loader) or int(pipe.epoch_size("Reader") / (world_size * batch_size)): wrapper of validation loader, or number of batch """ if torch.distributed.is_initialized(): local_rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() else: local_rank = 0 world_size = 1 valdir = os.path.join(data_path, 'val') pipe = HybridValPipe(batch_size=batch_size, num_threads=workers, device_id=local_rank, data_dir=valdir, crop=224, size=256) pipe.build() val_loader = DALIClassificationIterator(pipe, size \ = int(pipe.epoch_size("Reader") / world_size), fill_last_batch=False) return DALIWrapper(val_loader), \ int(pipe.epoch_size("Reader") / (world_size * batch_size))
def get_dali_imageNet_val_loader(data_path, batch_size, seed, num_threads=4): if torch.distributed.is_initialized(): local_rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() else: local_rank = 0 world_size = 1 val_dir = os.path.join(data_path, 'ILSVRC2012_img_val') pipe = ImageNetHybridValPipe(batch_size=batch_size, num_threads=num_threads, device_id=local_rank, data_dir=val_dir, crop=224, size=256, seed=seed) pipe.build() val_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size('Reader') / world_size), fill_last_batch=False, last_batch_padded=True, auto_reset=True) return DALIWrapper(val_loader), ceil( pipe.epoch_size('Reader') / (world_size * batch_size))
def gdtl(data_path, batch_size, num_classes, one_hot, workers=5, _worker_init_fn=None, fp16=False): if torch.distributed.is_initialized(): local_rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() else: local_rank = 0 world_size = 1 traindir = os.path.join(data_path, 'train') pipe = HybridTrainPipe(batch_size=batch_size, num_threads=workers, device_id=local_rank, data_dir=traindir, crop=224, dali_cpu=dali_cpu) pipe.build() train_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / world_size)) return DALIWrapper(train_loader, num_classes, one_hot), int( pipe.epoch_size("Reader") / (world_size * batch_size))
def gdvl(data_path, batch_size, num_classes, one_hot, workers=5, _worker_init_fn=None, fp16=False): if torch.distributed.is_initialized(): rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() else: rank = 0 world_size = 1 valdir = os.path.join(data_path, 'val') pipe = HybridValPipe(batch_size=batch_size, num_threads=workers, device_id=rank % torch.cuda.device_count(), data_dir=valdir, crop=224, size=256) pipe.build() val_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / world_size)) return DALIWrapper(val_loader, num_classes, one_hot), int( pipe.epoch_size("Reader") / (world_size * batch_size))
def get_dali_tinyImageNet_train_loader(data_path, batch_size, seed, num_threads=4, dali_cpu=False): if torch.distributed.is_initialized(): local_rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() else: local_rank = 0 world_size = 1 train_dir = os.path.join(data_path, 'train') pipe = TinyImageNetHybridTrainPipe(batch_size=batch_size, num_threads=num_threads, device_id=local_rank, data_dir=train_dir, crop=56, seed=seed, dali_cpu=dali_cpu) pipe.build() train_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size('Reader') / world_size), fill_last_batch=False, last_batch_padded=True, auto_reset=True) return DALIWrapper(train_loader), ceil( pipe.epoch_size('Reader') / (world_size * batch_size))
def get_dali_val_loader(options, size, rank, gpu): def start_index(pipe, id): epoch_size = pipe.epoch_size("Reader") remainder = epoch_size % size if id < remainder: return epoch_size // size * id + id else: return epoch_size // size * id + remainder if options.tiny_imagenet: img_size1, img_size2 = 64, 64 else: img_size1, img_size2 = 256, 224 val_pipe = dali_dataloader.ValPipeline(gpu, rank, size, img_size1, img_size2, options) val_pipe.build() global_epoch_size = val_pipe.epoch_size("Reader") if rank == size - 1: epoch_size = global_epoch_size - start_index(val_pipe, rank) else: epoch_size = start_index(val_pipe, rank + 1) - start_index( val_pipe, rank) dali_iter = DALIClassificationIterator(val_pipe, dynamic_shape=True, size=epoch_size, last_batch_padded=False, fill_last_batch=False) return DALIDataLoaderWrapper(dali_iter), global_epoch_size
def main_worker(local_rank, args): # Parameters input_size = 5 output_size = 2 batch_size_per_gpu = 8 data_size = 1000 world_size = args.world_size # prepare dist environment dist.init_process_group(backend='nccl', rank=local_rank, world_size=world_size) torch.cuda.set_device(local_rank) # model model = Model(input_size, output_size) model = model.cuda() model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank]) # dataloader # train_dataset = RandomDataset(input_size, data_size) # train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, rank=local_rank) # rand_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size_per_gpu, sampler=train_sampler, num_workers=4) # pipeline pipe = HybridTrainPipe(batch_size=cfg.batch_size, num_threads=cfg.n_workers, device_id=local_rank, data_dir=args.data_path, crop=cfg.image_size, local_rank=args.local_rank, world_size=args.world_size) pipe.build() dataloader = DALIClassificationIterator(pipe, reader_name="Reader") # run for data in rand_loader: input = data.cuda() output = model(input)
def gdtl(data_path, batch_size, workers=5, input_size=224, _worker_init_fn=None): if torch.distributed.is_initialized(): local_rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() else: local_rank = 0 world_size = 1 traindir = os.path.join(data_path, 'train') pipe = HybridTrainPipe(batch_size=batch_size, num_threads=workers, device_id=local_rank, data_dir=traindir, crop=224, dali_cpu=dali_cpu) pipe.build() # test_run = pipe.run() #test_run = train_pipe.schedule_run(), train_pipe.share_outputs(), train_pipe.release_outputs() train_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / world_size)) return DALIWrapper(train_loader), int( pipe.epoch_size("Reader") / (world_size * batch_size))
def gdvl(data_path, batch_size, workers=5, _worker_init_fn=None): if torch.distributed.is_initialized(): local_rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() else: local_rank = 0 world_size = 1 valdir = os.path.join(data_path, 'val') pipe = HybridValPipe(batch_size=batch_size, num_threads=workers, device_id=local_rank, data_dir=valdir, crop=224, size=256) pipe.build() test_run = pipe.run() val_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / world_size), fill_last_batch=False) return DALIWrapper(val_loader), int( pipe.epoch_size("Reader") / (world_size * batch_size))
def dali_loader(split, args=None, cfg=None): pipe = HybridPipe(split, args=args) pipe.build() loader = DALIClassificationIterator(pipe, size=int(pipe.epoch_size("Reader"))) #print(loader.__dict__) #print(loader._size) #print(loader.size) return loader
def dali_data_iter(batch_size: int, rec_file: str, idx_file: str, num_threads: int, initial_fill=32768, random_shuffle=True, prefetch_queue_depth=1, local_rank=0, name="reader", mean=(127.5, 127.5, 127.5), std=(127.5, 127.5, 127.5)): """ Parameters: ---------- initial_fill: int Size of the buffer that is used for shuffling. If random_shuffle is False, this parameter is ignored. """ rank: int = distributed.get_rank() world_size: int = distributed.get_world_size() import nvidia.dali.fn as fn import nvidia.dali.types as types from nvidia.dali.pipeline import Pipeline from nvidia.dali.plugin.pytorch import DALIClassificationIterator pipe = Pipeline( batch_size=batch_size, num_threads=num_threads, device_id=local_rank, prefetch_queue_depth=prefetch_queue_depth, ) condition_flip = fn.random.coin_flip(probability=0.5) with pipe: jpegs, labels = fn.readers.mxnet(path=rec_file, index_path=idx_file, initial_fill=initial_fill, num_shards=world_size, shard_id=rank, random_shuffle=random_shuffle, pad_last_batch=False, name=name) images = fn.decoders.image(jpegs, device="mixed", output_type=types.RGB) images = fn.crop_mirror_normalize(images, dtype=types.FLOAT, mean=mean, std=std, mirror=condition_flip) pipe.set_outputs(images, labels) pipe.build() return DALIWarper( DALIClassificationIterator( pipelines=[pipe], reader_name=name, ))
def build_dataloader(batch_size, workers, local_rank, world_size, data_dir, crop_size, dali_cpu=False): pipe = HybridTrainPipe(batch_size=batch_size, num_threads=workers, device_id=local_rank, data_dir=data_dir, crop=crop_size, local_rank=local_rank, world_size=world_size, dali_cpu=dali_cpu) pipe.build() return DALIClassificationIterator(pipe, size=int(pipe.epoch_size("Reader") / world_size))
def val_dataloader(self): val_dir = os.path.join(self.hparams.data_path, "val") pipe = HybridValPipe(batch_size=self.hparams.batch_size, num_threads=2, local_rank=self.trainer.proc_rank, world_size=self.trainer.world_size, data_dir=val_dir) pipe.build() pipe_size = int(pipe.epoch_size("Reader") / self.trainer.world_size) return DALIClassificationIterator(pipe, size=pipe_size, auto_reset=True)
def get_dali_train_loader(options, size, rank, team, gpu, seed_rank): train_pipe = dali_dataloader.TrainPipeline(gpu, 224, size, rank, seed_rank, options) train_pipe.build() global_epoch_size = train_pipe.epoch_size('Reader') epoch_size = global_epoch_size // (size * options.batchsize) * options.batchsize dali_iter = DALIClassificationIterator(train_pipe, size=epoch_size, auto_reset=True, last_batch_padded=False, fill_last_batch=True) return DALIDataLoaderWrapper(dali_iter), global_epoch_size
def __init__(self, cfg: LoaderConfig): """Returns train or val iterator over Imagenet data""" pipeline = train_pipeline if cfg._is_train else val_pipeline pipe = pipeline(batch_size=cfg.batch_size, num_threads=cfg.workers, device_id=env_rank(), cfg=cfg) pipe.build() self.loader = DALIClassificationIterator( pipe, reader_name="Reader", auto_reset=True, last_batch_policy=LastBatchPolicy.DROP, )
def get_dali_data_loader(args): crop_size = 224 val_size = 256 data_folder = get_data_folder(args.dataset) train_folder = os.path.join(data_folder, 'train') val_folder = os.path.join(data_folder, 'val') pipe = HybridTrainPipe(batch_size=args.batch_size, num_threads=args.num_workers, device_id=args.rank, data_dir=train_folder, crop=crop_size, dali_cpu=args.dali == 'cpu', shard_id=args.rank, num_shards=args.world_size) pipe.build() train_loader = DALIClassificationIterator(pipe, reader_name="Reader", fill_last_batch=True, last_batch_padded=False) pipe = HybridValPipe(batch_size=args.batch_size, num_threads=args.num_workers, device_id=args.rank, data_dir=val_folder, crop=crop_size, size=val_size, shard_id=args.rank, num_shards=args.world_size) pipe.build() val_loader = DALIClassificationIterator(pipe, reader_name="Reader", fill_last_batch=False, last_batch_padded=False) return train_loader, val_loader