def test_paddle_iterator_not_fill_last_batch_pad_last_batch(): from nvidia.dali.plugin.paddle import DALIGenericIterator as PaddleIterator num_gpus = 1 batch_size = 100 iters = 0 pipes, data_size = create_pipeline(lambda gpu: COCOReaderPipeline(batch_size=batch_size, num_threads=4, shard_id=gpu, num_gpus=num_gpus, data_paths=data_sets[0], random_shuffle=False, stick_to_shard=False, shuffle_after_epoch=False, pad_last_batch=True), batch_size, num_gpus) dali_train_iter = PaddleIterator(pipes, output_map=["data"], size=pipes[0].epoch_size("Reader"), fill_last_batch=False, last_batch_padded=True) img_ids_list, img_ids_list_set, mirrored_data, _, _ = \ gather_ids(dali_train_iter, lambda x: np.array(x["data"]).squeeze(), lambda x: 0, data_size) assert len(img_ids_list) == data_size assert len(img_ids_list_set) == data_size assert len(set(mirrored_data)) != 1 dali_train_iter.reset() next_img_ids_list, next_img_ids_list_set, next_mirrored_data, _, _ = \ gather_ids(dali_train_iter, lambda x: np.array(x["data"]).squeeze(), lambda x: 0, data_size) # there is no mirroring as data in the output is just cut off, # in the mirrored_data there is real data assert len(next_img_ids_list) == data_size assert len(next_img_ids_list_set) == data_size assert len(set(next_mirrored_data)) != 1
def main(): seg_num = 8 target_size = 224 video_files = [FLAGS.data + '/' + f for f in os.listdir(FLAGS.data)] pipeline = VideoPipe(video_files, seg_num, target_size, FLAGS.stride) video_loader = DALIGenericIterator(pipeline, ['image'], len(video_files), dynamic_shape=True) exe = fluid.Executor(fluid.CUDAPlace(0)) startup_prog = fluid.Program() eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): fetch_list = build(seg_num, target_size) exe.run(startup_prog) compiled_eval_prog = fluid.CompiledProgram(eval_prog) load_weights(exe, eval_prog, PRETRAIN_WEIGHTS) labels = json.load(open("kinetics_labels.json")) for idx, batch in enumerate(video_loader): fetches = exe.run(compiled_eval_prog, feed=batch, fetch_list=fetch_list) pred = fetches[0][0] topk_indices = pred.argsort()[0 - FLAGS.topk:] topk_labels = [labels[i] for i in topk_indices] filename = video_files[idx] print("prediction for {} is: {}".format(filename, topk_labels))
def test_paddle_iterator_last_batch_pad_last_batch(): num_gpus = 1 batch_size = 100 iters = 0 pipes, data_size = create_pipeline( lambda gpu: COCOReaderPipeline(batch_size=batch_size, num_threads=4, shard_id=gpu, num_gpus=num_gpus, data_paths=data_sets[0], random_shuffle=True, stick_to_shard=False, shuffle_after_epoch=False, pad_last_batch=True), batch_size, num_gpus) dali_train_iter = PaddleIterator(pipes, output_map=["data"], size=pipes[0].epoch_size("Reader"), fill_last_batch=True) img_ids_list, img_ids_list_set, mirrored_data, _, _ = \ gather_ids(dali_train_iter, lambda x: np.array(x["data"]).squeeze(), lambda x: 0, data_size) assert len(img_ids_list) > data_size assert len(img_ids_list_set) == data_size assert len(set(mirrored_data)) == 1 dali_train_iter.reset() next_img_ids_list, next_img_ids_list_set, next_mirrored_data, _, _ = \ gather_ids(dali_train_iter, lambda x: np.array(x["data"]).squeeze(), lambda x: 0, data_size) assert len(next_img_ids_list) > data_size assert len(next_img_ids_list_set) == data_size assert len(set(next_mirrored_data)) == 1
def main(): places = [] for p in fluid.framework.cuda_places(): place = fluid.core.Place() place.set_place(p) places.append(place) file_root = os.path.join(FLAGS.data, 'train2017') annotations_file = os.path.join(FLAGS.data, 'annotations/instances_train2017.json') world_size = len(places) pipelines = [ HybridTrainPipe(file_root, annotations_file, FLAGS.batch_size, p.gpu_device_id(), FLAGS.num_threads, local_rank=idx, world_size=world_size) for idx, p in enumerate(places) ] train_loader = DALIGenericIterator( pipelines, ['image', ('gt_box', 1), ('gt_label', 1)], reader_name="Reader", last_batch_policy=LastBatchPolicy.PARTIAL, auto_reset=True, dynamic_shape=True) FLAGS.whole_batch_size = FLAGS.batch_size * world_size total_steps = 400000 if FLAGS.check_loss_steps > 0: total_steps = FLAGS.check_loss_steps milestones = [280000, 360000] values = [FLAGS.lr * (0.1**i) for i in range(len(milestones) + 1)] exe = fluid.Executor(fluid.CUDAPlace(0)) startup_prog = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): train_fetch_list = build() learning_rate = fluid.layers.piecewise_decay(boundaries=milestones, values=values) learning_rate = fluid.layers.linear_lr_warmup( learning_rate=learning_rate, warmup_steps=500, start_lr=FLAGS.lr / 3, end_lr=FLAGS.lr) decay = FLAGS.weight_decay optimizer = fluid.optimizer.Momentum( momentum=FLAGS.momentum, learning_rate=learning_rate, regularization=fluid.regularizer.L2Decay(decay)) avg_loss = train_fetch_list[0] optimizer.minimize(avg_loss) exe.run(startup_prog) compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=avg_loss.name) load_weights(exe, train_prog, PRETRAIN_WEIGHTS) batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() end = time.time() def forever(): while True: try: yield next(train_loader) except StopIteration: pass for idx, batch in enumerate(forever()): if idx > total_steps: break data_time.update(time.time() - end) fetches = exe.run(compiled_train_prog, feed=batch, fetch_list=train_fetch_list) loss = np.mean(fetches[0]) losses.update(loss, FLAGS.whole_batch_size) if FLAGS.check_loss_steps > 0: if idx == 0: loss_start = loss else: loss_end = loss if idx % FLAGS.print_freq == 0 and idx > 1: print('Epoch: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Speed {2:.3f} ({3:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( idx, total_steps, FLAGS.whole_batch_size / batch_time.val, FLAGS.whole_batch_size / batch_time.avg, batch_time=batch_time, data_time=data_time, loss=losses)) if idx % FLAGS.ckpt_freq == 0 and idx > 1: ckpt_path = os.path.join('checkpoint', "{:02d}".format(idx)) if os.path.isdir(ckpt_path): shutil.rmtree(ckpt_path) print('Save model to {}.'.format(ckpt_path)) fluid.io.save_persistables(exe, ckpt_path, train_prog) batch_time.update(time.time() - end) end = time.time() if FLAGS.check_loss_steps > 0: assert loss_start > loss_end, \ 'loss should decrease after training for {} steps'.format( FLAGS.check_loss_steps)
def build(config, mode='train'): env = os.environ assert config.get('use_gpu', True) == True, "gpu training is required for DALI" assert not config.get( 'use_aa'), "auto augment is not supported by DALI reader" assert float(env.get('FLAGS_fraction_of_gpu_memory_to_use', 0.92)) < 0.9, \ "Please leave enough GPU memory for DALI workspace, e.g., by setting" \ " `export FLAGS_fraction_of_gpu_memory_to_use=0.8`" dataset_config = config[mode.upper()] gpu_num = paddle.fluid.core.get_cuda_device_count() if ( 'PADDLE_TRAINERS_NUM') and ( 'PADDLE_TRAINER_ID' ) not in env else int(env.get('PADDLE_TRAINERS_NUM', 0)) batch_size = dataset_config.batch_size assert batch_size % gpu_num == 0, \ "batch size must be multiple of number of devices" batch_size = batch_size // gpu_num file_root = dataset_config.data_dir file_list = dataset_config.file_list interp = 1 # settings.interpolation or 1 # default to linear interp_map = { 0: types.INTERP_NN, # cv2.INTER_NEAREST 1: types.INTERP_LINEAR, # cv2.INTER_LINEAR 2: types.INTERP_CUBIC, # cv2.INTER_CUBIC 4: types.INTERP_LANCZOS3, # XXX use LANCZOS3 for cv2.INTER_LANCZOS4 } output_dtype = (types.FLOAT16 if 'AMP' in config and config.AMP.get("use_pure_fp16", False) else types.FLOAT) assert interp in interp_map, "interpolation method not supported by DALI" interp = interp_map[interp] pad_output = False image_shape = config.get("image_shape", None) if image_shape and image_shape[0] == 4: pad_output = True transforms = { k: v for d in dataset_config["transforms"] for k, v in d.items() } scale = transforms["NormalizeImage"].get("scale", 1.0 / 255) if isinstance(scale, str): scale = eval(scale) mean = transforms["NormalizeImage"].get("mean", [0.485, 0.456, 0.406]) std = transforms["NormalizeImage"].get("std", [0.229, 0.224, 0.225]) mean = [v / scale for v in mean] std = [v / scale for v in std] if mode == "train": resize_shorter = 256 crop = transforms["RandCropImage"]["size"] scale = transforms["RandCropImage"].get("scale", [0.08, 1.]) ratio = transforms["RandCropImage"].get("ratio", [3.0 / 4, 4.0 / 3]) min_area = scale[0] lower = ratio[0] upper = ratio[1] if 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env: shard_id = int(env['PADDLE_TRAINER_ID']) num_shards = int(env['PADDLE_TRAINERS_NUM']) device_id = int(env['FLAGS_selected_gpus']) pipe = HybridTrainPipe( file_root, file_list, batch_size, resize_shorter, crop, min_area, lower, upper, interp, mean, std, device_id, shard_id, num_shards, seed=42 + shard_id, pad_output=pad_output, output_dtype=output_dtype) pipe.build() pipelines = [pipe] sample_per_shard = len(pipe) // num_shards else: pipelines = [] places = fluid.framework.cuda_places() num_shards = len(places) for idx, p in enumerate(places): place = fluid.core.Place() place.set_place(p) device_id = place.gpu_device_id() pipe = HybridTrainPipe( file_root, file_list, batch_size, resize_shorter, crop, min_area, lower, upper, interp, mean, std, device_id, idx, num_shards, seed=42 + idx, pad_output=pad_output, output_dtype=output_dtype) pipe.build() pipelines.append(pipe) sample_per_shard = len(pipelines[0]) return DALIGenericIterator( pipelines, ['feed_image', 'feed_label'], size=sample_per_shard) else: resize_shorter = transforms["ResizeImage"].get("resize_short", 256) crop = transforms["CropImage"]["size"] p = fluid.framework.cuda_places()[0] place = fluid.core.Place() place.set_place(p) device_id = place.gpu_device_id() pipe = HybridValPipe( file_root, file_list, batch_size, resize_shorter, crop, interp, mean, std, device_id=device_id, pad_output=pad_output, output_dtype=output_dtype) pipe.build() return DALIGenericIterator( pipe, ['feed_image', 'feed_label'], size=len(pipe), dynamic_shape=True, fill_last_batch=True, last_batch_padded=True)
def build(settings, mode='train'): env = os.environ assert settings.use_gpu, "gpu training is required for DALI" assert not settings.use_mixup, "mixup is not supported by DALI reader" assert not settings.use_aa, "auto augment is not supported by DALI reader" assert float(env.get('FLAGS_fraction_of_gpu_memory_to_use', 0.92)) < 0.9, \ "Please leave enough GPU memory for DALI workspace, e.g., by setting" \ " `export FLAGS_fraction_of_gpu_memory_to_use=0.8`" file_root = settings.data_dir bs = settings.batch_size assert bs % paddle.fluid.core.get_cuda_device_count() == 0, \ "batch size must be multiple of number of devices" batch_size = bs // paddle.fluid.core.get_cuda_device_count() mean = [v * 255 for v in settings.image_mean] std = [v * 255 for v in settings.image_std] crop = settings.image_shape[1] resize_shorter = settings.resize_short_size min_area = settings.lower_scale lower = settings.lower_ratio upper = settings.upper_ratio output_dtype = types.FLOAT16 if settings.use_pure_fp16 else types.FLOAT interp = settings.interpolation or 1 # default to linear interp_map = { 0: types.INTERP_NN, # cv2.INTER_NEAREST 1: types.INTERP_LINEAR, # cv2.INTER_LINEAR 2: types.INTERP_CUBIC, # cv2.INTER_CUBIC 4: types.INTERP_LANCZOS3, # XXX use LANCZOS3 for cv2.INTER_LANCZOS4 } assert interp in interp_map, "interpolation method not supported by DALI" interp = interp_map[interp] pad_output = False if settings.image_shape[0] == 4: pad_output = True if mode != 'train': p = fluid.framework.cuda_places()[0] place = fluid.core.Place() place.set_place(p) device_id = place.gpu_device_id() file_list = os.path.join(file_root, 'val_list.txt') if not os.path.exists(file_list): file_list = None file_root = os.path.join(file_root, 'val') pipe = HybridValPipe(file_root, file_list, batch_size, resize_shorter, crop, interp, mean, std, device_id=device_id, pad_output=pad_output, output_dtype=output_dtype) pipe.build() return DALIGenericIterator(pipe, ['feed_image', 'feed_label'], size=len(pipe), dynamic_shape=True, fill_last_batch=True, last_batch_padded=True) file_list = os.path.join(file_root, 'train_list.txt') if not os.path.exists(file_list): file_list = None file_root = os.path.join(file_root, 'train') if 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env: shard_id = int(env['PADDLE_TRAINER_ID']) num_shards = int(env['PADDLE_TRAINERS_NUM']) device_id = int(env['FLAGS_selected_gpus']) pipe = HybridTrainPipe(file_root, file_list, batch_size, resize_shorter, crop, min_area, lower, upper, interp, mean, std, device_id, shard_id, num_shards, seed=42 + shard_id, pad_output=pad_output, output_dtype=output_dtype) pipe.build() pipelines = [pipe] sample_per_shard = len(pipe) // num_shards else: pipelines = [] places = fluid.framework.cuda_places() num_shards = len(places) for idx, p in enumerate(places): place = fluid.core.Place() place.set_place(p) device_id = place.gpu_device_id() pipe = HybridTrainPipe(file_root, file_list, batch_size, resize_shorter, crop, min_area, lower, upper, interp, mean, std, device_id, idx, num_shards, seed=42 + idx, pad_output=pad_output, output_dtype=output_dtype) pipe.build() pipelines.append(pipe) sample_per_shard = len(pipelines[0]) return DALIGenericIterator(pipelines, ['feed_image', 'feed_label'], size=sample_per_shard)
def reader_(): with open(self.filelist) as flist: full_lines = [line for line in flist] if self.mode == 'train': if (not hasattr(reader_, 'seed')): reader_.seed = 0 random.Random(reader_.seed).shuffle(full_lines) print("reader shuffle seed", reader_.seed) if reader_.seed is not None: reader_.seed += 1 per_node_lines = int( math.ceil(len(full_lines) * 1.0 / self.num_trainers)) total_lines = per_node_lines * self.num_trainers # aligned full_lines so that it can evenly divisible full_lines += full_lines[:(total_lines - len(full_lines))] assert len(full_lines) == total_lines # trainer get own sample lines = full_lines[self.trainer_id:total_lines:self. num_trainers] assert len(lines) == per_node_lines logger.info("trainerid %d, trainer_count %d" % (self.trainer_id, self.num_trainers)) logger.info( "read images from %d, length: %d, lines length: %d, total: %d" % (self.trainer_id * per_node_lines, per_node_lines, len(lines), len(full_lines))) video_files = '' for item in lines: video_files += item tf = tempfile.NamedTemporaryFile() tf.write(str.encode(video_files)) tf.flush() video_files = tf.name device_id = int(os.getenv('FLAGS_selected_gpus', 0)) print('---------- device id -----------', device_id) if self.mode == 'train': pipe = VideoPipe(batch_size=self.batch_size, num_threads=1, device_id=device_id, file_list=video_files, sequence_length=self.seg_num * self.seglen, seg_num=self.seg_num, seg_length=self.seglen, resize_shorter_scale=self.short_size, crop_target_size=self.target_size, is_training=(self.mode == 'train'), dali_mean=self.dali_mean, dali_std=self.dali_std) else: pipe = VideoTestPipe(batch_size=self.batch_size, num_threads=1, device_id=device_id, file_list=video_files, sequence_length=self.seg_num * self.seglen, seg_num=self.seg_num, seg_length=self.seglen, resize_shorter_scale=self.short_size, crop_target_size=self.target_size, is_training=(self.mode == 'train'), dali_mean=self.dali_mean, dali_std=self.dali_std) logger.info( 'initializing dataset, it will take several minutes if it is too large .... ' ) video_loader = DALIGenericIterator([pipe], ['image', 'label'], len(lines), dynamic_shape=True, auto_reset=True) return video_loader
def build(settings, mode='train', trainer_id=None, trainers_num=None, gpu_id=0, data_layout='NCHW'): env = os.environ assert settings.use_gpu, "gpu training is required for DALI" assert not settings.use_mixup, "mixup is not supported by DALI reader" #assert not settings.use_aa, "auto augment is not supported by DALI reader" assert float(env.get('FLAGS_fraction_of_gpu_memory_to_use', 0.92)) < 0.9, \ "Please leave enough GPU memory for DALI workspace, e.g., by setting" \ " `export FLAGS_fraction_of_gpu_memory_to_use=0.8`" file_root = settings.data_dir batch_size = settings.batch_size print("batch_size:", batch_size) mean = [v * 255 for v in settings.image_mean] std = [v * 255 for v in settings.image_std] image_shape = [int(m) for m in settings.image_shape.split(",")] crop = image_shape[1] resize_shorter = settings.resize_short_size min_area = settings.lower_scale lower = settings.lower_ratio upper = settings.upper_ratio interp = settings.interpolation or 1 # default to linear interp_map = { 0: types.INTERP_NN, # cv2.INTER_NEAREST 1: types.INTERP_LINEAR, # cv2.INTER_LINEAR 2: types.INTERP_CUBIC, # cv2.INTER_CUBIC 4: types.INTERP_LANCZOS3, # XXX use LANCZOS3 for cv2.INTER_LANCZOS4 } assert interp in interp_map, "interpolation method not supported by DALI" interp = interp_map[interp] if mode != 'train': file_list = os.path.join(file_root, 'val_list.txt') if not os.path.exists(file_list): file_list = None file_root = os.path.join(file_root, 'val') pipe = HybridValPipe(file_root, file_list, batch_size, resize_shorter, crop, interp, mean, std, device_id=gpu_id, data_layout=data_layout) pipe.build() return DALIGenericIterator(pipe, ['feed_image', 'feed_label'], size=len(pipe), dynamic_shape=True, fill_last_batch=False, last_batch_padded=True) file_list = os.path.join(file_root, 'train_list.txt') if not os.path.exists(file_list): file_list = None file_root = os.path.join(file_root, 'train') if trainer_id is not None and trainers_num is not None: print("dali gpu_id:", gpu_id, "shard_id:", trainer_id, "num_shard:", trainers_num) shard_id = trainer_id num_shards = trainers_num pipe = HybridTrainPipe(file_root, file_list, batch_size, resize_shorter, crop, min_area, lower, upper, interp, mean, std, device_id=gpu_id, shard_id=shard_id, num_shards=num_shards, seed=42 + shard_id, data_layout=data_layout, num_threads=4) pipe.build() pipelines = [pipe] sample_per_shard = len(pipe) // num_shards else: pipelines = [] places = fluid.framework.cuda_places() num_shards = len(places) for idx, p in enumerate(places): place = fluid.core.Place() place.set_place(p) device_id = place.gpu_device_id() pipe = HybridTrainPipe(file_root, file_list, batch_size, resize_shorter, crop, min_area, lower, upper, interp, mean, std, device_id, idx, num_shards, seed=42 + idx, data_layout=data_layout, num_threads=4) pipe.build() pipelines.append(pipe) sample_per_shard = len(pipelines[0]) return DALIGenericIterator(pipelines, ['feed_image', 'feed_label'], size=sample_per_shard)
def reader_(): with open(self.file_path) as flist: full_lines = [line for line in flist] if (not hasattr(reader_, 'seed')): reader_.seed = 0 random.Random(reader_.seed).shuffle(full_lines) logger.info(f"reader shuffle seed: {reader_.seed}.") if reader_.seed is not None: reader_.seed += 1 per_node_lines = int( math.ceil(len(full_lines) * 1.0 / self.num_shards)) total_lines = per_node_lines * self.num_shards # aligned full_lines so that it can evenly divisible full_lines += full_lines[:(total_lines - len(full_lines))] assert len(full_lines) == total_lines # trainer get own sample lines = full_lines[self.shard_id:total_lines:self.num_shards] assert len(lines) == per_node_lines logger.info( f"shard_id: {self.shard_id}, trainer_count: {self.num_shards}" ) logger.info( f"read videos from {self.shard_id * per_node_lines}, " f"length: {per_node_lines}, " f"lines length: {len(lines)}, " f"total: {len(full_lines)}") video_files = '' for item in lines: video_files += item tf = tempfile.NamedTemporaryFile() tf.write(str.encode(video_files)) tf.flush() video_files = tf.name device_id = ParallelEnv().local_rank logger.info(f'---------- device_id: {device_id} -----------') pipe = VideoPipe(batch_size=self.batch_size, num_threads=1, device_id=device_id, file_list=video_files, sequence_length=self.seg_num * self.seglen, seg_num=self.seg_num, seg_length=self.seglen, resize_shorter_scale=self.short_size, crop_target_size=self.target_size, is_training=True, num_shards=self.num_shards, shard_id=self.shard_id, dali_mean=self.dali_mean, dali_std=self.dali_std) logger.info( 'initializing dataset, it will take several minutes if it is too large .... ' ) video_loader = DALIGenericIterator([pipe], ['image', 'label'], len(lines), dynamic_shape=True, auto_reset=True) return video_loader
def dali_dataloader(config, mode, device, seed=None): assert "gpu" in device, "gpu training is required for DALI" device_id = int(device.split(':')[1]) config_dataloader = config[mode] seed = 42 if seed is None else seed ops = [ list(x.keys())[0] for x in config_dataloader["dataset"]["transform_ops"] ] support_ops_train = [ "DecodeImage", "NormalizeImage", "RandFlipImage", "RandCropImage" ] support_ops_eval = [ "DecodeImage", "ResizeImage", "CropImage", "NormalizeImage" ] if mode.lower() == 'train': assert set(ops) == set( support_ops_train ), "The supported trasform_ops for train_dataset in dali is : {}".format( ",".join(support_ops_train)) else: assert set(ops) == set( support_ops_eval ), "The supported trasform_ops for eval_dataset in dali is : {}".format( ",".join(support_ops_eval)) normalize_ops = [ op for op in config_dataloader["dataset"]["transform_ops"] if "NormalizeImage" in op ][0]["NormalizeImage"] channel_num = normalize_ops.get("channel_num", 3) output_dtype = types.FLOAT16 if normalize_ops.get("output_fp16", False) else types.FLOAT env = os.environ # assert float(env.get('FLAGS_fraction_of_gpu_memory_to_use', 0.92)) < 0.9, \ # "Please leave enough GPU memory for DALI workspace, e.g., by setting" \ # " `export FLAGS_fraction_of_gpu_memory_to_use=0.8`" gpu_num = paddle.distributed.get_world_size() batch_size = config_dataloader["sampler"]["batch_size"] file_root = config_dataloader["dataset"]["image_root"] file_list = config_dataloader["dataset"]["cls_label_path"] interp = 1 # settings.interpolation or 1 # default to linear interp_map = { 0: types.DALIInterpType.INTERP_NN, # cv2.INTER_NEAREST 1: types.DALIInterpType.INTERP_LINEAR, # cv2.INTER_LINEAR 2: types.DALIInterpType.INTERP_CUBIC, # cv2.INTER_CUBIC 3: types.DALIInterpType. INTERP_LANCZOS3, # XXX use LANCZOS3 for cv2.INTER_LANCZOS4 } assert interp in interp_map, "interpolation method not supported by DALI" interp = interp_map[interp] pad_output = channel_num == 4 transforms = { k: v for d in config_dataloader["dataset"]["transform_ops"] for k, v in d.items() } scale = transforms["NormalizeImage"].get("scale", 1.0 / 255) scale = eval(scale) if isinstance(scale, str) else scale mean = transforms["NormalizeImage"].get("mean", [0.485, 0.456, 0.406]) std = transforms["NormalizeImage"].get("std", [0.229, 0.224, 0.225]) mean = [v / scale for v in mean] std = [v / scale for v in std] sampler_name = config_dataloader["sampler"].get("name", "DistributedBatchSampler") assert sampler_name in ["DistributedBatchSampler", "BatchSampler"] if mode.lower() == "train": resize_shorter = 256 crop = transforms["RandCropImage"]["size"] scale = transforms["RandCropImage"].get("scale", [0.08, 1.]) ratio = transforms["RandCropImage"].get("ratio", [3.0 / 4, 4.0 / 3]) min_area = scale[0] lower = ratio[0] upper = ratio[1] if 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env: shard_id = int(env['PADDLE_TRAINER_ID']) num_shards = int(env['PADDLE_TRAINERS_NUM']) device_id = int(env['FLAGS_selected_gpus']) pipe = HybridTrainPipe(file_root, file_list, batch_size, resize_shorter, crop, min_area, lower, upper, interp, mean, std, device_id, shard_id, num_shards, seed=seed + shard_id, pad_output=pad_output, output_dtype=output_dtype) pipe.build() pipelines = [pipe] # sample_per_shard = len(pipe) // num_shards else: pipe = HybridTrainPipe(file_root, file_list, batch_size, resize_shorter, crop, min_area, lower, upper, interp, mean, std, device_id=device_id, shard_id=0, num_shards=1, seed=seed, pad_output=pad_output, output_dtype=output_dtype) pipe.build() pipelines = [pipe] # sample_per_shard = len(pipelines[0]) return DALIGenericIterator(pipelines, ['data', 'label'], reader_name='Reader') else: resize_shorter = transforms["ResizeImage"].get("resize_short", 256) crop = transforms["CropImage"]["size"] if 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env and sampler_name == "DistributedBatchSampler": shard_id = int(env['PADDLE_TRAINER_ID']) num_shards = int(env['PADDLE_TRAINERS_NUM']) device_id = int(env['FLAGS_selected_gpus']) pipe = HybridValPipe(file_root, file_list, batch_size, resize_shorter, crop, interp, mean, std, device_id=device_id, shard_id=shard_id, num_shards=num_shards, pad_output=pad_output, output_dtype=output_dtype) else: pipe = HybridValPipe(file_root, file_list, batch_size, resize_shorter, crop, interp, mean, std, device_id=device_id, pad_output=pad_output, output_dtype=output_dtype) pipe.build() return DALIGenericIterator([pipe], ['data', 'label'], reader_name="Reader")