Beispiel #1
0
def test_paddle_iterator_not_fill_last_batch_pad_last_batch():
    from nvidia.dali.plugin.paddle import DALIGenericIterator as PaddleIterator
    num_gpus = 1
    batch_size = 100
    iters = 0

    pipes, data_size = create_pipeline(lambda gpu: COCOReaderPipeline(batch_size=batch_size, num_threads=4, shard_id=gpu, num_gpus=num_gpus,
                                                                      data_paths=data_sets[0], random_shuffle=False, stick_to_shard=False,
                                                                      shuffle_after_epoch=False, pad_last_batch=True), batch_size, num_gpus)

    dali_train_iter = PaddleIterator(pipes, output_map=["data"], size=pipes[0].epoch_size("Reader"), fill_last_batch=False, last_batch_padded=True)

    img_ids_list, img_ids_list_set, mirrored_data, _, _ = \
        gather_ids(dali_train_iter, lambda x: np.array(x["data"]).squeeze(), lambda x: 0, data_size)

    assert len(img_ids_list) == data_size
    assert len(img_ids_list_set) == data_size
    assert len(set(mirrored_data)) != 1

    dali_train_iter.reset()
    next_img_ids_list, next_img_ids_list_set, next_mirrored_data, _, _ = \
        gather_ids(dali_train_iter, lambda x: np.array(x["data"]).squeeze(), lambda x: 0, data_size)

    # there is no mirroring as data in the output is just cut off,
    # in the mirrored_data there is real data
    assert len(next_img_ids_list) == data_size
    assert len(next_img_ids_list_set) == data_size
    assert len(set(next_mirrored_data)) != 1
Beispiel #2
0
def main():
    seg_num = 8
    target_size = 224

    video_files = [FLAGS.data + '/' + f for f in os.listdir(FLAGS.data)]
    pipeline = VideoPipe(video_files, seg_num, target_size, FLAGS.stride)

    video_loader = DALIGenericIterator(pipeline, ['image'],
                                       len(video_files),
                                       dynamic_shape=True)

    exe = fluid.Executor(fluid.CUDAPlace(0))
    startup_prog = fluid.Program()
    eval_prog = fluid.Program()

    with fluid.program_guard(eval_prog, startup_prog):
        with fluid.unique_name.guard():
            fetch_list = build(seg_num, target_size)

    exe.run(startup_prog)
    compiled_eval_prog = fluid.CompiledProgram(eval_prog)

    load_weights(exe, eval_prog, PRETRAIN_WEIGHTS)

    labels = json.load(open("kinetics_labels.json"))

    for idx, batch in enumerate(video_loader):
        fetches = exe.run(compiled_eval_prog,
                          feed=batch,
                          fetch_list=fetch_list)
        pred = fetches[0][0]
        topk_indices = pred.argsort()[0 - FLAGS.topk:]
        topk_labels = [labels[i] for i in topk_indices]
        filename = video_files[idx]
        print("prediction for {} is: {}".format(filename, topk_labels))
Beispiel #3
0
def test_paddle_iterator_last_batch_pad_last_batch():
    num_gpus = 1
    batch_size = 100
    iters = 0

    pipes, data_size = create_pipeline(
        lambda gpu: COCOReaderPipeline(batch_size=batch_size,
                                       num_threads=4,
                                       shard_id=gpu,
                                       num_gpus=num_gpus,
                                       data_paths=data_sets[0],
                                       random_shuffle=True,
                                       stick_to_shard=False,
                                       shuffle_after_epoch=False,
                                       pad_last_batch=True), batch_size,
        num_gpus)

    dali_train_iter = PaddleIterator(pipes,
                                     output_map=["data"],
                                     size=pipes[0].epoch_size("Reader"),
                                     fill_last_batch=True)

    img_ids_list, img_ids_list_set, mirrored_data, _, _ = \
        gather_ids(dali_train_iter, lambda x: np.array(x["data"]).squeeze(), lambda x: 0, data_size)

    assert len(img_ids_list) > data_size
    assert len(img_ids_list_set) == data_size
    assert len(set(mirrored_data)) == 1

    dali_train_iter.reset()
    next_img_ids_list, next_img_ids_list_set, next_mirrored_data, _, _ = \
        gather_ids(dali_train_iter, lambda x: np.array(x["data"]).squeeze(), lambda x: 0, data_size)

    assert len(next_img_ids_list) > data_size
    assert len(next_img_ids_list_set) == data_size
    assert len(set(next_mirrored_data)) == 1
Beispiel #4
0
def main():
    places = []
    for p in fluid.framework.cuda_places():
        place = fluid.core.Place()
        place.set_place(p)
        places.append(place)

    file_root = os.path.join(FLAGS.data, 'train2017')
    annotations_file = os.path.join(FLAGS.data,
                                    'annotations/instances_train2017.json')
    world_size = len(places)

    pipelines = [
        HybridTrainPipe(file_root,
                        annotations_file,
                        FLAGS.batch_size,
                        p.gpu_device_id(),
                        FLAGS.num_threads,
                        local_rank=idx,
                        world_size=world_size) for idx, p in enumerate(places)
    ]

    train_loader = DALIGenericIterator(
        pipelines, ['image', ('gt_box', 1), ('gt_label', 1)],
        reader_name="Reader",
        last_batch_policy=LastBatchPolicy.PARTIAL,
        auto_reset=True,
        dynamic_shape=True)

    FLAGS.whole_batch_size = FLAGS.batch_size * world_size
    total_steps = 400000
    if FLAGS.check_loss_steps > 0:
        total_steps = FLAGS.check_loss_steps
    milestones = [280000, 360000]
    values = [FLAGS.lr * (0.1**i) for i in range(len(milestones) + 1)]

    exe = fluid.Executor(fluid.CUDAPlace(0))
    startup_prog = fluid.Program()
    train_prog = fluid.Program()

    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            train_fetch_list = build()
            learning_rate = fluid.layers.piecewise_decay(boundaries=milestones,
                                                         values=values)
            learning_rate = fluid.layers.linear_lr_warmup(
                learning_rate=learning_rate,
                warmup_steps=500,
                start_lr=FLAGS.lr / 3,
                end_lr=FLAGS.lr)
            decay = FLAGS.weight_decay
            optimizer = fluid.optimizer.Momentum(
                momentum=FLAGS.momentum,
                learning_rate=learning_rate,
                regularization=fluid.regularizer.L2Decay(decay))
            avg_loss = train_fetch_list[0]
            optimizer.minimize(avg_loss)

    exe.run(startup_prog)
    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=avg_loss.name)

    load_weights(exe, train_prog, PRETRAIN_WEIGHTS)

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    end = time.time()

    def forever():
        while True:
            try:
                yield next(train_loader)
            except StopIteration:
                pass

    for idx, batch in enumerate(forever()):
        if idx > total_steps:
            break
        data_time.update(time.time() - end)

        fetches = exe.run(compiled_train_prog,
                          feed=batch,
                          fetch_list=train_fetch_list)
        loss = np.mean(fetches[0])

        losses.update(loss, FLAGS.whole_batch_size)

        if FLAGS.check_loss_steps > 0:
            if idx == 0:
                loss_start = loss
            else:
                loss_end = loss

        if idx % FLAGS.print_freq == 0 and idx > 1:
            print('Epoch: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Speed {2:.3f} ({3:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                      idx,
                      total_steps,
                      FLAGS.whole_batch_size / batch_time.val,
                      FLAGS.whole_batch_size / batch_time.avg,
                      batch_time=batch_time,
                      data_time=data_time,
                      loss=losses))

        if idx % FLAGS.ckpt_freq == 0 and idx > 1:
            ckpt_path = os.path.join('checkpoint', "{:02d}".format(idx))
            if os.path.isdir(ckpt_path):
                shutil.rmtree(ckpt_path)

            print('Save model to {}.'.format(ckpt_path))
            fluid.io.save_persistables(exe, ckpt_path, train_prog)

        batch_time.update(time.time() - end)
        end = time.time()

    if FLAGS.check_loss_steps > 0:
        assert loss_start > loss_end, \
            'loss should decrease after training for {} steps'.format(
                FLAGS.check_loss_steps)
Beispiel #5
0
def build(config, mode='train'):
    env = os.environ
    assert config.get('use_gpu',
                      True) == True, "gpu training is required for DALI"
    assert not config.get(
        'use_aa'), "auto augment is not supported by DALI reader"
    assert float(env.get('FLAGS_fraction_of_gpu_memory_to_use', 0.92)) < 0.9, \
        "Please leave enough GPU memory for DALI workspace, e.g., by setting" \
        " `export FLAGS_fraction_of_gpu_memory_to_use=0.8`"

    dataset_config = config[mode.upper()]

    gpu_num = paddle.fluid.core.get_cuda_device_count() if (
        'PADDLE_TRAINERS_NUM') and (
            'PADDLE_TRAINER_ID'
        ) not in env else int(env.get('PADDLE_TRAINERS_NUM', 0))

    batch_size = dataset_config.batch_size
    assert batch_size % gpu_num == 0, \
        "batch size must be multiple of number of devices"
    batch_size = batch_size // gpu_num

    file_root = dataset_config.data_dir
    file_list = dataset_config.file_list

    interp = 1  # settings.interpolation or 1  # default to linear
    interp_map = {
        0: types.INTERP_NN,  # cv2.INTER_NEAREST
        1: types.INTERP_LINEAR,  # cv2.INTER_LINEAR
        2: types.INTERP_CUBIC,  # cv2.INTER_CUBIC
        4: types.INTERP_LANCZOS3,  # XXX use LANCZOS3 for cv2.INTER_LANCZOS4
    }

    output_dtype = (types.FLOAT16 if 'AMP' in config and
                    config.AMP.get("use_pure_fp16", False) 
                    else types.FLOAT)
    
    assert interp in interp_map, "interpolation method not supported by DALI"
    interp = interp_map[interp]
    pad_output = False
    image_shape = config.get("image_shape", None)
    if image_shape and image_shape[0] == 4:
        pad_output = True

    transforms = {
        k: v
        for d in dataset_config["transforms"] for k, v in d.items()
    }

    scale = transforms["NormalizeImage"].get("scale", 1.0 / 255)
    if isinstance(scale, str):
        scale = eval(scale)
    mean = transforms["NormalizeImage"].get("mean", [0.485, 0.456, 0.406])
    std = transforms["NormalizeImage"].get("std", [0.229, 0.224, 0.225])
    mean = [v / scale for v in mean]
    std = [v / scale for v in std]

    if mode == "train":
        resize_shorter = 256
        crop = transforms["RandCropImage"]["size"]
        scale = transforms["RandCropImage"].get("scale", [0.08, 1.])
        ratio = transforms["RandCropImage"].get("ratio", [3.0 / 4, 4.0 / 3])
        min_area = scale[0]
        lower = ratio[0]
        upper = ratio[1]

        if 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env:
            shard_id = int(env['PADDLE_TRAINER_ID'])
            num_shards = int(env['PADDLE_TRAINERS_NUM'])
            device_id = int(env['FLAGS_selected_gpus'])
            pipe = HybridTrainPipe(
                file_root,
                file_list,
                batch_size,
                resize_shorter,
                crop,
                min_area,
                lower,
                upper,
                interp,
                mean,
                std,
                device_id,
                shard_id,
                num_shards,
                seed=42 + shard_id,
                pad_output=pad_output,
                output_dtype=output_dtype)
            pipe.build()
            pipelines = [pipe]
            sample_per_shard = len(pipe) // num_shards
        else:
            pipelines = []
            places = fluid.framework.cuda_places()
            num_shards = len(places)
            for idx, p in enumerate(places):
                place = fluid.core.Place()
                place.set_place(p)
                device_id = place.gpu_device_id()
                pipe = HybridTrainPipe(
                    file_root,
                    file_list,
                    batch_size,
                    resize_shorter,
                    crop,
                    min_area,
                    lower,
                    upper,
                    interp,
                    mean,
                    std,
                    device_id,
                    idx,
                    num_shards,
                    seed=42 + idx,
                pad_output=pad_output,
                output_dtype=output_dtype)
                pipe.build()
                pipelines.append(pipe)
            sample_per_shard = len(pipelines[0])
        return DALIGenericIterator(
            pipelines, ['feed_image', 'feed_label'], size=sample_per_shard)
    else:
        resize_shorter = transforms["ResizeImage"].get("resize_short", 256)
        crop = transforms["CropImage"]["size"]

        p = fluid.framework.cuda_places()[0]
        place = fluid.core.Place()
        place.set_place(p)
        device_id = place.gpu_device_id()
        pipe = HybridValPipe(
            file_root,
            file_list,
            batch_size,
            resize_shorter,
            crop,
            interp,
            mean,
            std,
            device_id=device_id,
            pad_output=pad_output,
            output_dtype=output_dtype)
        pipe.build()
        return DALIGenericIterator(
            pipe, ['feed_image', 'feed_label'],
            size=len(pipe),
            dynamic_shape=True,
            fill_last_batch=True,
            last_batch_padded=True)
Beispiel #6
0
def build(settings, mode='train'):
    env = os.environ
    assert settings.use_gpu, "gpu training is required for DALI"
    assert not settings.use_mixup, "mixup is not supported by DALI reader"
    assert not settings.use_aa, "auto augment is not supported by DALI reader"
    assert float(env.get('FLAGS_fraction_of_gpu_memory_to_use', 0.92)) < 0.9, \
        "Please leave enough GPU memory for DALI workspace, e.g., by setting" \
        " `export FLAGS_fraction_of_gpu_memory_to_use=0.8`"

    file_root = settings.data_dir
    bs = settings.batch_size
    assert bs % paddle.fluid.core.get_cuda_device_count() == 0, \
        "batch size must be multiple of number of devices"
    batch_size = bs // paddle.fluid.core.get_cuda_device_count()

    mean = [v * 255 for v in settings.image_mean]
    std = [v * 255 for v in settings.image_std]
    crop = settings.image_shape[1]
    resize_shorter = settings.resize_short_size
    min_area = settings.lower_scale
    lower = settings.lower_ratio
    upper = settings.upper_ratio
    output_dtype = types.FLOAT16 if settings.use_pure_fp16 else types.FLOAT

    interp = settings.interpolation or 1  # default to linear
    interp_map = {
        0: types.INTERP_NN,  # cv2.INTER_NEAREST
        1: types.INTERP_LINEAR,  # cv2.INTER_LINEAR
        2: types.INTERP_CUBIC,  # cv2.INTER_CUBIC
        4: types.INTERP_LANCZOS3,  # XXX use LANCZOS3 for cv2.INTER_LANCZOS4
    }
    assert interp in interp_map, "interpolation method not supported by DALI"
    interp = interp_map[interp]
    pad_output = False
    if settings.image_shape[0] == 4:
        pad_output = True

    if mode != 'train':
        p = fluid.framework.cuda_places()[0]
        place = fluid.core.Place()
        place.set_place(p)
        device_id = place.gpu_device_id()
        file_list = os.path.join(file_root, 'val_list.txt')
        if not os.path.exists(file_list):
            file_list = None
            file_root = os.path.join(file_root, 'val')
        pipe = HybridValPipe(file_root,
                             file_list,
                             batch_size,
                             resize_shorter,
                             crop,
                             interp,
                             mean,
                             std,
                             device_id=device_id,
                             pad_output=pad_output,
                             output_dtype=output_dtype)
        pipe.build()
        return DALIGenericIterator(pipe, ['feed_image', 'feed_label'],
                                   size=len(pipe),
                                   dynamic_shape=True,
                                   fill_last_batch=True,
                                   last_batch_padded=True)

    file_list = os.path.join(file_root, 'train_list.txt')
    if not os.path.exists(file_list):
        file_list = None
        file_root = os.path.join(file_root, 'train')

    if 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env:
        shard_id = int(env['PADDLE_TRAINER_ID'])
        num_shards = int(env['PADDLE_TRAINERS_NUM'])
        device_id = int(env['FLAGS_selected_gpus'])
        pipe = HybridTrainPipe(file_root,
                               file_list,
                               batch_size,
                               resize_shorter,
                               crop,
                               min_area,
                               lower,
                               upper,
                               interp,
                               mean,
                               std,
                               device_id,
                               shard_id,
                               num_shards,
                               seed=42 + shard_id,
                               pad_output=pad_output,
                               output_dtype=output_dtype)
        pipe.build()
        pipelines = [pipe]
        sample_per_shard = len(pipe) // num_shards
    else:
        pipelines = []
        places = fluid.framework.cuda_places()
        num_shards = len(places)
        for idx, p in enumerate(places):
            place = fluid.core.Place()
            place.set_place(p)
            device_id = place.gpu_device_id()
            pipe = HybridTrainPipe(file_root,
                                   file_list,
                                   batch_size,
                                   resize_shorter,
                                   crop,
                                   min_area,
                                   lower,
                                   upper,
                                   interp,
                                   mean,
                                   std,
                                   device_id,
                                   idx,
                                   num_shards,
                                   seed=42 + idx,
                                   pad_output=pad_output,
                                   output_dtype=output_dtype)
            pipe.build()
            pipelines.append(pipe)
        sample_per_shard = len(pipelines[0])

    return DALIGenericIterator(pipelines, ['feed_image', 'feed_label'],
                               size=sample_per_shard)
Beispiel #7
0
        def reader_():
            with open(self.filelist) as flist:
                full_lines = [line for line in flist]
                if self.mode == 'train':
                    if (not hasattr(reader_, 'seed')):
                        reader_.seed = 0
                    random.Random(reader_.seed).shuffle(full_lines)
                    print("reader shuffle seed", reader_.seed)
                    if reader_.seed is not None:
                        reader_.seed += 1

                per_node_lines = int(
                    math.ceil(len(full_lines) * 1.0 / self.num_trainers))
                total_lines = per_node_lines * self.num_trainers

                # aligned full_lines so that it can evenly divisible
                full_lines += full_lines[:(total_lines - len(full_lines))]
                assert len(full_lines) == total_lines

                # trainer get own sample
                lines = full_lines[self.trainer_id:total_lines:self.
                                   num_trainers]
                assert len(lines) == per_node_lines

                logger.info("trainerid %d, trainer_count %d" %
                            (self.trainer_id, self.num_trainers))
                logger.info(
                    "read images from %d, length: %d, lines length: %d, total: %d"
                    % (self.trainer_id * per_node_lines, per_node_lines,
                       len(lines), len(full_lines)))

            video_files = ''
            for item in lines:
                video_files += item
            tf = tempfile.NamedTemporaryFile()
            tf.write(str.encode(video_files))
            tf.flush()
            video_files = tf.name

            device_id = int(os.getenv('FLAGS_selected_gpus', 0))
            print('---------- device id -----------', device_id)

            if self.mode == 'train':
                pipe = VideoPipe(batch_size=self.batch_size,
                                 num_threads=1,
                                 device_id=device_id,
                                 file_list=video_files,
                                 sequence_length=self.seg_num * self.seglen,
                                 seg_num=self.seg_num,
                                 seg_length=self.seglen,
                                 resize_shorter_scale=self.short_size,
                                 crop_target_size=self.target_size,
                                 is_training=(self.mode == 'train'),
                                 dali_mean=self.dali_mean,
                                 dali_std=self.dali_std)
            else:
                pipe = VideoTestPipe(batch_size=self.batch_size,
                                     num_threads=1,
                                     device_id=device_id,
                                     file_list=video_files,
                                     sequence_length=self.seg_num *
                                     self.seglen,
                                     seg_num=self.seg_num,
                                     seg_length=self.seglen,
                                     resize_shorter_scale=self.short_size,
                                     crop_target_size=self.target_size,
                                     is_training=(self.mode == 'train'),
                                     dali_mean=self.dali_mean,
                                     dali_std=self.dali_std)
            logger.info(
                'initializing dataset, it will take several minutes if it is too large .... '
            )
            video_loader = DALIGenericIterator([pipe], ['image', 'label'],
                                               len(lines),
                                               dynamic_shape=True,
                                               auto_reset=True)

            return video_loader
Beispiel #8
0
def build(settings,
          mode='train',
          trainer_id=None,
          trainers_num=None,
          gpu_id=0,
          data_layout='NCHW'):
    env = os.environ
    assert settings.use_gpu, "gpu training is required for DALI"
    assert not settings.use_mixup, "mixup is not supported by DALI reader"
    #assert not settings.use_aa, "auto augment is not supported by DALI reader"
    assert float(env.get('FLAGS_fraction_of_gpu_memory_to_use', 0.92)) < 0.9, \
        "Please leave enough GPU memory for DALI workspace, e.g., by setting" \
        " `export FLAGS_fraction_of_gpu_memory_to_use=0.8`"

    file_root = settings.data_dir
    batch_size = settings.batch_size
    print("batch_size:", batch_size)

    mean = [v * 255 for v in settings.image_mean]
    std = [v * 255 for v in settings.image_std]
    image_shape = [int(m) for m in settings.image_shape.split(",")]
    crop = image_shape[1]
    resize_shorter = settings.resize_short_size
    min_area = settings.lower_scale
    lower = settings.lower_ratio
    upper = settings.upper_ratio

    interp = settings.interpolation or 1  # default to linear
    interp_map = {
        0: types.INTERP_NN,  # cv2.INTER_NEAREST
        1: types.INTERP_LINEAR,  # cv2.INTER_LINEAR
        2: types.INTERP_CUBIC,  # cv2.INTER_CUBIC
        4: types.INTERP_LANCZOS3,  # XXX use LANCZOS3 for cv2.INTER_LANCZOS4
    }
    assert interp in interp_map, "interpolation method not supported by DALI"
    interp = interp_map[interp]

    if mode != 'train':
        file_list = os.path.join(file_root, 'val_list.txt')
        if not os.path.exists(file_list):
            file_list = None
            file_root = os.path.join(file_root, 'val')
        pipe = HybridValPipe(file_root,
                             file_list,
                             batch_size,
                             resize_shorter,
                             crop,
                             interp,
                             mean,
                             std,
                             device_id=gpu_id,
                             data_layout=data_layout)
        pipe.build()
        return DALIGenericIterator(pipe, ['feed_image', 'feed_label'],
                                   size=len(pipe),
                                   dynamic_shape=True,
                                   fill_last_batch=False,
                                   last_batch_padded=True)

    file_list = os.path.join(file_root, 'train_list.txt')
    if not os.path.exists(file_list):
        file_list = None
        file_root = os.path.join(file_root, 'train')

    if trainer_id is not None and trainers_num is not None:
        print("dali gpu_id:", gpu_id, "shard_id:", trainer_id, "num_shard:",
              trainers_num)
        shard_id = trainer_id
        num_shards = trainers_num
        pipe = HybridTrainPipe(file_root,
                               file_list,
                               batch_size,
                               resize_shorter,
                               crop,
                               min_area,
                               lower,
                               upper,
                               interp,
                               mean,
                               std,
                               device_id=gpu_id,
                               shard_id=shard_id,
                               num_shards=num_shards,
                               seed=42 + shard_id,
                               data_layout=data_layout,
                               num_threads=4)
        pipe.build()
        pipelines = [pipe]
        sample_per_shard = len(pipe) // num_shards
    else:
        pipelines = []
        places = fluid.framework.cuda_places()
        num_shards = len(places)
        for idx, p in enumerate(places):
            place = fluid.core.Place()
            place.set_place(p)
            device_id = place.gpu_device_id()
            pipe = HybridTrainPipe(file_root,
                                   file_list,
                                   batch_size,
                                   resize_shorter,
                                   crop,
                                   min_area,
                                   lower,
                                   upper,
                                   interp,
                                   mean,
                                   std,
                                   device_id,
                                   idx,
                                   num_shards,
                                   seed=42 + idx,
                                   data_layout=data_layout,
                                   num_threads=4)
            pipe.build()
            pipelines.append(pipe)
        sample_per_shard = len(pipelines[0])

    return DALIGenericIterator(pipelines, ['feed_image', 'feed_label'],
                               size=sample_per_shard)
        def reader_():
            with open(self.file_path) as flist:
                full_lines = [line for line in flist]
                if (not hasattr(reader_, 'seed')):
                    reader_.seed = 0
                random.Random(reader_.seed).shuffle(full_lines)
                logger.info(f"reader shuffle seed: {reader_.seed}.")
                if reader_.seed is not None:
                    reader_.seed += 1

                per_node_lines = int(
                    math.ceil(len(full_lines) * 1.0 / self.num_shards))
                total_lines = per_node_lines * self.num_shards

                # aligned full_lines so that it can evenly divisible
                full_lines += full_lines[:(total_lines - len(full_lines))]
                assert len(full_lines) == total_lines

                # trainer get own sample
                lines = full_lines[self.shard_id:total_lines:self.num_shards]
                assert len(lines) == per_node_lines

                logger.info(
                    f"shard_id: {self.shard_id}, trainer_count: {self.num_shards}"
                )
                logger.info(
                    f"read videos from {self.shard_id * per_node_lines}, "
                    f"length: {per_node_lines}, "
                    f"lines length: {len(lines)}, "
                    f"total: {len(full_lines)}")

            video_files = ''
            for item in lines:
                video_files += item
            tf = tempfile.NamedTemporaryFile()
            tf.write(str.encode(video_files))
            tf.flush()
            video_files = tf.name

            device_id = ParallelEnv().local_rank
            logger.info(f'---------- device_id: {device_id} -----------')

            pipe = VideoPipe(batch_size=self.batch_size,
                             num_threads=1,
                             device_id=device_id,
                             file_list=video_files,
                             sequence_length=self.seg_num * self.seglen,
                             seg_num=self.seg_num,
                             seg_length=self.seglen,
                             resize_shorter_scale=self.short_size,
                             crop_target_size=self.target_size,
                             is_training=True,
                             num_shards=self.num_shards,
                             shard_id=self.shard_id,
                             dali_mean=self.dali_mean,
                             dali_std=self.dali_std)

            logger.info(
                'initializing dataset, it will take several minutes if it is too large .... '
            )
            video_loader = DALIGenericIterator([pipe], ['image', 'label'],
                                               len(lines),
                                               dynamic_shape=True,
                                               auto_reset=True)

            return video_loader
Beispiel #10
0
def dali_dataloader(config, mode, device, seed=None):
    assert "gpu" in device, "gpu training is required for DALI"
    device_id = int(device.split(':')[1])
    config_dataloader = config[mode]
    seed = 42 if seed is None else seed
    ops = [
        list(x.keys())[0]
        for x in config_dataloader["dataset"]["transform_ops"]
    ]
    support_ops_train = [
        "DecodeImage", "NormalizeImage", "RandFlipImage", "RandCropImage"
    ]
    support_ops_eval = [
        "DecodeImage", "ResizeImage", "CropImage", "NormalizeImage"
    ]

    if mode.lower() == 'train':
        assert set(ops) == set(
            support_ops_train
        ), "The supported trasform_ops for train_dataset in dali is : {}".format(
            ",".join(support_ops_train))
    else:
        assert set(ops) == set(
            support_ops_eval
        ), "The supported trasform_ops for eval_dataset in dali is : {}".format(
            ",".join(support_ops_eval))

    normalize_ops = [
        op for op in config_dataloader["dataset"]["transform_ops"]
        if "NormalizeImage" in op
    ][0]["NormalizeImage"]
    channel_num = normalize_ops.get("channel_num", 3)
    output_dtype = types.FLOAT16 if normalize_ops.get("output_fp16",
                                                      False) else types.FLOAT

    env = os.environ
    #  assert float(env.get('FLAGS_fraction_of_gpu_memory_to_use', 0.92)) < 0.9, \
    #      "Please leave enough GPU memory for DALI workspace, e.g., by setting" \
    #      " `export FLAGS_fraction_of_gpu_memory_to_use=0.8`"

    gpu_num = paddle.distributed.get_world_size()

    batch_size = config_dataloader["sampler"]["batch_size"]

    file_root = config_dataloader["dataset"]["image_root"]
    file_list = config_dataloader["dataset"]["cls_label_path"]

    interp = 1  # settings.interpolation or 1  # default to linear
    interp_map = {
        0: types.DALIInterpType.INTERP_NN,  # cv2.INTER_NEAREST
        1: types.DALIInterpType.INTERP_LINEAR,  # cv2.INTER_LINEAR
        2: types.DALIInterpType.INTERP_CUBIC,  # cv2.INTER_CUBIC
        3: types.DALIInterpType.
        INTERP_LANCZOS3,  # XXX use LANCZOS3 for cv2.INTER_LANCZOS4
    }

    assert interp in interp_map, "interpolation method not supported by DALI"
    interp = interp_map[interp]
    pad_output = channel_num == 4

    transforms = {
        k: v
        for d in config_dataloader["dataset"]["transform_ops"]
        for k, v in d.items()
    }

    scale = transforms["NormalizeImage"].get("scale", 1.0 / 255)
    scale = eval(scale) if isinstance(scale, str) else scale
    mean = transforms["NormalizeImage"].get("mean", [0.485, 0.456, 0.406])
    std = transforms["NormalizeImage"].get("std", [0.229, 0.224, 0.225])
    mean = [v / scale for v in mean]
    std = [v / scale for v in std]

    sampler_name = config_dataloader["sampler"].get("name",
                                                    "DistributedBatchSampler")
    assert sampler_name in ["DistributedBatchSampler", "BatchSampler"]

    if mode.lower() == "train":
        resize_shorter = 256
        crop = transforms["RandCropImage"]["size"]
        scale = transforms["RandCropImage"].get("scale", [0.08, 1.])
        ratio = transforms["RandCropImage"].get("ratio", [3.0 / 4, 4.0 / 3])
        min_area = scale[0]
        lower = ratio[0]
        upper = ratio[1]

        if 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env:
            shard_id = int(env['PADDLE_TRAINER_ID'])
            num_shards = int(env['PADDLE_TRAINERS_NUM'])
            device_id = int(env['FLAGS_selected_gpus'])
            pipe = HybridTrainPipe(file_root,
                                   file_list,
                                   batch_size,
                                   resize_shorter,
                                   crop,
                                   min_area,
                                   lower,
                                   upper,
                                   interp,
                                   mean,
                                   std,
                                   device_id,
                                   shard_id,
                                   num_shards,
                                   seed=seed + shard_id,
                                   pad_output=pad_output,
                                   output_dtype=output_dtype)
            pipe.build()
            pipelines = [pipe]
            #  sample_per_shard = len(pipe) // num_shards
        else:
            pipe = HybridTrainPipe(file_root,
                                   file_list,
                                   batch_size,
                                   resize_shorter,
                                   crop,
                                   min_area,
                                   lower,
                                   upper,
                                   interp,
                                   mean,
                                   std,
                                   device_id=device_id,
                                   shard_id=0,
                                   num_shards=1,
                                   seed=seed,
                                   pad_output=pad_output,
                                   output_dtype=output_dtype)
            pipe.build()
            pipelines = [pipe]
            #  sample_per_shard = len(pipelines[0])
        return DALIGenericIterator(pipelines, ['data', 'label'],
                                   reader_name='Reader')
    else:
        resize_shorter = transforms["ResizeImage"].get("resize_short", 256)
        crop = transforms["CropImage"]["size"]
        if 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env and sampler_name == "DistributedBatchSampler":
            shard_id = int(env['PADDLE_TRAINER_ID'])
            num_shards = int(env['PADDLE_TRAINERS_NUM'])
            device_id = int(env['FLAGS_selected_gpus'])

            pipe = HybridValPipe(file_root,
                                 file_list,
                                 batch_size,
                                 resize_shorter,
                                 crop,
                                 interp,
                                 mean,
                                 std,
                                 device_id=device_id,
                                 shard_id=shard_id,
                                 num_shards=num_shards,
                                 pad_output=pad_output,
                                 output_dtype=output_dtype)
        else:
            pipe = HybridValPipe(file_root,
                                 file_list,
                                 batch_size,
                                 resize_shorter,
                                 crop,
                                 interp,
                                 mean,
                                 std,
                                 device_id=device_id,
                                 pad_output=pad_output,
                                 output_dtype=output_dtype)
        pipe.build()
        return DALIGenericIterator([pipe], ['data', 'label'],
                                   reader_name="Reader")