Example #1
0
def create_dataset(args, data_path, batch_size):
    if args.mode == 'train' and args.use_kungfu:
        rank = kfops.kungfu_current_rank()
        size = kfops.kungfu_current_cluster_size()
        ds = de.Cifar10Dataset(
            data_path,
            num_parallel_workers=8,
            shuffle=False,
            num_shards=size,
            shard_id=rank,
        )
        print('using shard %d of %d' % (rank, size))
    else:
        ds = de.Cifar10Dataset(
            data_path,
            num_parallel_workers=8,
            shuffle=False,
        )

    # define map operations
    trans = []
    # if do_train:
    #     trans += [
    #         # C.RandomCrop((32, 32), (4, 4, 4, 4)),
    #         # C.RandomHorizontalFlip(prob=0.5)
    #     ]

    trans += [
        C.Rescale(1.0 / 255.0, 0.0),
        C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]),
        C.HWC2CHW()
    ]

    type_cast_op = C2.TypeCast(mstype.int32)

    ds = ds.map(operations=type_cast_op,
                input_columns="label",
                num_parallel_workers=8)
    ds = ds.map(operations=trans,
                input_columns="image",
                num_parallel_workers=8)

    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)

    return ds
def create_dataset(repeat_num=1,
                   training=True,
                   batch_size=32,
                   num_samples=1600):
    data_home = "/home/workspace/mindspore_dataset"
    data_dir = data_home + "/cifar-10-batches-bin"
    if not training:
        data_dir = data_home + "/cifar-10-verify-bin"
    data_set = ds.Cifar10Dataset(data_dir, num_samples=num_samples)

    resize_height = 224
    resize_width = 224
    rescale = 1.0 / 255.0
    shift = 0.0

    # define map operations
    random_crop_op = vision.RandomCrop(
        (32, 32), (4, 4, 4, 4))  # padding_mode default CONSTANT
    random_horizontal_op = vision.RandomHorizontalFlip()
    # interpolation default BILINEAR
    resize_op = vision.Resize((resize_height, resize_width))
    rescale_op = vision.Rescale(rescale, shift)
    normalize_op = vision.Normalize((0.4465, 0.4822, 0.4914),
                                    (0.2010, 0.1994, 0.2023))
    changeswap_op = vision.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)

    c_trans = []
    if training:
        c_trans = [random_crop_op, random_horizontal_op]
    c_trans += [resize_op, rescale_op, normalize_op, changeswap_op]

    # apply map operations on images
    data_set = data_set.map(operations=type_cast_op, input_columns="label")
    data_set = data_set.map(operations=c_trans, input_columns="image")

    # apply shuffle operations
    data_set = data_set.shuffle(buffer_size=1000)

    # apply batch operations
    data_set = data_set.batch(batch_size=batch_size, drop_remainder=True)

    # apply repeat operations
    data_set = data_set.repeat(repeat_num)

    return data_set
Example #3
0
def create_dataset(repeat_num=1, training=True):
    """
    create data for next use such as training or infering
    """
    cifar_ds = ds.Cifar10Dataset(data_home)

    if args_opt.run_distribute:
        rank_id = int(os.getenv('RANK_ID'))
        rank_size = int(os.getenv('RANK_SIZE'))
        cifar_ds = ds.Cifar10Dataset(data_home, num_shards=rank_size, shard_id=rank_id)

    resize_height = 224
    resize_width = 224
    rescale = 1.0 / 255.0
    shift = 0.0

    # define map operations
    random_crop_op = C.RandomCrop((32, 32), (4, 4, 4, 4)) # padding_mode default CONSTANT
    random_horizontal_op = C.RandomHorizontalFlip()
    resize_op = C.Resize((resize_height, resize_width)) # interpolation default BILINEAR
    rescale_op = C.Rescale(rescale, shift)
    normalize_op = C.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
    changeswap_op = C.HWC2CHW()
    type_cast_op = C2.TypeCast(mstype.int32)

    c_trans = []
    if training:
        c_trans = [random_crop_op, random_horizontal_op]
    c_trans += [resize_op, rescale_op, normalize_op,
                changeswap_op]

    # apply map operations on images
    cifar_ds = cifar_ds.map(operations=type_cast_op, input_columns="label")
    cifar_ds = cifar_ds.map(operations=c_trans, input_columns="image")

    # apply shuffle operations
    cifar_ds = cifar_ds.shuffle(buffer_size=10)

    # apply batch operations
    cifar_ds = cifar_ds.batch(batch_size=args_opt.batch_size, drop_remainder=True)

    # apply repeat operations
    cifar_ds = cifar_ds.repeat(repeat_num)

    return cifar_ds
Example #4
0
def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1):
    """
    create a train or eval dataset

    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        rank (int): The shard ID within num_shards (default=None).
        group_size (int): Number of shards that the dataset should be divided into (default=None).
        repeat_num(int): the repeat times of dataset. Default: 1.

    Returns:
        dataset
    """
    if group_size == 1:
        data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True)
    else:
        data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True,
                                         num_shards=group_size, shard_id=rank)
    # define map operations
    if do_train:
        trans = [
            C.RandomCropDecodeResize(299, scale=(0.08, 1.0), ratio=(0.75, 1.333)),
            C.RandomHorizontalFlip(prob=0.5),
            C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4)
        ]
    else:
        trans = [
            C.Decode(),
            C.Resize(299),
            C.CenterCrop(299)
        ]
    trans += [
        C.Rescale(1.0 / 255.0, 0.0),
        C.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        C.HWC2CHW()
    ]
    type_cast_op = C2.TypeCast(mstype.int32)
    data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums)
    data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums)
    # apply batch operations
    data_set = data_set.batch(cfg.batch_size, drop_remainder=True)
    # apply dataset repeat operation
    data_set = data_set.repeat(repeat_num)
    return data_set
Example #5
0
def dataset_cifar(dataset_path=None,
                  batch_size=32,
                  repeat_num=1,
                  num_rows=9600,
                  distribution_num=None,
                  shard_id=None,
                  drop_remainder=True,
                  usage=None,
                  shuffle=False,
                  num_workers=8,
                  resize_size=32,
                  pad_info=None):
    if dataset_path is None:
        dataset_path = DATA_DIR

    ds = de.Cifar10Dataset(dataset_path,
                           num_samples=num_rows,
                           num_shards=distribution_num,
                           shard_id=shard_id,
                           shuffle=shuffle,
                           usage=usage,
                           num_parallel_workers=num_workers)

    typecast_op = c_trans.TypeCast(mstype.int32)
    ds = ds.map(input_columns="label",
                operations=typecast_op,
                num_parallel_workers=num_workers)

    image_op_list = [
        c_vision.Resize(resize_size),
        c_vision.Rescale(1.0 / 255.0, 0.0),
        c_vision.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
        c_vision.HWC2CHW()
    ]
    ds = ds.map(input_columns="image",
                operations=image_op_list,
                num_parallel_workers=num_workers)

    ds = ds.batch(batch_size,
                  drop_remainder=drop_remainder,
                  num_parallel_workers=num_workers,
                  pad_info=pad_info)
    ds = ds.repeat(repeat_num)

    return ds
def create_dataset(data_path,
                   repeat_num=1,
                   batch_size=32,
                   rank_id=0,
                   rank_size=1):
    """create dataset"""
    resize_height = 224
    resize_width = 224
    rescale = 1.0 / 255.0
    shift = 0.0

    # get rank_id and rank_size
    rank_id = get_rank()
    rank_size = get_group_size()
    data_set = ds.Cifar10Dataset(data_path,
                                 num_shards=rank_size,
                                 shard_id=rank_id)

    # define map operations
    random_crop_op = vision.RandomCrop((32, 32), (4, 4, 4, 4))
    random_horizontal_op = vision.RandomHorizontalFlip()
    resize_op = vision.Resize((resize_height, resize_width))
    rescale_op = vision.Rescale(rescale, shift)
    normalize_op = vision.Normalize((0.4465, 0.4822, 0.4914),
                                    (0.2010, 0.1994, 0.2023))
    changeswap_op = vision.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)

    c_trans = [random_crop_op, random_horizontal_op]
    c_trans += [resize_op, rescale_op, normalize_op, changeswap_op]

    # apply map operations on images
    data_set = data_set.map(operations=type_cast_op, input_columns="label")
    data_set = data_set.map(operations=c_trans, input_columns="image")

    # apply shuffle operations
    data_set = data_set.shuffle(buffer_size=10)

    # apply batch operations
    data_set = data_set.batch(batch_size=batch_size, drop_remainder=True)

    # apply repeat operations
    data_set = data_set.repeat(repeat_num)

    return data_set
Example #7
0
def create_dataset(dataset_path, config, repeat_num=1, batch_size=32):
    """
    create a train dataset

    Args:
        dataset_path(string): the path of dataset.
        config(EasyDict):the basic config for training
        repeat_num(int): the repeat times of dataset. Default: 1.
        batch_size(int): the batch size of dataset. Default: 32.

    Returns:
        dataset
    """

    load_func = partial(ds.Cifar10Dataset, dataset_path)
    cifar_ds = load_func(num_parallel_workers=8, shuffle=False)

    resize_height = config.image_height
    resize_width = config.image_width
    rescale = 1.0 / 255.0
    shift = 0.0

    # define map operations
    # interpolation default BILINEAR
    resize_op = C.Resize((resize_height, resize_width))
    rescale_op = C.Rescale(rescale, shift)
    normalize_op = C.Normalize((0.4914, 0.4822, 0.4465),
                               (0.2023, 0.1994, 0.2010))
    changeswap_op = C.HWC2CHW()
    type_cast_op = C2.TypeCast(mstype.int32)

    c_trans = [resize_op, rescale_op, normalize_op, changeswap_op]

    # apply map operations on images
    cifar_ds = cifar_ds.map(input_columns="label", operations=type_cast_op)
    cifar_ds = cifar_ds.map(input_columns="image", operations=c_trans)

    # apply batch operations
    cifar_ds = cifar_ds.batch(batch_size, drop_remainder=True)

    # apply dataset repeat operation
    cifar_ds = cifar_ds.repeat(repeat_num)

    return cifar_ds
Example #8
0
def test_cutmix_batch_success2(plot=False):
    """
    Test CutMixBatch op with default values for alpha and prob on a batch of rescaled HWC images
    """
    logger.info("test_cutmix_batch_success2")

    # Original Images
    ds_original = ds.Cifar10Dataset(DATA_DIR, num_samples=10, shuffle=False)
    ds_original = ds_original.batch(5, drop_remainder=True)

    images_original = None
    for idx, (image, _) in enumerate(ds_original):
        if idx == 0:
            images_original = image.asnumpy()
        else:
            images_original = np.append(images_original,
                                        image.asnumpy(),
                                        axis=0)

    # CutMix Images
    data1 = ds.Cifar10Dataset(DATA_DIR, num_samples=10, shuffle=False)
    one_hot_op = data_trans.OneHot(num_classes=10)
    data1 = data1.map(operations=one_hot_op, input_columns=["label"])
    rescale_op = vision.Rescale((1.0 / 255.0), 0.0)
    data1 = data1.map(operations=rescale_op, input_columns=["image"])
    cutmix_batch_op = vision.CutMixBatch(mode.ImageBatchFormat.NHWC)
    data1 = data1.batch(5, drop_remainder=True)
    data1 = data1.map(operations=cutmix_batch_op,
                      input_columns=["image", "label"])

    images_cutmix = None
    for idx, (image, _) in enumerate(data1):
        if idx == 0:
            images_cutmix = image.asnumpy()
        else:
            images_cutmix = np.append(images_cutmix, image.asnumpy(), axis=0)
    if plot:
        visualize_list(images_original, images_cutmix)

    num_samples = images_original.shape[0]
    mse = np.zeros(num_samples)
    for i in range(num_samples):
        mse[i] = diff_mse(images_cutmix[i], images_original[i])
    logger.info("MSE= {}".format(str(np.mean(mse))))
def create_cifar10_dataset(cifar_dir, num_parallel_workers=1):
    """
    Creat the cifar10 dataset.
    """
    ds = de.Cifar10Dataset(cifar_dir)

    training = True
    resize_height = 224
    resize_width = 224
    rescale = 1.0 / 255.0
    shift = 0.0
    repeat_num = 10
    batch_size = 32

    # define map operations
    random_crop_op = vision.RandomCrop(
        (32, 32), (4, 4, 4, 4))  # padding_mode default CONSTANT
    random_horizontal_op = vision.RandomHorizontalFlip()
    resize_op = vision.Resize(
        (resize_height, resize_width))  # interpolation default BILINEAR
    rescale_op = vision.Rescale(rescale, shift)
    normalize_op = vision.Normalize((0.4465, 0.4822, 0.4914),
                                    (0.2010, 0.1994, 0.2023))
    changeswap_op = vision.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)

    if training:
        c_trans = [random_crop_op, random_horizontal_op]
    c_trans += [resize_op, rescale_op, normalize_op, changeswap_op]

    # apply map operations on images
    ds = ds.map(operations=type_cast_op, input_columns="label")
    ds = ds.map(operations=c_trans, input_columns="image")

    # apply repeat operations
    ds = ds.repeat(repeat_num)

    # apply shuffle operations
    ds = ds.shuffle(buffer_size=10)

    # apply batch operations
    ds = ds.batch(batch_size=batch_size, drop_remainder=True)

    return ds
Example #10
0
def test_one_hot_post_aug():
    """
    Test One Hot Encoding after Multiple Data Augmentation Operators
    """
    logger.info("test_one_hot_post_aug")
    data1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, shuffle=False)

    # Define data augmentation parameters
    rescale = 1.0 / 255.0
    shift = 0.0
    resize_height, resize_width = 224, 224

    # Define map operations
    decode_op = c_vision.Decode()
    rescale_op = c_vision.Rescale(rescale, shift)
    resize_op = c_vision.Resize((resize_height, resize_width))

    # Apply map operations on images
    data1 = data1.map(operations=decode_op, input_columns=["image"])
    data1 = data1.map(operations=rescale_op, input_columns=["image"])
    data1 = data1.map(operations=resize_op, input_columns=["image"])

    # Apply one-hot encoding on labels
    depth = 4
    one_hot_encode = data_trans.OneHot(depth)
    data1 = data1.map(operations=one_hot_encode, input_columns=["label"])

    # Apply datasets ops
    buffer_size = 100
    seed = 10
    batch_size = 2
    ds.config.set_seed(seed)
    data1 = data1.shuffle(buffer_size=buffer_size)
    data1 = data1.batch(batch_size, drop_remainder=True)

    num_iter = 0
    for item in data1.create_dict_iterator(num_epochs=1):
        logger.info("image is: {}".format(item["image"]))
        logger.info("label is: {}".format(item["label"]))
        num_iter += 1

    assert num_iter == 1
def generate_mnist_dataset(data_path, batch_size=32, repeat_size=1,
                           num_samples=None, num_parallel_workers=1, sparse=True):
    """
    create dataset for training or testing
    """
    # define dataset
    ds1 = ds.MnistDataset(data_path, num_samples=num_samples)

    # define operation parameters
    resize_height, resize_width = 32, 32
    rescale = 1.0 / 255.0
    shift = 0.0

    # define map operations
    resize_op = CV.Resize((resize_height, resize_width),
                          interpolation=Inter.LINEAR)
    rescale_op = CV.Rescale(rescale, shift)
    hwc2chw_op = CV.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)

    # apply map operations on images
    if not sparse:
        one_hot_enco = C.OneHot(10)
        ds1 = ds1.map(input_columns="label", operations=one_hot_enco,
                      num_parallel_workers=num_parallel_workers)
        type_cast_op = C.TypeCast(mstype.float32)
    ds1 = ds1.map(input_columns="label", operations=type_cast_op,
                  num_parallel_workers=num_parallel_workers)
    ds1 = ds1.map(input_columns="image", operations=resize_op,
                  num_parallel_workers=num_parallel_workers)
    ds1 = ds1.map(input_columns="image", operations=rescale_op,
                  num_parallel_workers=num_parallel_workers)
    ds1 = ds1.map(input_columns="image", operations=hwc2chw_op,
                  num_parallel_workers=num_parallel_workers)

    # apply DatasetOps
    buffer_size = 10000
    ds1 = ds1.shuffle(buffer_size=buffer_size)
    ds1 = ds1.batch(batch_size, drop_remainder=True)
    ds1 = ds1.repeat(repeat_size)

    return ds1
Example #12
0
def test_rescale_with_random_posterize():
    """
    Test RandomPosterize: only support CV_8S/CV_8U
    """
    logger.info("test_rescale_with_random_posterize")

    DATA_DIR_10 = "../data/dataset/testCifar10Data"
    dataset = ds.Cifar10Dataset(DATA_DIR_10)

    rescale_op = c_vision.Rescale((1.0 / 255.0), 0.0)
    dataset = dataset.map(operations=rescale_op, input_columns=["image"])

    random_posterize_op = c_vision.RandomPosterize((4, 8))
    dataset = dataset.map(operations=random_posterize_op, input_columns=["image"], num_parallel_workers=1)

    try:
        _ = dataset.output_shapes()
    except RuntimeError as e:
        logger.info("Got an exception in DE: {}".format(str(e)))
        assert "input image data type can not be float" in str(e)
Example #13
0
def create_dataset(data_dir, p=16, k=8):
    """
    create a train or eval dataset

    Args:
        dataset_path(string): the path of dataset.
        p(int): randomly choose p classes from all classes.
        k(int): randomly choose k images from each of the chosen p classes.
                p * k is the batchsize.

    Returns:
        dataset
    """
    dataset = MGDataset(data_dir)
    sampler = DistributedPKSampler(dataset, p=p, k=k)
    de_dataset = de.GeneratorDataset(dataset, ["image", "label1", "label2"], sampler=sampler)

    resize_height = config.image_height
    resize_width = config.image_width
    rescale = 1.0 / 255.0
    shift = 0.0

    resize_op = CV.Resize((resize_height, resize_width))
    rescale_op = CV.Rescale(rescale, shift)
    normalize_op = CV.Normalize([0.486, 0.459, 0.408], [0.229, 0.224, 0.225])

    change_swap_op = CV.HWC2CHW()

    trans = []

    trans += [resize_op, rescale_op, normalize_op, change_swap_op]

    type_cast_op_label1 = C.TypeCast(mstype.int32)
    type_cast_op_label2 = C.TypeCast(mstype.float32)

    de_dataset = de_dataset.map(input_columns="label1", operations=type_cast_op_label1)
    de_dataset = de_dataset.map(input_columns="label2", operations=type_cast_op_label2)
    de_dataset = de_dataset.map(input_columns="image", operations=trans)
    de_dataset = de_dataset.batch(p*k, drop_remainder=True)

    return de_dataset
def test_serdes_cifar10_dataset(remove_json_files=True):
    """
    Test serdes on Cifar10 dataset pipeline
    """
    data_dir = "../data/dataset/testCifar10Data"
    original_seed = config_get_set_seed(1)
    original_num_parallel_workers = config_get_set_num_parallel_workers(1)

    data1 = ds.Cifar10Dataset(data_dir, num_samples=10, shuffle=False)
    data1 = data1.take(6)

    trans = [
        vision.RandomCrop((32, 32), (4, 4, 4, 4)),
        vision.Resize((224, 224)),
        vision.Rescale(1.0 / 255.0, 0.0),
        vision.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]),
        vision.HWC2CHW()
    ]

    type_cast_op = c.TypeCast(mstype.int32)
    data1 = data1.map(operations=type_cast_op, input_columns="label")
    data1 = data1.map(operations=trans, input_columns="image")
    data1 = data1.batch(3, drop_remainder=True)
    data1 = data1.repeat(1)
    data2 = util_check_serialize_deserialize_file(data1,
                                                  "cifar10_dataset_pipeline",
                                                  remove_json_files)

    num_samples = 0
    # Iterate and compare the data in the original pipeline (data1) against the deserialized pipeline (data2)
    for item1, item2 in zip(
            data1.create_dict_iterator(num_epochs=1, output_numpy=True),
            data2.create_dict_iterator(num_epochs=1, output_numpy=True)):
        np.testing.assert_array_equal(item1['image'], item2['image'])
        num_samples += 1

    assert num_samples == 2

    # Restore configuration num_parallel_workers
    ds.config.set_seed(original_seed)
    ds.config.set_num_parallel_workers(original_num_parallel_workers)
Example #15
0
def create_dataset(dataset_path,
                   batch_size=1,
                   num_shards=1,
                   shard_id=0,
                   device_target='Ascend'):
    """
     create train or evaluation dataset for warpctc

     Args:
        dataset_path(str): dataset path
        batch_size(int): batch size of generated dataset, default is 1
        num_shards(int): number of devices
        shard_id(int): rank id
        device_target(str): platform of training, support Ascend and GPU
     """

    dataset = _CaptchaDataset(dataset_path, cf.max_captcha_digits,
                              device_target)
    ds = de.GeneratorDataset(dataset, ["image", "label"],
                             shuffle=True,
                             num_shards=num_shards,
                             shard_id=shard_id)
    image_trans = [
        vc.Rescale(1.0 / 255.0, 0.0),
        vc.Normalize([0.9010, 0.9049, 0.9025], std=[0.1521, 0.1347, 0.1458]),
        vc.Resize((m.ceil(cf.captcha_height / 16) * 16, cf.captcha_width)),
        c.TypeCast(mstype.float16)
    ]
    label_trans = [c.TypeCast(mstype.int32)]
    ds = ds.map(operations=image_trans,
                input_columns=["image"],
                num_parallel_workers=8)
    ds = ds.map(operations=transpose_hwc2whc,
                input_columns=["image"],
                num_parallel_workers=8)
    ds = ds.map(operations=label_trans,
                input_columns=["label"],
                num_parallel_workers=8)

    ds = ds.batch(batch_size, drop_remainder=True)
    return ds
Example #16
0
def test_rescale_md5():
    """
    Test Rescale with md5 comparison
    """
    logger.info("Test Rescale with md5 comparison")

    # generate dataset
    data = ds.TFRecordDataset(DATA_DIR,
                              SCHEMA_DIR,
                              columns_list=["image"],
                              shuffle=False)
    decode_op = vision.Decode()
    rescale_op = vision.Rescale(1.0 / 255.0, -1.0)

    # apply map operations on images
    data = data.map(operations=decode_op, input_columns=["image"])
    data = data.map(operations=rescale_op, input_columns=["image"])

    # check results with md5 comparison
    filename = "rescale_01_result.npz"
    save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN)
Example #17
0
def create_dataset_cifar10(data_home, repeat_num=1, training=True, cifar_cfg=None):
    """Data operations."""
    data_dir = os.path.join(data_home, "cifar-10-batches-bin")
    if not training:
        data_dir = os.path.join(data_home, "cifar-10-verify-bin")

    rank_size, rank_id = _get_rank_info()
    if training:
        data_set = ds.Cifar10Dataset(data_dir, num_shards=rank_size, shard_id=rank_id, shuffle=True)
    else:
        data_set = ds.Cifar10Dataset(data_dir, num_shards=rank_size, shard_id=rank_id, shuffle=False)

    resize_height = cifar_cfg.image_height
    resize_width = cifar_cfg.image_width

    # define map operations
    random_crop_op = vision.RandomCrop((32, 32), (4, 4, 4, 4))  # padding_mode default CONSTANT
    random_horizontal_op = vision.RandomHorizontalFlip()
    resize_op = vision.Resize((resize_height, resize_width))  # interpolation default BILINEAR
    rescale_op = vision.Rescale(1.0 / 255.0, 0.0)
    normalize_op = vision.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
    changeswap_op = vision.HWC2CHW()
    type_cast_op = c_transforms.TypeCast(mstype.int32)

    c_trans = []
    if training:
        c_trans = [random_crop_op, random_horizontal_op]
    c_trans += [resize_op, rescale_op, normalize_op, changeswap_op]

    # apply map operations on images
    data_set = data_set.map(operations=type_cast_op, input_columns="label")
    data_set = data_set.map(operations=c_trans, input_columns="image")

    # apply batch operations
    data_set = data_set.batch(batch_size=cifar_cfg.batch_size, drop_remainder=True)

    # apply repeat operations
    data_set = data_set.repeat(repeat_num)

    return data_set
Example #18
0
def create_dataset(data_path,
                   batch_size=32,
                   repeat_size=1,
                   num_parallel_workers=1):
    """Create dataset for train or test."""
    # define dataset
    mnist_ds = dataset.MnistDataset(data_path)

    resize_height, resize_width = 32, 32
    rescale = 1.0 / 255.0
    shift = 0.0

    # define map operations
    resize_op = transforms.Resize((resize_height, resize_width),
                                  interpolation=Inter.LINEAR)  # Bilinear mode
    rescale_op = transforms.Rescale(rescale, shift)
    hwc2chw_op = transforms.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)

    # apply map operations on images
    mnist_ds = mnist_ds.map(operations=type_cast_op,
                            input_columns="label",
                            num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=resize_op,
                            input_columns="image",
                            num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=rescale_op,
                            input_columns="image",
                            num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=hwc2chw_op,
                            input_columns="image",
                            num_parallel_workers=num_parallel_workers)

    # apply DatasetOps
    buffer_size = 10000
    mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size)
    mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
    mnist_ds = mnist_ds.repeat(repeat_size)

    return mnist_ds
Example #19
0
def create_dataset(indices, repeat_num=1, training=True):
    """
    create data for next use such as training or infering
    """
    cifar_ds = ds.Cifar10Dataset(data_home,
                                 sampler=FixedIndeciesSampler(indices))

    resize_height = 224
    resize_width = 224
    rescale = 1.0 / 255.0
    shift = 0.0

    # define map operations
    random_crop_op = C.RandomCrop(
        (32, 32), (4, 4, 4, 4))  # padding_mode default CONSTANT
    random_horizontal_op = C.RandomHorizontalFlip()
    resize_op = C.Resize(
        (resize_height, resize_width))  # interpolation default BILINEAR
    rescale_op = C.Rescale(rescale, shift)
    normalize_op = C.Normalize((0.4914, 0.4822, 0.4465),
                               (0.2023, 0.1994, 0.2010))
    changeswap_op = C.HWC2CHW()
    type_cast_op = C2.TypeCast(mstype.int32)

    c_trans = []
    # if training:
    #     c_trans = [random_crop_op, random_horizontal_op]
    c_trans += [resize_op, rescale_op, normalize_op, changeswap_op]

    # apply map operations on images
    cifar_ds = cifar_ds.map(operations=type_cast_op, input_columns="label")
    cifar_ds = cifar_ds.map(operations=c_trans, input_columns="image")

    # apply batch operations
    cifar_ds = cifar_ds.batch(batch_size=batch_size, drop_remainder=True)

    # apply repeat operations
    cifar_ds = cifar_ds.repeat(repeat_num)

    return cifar_ds
Example #20
0
def create_dataset(data_dir,
                   training=True,
                   batch_size=32,
                   resize=(32, 32),
                   rescale=1 / (255 * 0.3081),
                   shift=-0.1307 / 0.3081,
                   buffer_size=64):
    data_train = os.path.join(data_dir, 'train')  # train set
    data_test = os.path.join(data_dir, 'test')  # test set
    ds = ms.dataset.MnistDataset(data_train if training else data_test)

    ds = ds.map(input_columns=["image"],
                operations=[
                    CV.Resize(resize),
                    CV.Rescale(rescale, shift),
                    CV.HWC2CHW()
                ])
    ds = ds.map(input_columns=["label"], operations=C.TypeCast(ms.int32))
    ds = ds.shuffle(buffer_size=buffer_size).batch(batch_size,
                                                   drop_remainder=True)

    return ds
Example #21
0
def create_dataset1(dataset_path,
                    do_train,
                    repeat_num=1,
                    batch_size=32,
                    target="Ascend",
                    distribute=False):
    """
    create a train or evaluate cifar10 dataset for resnet50
    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        repeat_num(int): the repeat times of dataset. Default: 1
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend
        distribute(bool): data for distribute or not. Default: False

    Returns:
        dataset
    """
    if target == "Ascend":
        device_num, rank_id = _get_rank_info()
    else:
        if distribute:
            init()
            rank_id = get_rank()
            device_num = get_group_size()
        else:
            device_num = 1
    if device_num == 1:
        data_set = ds.Cifar10Dataset(dataset_path,
                                     num_parallel_workers=8,
                                     shuffle=True)
    else:
        data_set = ds.Cifar10Dataset(dataset_path,
                                     num_parallel_workers=8,
                                     shuffle=True,
                                     num_shards=device_num,
                                     shard_id=rank_id)

    # define map operations
    trans = []
    if do_train:
        trans += [
            C.RandomCrop((32, 32), (4, 4, 4, 4)),
            C.RandomHorizontalFlip(prob=0.5)
        ]

    trans += [
        C.Resize((224, 224)),
        C.Rescale(1.0 / 255.0, 0.0),
        C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]),
        C.HWC2CHW()
    ]

    type_cast_op = C2.TypeCast(mstype.int32)

    data_set = data_set.map(operations=type_cast_op,
                            input_columns="label",
                            num_parallel_workers=8)
    data_set = data_set.map(operations=trans,
                            input_columns="image",
                            num_parallel_workers=8)

    # apply batch operations
    data_set = data_set.batch(batch_size, drop_remainder=True)
    # apply dataset repeat operation
    data_set = data_set.repeat(repeat_num)

    return data_set
def test_serdes_imagefolder_dataset(remove_json_files=True):
    """
    Test simulating resnet50 dataset pipeline.
    """
    data_dir = "../data/dataset/testPK/data"
    ds.config.set_seed(1)

    # define data augmentation parameters
    rescale = 1.0 / 255.0
    shift = 0.0
    resize_height, resize_width = 224, 224
    weights = [
        1.0, 0.1, 0.02, 0.3, 0.4, 0.05, 1.2, 0.13, 0.14, 0.015, 0.16, 1.1
    ]

    # Constructing DE pipeline
    sampler = ds.WeightedRandomSampler(weights, 11)
    child_sampler = ds.SequentialSampler()
    sampler.add_child(child_sampler)
    data1 = ds.ImageFolderDataset(data_dir, sampler=sampler)
    data1 = data1.repeat(1)
    data1 = data1.map(operations=[vision.Decode(True)],
                      input_columns=["image"])
    rescale_op = vision.Rescale(rescale, shift)

    resize_op = vision.Resize((resize_height, resize_width), Inter.LINEAR)
    data1 = data1.map(operations=[rescale_op, resize_op],
                      input_columns=["image"])
    data1 = data1.batch(2)

    # Serialize the dataset pre-processing pipeline.
    # data1 should still work after saving.
    ds.serialize(data1, "imagenet_dataset_pipeline.json")
    ds1_dict = ds.serialize(data1)
    assert validate_jsonfile("imagenet_dataset_pipeline.json") is True

    # Print the serialized pipeline to stdout
    ds.show(data1)

    # Deserialize the serialized json file
    data2 = ds.deserialize(json_filepath="imagenet_dataset_pipeline.json")

    # Serialize the pipeline we just deserialized.
    # The content of the json file should be the same to the previous serialize.
    ds.serialize(data2, "imagenet_dataset_pipeline_1.json")
    assert validate_jsonfile("imagenet_dataset_pipeline_1.json") is True
    assert filecmp.cmp('imagenet_dataset_pipeline.json',
                       'imagenet_dataset_pipeline_1.json')

    # Deserialize the latest json file again
    data3 = ds.deserialize(json_filepath="imagenet_dataset_pipeline_1.json")
    data4 = ds.deserialize(input_dict=ds1_dict)
    num_samples = 0
    # Iterate and compare the data in the original pipeline (data1) against the deserialized pipeline (data2)
    for item1, item2, item3, item4 in zip(
            data1.create_dict_iterator(num_epochs=1, output_numpy=True),
            data2.create_dict_iterator(num_epochs=1, output_numpy=True),
            data3.create_dict_iterator(num_epochs=1, output_numpy=True),
            data4.create_dict_iterator(num_epochs=1, output_numpy=True)):
        np.testing.assert_array_equal(item1['image'], item2['image'])
        np.testing.assert_array_equal(item1['image'], item3['image'])
        np.testing.assert_array_equal(item1['label'], item2['label'])
        np.testing.assert_array_equal(item1['label'], item3['label'])
        np.testing.assert_array_equal(item3['image'], item4['image'])
        np.testing.assert_array_equal(item3['label'], item4['label'])
        num_samples += 1

    logger.info("Number of data in data1: {}".format(num_samples))
    assert num_samples == 6

    # Remove the generated json file
    if remove_json_files:
        delete_json_files()
def create_dataset(dataset_path, do_train, config, repeat_num=1):
    """
    Create a train or eval dataset.

    Args:
        dataset_path (string): The path of dataset.
        do_train (bool): Whether dataset is used for train or eval.
        config: configuration
        repeat_num (int): The repeat times of dataset. Default: 1.
    Returns:
        Dataset.
    """
    if do_train:
        dataset_path = os.path.join(dataset_path, 'train')
        do_shuffle = True
    else:
        dataset_path = os.path.join(dataset_path, 'eval')
        do_shuffle = False

    device_id = 0
    device_num = 1
    if config.platform == "GPU":
        if config.run_distribute:
            from mindspore.communication.management import get_rank, get_group_size
            device_id = get_rank()
            device_num = get_group_size()
    elif config.platform == "Ascend":
        device_id = int(os.getenv('DEVICE_ID'))
        device_num = int(os.getenv('RANK_SIZE'))

    if device_num == 1 or not do_train:
        ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=4, shuffle=do_shuffle)
    else:
        ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=4, shuffle=do_shuffle,
                               num_shards=device_num, shard_id=device_id)

    resize_height = config.image_height
    resize_width = config.image_width
    buffer_size = 100
    rescale = 1.0 / 255.0
    shift = 0.0

    # define map operations
    random_crop_op = C.RandomCrop((32, 32), (4, 4, 4, 4))
    random_horizontal_flip_op = C.RandomHorizontalFlip(device_id / (device_id + 1))

    resize_op = C.Resize((resize_height, resize_width))
    rescale_op = C.Rescale(rescale, shift)
    normalize_op = C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])

    change_swap_op = C.HWC2CHW()

    trans = []
    if do_train:
        trans += [random_crop_op, random_horizontal_flip_op]

    trans += [resize_op, rescale_op, normalize_op, change_swap_op]

    type_cast_op = C2.TypeCast(mstype.int32)

    ds = ds.map(input_columns="label", operations=type_cast_op)
    ds = ds.map(input_columns="image", operations=trans)

    # apply shuffle operations
    ds = ds.shuffle(buffer_size=buffer_size)

    # apply batch operations
    ds = ds.batch(config.batch_size, drop_remainder=True)

    # apply dataset repeat operation
    ds = ds.repeat(repeat_num)

    return ds
Example #24
0
def create_dataset_cifar(dataset_path,
                         do_train,
                         repeat_num=1,
                         batch_size=32,
                         target="Ascend"):
    """
    create a train or evaluate cifar10 dataset
    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        repeat_num(int): the repeat times of dataset. Default: 1
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend

    Returns:
        dataset
    """
    if target == "Ascend":
        device_num, rank_id = _get_rank_info()
    else:
        init()
        rank_id = get_rank()
        device_num = get_group_size()

    if device_num == 1:
        ds = de.Cifar10Dataset(dataset_path,
                               num_parallel_workers=8,
                               shuffle=True)
    else:
        ds = de.Cifar10Dataset(dataset_path,
                               num_parallel_workers=8,
                               shuffle=True,
                               num_shards=device_num,
                               shard_id=rank_id)

    # define map operations
    if do_train:
        trans = [
            C.RandomCrop((32, 32), (4, 4, 4, 4)),
            C.RandomHorizontalFlip(prob=0.5),
            C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4),
            C.Resize((227, 227)),
            C.Rescale(1.0 / 255.0, 0.0),
            C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]),
            C.CutOut(112),
            C.HWC2CHW()
        ]
    else:
        trans = [
            C.Resize((227, 227)),
            C.Rescale(1.0 / 255.0, 0.0),
            C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]),
            C.HWC2CHW()
        ]

    type_cast_op = C2.TypeCast(mstype.int32)

    ds = ds.map(operations=type_cast_op,
                input_columns="label",
                num_parallel_workers=8)
    ds = ds.map(operations=trans,
                input_columns="image",
                num_parallel_workers=8)

    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)

    # apply dataset repeat operation
    ds = ds.repeat(repeat_num)

    return ds
Example #25
0
def create_dataset(dataset_path,
                   do_train,
                   repeat_num=1,
                   batch_size=32,
                   shard_id=0):
    """
    Create a train or eval dataset.

    Args:
        dataset_path (str): The path of dataset.
        do_train (bool): Whether dataset is used for train or eval.
        repeat_num (int): The repeat times of dataset. Default: 1.
        batch_size (int): The batch size of dataset. Default: 32.

    Returns:
        Dataset.
    """

    do_shuffle = bool(do_train)

    if device_num == 1 or not do_train:
        ds = de.ImageFolderDataset(dataset_path,
                                   num_parallel_workers=config.work_nums,
                                   shuffle=do_shuffle)
    else:
        ds = de.ImageFolderDataset(dataset_path,
                                   num_parallel_workers=config.work_nums,
                                   shuffle=do_shuffle,
                                   num_shards=device_num,
                                   shard_id=shard_id)

    image_length = 299
    if do_train:
        trans = [
            C.RandomCropDecodeResize(image_length,
                                     scale=(0.08, 1.0),
                                     ratio=(0.75, 1.333)),
            C.RandomHorizontalFlip(prob=0.5),
            C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4)
        ]
    else:
        trans = [
            C.Decode(),
            C.Resize(image_length),
            C.CenterCrop(image_length)
        ]
    trans += [
        C.Rescale(1.0 / 255.0, 0.0),
        C.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        C.HWC2CHW()
    ]

    type_cast_op = C2.TypeCast(mstype.int32)

    ds = ds.map(input_columns="label",
                operations=type_cast_op,
                num_parallel_workers=config.work_nums)
    ds = ds.map(input_columns="image",
                operations=trans,
                num_parallel_workers=config.work_nums)

    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)

    # apply dataset repeat operation
    ds = ds.repeat(repeat_num)
    return ds
Example #26
0
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
    """
    Create a train or eval dataset.

    Args:
        dataset_path (str): The path of dataset.
        do_train (bool): Whether dataset is used for train or eval.
        repeat_num (int): The repeat times of dataset. Default: 1.
        batch_size (int): The batch size of dataset. Default: 32.

    Returns:
        Dataset.
    """
    if do_train:
        dataset_path = os.path.join(dataset_path, 'train')
        do_shuffle = True
    else:
        dataset_path = os.path.join(dataset_path, 'eval')
        do_shuffle = False

    if device_num == 1 or not do_train:
        ds = de.Cifar10Dataset(dataset_path,
                               num_parallel_workers=8,
                               shuffle=do_shuffle)
    else:
        ds = de.Cifar10Dataset(dataset_path,
                               num_parallel_workers=8,
                               shuffle=do_shuffle,
                               num_shards=device_num,
                               shard_id=device_id)

    resize_height = 224
    resize_width = 224
    buffer_size = 100
    rescale = 1.0 / 255.0
    shift = 0.0

    # define map operations
    random_crop_op = C.RandomCrop((32, 32), (4, 4, 4, 4))
    random_horizontal_flip_op = C.RandomHorizontalFlip(device_id /
                                                       (device_id + 1))

    resize_op = C.Resize((resize_height, resize_width))
    rescale_op = C.Rescale(rescale, shift)
    normalize_op = C.Normalize([0.4914, 0.4822, 0.4465],
                               [0.2023, 0.1994, 0.2010])

    change_swap_op = C.HWC2CHW()

    trans = []
    if do_train:
        trans += [random_crop_op, random_horizontal_flip_op]

    trans += [resize_op, rescale_op, normalize_op, change_swap_op]

    type_cast_op = C2.TypeCast(mstype.int32)

    ds = ds.map(operations=type_cast_op,
                input_columns="label",
                num_parallel_workers=8)
    ds = ds.map(operations=trans,
                input_columns="image",
                num_parallel_workers=8)

    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)

    # apply dataset repeat operation
    ds = ds.repeat(repeat_num)

    return ds
Example #27
0
def create_dataset(dataset_path,
                   do_train,
                   config,
                   platform,
                   repeat_num=1,
                   batch_size=100):
    """
    create a train or eval dataset

    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        repeat_num(int): the repeat times of dataset. Default: 1
        batch_size(int): the batch size of dataset. Default: 32

    Returns:
        dataset
    """
    if platform == "Ascend":
        rank_size = int(os.getenv("RANK_SIZE"))
        rank_id = int(os.getenv("RANK_ID"))
        if rank_size == 1:
            data_set = ds.MindDataset(dataset_path,
                                      num_parallel_workers=8,
                                      shuffle=True)
        else:
            data_set = ds.MindDataset(dataset_path,
                                      num_parallel_workers=8,
                                      shuffle=True,
                                      num_shards=rank_size,
                                      shard_id=rank_id)
    elif platform == "GPU":
        if do_train:
            from mindspore.communication.management import get_rank, get_group_size
            data_set = ds.MindDataset(dataset_path,
                                      num_parallel_workers=8,
                                      shuffle=True,
                                      num_shards=get_group_size(),
                                      shard_id=get_rank())
        else:
            data_set = ds.MindDataset(dataset_path,
                                      num_parallel_workers=8,
                                      shuffle=False)
    else:
        raise ValueError("Unsupported platform.")

    resize_height = config.image_height
    buffer_size = 1000

    # define map operations
    resize_crop_op = C.RandomCropDecodeResize(resize_height,
                                              scale=(0.08, 1.0),
                                              ratio=(0.75, 1.333))
    horizontal_flip_op = C.RandomHorizontalFlip(prob=0.5)

    color_op = C.RandomColorAdjust(brightness=0.4,
                                   contrast=0.4,
                                   saturation=0.4)
    rescale_op = C.Rescale(1 / 255.0, 0)
    normalize_op = C.Normalize(mean=[0.485, 0.456, 0.406],
                               std=[0.229, 0.224, 0.225])
    change_swap_op = C.HWC2CHW()

    # define python operations
    decode_p = P.Decode()
    resize_p = P.Resize(256, interpolation=Inter.BILINEAR)
    center_crop_p = P.CenterCrop(224)
    totensor = P.ToTensor()
    normalize_p = P.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    composeop = P2.Compose(
        [decode_p, resize_p, center_crop_p, totensor, normalize_p])
    if do_train:
        trans = [
            resize_crop_op, horizontal_flip_op, color_op, rescale_op,
            normalize_op, change_swap_op
        ]
    else:
        trans = composeop
    type_cast_op = C2.TypeCast(mstype.int32)

    data_set = data_set.map(input_columns="image",
                            operations=trans,
                            num_parallel_workers=8)
    data_set = data_set.map(input_columns="label_list",
                            operations=type_cast_op,
                            num_parallel_workers=8)

    # apply shuffle operations
    data_set = data_set.shuffle(buffer_size=buffer_size)

    # apply batch operations
    data_set = data_set.batch(batch_size, drop_remainder=True)

    # apply dataset repeat operation
    data_set = data_set.repeat(repeat_num)

    return data_set
Example #28
0
def keypoint_dataset(config,
                     ann_file=None,
                     image_path=None,
                     bbox_file=None,
                     rank=0,
                     group_size=1,
                     train_mode=True,
                     num_parallel_workers=8,
                     transform=None,
                     shuffle=None):
    """
    A function that returns an imagenet dataset for classification. The mode of input dataset should be "folder" .

    Args:
        rank (int): The shard ID within num_shards (default=None).
        group_size (int): Number of shards that the dataset should be divided
            into (default=None).
         mode (str): "train" or others. Default: " train".
        num_parallel_workers (int): Number of workers to read the data. Default: None.
    """
    # config
    per_batch_size = config.TRAIN.BATCH_SIZE if train_mode else config.TEST.BATCH_SIZE
    image_path = image_path if image_path else os.path.join(
        config.DATASET.ROOT,
        config.DATASET.TRAIN_SET if train_mode else config.DATASET.TEST_SET)
    print('loading dataset from {}'.format(image_path))
    ann_file = ann_file if ann_file else os.path.join(
        config.DATASET.ROOT, 'annotations/person_keypoints_{}2017.json'.format(
            'train' if train_mode else 'val'))
    shuffle = shuffle if shuffle is not None else train_mode
    # gen dataset db
    dataset_generator = KeypointDatasetGenerator(config, is_train=train_mode)

    if not train_mode and not config.TEST.USE_GT_BBOX:
        print('loading bbox file from {}'.format(bbox_file))
        dataset_generator.load_detect_dataset(image_path, ann_file, bbox_file)
    else:
        dataset_generator.load_gt_dataset(image_path, ann_file)

    # construct dataset
    de_dataset = de.GeneratorDataset(dataset_generator,
                                     column_names=[
                                         "image", "target", "weight", "scale",
                                         "center", "score", "id"
                                     ],
                                     num_parallel_workers=num_parallel_workers,
                                     num_shards=group_size,
                                     shard_id=rank,
                                     shuffle=shuffle)

    # inputs map functions
    if transform is None:
        transform_img = [
            V_C.Rescale(1.0 / 255.0, 0.0),
            V_C.Normalize(mean=[0.485, 0.456, 0.406],
                          std=[0.229, 0.224, 0.225]),
            V_C.HWC2CHW()
        ]
    else:
        transform_img = transform
    de_dataset = de_dataset.map(input_columns="image",
                                num_parallel_workers=num_parallel_workers,
                                operations=transform_img)

    # batch
    de_dataset = de_dataset.batch(per_batch_size, drop_remainder=train_mode)

    return de_dataset, dataset_generator
def vgg_create_dataset100(data_home,
                          image_size,
                          batch_size,
                          rank_id=0,
                          rank_size=1,
                          repeat_num=1,
                          training=True,
                          num_samples=None,
                          shuffle=True):
    """Data operations."""
    ds.config.set_seed(1)
    data_dir = os.path.join(data_home, "train")
    if not training:
        data_dir = os.path.join(data_home, "test")

    if num_samples is not None:
        data_set = ds.Cifar100Dataset(data_dir,
                                      num_shards=rank_size,
                                      shard_id=rank_id,
                                      num_samples=num_samples,
                                      shuffle=shuffle)
    else:
        data_set = ds.Cifar100Dataset(data_dir,
                                      num_shards=rank_size,
                                      shard_id=rank_id)

    input_columns = ["fine_label"]
    output_columns = ["label"]
    data_set = data_set.rename(input_columns=input_columns,
                               output_columns=output_columns)
    data_set = data_set.project(["image", "label"])

    rescale = 1.0 / 255.0
    shift = 0.0

    # define map operations
    random_crop_op = CV.RandomCrop(
        (32, 32), (4, 4, 4, 4))  # padding_mode default CONSTANT
    random_horizontal_op = CV.RandomHorizontalFlip()
    resize_op = CV.Resize(image_size)  # interpolation default BILINEAR
    rescale_op = CV.Rescale(rescale, shift)
    normalize_op = CV.Normalize((0.4465, 0.4822, 0.4914),
                                (0.2010, 0.1994, 0.2023))
    changeswap_op = CV.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)

    c_trans = []
    if training:
        c_trans = [random_crop_op, random_horizontal_op]
    c_trans += [resize_op, rescale_op, normalize_op, changeswap_op]

    # apply map operations on images
    data_set = data_set.map(input_columns="label", operations=type_cast_op)
    data_set = data_set.map(input_columns="image", operations=c_trans)

    # apply repeat operations
    data_set = data_set.repeat(repeat_num)

    # apply shuffle operations
    # data_set = data_set.shuffle(buffer_size=1000)

    # apply batch operations
    data_set = data_set.batch(batch_size=batch_size, drop_remainder=True)

    return data_set
Example #30
0
def create_dataset(data_dir,
                   repeat=400,
                   train_batch_size=16,
                   augment=False,
                   cross_val_ind=1,
                   run_distribute=False,
                   do_crop=None,
                   img_size=None):

    images = _load_multipage_tiff(os.path.join(data_dir, 'train-volume.tif'))
    masks = _load_multipage_tiff(os.path.join(data_dir, 'train-labels.tif'))

    train_indices, val_indices = _get_val_train_indices(
        len(images), cross_val_ind)
    train_images = images[train_indices]
    train_masks = masks[train_indices]
    train_images = np.repeat(train_images, repeat, axis=0)
    train_masks = np.repeat(train_masks, repeat, axis=0)
    val_images = images[val_indices]
    val_masks = masks[val_indices]

    train_image_data = {"image": train_images}
    train_mask_data = {"mask": train_masks}
    valid_image_data = {"image": val_images}
    valid_mask_data = {"mask": val_masks}

    ds_train_images = ds.NumpySlicesDataset(data=train_image_data,
                                            sampler=None,
                                            shuffle=False)
    ds_train_masks = ds.NumpySlicesDataset(data=train_mask_data,
                                           sampler=None,
                                           shuffle=False)

    if run_distribute:
        rank_id = get_rank()
        rank_size = get_group_size()
        ds_train_images = ds.NumpySlicesDataset(data=train_image_data,
                                                sampler=None,
                                                shuffle=False,
                                                num_shards=rank_size,
                                                shard_id=rank_id)
        ds_train_masks = ds.NumpySlicesDataset(data=train_mask_data,
                                               sampler=None,
                                               shuffle=False,
                                               num_shards=rank_size,
                                               shard_id=rank_id)

    ds_valid_images = ds.NumpySlicesDataset(data=valid_image_data,
                                            sampler=None,
                                            shuffle=False)
    ds_valid_masks = ds.NumpySlicesDataset(data=valid_mask_data,
                                           sampler=None,
                                           shuffle=False)

    if do_crop:
        resize_size = [
            int(img_size[x] * do_crop[x]) for x in range(len(img_size))
        ]
    else:
        resize_size = img_size
    c_resize_op = c_vision.Resize(size=(resize_size[0], resize_size[1]),
                                  interpolation=Inter.BILINEAR)
    c_pad = c_vision.Pad(padding=(img_size[0] - resize_size[0]) // 2)
    c_rescale_image = c_vision.Rescale(1.0 / 127.5, -1)
    c_rescale_mask = c_vision.Rescale(1.0 / 255.0, 0)

    c_trans_normalize_img = [c_rescale_image, c_resize_op, c_pad]
    c_trans_normalize_mask = [c_rescale_mask, c_resize_op, c_pad]
    c_center_crop = c_vision.CenterCrop(size=388)

    train_image_ds = ds_train_images.map(input_columns="image",
                                         operations=c_trans_normalize_img)
    train_mask_ds = ds_train_masks.map(input_columns="mask",
                                       operations=c_trans_normalize_mask)
    train_ds = ds.zip((train_image_ds, train_mask_ds))
    train_ds = train_ds.project(columns=["image", "mask"])
    if augment:
        augment_process = train_data_augmentation
        c_resize_op = c_vision.Resize(size=(img_size[0], img_size[1]),
                                      interpolation=Inter.BILINEAR)
        train_ds = train_ds.map(input_columns=["image", "mask"],
                                operations=augment_process)
        train_ds = train_ds.map(input_columns="image", operations=c_resize_op)
        train_ds = train_ds.map(input_columns="mask", operations=c_resize_op)

    if do_crop:
        train_ds = train_ds.map(input_columns="mask", operations=c_center_crop)
    post_process = data_post_process
    train_ds = train_ds.map(input_columns=["image", "mask"],
                            operations=post_process)
    train_ds = train_ds.shuffle(repeat * 24)
    train_ds = train_ds.batch(batch_size=train_batch_size, drop_remainder=True)

    valid_image_ds = ds_valid_images.map(input_columns="image",
                                         operations=c_trans_normalize_img)
    valid_mask_ds = ds_valid_masks.map(input_columns="mask",
                                       operations=c_trans_normalize_mask)
    valid_ds = ds.zip((valid_image_ds, valid_mask_ds))
    valid_ds = valid_ds.project(columns=["image", "mask"])
    if do_crop:
        valid_ds = valid_ds.map(input_columns="mask", operations=c_center_crop)
    post_process = data_post_process
    valid_ds = valid_ds.map(input_columns=["image", "mask"],
                            operations=post_process)
    valid_ds = valid_ds.batch(batch_size=1, drop_remainder=True)

    return train_ds, valid_ds