Python DatasetCache Examples, mindspore.dataset.DatasetCache Python Examples

Example #1

0

Show file

File: test_cache_map.py Project: gerayking/mindsporeAno

def test_cache_map_basic3():
    """
    Test a repeat under mappable cache

        Cache
          |
      Map(decode)
          |
        Repeat
          |
      ImageFolder
    """

    logger.info("Test cache basic 3")

    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)

    # This DATA_DIR only has 2 images in it
    ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR)
    decode_op = c_vision.Decode()
    ds1 = ds1.repeat(4)
    ds1 = ds1.map(input_columns=["image"],
                  operations=decode_op,
                  cache=some_cache)
    logger.info("ds1.dataset_size is ", ds1.get_dataset_size())

    num_iter = 0
    for _ in ds1.create_dict_iterator():
        logger.info("get data from dataset")
        num_iter += 1

    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 8
    logger.info('test_cache_basic3 Ended.\n')

Example #2

0

Show file

def test_cache_nomap_basic3():
    """
    A TF reader dataset (a non mappable dataset) with a cache over it just after the leaf

       Repeat
         |
     Map(decode)
         |
       Cache
         |
      TFReader
    """

    logger.info("Test cache nomap basic 3")

    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)
    ds1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False, cache=some_cache)
    decode_op = c_vision.Decode()
    ds1 = ds1.map(input_columns=["image"], operations=decode_op)
    ds1 = ds1.repeat(4)

    num_iter = 0
    for _ in ds1.create_dict_iterator():
        num_iter += 1

    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 12
    logger.info("test_cache_nomap_basic3 Ended.\n")

Example #3

0

Show file

def test_cache_nomap_basic5():
    """
    A TF reader dataset (a non mappable dataset) with a cache over it just after the leaf
    Same as test 3, but this one does not have shuffle arg, causing tf to default to global
    shuffle which attempts to inject a shuffle operator.  However, since there is a cache
    we do not need global shuffle, so the shuffle will not be built.  It ends up being
    identical to test basic 3, however we arrive at the same tree in different codepaths
    (if there was no cache, then the shuffle IS built)

       Repeat
         |
     Map(decode)
         |
       Cache
         |
      TFReader
    """

    logger.info("Test cache nomap basic 5")

    # This dataset has 3 records in it only
    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)
    ds1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], cache=some_cache)
    decode_op = c_vision.Decode()
    ds1 = ds1.map(input_columns=["image"], operations=decode_op)
    ds1 = ds1.repeat(4)

    num_iter = 0
    for _ in ds1.create_dict_iterator():
        num_iter += 1

    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 12
    logger.info("test_cache_nomap_basic5 Ended.\n")

Example #4

0

Show file

def test_cache_nomap_basic1():
    """
    A random dataset (a non mappable dataset) with a cache over it just after the leaf
    """

    logger.info("Test cache nomap basic 1")

    schema = ds.Schema()
    schema.add_column('image', de_type=mstype.uint8,
                      shape=[640, 480, 3])  # 921600 bytes (a bit less than 1 MB per image)
    schema.add_column('label', de_type=mstype.uint8, shape=[1])

    # create a cache.  arbitrary session_id for now
    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)

    # User-created sampler here
    ds1 = ds.RandomDataset(schema=schema, total_rows=10, num_parallel_workers=4, cache=some_cache)
    ds1 = ds1.repeat(4)

    num_iter = 0
    for data in ds1.create_dict_iterator():
        logger.info("printing the label: {}".format(data["label"]))
        num_iter += 1

    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 40
    logger.info("test_cache_nomap_basic1 Ended.\n")

Example #5

0

Show file

def test_cache_nomap_allowed_share1():
    """
    It is allowed to share the cache between the following two trees:

       Repeat     Shuffle
         |           |
       Cache       Cache
         |           |
      TFReader    TFReader
    """

    logger.info("Test cache nomap allowed share 1")

    ds.config.set_seed(1)
    # This dataset has 3 records in it only
    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)
    ds1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False, cache=some_cache)
    ds1 = ds1.repeat(4)

    ds2 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=False, cache=some_cache)
    ds2 = ds2.shuffle(buffer_size=2)

    num_iter = 0
    for _ in ds1.create_dict_iterator():
        num_iter += 1
    assert num_iter == 12
    logger.info("Number of data in ds1: {} ".format(num_iter))

    num_iter = 0
    for _ in ds2.create_dict_iterator():
        num_iter += 1
    assert num_iter == 3
    logger.info("test_cache_nomap_allowed_share1 Ended.\n")

Example #6

0

Show file

File: test_cache_map.py Project: gerayking/mindsporeAno

def test_cache_map_basic2():
    """
    Test mappable leaf with the cache op later in the tree above the map(decode)

       Repeat
         |
       Cache
         |
     Map(decode)
         |
     ImageFolder
    """

    logger.info("Test cache map basic 2")

    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)

    # This DATA_DIR only has 2 images in it
    ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR)
    decode_op = c_vision.Decode()
    ds1 = ds1.map(input_columns=["image"],
                  operations=decode_op,
                  cache=some_cache)
    ds1 = ds1.repeat(4)

    filename = "cache_map_02_result.npz"
    save_and_check_md5(ds1, filename, generate_golden=GENERATE_GOLDEN)

    logger.info("test_cache_map_basic2 Ended.\n")

Example #7

0

Show file

File: test_cache_map.py Project: lizhanyang505/mindspore

def test_cache_map_failure1():
    """
    Test nested cache (failure)

        Repeat
          |
        Cache
          |
      Map(decode)
          |
        Cache
          |
      ImageFolder

    """
    logger.info("Test cache failure 1")

    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)

    # This DATA_DIR only has 2 images in it
    ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR, cache=some_cache)
    decode_op = c_vision.Decode()
    ds1 = ds1.map(input_columns=["image"], operations=decode_op, cache=some_cache)
    ds1 = ds1.repeat(4)

    try:
        num_iter = 0
        for _ in ds1.create_dict_iterator():
            num_iter += 1
    except RuntimeError as e:
        logger.info("Got an exception in DE: {}".format(str(e)))
        assert "Nested cache operations is not supported!" in str(e)

    assert num_iter == 0
    logger.info('test_cache_failure1 Ended.\n')

Example #8

0

Show file

def test_cache_nomap_allowed_share3():
    """
    It is allowed to share the cache between the following two trees (different shard ids):

       Repeat                     Repeat
         |                          |
       Cache                      Cache
         |                          |
      TFReader(shard_id = 0)     TFReader(shard_id = 1)
    """

    logger.info("Test cache nomap allowed share 3")

    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)

    tf_files = ["../data/dataset/tf_file_dataset/test1.data", "../data/dataset/tf_file_dataset/test2.data"]
    ds1 = ds.TFRecordDataset(tf_files, num_shards=2, shard_id=0, num_samples=3, shuffle=False, cache=some_cache)
    ds1 = ds1.repeat(4)

    ds2 = ds.TFRecordDataset(tf_files, num_shards=2, shard_id=1, num_samples=3, shuffle=False, cache=some_cache)
    ds2 = ds2.repeat(4)

    num_iter = 0
    for _ in ds1.create_dict_iterator():
        num_iter += 1
    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 12

    num_iter = 0
    for _ in ds2.create_dict_iterator():
        num_iter += 1
    assert num_iter == 12
    logger.info("test_cache_nomap_allowed_share3 Ended.\n")

Example #9

0

Show file

def test_cache_nomap_basic2():
    """
    A random dataset (a non mappable dataset) with a cache over it just after the leaf
    """

    logger.info("Test cache nomap basic 2")

    schema = ds.Schema()
    schema.add_column('image', de_type=mstype.uint8,
                      shape=[640, 480, 3])  # 921600 bytes (a bit less than 1 MB per image)
    schema.add_column('label', de_type=mstype.uint8, shape=[1])

    # create a cache.  arbitrary session_id for now
    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)

    # sampler arg not given directly, however any of these args will auto-generate an appropriate sampler:
    # num_samples, shuffle, num_shards, shard_id
    # In this case, the presence of num_samples chooses a sampler.
    ds1 = ds.RandomDataset(schema=schema, total_rows=20, num_samples=20, num_parallel_workers=4, cache=some_cache)
    ds1 = ds1.repeat(2)

    num_iter = 0
    for data in ds1.create_dict_iterator():
        logger.info("printing the label: {}".format(data["label"]))
        num_iter += 1

    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 40
    logger.info("test_cache_nomap_basic2 Ended.\n")

Example #10

0

Show file

def test_cache_nomap_basic7():
    """
    A TF reader dataset (a non mappable dataset) that uses global shuffle, and is cached followed by
    map.
    In this one, the tf dataset with global shuffle might want to inject a shuffle op over top of the
    tf reader, but since a cache is given, it will choose not to.

       Repeat
         |
     Map(decode)
         |
       cache
         |
      TFReader
    """

    logger.info("Test cache nomap basic 7")

    # This dataset has 3 records in it only
    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)

    ds1 = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["image"], shuffle=ds.Shuffle.GLOBAL, cache=some_cache)
    decode_op = c_vision.Decode()
    ds1 = ds1.map(input_columns=["image"], operations=decode_op)
    ds1 = ds1.repeat(4)

    num_iter = 0
    for _ in ds1.create_dict_iterator():
        num_iter += 1

    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 12
    logger.info("test_cache_nomap_basic7 Ended.\n")

Example #11

0

Show file

File: test_cache_nomap.py Project: huxian123/mindspore

def test_cache_nomap_allowed_share2():
    """
    It is allowed to share the cache between the following two trees (with map decode):

       Repeat     Shuffle
         |           |
       Cache       Cache
         |           |
     Map(decode) Map(decode)
         |           |
      TFReader    TFReader
    """

    logger.info("Test cache nomap allowed share 2")

    ds.config.set_seed(1)
    # This dataset has 3 records in it only
    some_cache = ds.DatasetCache(session_id=2, size=0, spilling=True)
    decode_op = c_vision.Decode()

    ds1 = ds.TFRecordDataset(DATA_DIR,
                             SCHEMA_DIR,
                             columns_list=["image"],
                             shuffle=False)
    ds1 = ds1.map(operations=decode_op,
                  input_columns=["image"],
                  cache=some_cache)
    ds1 = ds1.repeat(4)

    ds2 = ds.TFRecordDataset(DATA_DIR,
                             SCHEMA_DIR,
                             columns_list=["image"],
                             shuffle=False)
    ds2 = ds2.map(operations=decode_op,
                  input_columns=["image"],
                  cache=some_cache)
    ds2 = ds2.shuffle(buffer_size=2)

    num_iter = 0
    for _ in ds1.create_dict_iterator(num_epochs=1):
        num_iter += 1
    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 12

    num_iter = 0
    for _ in ds2.create_dict_iterator(num_epochs=1):
        num_iter += 1
    assert num_iter == 3
    logger.info("test_cache_nomap_allowed_share2 Ended.\n")

Example #12

0

Show file

File: test_cache_nomap.py Project: gerayking/mindsporeAno

def test_cache_nomap_allowed_share4():
    """
    It is allowed to share the cache between the following two trees:

       Cache                                  Cache
         |                                      |
     Map(decode, num_parallel_workers=1)    Map(decode, num_parallel_workers=2)
         |                                      |
      TFReader                              TFReader
    """

    logger.info("Test cache nomap allowed share 4")

    # This dataset has 3 records in it only
    some_cache = ds.DatasetCache(session_id=2, size=0, spilling=True)
    decode_op = c_vision.Decode()

    ds1 = ds.TFRecordDataset(DATA_DIR,
                             SCHEMA_DIR,
                             columns_list=["image"],
                             shuffle=False)
    ds1 = ds1.map(input_columns=["image"],
                  operations=decode_op,
                  cache=some_cache,
                  num_parallel_workers=1)

    ds2 = ds.TFRecordDataset(DATA_DIR,
                             SCHEMA_DIR,
                             columns_list=["image"],
                             shuffle=False)
    ds2 = ds2.map(input_columns=["image"],
                  operations=decode_op,
                  cache=some_cache,
                  num_parallel_workers=2)

    num_iter = 0
    for _ in ds1.create_dict_iterator():
        num_iter += 1
    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 3

    num_iter = 0
    for _ in ds2.create_dict_iterator():
        num_iter += 1
    logger.info("Number of data in ds2: {} ".format(num_iter))
    assert num_iter == 3

    logger.info("test_cache_nomap_allowed_share4 Ended.\n")

Example #13

0

Show file

File: test_cache_nomap.py Project: gerayking/mindsporeAno

def test_cache_nomap_disallowed_share1():
    """
    It is not allowed to share the cache between the following two trees:

       Cache       Cache
         |           |
     Map(decode) Map(rescale)
         |           |
      TFReader    TFReader
    """

    logger.info("Test cache nomap disallowed share1")

    # This dataset has 3 records in it only
    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)
    decode_op = c_vision.Decode()
    rescale_op = c_vision.Rescale(1.0 / 255.0, -1.0)

    ds1 = ds.TFRecordDataset(DATA_DIR,
                             SCHEMA_DIR,
                             columns_list=["image"],
                             shuffle=False)
    ds1 = ds1.map(input_columns=["image"],
                  operations=decode_op,
                  cache=some_cache)

    ds2 = ds.TFRecordDataset(DATA_DIR,
                             SCHEMA_DIR,
                             columns_list=["image"],
                             shuffle=False)
    ds2 = ds2.map(input_columns=["image"],
                  operations=rescale_op,
                  cache=some_cache)

    num_iter = 0
    for _ in ds1.create_dict_iterator():
        num_iter += 1
    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 3

    try:
        sum([1 for _ in ds2])
    except RuntimeError as e:
        logger.info("Got an exception in DE: {}".format(str(e)))
        assert "Attempt to re-use a cache for a different tree!" in str(e)

    logger.info("test_cache_nomap_disallowed_share1 Ended.\n")

Example #14

0

Show file

File: test_cache_nomap.py Project: huxian123/mindspore

def test_cache_nomap_basic3():
    """
    A TF reader dataset (a non mappable dataset) with a cache over it just after the leaf

       Repeat
         |
     Map(decode)
         |
       Cache
         |
      TFReader
    """

    logger.info("Test cache nomap basic 3")

    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)
    ds1 = ds.TFRecordDataset(DATA_DIR,
                             SCHEMA_DIR,
                             columns_list=["image"],
                             shuffle=False,
                             cache=some_cache)
    decode_op = c_vision.Decode()
    ds1 = ds1.map(operations=decode_op, input_columns=["image"])
    ds1 = ds1.repeat(4)

    num_iter = 0
    for _ in ds1.create_dict_iterator(num_epochs=1):
        num_iter += 1

    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 12

    # Contact the server to get the statistics
    stat = some_cache.GetStat()
    cache_sz = stat.avg_cache_sz
    num_mem_cached = stat.num_mem_cached
    num_disk_cached = stat.num_disk_cached

    logger.info("Number of rows cached in memory: {}".format(num_mem_cached))
    logger.info("Number of rows spilled to disk: {}".format(num_disk_cached))
    logger.info("Average row cache size: {}".format(cache_sz))

    logger.info("test_cache_nomap_basic3 Ended.\n")

Example #15

0

Show file

File: test_cache_nomap.py Project: gerayking/mindsporeAno

def test_cache_nomap_basic4():
    """
    A TF reader dataset (a non mappable dataset) with a map decode and cache after it
    Since a global shuffle is used for the tf reader, it will inject a shuffle op over the tf.
    But, if there's a cache later, that shuffle becomes invalid and should be removed.

       Repeat
         |
       Cache
         |
     Map(decode)
         |
      TFReader
    """

    logger.info("Test cache nomap basic 4")

    # This dataset has 3 records in it only
    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)
    # With shuffle not being set, TF defaults to a "global" shuffle when there is no cache
    # in the picture.  This causes a shuffle-injection over the TF.  For clarify, this test will
    # explicitly give the global option, even though it's the default in python.
    # But, when caching is added in the ascendent tree above TF, we do global shuffling
    # through the sampler over the cache, not by the shuffle op.  In that case, tree prepare
    # will remove the shuffle op that got injected by the initial tree creation.
    ds1 = ds.TFRecordDataset(DATA_DIR,
                             SCHEMA_DIR,
                             columns_list=["image"],
                             shuffle=ds.Shuffle.GLOBAL)
    decode_op = c_vision.Decode()

    ds1 = ds1.map(input_columns=["image"],
                  operations=decode_op,
                  cache=some_cache)
    ds1 = ds1.repeat(4)

    num_iter = 0
    for _ in ds1.create_dict_iterator():
        num_iter += 1

    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 12
    logger.info("test_cache_nomap_basic4 Ended.\n")

Example #16

0

Show file

File: test_cache_nomap.py Project: gerayking/mindsporeAno

def test_cache_nomap_basic6():
    """
    A TF reader dataset (a non mappable dataset) with a cache over it just after the leaf
    In this one, the tf dataset will be given sharding configuration, however since a cache is
    used, the tree prepare should undo the sharding configuration and instead, a distributed
    sampler will be chosen with the same shard config.

       Repeat
         |
     Map(decode)
         |
       Cache
         |
      TFReader
    """

    logger.info("Test cache nomap basic 6")

    # This dataset has 3 records in it only
    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)

    # With only 3 records shard into 3, we expect only 1 record returned for this shard
    # However, the sharding will be done by the sampler, not by the tf record leaf node
    # In this case, it is a row-based sharding, not the file-based sharding that would happen if
    # there was not any cache.
    ds1 = ds.TFRecordDataset(DATA_DIR,
                             SCHEMA_DIR,
                             columns_list=["image"],
                             num_shards=3,
                             shard_id=1,
                             cache=some_cache)
    decode_op = c_vision.Decode()
    ds1 = ds1.map(input_columns=["image"], operations=decode_op)
    ds1 = ds1.repeat(4)

    num_iter = 0
    for _ in ds1.create_dict_iterator():
        num_iter += 1

    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 4
    logger.info("test_cache_nomap_basic6 Ended.\n")

Example #17

0

Show file

File: test_cache_map.py Project: gerayking/mindsporeAno

def test_cache_map_basic4():
    """
    Test different rows result in core dump
    """
    logger.info("Test cache basic 4")
    some_cache = ds.DatasetCache(session_id=1, size=0, spilling=True)

    # This DATA_DIR only has 2 images in it
    ds1 = ds.ImageFolderDatasetV2(dataset_dir=DATA_DIR, cache=some_cache)
    decode_op = c_vision.Decode()
    ds1 = ds1.repeat(4)
    ds1 = ds1.map(input_columns=["image"], operations=decode_op)
    logger.info("ds1.dataset_size is ", ds1.get_dataset_size())
    shape = ds1.output_shapes()
    logger.info(shape)
    num_iter = 0
    for _ in ds1.create_dict_iterator():
        logger.info("get data from dataset")
        num_iter += 1

    logger.info("Number of data in ds1: {} ".format(num_iter))
    assert num_iter == 8
    logger.info('test_cache_basic3 Ended.\n')

Example #18

0

Show file

File: dataset.py Project: peixinhou/mindspore

def create_dataset1(dataset_path,
                    do_train,
                    repeat_num=1,
                    batch_size=32,
                    target="Ascend",
                    distribute=False,
                    enable_cache=False,
                    cache_session_id=None):
    """
    create a train or evaluate cifar10 dataset for resnet50
    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        repeat_num(int): the repeat times of dataset. Default: 1
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend
        distribute(bool): data for distribute or not. Default: False
        enable_cache(bool): whether tensor caching service is used for eval. Default: False
        cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None

    Returns:
        dataset
    """
    if target == "Ascend":
        device_num, rank_id = _get_rank_info()
    else:
        if distribute:
            init()
            rank_id = get_rank()
            device_num = get_group_size()
        else:
            device_num = 1
    if device_num == 1:
        data_set = ds.Cifar10Dataset(dataset_path,
                                     num_parallel_workers=8,
                                     shuffle=True)
    else:
        data_set = ds.Cifar10Dataset(dataset_path,
                                     num_parallel_workers=8,
                                     shuffle=True,
                                     num_shards=device_num,
                                     shard_id=rank_id)

    # define map operations
    trans = []
    if do_train:
        trans += [
            C.RandomCrop((32, 32), (4, 4, 4, 4)),
            C.RandomHorizontalFlip(prob=0.5)
        ]

    trans += [
        C.Resize((224, 224)),
        C.Rescale(1.0 / 255.0, 0.0),
        C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]),
        C.HWC2CHW()
    ]

    type_cast_op = C2.TypeCast(mstype.int32)

    data_set = data_set.map(operations=type_cast_op,
                            input_columns="label",
                            num_parallel_workers=8)
    # only enable cache for eval
    if do_train:
        enable_cache = False
    if enable_cache:
        if not cache_session_id:
            raise ValueError(
                "A cache session_id must be provided to use cache.")
        eval_cache = ds.DatasetCache(session_id=int(cache_session_id), size=0)
        data_set = data_set.map(operations=trans,
                                input_columns="image",
                                num_parallel_workers=8,
                                cache=eval_cache)
    else:
        data_set = data_set.map(operations=trans,
                                input_columns="image",
                                num_parallel_workers=8)

    # apply batch operations
    data_set = data_set.batch(batch_size, drop_remainder=True)
    # apply dataset repeat operation
    data_set = data_set.repeat(repeat_num)

    return data_set

Example #19

0

Show file

Training script
"""
import argparse
import mindspore.dataset as ds

parser = argparse.ArgumentParser(description='Cache Example')
parser.add_argument('--num_devices', type=int, default=1, help='Device num.')
parser.add_argument('--device', type=int, default=0, help='Device id.')
parser.add_argument('--session_id', type=int, default=1, help='Session id.')
parser.add_argument('--dataset_path',
                    type=str,
                    default=None,
                    help='Dataset path')
args_opt = parser.parse_args()

# apply cache to dataset
test_cache = ds.DatasetCache(session_id=args_opt.session_id,
                             size=0,
                             spilling=False)
dataset = ds.Cifar10Dataset(dataset_dir=args_opt.dataset_path,
                            num_samples=4,
                            shuffle=False,
                            num_parallel_workers=1,
                            num_shards=args_opt.num_devices,
                            shard_id=args_opt.device,
                            cache=test_cache)
num_iter = 0
for _ in dataset.create_dict_iterator():
    num_iter += 1
print("Got {} samples on device {}".format(num_iter, args_opt.device))

Example #20

0

Show file

File: dataset.py Project: peixinhou/mindspore

def create_dataset4(dataset_path,
                    do_train,
                    repeat_num=1,
                    batch_size=32,
                    target="Ascend",
                    distribute=False,
                    enable_cache=False,
                    cache_session_id=None):
    """
    create a train or eval imagenet2012 dataset for se-resnet50

    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        repeat_num(int): the repeat times of dataset. Default: 1
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend
        distribute(bool): data for distribute or not. Default: False
        enable_cache(bool): whether tensor caching service is used for eval. Default: False
        cache_session_id(int): If enable_cache, cache session_id need to be provided. Default: None

    Returns:
        dataset
    """
    if target == "Ascend":
        device_num, rank_id = _get_rank_info()
    else:
        if distribute:
            init()
            rank_id = get_rank()
            device_num = get_group_size()
        else:
            device_num = 1
    if device_num == 1:
        data_set = ds.ImageFolderDataset(dataset_path,
                                         num_parallel_workers=12,
                                         shuffle=True)
    else:
        data_set = ds.ImageFolderDataset(dataset_path,
                                         num_parallel_workers=12,
                                         shuffle=True,
                                         num_shards=device_num,
                                         shard_id=rank_id)
    image_size = 224
    mean = [123.68, 116.78, 103.94]
    std = [1.0, 1.0, 1.0]

    # define map operations
    if do_train:
        trans = [
            C.RandomCropDecodeResize(image_size,
                                     scale=(0.08, 1.0),
                                     ratio=(0.75, 1.333)),
            C.RandomHorizontalFlip(prob=0.5),
            C.Normalize(mean=mean, std=std),
            C.HWC2CHW()
        ]
    else:
        trans = [
            C.Decode(),
            C.Resize(292),
            C.CenterCrop(256),
            C.Normalize(mean=mean, std=std),
            C.HWC2CHW()
        ]

    type_cast_op = C2.TypeCast(mstype.int32)
    data_set = data_set.map(operations=trans,
                            input_columns="image",
                            num_parallel_workers=12)
    # only enable cache for eval
    if do_train:
        enable_cache = False
    if enable_cache:
        if not cache_session_id:
            raise ValueError(
                "A cache session_id must be provided to use cache.")
        eval_cache = ds.DatasetCache(session_id=int(cache_session_id), size=0)
        data_set = data_set.map(operations=type_cast_op,
                                input_columns="label",
                                num_parallel_workers=12,
                                cache=eval_cache)
    else:
        data_set = data_set.map(operations=type_cast_op,
                                input_columns="label",
                                num_parallel_workers=12)

    # apply batch operations
    data_set = data_set.batch(batch_size, drop_remainder=True)

    # apply dataset repeat operation
    data_set = data_set.repeat(repeat_num)

    return data_set