def test_bath_afterPadded():
    data1 = [{
        'image': np.zeros(1, np.uint8)
    }, {
        'image': np.zeros(1, np.uint8)
    }, {
        'image': np.zeros(1, np.uint8)
    }, {
        'image': np.zeros(1, np.uint8)
    }, {
        'image': np.zeros(1, np.uint8)
    }]
    data2 = [{
        'image': np.zeros(1, np.uint8)
    }, {
        'image': np.zeros(1, np.uint8)
    }, {
        'image': np.zeros(1, np.uint8)
    }]

    ds1 = ds.PaddedDataset(data1)
    ds2 = ds.PaddedDataset(data2)
    ds3 = ds1 + ds2

    testsampler = ds.DistributedSampler(num_shards=2,
                                        shard_id=0,
                                        shuffle=False,
                                        num_samples=None)
    ds3.use_sampler(testsampler)

    ds4 = ds3.batch(2)
    assert sum([1 for _ in ds4]) == 2
def test_raise_error():
    data1 = [{'image': np.zeros(1, np.uint8)}, {'image': np.zeros(2, np.uint8)},
             {'image': np.zeros(3, np.uint8)}, {'image': np.zeros(4, np.uint8)},
             {'image': np.zeros(5, np.uint8)}]
    data2 = [{'image': np.zeros(6, np.uint8)}, {'image': np.zeros(7, np.uint8)},
             {'image': np.zeros(8, np.uint8)}]

    ds1 = ds.PaddedDataset(data1)
    ds4 = ds1.batch(2)
    ds2 = ds.PaddedDataset(data2)
    ds3 = ds4 + ds2

    with pytest.raises(TypeError) as excinfo:
        testsampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=None)
        ds3.use_sampler(testsampler)
        assert excinfo.type == 'TypeError'

    with pytest.raises(TypeError) as excinfo:
        otherSampler = ds.SequentialSampler()
        ds3.use_sampler(otherSampler)
        assert excinfo.type == 'TypeError'

    with pytest.raises(ValueError) as excinfo:
        testsampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=True, num_samples=None)
        ds3.use_sampler(testsampler)
        assert excinfo.type == 'ValueError'

    with pytest.raises(ValueError) as excinfo:
        testsampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=5)
        ds3.use_sampler(testsampler)
        assert excinfo.type == 'ValueError'
def test_Mindrecord_Padded(remove_mindrecord_file):
    result_list = []
    verify_list = [[1, 2], [3, 4], [5, 11], [6, 12], [7, 13], [8, 14], [9],
                   [10]]
    num_readers = 4
    data_set = ds.MindDataset(CV_FILE_NAME + "0", ['file_name'],
                              num_readers,
                              shuffle=False)
    data1 = [{
        'file_name': np.array(b'image_00011.jpg', dtype='|S15')
    }, {
        'file_name': np.array(b'image_00012.jpg', dtype='|S15')
    }, {
        'file_name': np.array(b'image_00013.jpg', dtype='|S15')
    }, {
        'file_name': np.array(b'image_00014.jpg', dtype='|S15')
    }]
    ds1 = ds.PaddedDataset(data1)
    ds2 = data_set + ds1
    shard_num = 8
    for i in range(shard_num):
        testsampler = ds.DistributedSampler(num_shards=shard_num,
                                            shard_id=i,
                                            shuffle=False,
                                            num_samples=None)
        ds2.use_sampler(testsampler)
        tem_list = []
        for ele in ds2.create_dict_iterator():
            tem_list.append(
                int(ele['file_name'].tostring().decode().lstrip(
                    'image_').rstrip('.jpg')))
        result_list.append(tem_list)
    assert result_list == verify_list
def test_imagefolder_padded_with_decode_and_get_dataset_size():
    num_shards = 5
    count = 0
    for shard_id in range(num_shards):
        DATA_DIR = "../data/dataset/testPK/data"
        data = ds.ImageFolderDatasetV2(DATA_DIR)

        white_io = BytesIO()
        Image.new('RGB', (224, 224), (255, 255, 255)).save(white_io, 'JPEG')
        padded_sample = {}
        padded_sample['image'] = np.array(bytearray(white_io.getvalue()),
                                          dtype='uint8')
        padded_sample['label'] = np.array(-1, np.int32)

        white_samples = [
            padded_sample, padded_sample, padded_sample, padded_sample
        ]
        data2 = ds.PaddedDataset(white_samples)
        data3 = data + data2

        testsampler = ds.DistributedSampler(num_shards=num_shards,
                                            shard_id=shard_id,
                                            shuffle=False,
                                            num_samples=None)
        data3.use_sampler(testsampler)
        shard_dataset_size = data3.get_dataset_size()
        data3 = data3.map(input_columns="image", operations=V_C.Decode())
        shard_sample_count = 0
        for ele in data3.create_dict_iterator():
            print("label: {}".format(ele['label']))
            count += 1
            shard_sample_count += 1
        assert shard_sample_count in (9, 10)
        assert shard_dataset_size == shard_sample_count
    assert count == 48
Beispiel #5
0
def test_imagefolder_error():
    DATA_DIR = "../data/dataset/testPK/data"
    data = ds.ImageFolderDataset(DATA_DIR, num_samples=14)

    data1 = [{
        'image': np.zeros(1, np.uint8),
        'label': np.array(0, np.int32)
    }, {
        'image': np.zeros(2, np.uint8),
        'label': np.array(1, np.int32)
    }, {
        'image': np.zeros(3, np.uint8),
        'label': np.array(0, np.int32)
    }, {
        'image': np.zeros(4, np.uint8),
        'label': np.array(1, np.int32)
    }, {
        'image': np.zeros(5, np.uint8),
        'label': np.array(0, np.int32)
    }, {
        'image': np.zeros(6, np.uint8),
        'label': np.array(1, np.int32)
    }]

    data2 = ds.PaddedDataset(data1)
    data3 = data + data2
    with pytest.raises(ValueError) as excinfo:
        testsampler = ds.DistributedSampler(num_shards=5,
                                            shard_id=4,
                                            shuffle=False,
                                            num_samples=None)
        data3.use_sampler(testsampler)
        assert excinfo.type == 'ValueError'
def test_more_shard_padded():
    result_list = []
    for i in range(8):
        result_list.append(1)
    result_list.append(0)

    data1 = ds.GeneratorDataset(generator_5, ["col1"])
    data2 = ds.GeneratorDataset(generator_8, ["col1"])
    data3 = data1 + data2
    vertifyList = []
    numShard = 9
    for i in range(numShard):
        tem_list = []
        testsampler = ds.DistributedSampler(num_shards=numShard, shard_id=i, shuffle=False, num_samples=None)
        data3.use_sampler(testsampler)
        for item in data3.create_dict_iterator():
            tem_list.append(item['col1'])
        vertifyList.append(tem_list)

    assert [len(ele) for ele in vertifyList] == result_list

    vertifyList1 = []
    result_list1 = []
    for i in range(8):
        result_list1.append([i+1])
    result_list1.append([])

    data1 = [{'image': np.zeros(1, np.uint8)}, {'image': np.zeros(2, np.uint8)},
             {'image': np.zeros(3, np.uint8)}, {'image': np.zeros(4, np.uint8)},
             {'image': np.zeros(5, np.uint8)}]
    data2 = [{'image': np.zeros(6, np.uint8)}, {'image': np.zeros(7, np.uint8)},
             {'image': np.zeros(8, np.uint8)}]

    ds1 = ds.PaddedDataset(data1)
    ds2 = ds.PaddedDataset(data2)
    ds3 = ds1 + ds2

    for i in range(numShard):
        tem_list = []
        testsampler = ds.DistributedSampler(num_shards=numShard, shard_id=i, shuffle=False, num_samples=None)
        ds3.use_sampler(testsampler)
        for item in ds3.create_dict_iterator():
            tem_list.append(len(item['image']))
        vertifyList1.append(tem_list)

    assert vertifyList1 == result_list1
def test_Unevenly_distributed():
    result_list = [[1, 4, 7], [2, 5, 8], [3, 6]]
    verify_list = []

    data1 = [{
        'image': np.zeros(1, np.uint8)
    }, {
        'image': np.zeros(2, np.uint8)
    }, {
        'image': np.zeros(3, np.uint8)
    }, {
        'image': np.zeros(4, np.uint8)
    }, {
        'image': np.zeros(5, np.uint8)
    }]
    data2 = [{
        'image': np.zeros(6, np.uint8)
    }, {
        'image': np.zeros(7, np.uint8)
    }, {
        'image': np.zeros(8, np.uint8)
    }]

    testsampler = ds.DistributedSampler(num_shards=4,
                                        shard_id=0,
                                        shuffle=False,
                                        num_samples=None,
                                        offset=1)

    ds1 = ds.PaddedDataset(data1)
    ds2 = ds.PaddedDataset(data2)
    ds3 = ds1 + ds2
    numShard = 3
    for i in range(numShard):
        tem_list = []
        testsampler = ds.DistributedSampler(num_shards=numShard,
                                            shard_id=i,
                                            shuffle=False,
                                            num_samples=None)
        ds3.use_sampler(testsampler)
        for item in ds3.create_dict_iterator():
            tem_list.append(len(item['image']))
        verify_list.append(tem_list)
    assert verify_list == result_list
def test_Reapeat_afterPadded():
    result_list = [1, 3, 5, 7]
    verify_list = []

    data1 = [{'image': np.zeros(1, np.uint8)}, {'image': np.zeros(2, np.uint8)},
             {'image': np.zeros(3, np.uint8)}, {'image': np.zeros(4, np.uint8)},
             {'image': np.zeros(5, np.uint8)}]
    data2 = [{'image': np.zeros(6, np.uint8)}, {'image': np.zeros(7, np.uint8)},
             {'image': np.zeros(8, np.uint8)}]

    ds1 = ds.PaddedDataset(data1)
    ds2 = ds.PaddedDataset(data2)
    ds3 = ds1 + ds2

    testsampler = ds.DistributedSampler(num_shards=2, shard_id=0, shuffle=False, num_samples=None)
    ds3.use_sampler(testsampler)
    repeat_num = 2
    ds3 = ds3.repeat(repeat_num)
    for item in ds3.create_dict_iterator():
        verify_list.append(len(item['image']))

    assert verify_list == result_list * repeat_num
def test_celeba_padded():
    data = ds.CelebADataset("../data/dataset/testCelebAData/")

    padded_samples = [{'image': np.zeros(1, np.uint8), 'attr': np.zeros(1, np.uint32)}]
    padded_ds = ds.PaddedDataset(padded_samples)
    data = data + padded_ds
    dis_sampler = ds.DistributedSampler(num_shards=2, shard_id=1, shuffle=False, num_samples=None)
    data.use_sampler(dis_sampler)
    data = data.repeat(2)

    count = 0
    for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        count = count + 1
    assert count == 4
Beispiel #10
0
def test_manifest_dataset_get_num_class():
    data = ds.ManifestDataset(DATA_FILE, decode=True, shuffle=False)
    assert data.num_classes() == 3

    padded_samples = [{
        'image': np.zeros(1, np.uint8),
        'label': np.array(1, np.int32)
    }]
    padded_ds = ds.PaddedDataset(padded_samples)

    data = data.repeat(2)
    padded_ds = padded_ds.repeat(2)

    data1 = data + padded_ds
    assert data1.num_classes() == 3
Beispiel #11
0
def test_clue_padded_and_skip_with_0_samples():
    """
    Test num_samples param of CLUE dataset
    """
    TRAIN_FILE = '../data/dataset/testCLUE/afqmc/train.json'

    data = ds.CLUEDataset(TRAIN_FILE, task='AFQMC', usage='train')
    count = 0
    for _ in data.create_dict_iterator(num_epochs=1, output_numpy=True):
        count += 1
    assert count == 3

    data_copy1 = copy.deepcopy(data)

    sample = {
        "label": np.array(1, np.string_),
        "sentence1": np.array(1, np.string_),
        "sentence2": np.array(1, np.string_)
    }
    samples = [sample]
    padded_ds = ds.PaddedDataset(samples)
    dataset = data + padded_ds
    testsampler = ds.DistributedSampler(num_shards=2,
                                        shard_id=1,
                                        shuffle=False,
                                        num_samples=None)
    dataset.use_sampler(testsampler)
    assert dataset.get_dataset_size() == 2
    count = 0
    for data in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        count += 1
    assert count == 2

    dataset = dataset.skip(count=2)  # dataset2 has none samples
    count = 0
    for data in dataset.create_dict_iterator(num_epochs=1, output_numpy=True):
        count += 1
    assert count == 0

    with pytest.raises(ValueError, match="There is no samples in the "):
        dataset = dataset.concat(data_copy1)
        count = 0
        for data in dataset.create_dict_iterator(num_epochs=1,
                                                 output_numpy=True):
            count += 1
        assert count == 2
Beispiel #12
0
def test_TFRecord_Padded():
    DATA_DIR = [
        "../data/dataset/test_tf_file_3_images/train-0000-of-0001.data"
    ]
    SCHEMA_DIR = "../data/dataset/test_tf_file_3_images/datasetSchema.json"
    result_list = [[159109, 2], [192607, 3], [179251, 4], [1, 5]]
    verify_list = []
    shard_num = 4
    for i in range(shard_num):
        data = ds.TFRecordDataset(DATA_DIR,
                                  SCHEMA_DIR,
                                  columns_list=["image"],
                                  shuffle=False,
                                  shard_equal_rows=True)

        padded_samples = [{
            'image': np.zeros(1, np.uint8)
        }, {
            'image': np.zeros(2, np.uint8)
        }, {
            'image': np.zeros(3, np.uint8)
        }, {
            'image': np.zeros(4, np.uint8)
        }, {
            'image': np.zeros(5, np.uint8)
        }]

        padded_ds = ds.PaddedDataset(padded_samples)
        concat_ds = data + padded_ds
        testsampler = ds.DistributedSampler(num_shards=shard_num,
                                            shard_id=i,
                                            shuffle=False,
                                            num_samples=None)
        concat_ds.use_sampler(testsampler)
        shard_list = []
        for item in concat_ds.create_dict_iterator(num_epochs=1,
                                                   output_numpy=True):
            shard_list.append(len(item['image']))
        verify_list.append(shard_list)
    assert verify_list == result_list
def test_imagefolder_padded():
    DATA_DIR = "../data/dataset/testPK/data"
    data = ds.ImageFolderDatasetV2(DATA_DIR)

    data1 = [{'image': np.zeros(1, np.uint8), 'label': np.array(0, np.int32)},
             {'image': np.zeros(2, np.uint8), 'label': np.array(1, np.int32)},
             {'image': np.zeros(3, np.uint8), 'label': np.array(0, np.int32)},
             {'image': np.zeros(4, np.uint8), 'label': np.array(1, np.int32)},
             {'image': np.zeros(5, np.uint8), 'label': np.array(0, np.int32)},
             {'image': np.zeros(6, np.uint8), 'label': np.array(1, np.int32)}]

    data2 = ds.PaddedDataset(data1)
    data3 = data + data2
    testsampler = ds.DistributedSampler(num_shards=5, shard_id=4, shuffle=False, num_samples=None)
    data3.use_sampler(testsampler)
    assert sum([1 for _ in data3]) == 10
    verify_list = []

    for ele in data3.create_dict_iterator():
        verify_list.append(len(ele['image']))
    assert verify_list[8] == 1
    assert verify_list[9] == 6
Beispiel #14
0
def test_padded_dataset_size():
    dataset = ds.PaddedDataset([{"data": [1, 2, 3]}, {"data": [1, 0, 1]}])
    assert dataset.get_dataset_size() == 2