Beispiel #1
0
 def test_lots_of_data_with_multiple_backend(self, repo_300_filled_samples):
     repo = repo_300_filled_samples
     co = repo.checkout()
     aset = co.columns['aset']
     np_dset = make_numpy_dataset([aset], batch_size=10, drop_last=True)
     for data in np_dset:
         assert isinstance(data, np.ndarray)
         assert data.shape == (10, 5, 7)
     co.close()
Beispiel #2
0
    def test_nested_column(self, repo_20_filled_subsamples):
        co = repo_20_filled_subsamples.checkout()
        col1 = co['writtenaset']
        col2 = co['second_aset']
        dset = make_numpy_dataset([col1, col2])
        for data1, data2 in dset:
            assert isinstance(data1, dict)
            assert isinstance(data2, dict)
            assert tuple(data1.keys()) == tuple(data2.keys())

        dset = make_numpy_dataset([col1, col2], batch_size=1, drop_last=True)
        for data1, data2 in dset:
            assert type(data1) is type(data2) is tuple
            assert len(data1) == len(data2) == 1
            assert tuple(data1[0].keys()) == tuple(data2[0].keys())

        dset = make_numpy_dataset([col1, col2], batch_size=2, drop_last=True)
        for data1, data2 in dset:
            assert len(data1) == len(data2) == 2
        co.close()
Beispiel #3
0
    def test_shuffle(self, repo_20_filled_samples):
        repo = repo_20_filled_samples
        co = repo.checkout()
        first_aset = co.columns['writtenaset']

        unshuffled_dataset = make_numpy_dataset(
            (first_aset, ), keys=[str(i) for i in range(15)], shuffle=False)
        expected_unshuffled_content = [i for i in range(15)]
        recieved_unshuffled_content = []
        for data in unshuffled_dataset:
            recieved_unshuffled_content.append(int(data[0][0]))
        assert expected_unshuffled_content == recieved_unshuffled_content

        shuffled_dataset = make_numpy_dataset((first_aset, ),
                                              keys=[str(i) for i in range(15)],
                                              shuffle=True)
        recieved_shuffled_content = []
        for data in shuffled_dataset:
            recieved_shuffled_content.append(int(data[0][0]))
        assert recieved_shuffled_content != expected_unshuffled_content
        co.close()
Beispiel #4
0
    def test_multiple_dataset_batched_loader(self, repo_20_filled_samples):
        co = repo_20_filled_samples.checkout()
        first_aset = co.columns['writtenaset']
        second_aset = co.columns['second_aset']
        dset = make_numpy_dataset([first_aset, second_aset],
                                  batch_size=6,
                                  drop_last=True)
        total_samples = 0
        for dset1, dset2 in dset:
            total_samples += dset1.shape[0]
            assert dset1.shape == (6, 5, 7)
            assert dset2.shape == (6, 5, 7)
        assert total_samples == 18  # drop last is True

        # testing with batch_size = 1
        dset = make_numpy_dataset([first_aset, second_aset],
                                  batch_size=1,
                                  drop_last=True)
        total_samples = 0
        for dset1, dset2 in dset:
            total_samples += dset1.shape[0]
            assert dset1.shape == (1, 5, 7)
            assert dset2.shape == (1, 5, 7)
        assert total_samples == 20  # drop last is True will not have any effect

        with pytest.raises(RuntimeError,
                           match="Setting `drop_last` is a no-op when "
                           "batching is not enabled"):
            # Setting drop_last without batching
            dset = make_numpy_dataset([first_aset, second_aset],
                                      batch_size=0,
                                      drop_last=True)
        dset = make_numpy_dataset([first_aset, second_aset], batch_size=0)
        total_samples = 0
        for dset1, dset2 in dset:
            total_samples += 1
            assert dset1.shape == (5, 7)
            assert dset2.shape == (5, 7)
        assert total_samples == 20
        co.close()
Beispiel #5
0
    def test_collate_fn(self, repo_20_filled_subsamples):
        co = repo_20_filled_subsamples.checkout()
        col1 = co['writtenaset']
        col2 = co['second_aset']
        keys = (((0, ...), (0, 1)), ((1, ...), (1, 4)))

        dataset = make_numpy_dataset([col1, col2],
                                     keys=keys,
                                     shuffle=False,
                                     batch_size=2)
        col1data, col2data = next(iter(dataset))
        assert isinstance(col1data, tuple)
        assert isinstance(col2data, np.ndarray)
        assert list(col1data[0].keys()) == [1, 2, 3]
        assert list(col1data[1].keys()) == [4, 5, 6]
        assert np.allclose(col2data, np.stack((col2[0][1], col2[1][4])))

        def collate_fn(data_arr):
            arr1 = []
            arr2 = []
            for elem in data_arr:
                # picking one arbitrary subsample
                k = list(elem[0].keys())[2]
                data1 = elem[0][k]
                data2 = elem[1]
                arr1.append(data1)
                arr2.append(data2)
            return np.stack(arr1), np.stack(arr2)

        dataset = make_numpy_dataset([col1, col2],
                                     keys=keys,
                                     shuffle=False,
                                     batch_size=2,
                                     collate_fn=collate_fn)
        col1data, col2data = next(iter(dataset))
        assert np.allclose(col1data, np.stack((col1[0][3], col1[1][6])))
        assert np.allclose(col2data, np.stack((col2[0][1], col2[1][4])))
        co.close()