Exemple #1
0
    def test_multiple_dataset_loader(self, repo_with_20_samples):
        repo = repo_with_20_samples
        co = repo.checkout(write=True)
        second_aset = co.arraysets['second_aset']
        del second_aset['10']
        co.commit('deleting')
        co.close()

        co = repo.checkout()
        first_aset = co.arraysets['writtenaset']
        second_aset = co.arraysets['second_aset']
        with pytest.raises(ValueError):
            # emtpy list
            make_torch_dataset([])
        with pytest.raises(TypeError):
            # if more than one dataset, those should be in a list/tuple
            make_torch_dataset(first_aset, first_aset)

        with pytest.warns(UserWarning, match='Arraysets do not contain equal number of samples'):
            torch_dset = make_torch_dataset([first_aset, second_aset])
        loader = DataLoader(torch_dset, batch_size=6, drop_last=True)
        total_samples = 0
        for dset1, dset2 in loader:
            total_samples += dset1.shape[0]
            assert dset1.shape == (6, 5, 7)
            assert dset2.shape == (6, 5, 7)
        assert total_samples == 18  # drop last is True
        co.close()
Exemple #2
0
 def test_field_names(self, repo_with_20_samples):
     repo = repo_with_20_samples
     co = repo.checkout()
     first_aset = co.arraysets['writtenaset']
     second_aset = co.arraysets['second_aset']
     with pytest.raises(
             ValueError):  # number of dsets and field_names are different
         make_torch_dataset([first_aset, second_aset],
                            field_names=('input', ))
     with pytest.raises(TypeError):  # field_names's type is wrong
         make_torch_dataset([first_aset, second_aset],
                            field_names={
                                'input': '',
                                'target': ''
                            })
     torch_dset = make_torch_dataset([first_aset, second_aset],
                                     field_names=('input', 'target'))
     assert hasattr(torch_dset[1], 'input')
     assert hasattr(torch_dset[1], 'target')
     if torch.__version__ > '1.0.1':
         loader = DataLoader(torch_dset, batch_size=5)
         for sample in loader:
             assert hasattr(sample, 'input')
             assert hasattr(sample, 'target')
     co.close()
Exemple #3
0
 def test_dataset_loader_fails_with_write_enabled_checkout(self, repo_with_20_samples):
     repo = repo_with_20_samples
     co = repo.checkout(write=True)
     first_aset = co.arraysets['writtenaset']
     second_aset = co.arraysets['second_aset']
     with pytest.raises(TypeError):
         make_torch_dataset([first_aset, second_aset])
     co.close()
Exemple #4
0
 def test_warns_experimental(self, repo_with_20_samples):
     repo = repo_with_20_samples
     co = repo.checkout()
     first_aset = co.arraysets['writtenaset']
     second_aset = co.arraysets['second_aset']
     with pytest.warns(UserWarning, match='Dataloaders are experimental'):
         make_torch_dataset([first_aset, second_aset])
     co.close()
Exemple #5
0
    def test_warns_arrayset_sample_size_mismatch(self, repo_with_20_samples):
        repo = repo_with_20_samples
        co = repo.checkout(write=True)
        second_aset = co.arraysets['second_aset']
        del second_aset['10']
        co.commit('deleting')
        co.close()

        co = repo.checkout()
        first_aset = co.arraysets['writtenaset']
        second_aset = co.arraysets['second_aset']
        with pytest.warns(UserWarning, match='Arraysets do not contain equal number of samples'):
            make_torch_dataset([first_aset, second_aset])
        co.close()
Exemple #6
0
    def test_with_keys(self, repo_with_20_samples):
        repo = repo_with_20_samples
        co = repo.checkout()
        aset = co.arraysets['writtenaset']

        # with keys
        keys = ['2', '4', '5', '6', '7', '9', '15', '18', '19']
        bad_tensor0 = aset['0']
        bad_tensor1 = aset['1']
        bad_tensor3 = aset['3']
        bad_tensor8 = aset['8']

        torch_dset = make_torch_dataset(aset, keys=keys)
        loader = DataLoader(torch_dset, batch_size=3)
        total_batches = 0
        for batch in loader:
            assert batch[0].size(0) == 3
            total_batches += 1
            for sample in batch:
                assert not np.allclose(sample, bad_tensor0)
                assert not np.allclose(sample, bad_tensor1)
                assert not np.allclose(sample, bad_tensor3)
                assert not np.allclose(sample, bad_tensor8)
        assert total_batches == 3
        co.close()
Exemple #7
0
 def test_field_names(self, repo_with_20_samples):
     repo = repo_with_20_samples
     co = repo.checkout()
     first_aset = co.arraysets['writtenaset']
     second_aset = co.arraysets['second_aset']
     with pytest.raises(ValueError):  # number of dsets and field_names are different
         make_torch_dataset([first_aset, second_aset], field_names=('input',))
     with pytest.raises(TypeError):  # field_names's type is wrong
         make_torch_dataset([first_aset, second_aset], field_names={'input': '', 'target': ''})
     torch_dset = make_torch_dataset([first_aset, second_aset], field_names=('input', 'target'))
     assert len(torch_dset) == 20
     loader = DataLoader(torch_dset, batch_size=5)
     for sample in loader:
         assert type(sample).__name__ == 'BatchTuple_input_target'
         assert sample._fields == ('input', 'target')
     co.close()
Exemple #8
0
 def test_lots_of_data_with_multiple_backend(self, repo_with_10000_samples):
     repo = repo_with_10000_samples
     co = repo.checkout()
     aset = co.arraysets['aset']
     torch_dset = make_torch_dataset([aset])
     loader = DataLoader(torch_dset, batch_size=1000, drop_last=True)
     for data in loader:
         assert data.aset.shape == (1000, 5, 7)
     co.close()
Exemple #9
0
 def test_lots_of_data_with_multiple_backend_multiple_worker_dataloader(self, repo_with_10000_samples):
     repo = repo_with_10000_samples
     co = repo.checkout()
     aset = co.arraysets['aset']
     torch_dset = make_torch_dataset([aset])
     loader = DataLoader(torch_dset, batch_size=1000, drop_last=True, num_workers=2)
     for data in loader:
         assert type(data).__name__ == 'BatchTuple_aset'
         assert data.aset.shape == (1000, 5, 7)
     co.close()
Exemple #10
0
 def test_lots_of_data_with_multiple_backend(self, repo_300_filled_samples):
     repo = repo_300_filled_samples
     co = repo.checkout()
     aset = co.columns['aset']
     torch_dset = make_torch_dataset([aset])
     loader = DataLoader(torch_dset, batch_size=10, drop_last=True)
     for data in loader:
         assert type(data).__name__ == 'BatchTuple_aset'
         assert data.aset.shape == (10, 5, 7)
     co.close()
Exemple #11
0
 def test_local_without_data_fails_data_unavailable(self, written_two_cmt_server_repo, managed_tmpdir):
     new_tmpdir = pjoin(managed_tmpdir, 'new')
     mkdir(new_tmpdir)
     server, _ = written_two_cmt_server_repo
     repo = Repository(path=new_tmpdir, exists=False)
     repo.clone('name', '[email protected]', server, remove_old=True)
     co = repo.checkout()
     aset = co.arraysets['writtenaset']
     with pytest.raises(FileNotFoundError):
         torch_dset = make_torch_dataset(aset, keys=['1', '2'])
     co.close()
     repo._env._close_environments()
Exemple #12
0
 def test_local_without_data_fails_no_common_no_local(
         self, written_two_cmt_server_repo, managed_tmpdir):
     new_tmpdir = pjoin(managed_tmpdir, 'new')
     mkdir(new_tmpdir)
     server, _ = written_two_cmt_server_repo
     repo = Repository(path=new_tmpdir, exists=False)
     repo.clone('name', '[email protected]', server, remove_old=True)
     co = repo.checkout()
     aset = co.columns['writtenaset']
     with pytest.raises(ValueError):
         torch_dset = make_torch_dataset(aset)
     co.close()
     repo._env._close_environments()
Exemple #13
0
 def test_two_aset_loader_two_worker_dataloader(self, repo_with_20_samples):
     repo = repo_with_20_samples
     co = repo.checkout()
     first_aset = co.arraysets['writtenaset']
     second_aset = co.arraysets['second_aset']
     torch_dset = make_torch_dataset([first_aset, second_aset])
     loader = DataLoader(torch_dset, batch_size=2, drop_last=True, num_workers=2)
     count = 0
     for asets_batch in loader:
         assert type(asets_batch).__name__ == 'BatchTuple_writtenaset_second_aset'
         assert isinstance(asets_batch, tuple)
         assert len(asets_batch) == 2
         assert asets_batch._fields == ('writtenaset', 'second_aset')
         assert asets_batch.writtenaset.shape == (2, 5, 7)
         assert asets_batch.second_aset.shape == (2, 5, 7)
         assert np.allclose(asets_batch.writtenaset, -asets_batch.second_aset)
         count += 1
     assert count == 10
Exemple #14
0
    def test_with_index_range(self, repo_with_20_samples):
        repo = repo_with_20_samples
        co = repo.checkout()
        aset = co.arraysets['writtenaset']

        # with keys
        bad_tensor0 = aset['0']
        bad_tensor1 = aset['1']

        # with index range
        index_range = slice(2, 20)
        torch_dset = make_torch_dataset(aset, index_range=index_range)
        loader = DataLoader(torch_dset, batch_size=3)
        total_batches = 0
        for batch in loader:
            assert batch[0].size(0) == 3
            total_batches += 1
            for sample in batch:
                assert not np.allclose(sample, bad_tensor0)
                assert not np.allclose(sample, bad_tensor1)
        assert total_batches == 6
        co.close()
Exemple #15
0
    parser = ArgumentParser()
    parser.add_argument('--gpus', type=int, default=None)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--max_epochs', type=int, default=1)
    parser.add_argument('--max_elems', type=int, default=60000)
    parser.add_argument('--hangar', action='store_true')
    args = parser.parse_args()

    repo = Repository(path=Path(__file__).parent / "hangar")
    co = repo.checkout()

    if args.hangar:
        dataset = make_torch_dataset(
            [co.columns['digits'], co.columns['label']],
            index_range=slice(0, args.max_elems))
    else:
        dataset = MNIST(os.getcwd(),
                        download=True,
                        transform=transforms.ToTensor())
    print(len(dataset))
    datapoint, label = dataset[0]
    print(type(datapoint), type(label))
    print("making a loader!")
    train_loader = DataLoader(dataset,
                              batch_size=args.batch_size,
                              num_workers=16,
                              shuffle=False)

    # init model