Example #1
0
    def test_dataset_download(self, tmp_path, gmb_schema):
        "Test Dataset class downloads a dataset properly."

        data_dir = tmp_path / 'gmb'
        gmb_dataset = Dataset(gmb_schema,
                              data_dir=data_dir,
                              mode=Dataset.InitializationMode.DOWNLOAD_ONLY)
        assert len(list(data_dir.iterdir(
        ))) == 2  # 'groningen_meaning_bank_modified' and '.nourish.dataset'
        unarchived_data_dir = data_dir / 'groningen_meaning_bank_modified'
        unarchived_data_dir_files = [
            'gmb_subset_full.txt', 'LICENSE.txt', 'README.txt'
        ]
        assert unarchived_data_dir.is_dir()
        assert len(list(
            unarchived_data_dir.iterdir())) == len(unarchived_data_dir_files)
        assert all(f.name in unarchived_data_dir_files
                   for f in unarchived_data_dir.iterdir())

        # Force check previously downloaded dataset should error
        with pytest.raises(RuntimeError) as e:
            gmb_dataset.download(check=True)
        assert str(e.value) == (
            'Dataset.download() was previously called. To overwrite existing data files, rerun '
            'Dataset.download() with ``check`` set to ``False``.')
Example #2
0
    def test_is_downloaded(self, tmp_path, gmb_schema):
        "Test is_downloaded method using a ``.tar.gz`` archive."

        data_dir = tmp_path / 'non-existing-dir'
        assert not data_dir.exists()  # Sanity check: data_dir must not exist
        gmb = Dataset(gmb_schema,
                      data_dir=data_dir,
                      mode=Dataset.InitializationMode.LAZY)
        assert gmb.is_downloaded() is False

        gmb.download()
        assert gmb.is_downloaded() is True

        # JSON decoding error
        gmb._file_list_file.write_text("nonsense\n", encoding='utf-8')
        with pytest.raises(JSONDecodeError):
            # We don't check the value of the exception because we clearly only are only interested in ensuring that the
            # file isn't decodable
            gmb.is_downloaded()
Example #3
0
    def test_unloaded_access_to_data(self, tmp_path, gmb_schema):
        "Test access to ``Dataset.data`` when no data has been loaded."

        dataset = Dataset(gmb_schema,
                          data_dir=tmp_path,
                          mode=Dataset.InitializationMode.LAZY)
        with pytest.raises(RuntimeError) as e:
            dataset.data
        assert str(e.value) == (
            'Data has not been downloaded and/or loaded yet. Call Dataset.download() to download '
            'data, call Dataset.load() to load data.')

        # Same after downloading
        dataset.download()
        with pytest.raises(RuntimeError) as e:
            dataset.data
        assert str(e.value) == (
            'Data has not been downloaded and/or loaded yet. Call Dataset.download() to download '
            'data, call Dataset.load() to load data.')
Example #4
0
    def test_deleting_data_dir(self, tmp_path, gmb_schema):
        "Test ``Dataset.delete()``."

        # Note we don't use tmp_sub_dir fixture because we want data_dir to be non-existing at the beginning of the
        # test.
        data_dir = tmp_path / 'data-dir'
        dataset = Dataset(gmb_schema,
                          data_dir=data_dir,
                          mode=Dataset.InitializationMode.LAZY)
        assert not data_dir.exists()  # sanity check: data_dir doesn't exist
        dataset.delete()  # no exception should be raised here
        assert not data_dir.exists()  # sanity check: data_dir doesn't exist

        dataset.download()
        # Sanity check: Files are in place
        assert dataset.is_downloaded()
        assert len(os.listdir(data_dir)) > 0
        # Delete the dir
        dataset.delete()
        assert not data_dir.exists()