def test_dataset_download(self, tmp_path, schema, request): "Test Dataset class downloads a dataset properly." gmb_schema = request.getfixturevalue(schema) data_dir = tmp_path / 'gmb' gmb_dataset = Dataset(gmb_schema, data_dir=data_dir, mode=Dataset.InitializationMode.DOWNLOAD_ONLY) assert len(list(data_dir.iterdir( ))) == 2 # 'groningen_meaning_bank_modified' and '.pydax.dataset' unarchived_data_dir = data_dir / 'groningen_meaning_bank_modified' unarchived_data_dir_files = [ 'gmb_subset_full.txt', 'LICENSE.txt', 'README.txt' ] assert unarchived_data_dir.is_dir() assert len(list( unarchived_data_dir.iterdir())) == len(unarchived_data_dir_files) assert all(f.name in unarchived_data_dir_files for f in unarchived_data_dir.iterdir()) # Force check previously downloaded dataset should error with pytest.raises(RuntimeError) as e: gmb_dataset.download(check=True) assert str(e.value) == ( 'Dataset.download() was previously called. To overwrite existing data files, rerun ' 'Dataset.download() with ``check`` set to ``False``.')
def test_is_downloaded(self, tmp_path, gmb_schema): "Test is_downloaded method." data_dir = tmp_path / 'non-existing-dir' assert not data_dir.exists() # Sanity check: data_dir must not exist gmb = Dataset(gmb_schema, data_dir=data_dir, mode=Dataset.InitializationMode.LAZY) assert gmb.is_downloaded() is False gmb.download() assert gmb.is_downloaded() is True # content of the file list with open(gmb._file_list_file, mode='r') as f: file_list = json.load(f) def test_incorrect_file_list(change: dict): "Test a single case that somewhere in the file list things are wrong." wrong_file_list = copy.deepcopy(file_list) wrong_file_list.update(change) with open(gmb._file_list_file, mode='w') as f: json.dump(wrong_file_list, f) assert gmb.is_downloaded() is False # Can't find a file test_incorrect_file_list( {'non-existing-file': { 'type': int(tarfile.REGTYPE) }}) # File type incorrect test_incorrect_file_list({ 'groningen_meaning_bank_modified': { 'type': int(tarfile.REGTYPE) } }) test_incorrect_file_list({ 'groningen_meaning_bank_modified/LICENSE.txt': { 'type': int(tarfile.DIRTYPE) } }) test_incorrect_file_list({ 'groningen_meaning_bank_modified/README.txt': { 'type': int(tarfile.SYMTYPE) } }) # size incorrect changed = copy.deepcopy( file_list['groningen_meaning_bank_modified/README.txt']) changed['size'] += 100 test_incorrect_file_list( {'groningen_meaning_bank_modified/README.txt': changed}) # JSON decoding error gmb._file_list_file.write_text("nonsense\n", encoding='utf-8') with pytest.raises(JSONDecodeError): # We don't check the value of the exception because we clearly only are only interested in ensuring that the # file isn't decodable gmb.is_downloaded()
def test_unloaded_access_to_data(self, tmp_path, gmb_schema): "Test access to ``Dataset.data`` when no data has been loaded." dataset = Dataset(gmb_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.LAZY) with pytest.raises(RuntimeError) as e: dataset.data assert str(e.value) == ('Data has not been downloaded and/or loaded yet. Call Dataset.download() to download ' 'data, call Dataset.load() to load data.') # Same after downloading dataset.download() with pytest.raises(RuntimeError) as e: dataset.data assert str(e.value) == ('Data has not been downloaded and/or loaded yet. Call Dataset.download() to download ' 'data, call Dataset.load() to load data.')
def test_deleting_data_dir(self, tmp_path, gmb_schema): "Test ``Dataset.delete()``." # Note we don't use tmp_sub_dir fixture because we want data_dir to be non-existing at the beginning of the # test. data_dir = tmp_path / 'data-dir' dataset = Dataset(gmb_schema, data_dir=data_dir, mode=Dataset.InitializationMode.LAZY) assert not data_dir.exists() # sanity check: data_dir doesn't exist dataset.delete() # no exception should be raised here assert not data_dir.exists() # sanity check: data_dir doesn't exist dataset.download() # Sanity check: Files are in place assert dataset.is_downloaded() assert len(os.listdir(data_dir)) > 0 # Delete the dir dataset.delete() assert not data_dir.exists()