def test_dataset_download(self, tmp_path, gmb_schema): "Test Dataset class downloads a dataset properly." data_dir = tmp_path / 'gmb' gmb_dataset = Dataset(gmb_schema, data_dir=data_dir, mode=Dataset.InitializationMode.DOWNLOAD_ONLY) assert len(list(data_dir.iterdir( ))) == 2 # 'groningen_meaning_bank_modified' and '.nourish.dataset' unarchived_data_dir = data_dir / 'groningen_meaning_bank_modified' unarchived_data_dir_files = [ 'gmb_subset_full.txt', 'LICENSE.txt', 'README.txt' ] assert unarchived_data_dir.is_dir() assert len(list( unarchived_data_dir.iterdir())) == len(unarchived_data_dir_files) assert all(f.name in unarchived_data_dir_files for f in unarchived_data_dir.iterdir()) # Force check previously downloaded dataset should error with pytest.raises(RuntimeError) as e: gmb_dataset.download(check=True) assert str(e.value) == ( 'Dataset.download() was previously called. To overwrite existing data files, rerun ' 'Dataset.download() with ``check`` set to ``False``.')
def test_nourish_dir(self, tmp_path, gmb_schema): "Test ``Dataset._nourish_dir``." # Automatic creation nourish_dir = tmp_path / 'data_dir' / '.nourish.dataset' dataset = Dataset(gmb_schema, data_dir=tmp_path / 'data_dir', mode=Dataset.InitializationMode.LAZY) assert dataset._nourish_dir == nourish_dir # Non-directory present nourish_dir.rmdir() assert nourish_dir.exists() is False nourish_dir.touch() with pytest.raises(NotADirectoryError) as e: dataset._nourish_dir assert str( e.value) == f'"{nourish_dir}" exists and is not a directory.' # Non-directory parent present dataset = Dataset(gmb_schema, data_dir='setup.py', mode=Dataset.InitializationMode.LAZY) # These are raised by pathlib.Path.mkdir # Also see https://bugs.python.org/issue42872 ExceptionClass = FileExistsError if os.name == 'nt' else NotADirectoryError with pytest.raises(ExceptionClass) as e: dataset._nourish_dir # This error message may be generated by pathlib.Path.mkdir() (as in DirectoryLock.lock()). We only make sure # the path is in the string. # On Windows, backslashes in the error message are doubled: # [WinError 183] Cannot create a file when that file already exists: 'D:\\\\a\\\\nourish\\\\nourish\\\\setup.py' assert str(pathlib.Path.cwd() / "setup.py").replace( '\\', '\\\\') in str(e.value)
def test_zip_extractor(self, dataset_base_url, dataset_dir, gmb_schema, tmp_path): "Test _ZipExtractor to make sure zip datasets are properly extracted and verified." fake_schema = gmb_schema fake_schema[ 'download_url'] = dataset_base_url + '/extractables/test.zip' fake_schema['sha512sum'] = hashlib.sha512( (dataset_dir / 'extractables/test.zip').read_bytes()).hexdigest() zip_dataset = Dataset(fake_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_ONLY) assert zip_dataset.is_downloaded() is True # Content of the file list with open(zip_dataset._file_list_file, mode='r') as f: file_list = json.load(f) def test_incorrect_file_list(change: dict): "Test a single case that somewhere in the file list things are wrong." wrong_file_list = copy.deepcopy(file_list) wrong_file_list['contents'].update(change) with open(zip_dataset._file_list_file, mode='w') as f: json.dump(wrong_file_list, f) assert zip_dataset.is_downloaded() is False # Can't find a file test_incorrect_file_list({'non-existing-file': {'isdir': False}}) # File type incorrect test_incorrect_file_list({'test-dir/test.csv': {'isdir': True}}) # Size incorrect changed = copy.deepcopy(file_list['contents']['test-dir/test.txt']) changed['size'] += 100 test_incorrect_file_list({'test-dir/test.txt': changed})
def test_compression_extractors(self, compressed_file, dataset_base_url, dataset_dir, gmb_schema, tmp_path): "Test compression extractors (gzip, bzip2, and lzma) to make sure datasets are properly extracted and verified." fake_schema = gmb_schema fake_schema[ 'download_url'] = dataset_base_url + '/extractables/' + compressed_file compressed_fp = dataset_dir / ('extractables/' + compressed_file) fake_schema['sha512sum'] = hashlib.sha512( (compressed_fp).read_bytes()).hexdigest() dataset = Dataset(fake_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_ONLY) assert dataset.is_downloaded() is True # Content of the file list with open(dataset._file_list_file, mode='r') as f: file_list = json.load(f) def test_incorrect_file_list(change: dict): "Test a single case that somewhere in the file list things are wrong." wrong_file_list = copy.deepcopy(file_list) wrong_file_list['contents'].update(change) with open(dataset._file_list_file, mode='w') as f: json.dump(wrong_file_list, f) assert dataset.is_downloaded() is False # Can't find the file test_incorrect_file_list({'filename': 'non-existing-file'}) # Size incorrect changed = copy.deepcopy(file_list['contents']) changed['size'] += 100 test_incorrect_file_list(changed)
def test_csv_pandas_loader(self, tmp_path, noaa_jfk_schema): "Test the basic functioning of CSVPandasLoader." dataset = Dataset(noaa_jfk_schema, tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) data = dataset.data['jfk_weather_cleaned'] assert isinstance(data, pd.DataFrame) assert data.shape == (75119, 16) dataset.delete()
def test_symlink_data_dir(self, tmp_symlink_dir, gmb_schema): "Test when ``data_dir`` is a symlink. The symlink should not be resolved." dataset = Dataset(gmb_schema, data_dir=tmp_symlink_dir, mode=Dataset.InitializationMode.LAZY) assert dataset._data_dir == tmp_symlink_dir
def test_csv_pandas_column_unsupported_data_types(self, tmp_path, noaa_jfk_schema, err_column, other_columns): "Test column data types when they are unsupported." # Clear columns column_dict = noaa_jfk_schema['subdatasets']['jfk_weather_cleaned'][ 'format']['options']['columns'] = {} # Update column dictionary as specified for col in other_columns: if col.dtype is not None: column_dict[col.name] = col.dtype column_dict[err_column.name] = err_column.dtype with pytest.raises(ValueError) as e: Dataset(noaa_jfk_schema, tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) # Pandas is a 3rd-party library. We don't check for the exact wording but only some keywords # Examples: # ValueError: cannot safely convert passed user dtype of int64 for float64 dtyped data in column 1 # ValueError: could not convert string to float: '2010-01-01 01:00:00' assert 'convert' in str(e.value) for t in (err_column.dtype, err_column.check): assert re.search(rf"{t}(\d*|ing)\b", str(e.value)) # "ing" is for "str'ing'"
def test_data_dir(self, tmp_path, gmb_schema): "Test ``Dataset._data_dir``." # Automatic creation dataset = Dataset(gmb_schema, data_dir=tmp_path / 'data_dir', mode=Dataset.InitializationMode.LAZY) assert dataset._data_dir == tmp_path / 'data_dir' # Non-directory present dataset = Dataset(gmb_schema, data_dir='setup.py', mode=Dataset.InitializationMode.LAZY) with pytest.raises(NotADirectoryError) as e: dataset._data_dir assert str( e.value ) == f'"{pathlib.Path.cwd()/"setup.py"}" exists and is not a directory.'
def test_relative_data_dir(self, gmb_schema, chdir_tmp_path, tmp_sub_dir, tmp_relative_sub_dir): "Test when ``data_dir`` is relative." dataset = Dataset(gmb_schema, data_dir=tmp_relative_sub_dir, mode=Dataset.InitializationMode.LAZY) assert dataset._data_dir == tmp_sub_dir assert dataset._data_dir.is_absolute()
def test_supported_file_extensions(self, dataset_base_url, dataset_dir, extractable, extractable_type, gmb_schema, tmp_path): "Test extract_data_files and verify_data_files to make sure proper extractors are used for various datasets." fake_schema = gmb_schema fake_schema[ 'download_url'] = dataset_base_url + '/extractables/' + extractable fake_schema['sha512sum'] = hashlib.sha512( (dataset_dir / 'extractables' / extractable).read_bytes()).hexdigest() dataset = Dataset(fake_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_ONLY) assert dataset.is_downloaded() is True with open(dataset._file_list_file, mode='r') as f: file_list = json.load(f) assert file_list['type'] == extractable_type
def test_unloaded_access_to_data(self, tmp_path, gmb_schema): "Test access to ``Dataset.data`` when no data has been loaded." dataset = Dataset(gmb_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.LAZY) with pytest.raises(RuntimeError) as e: dataset.data assert str(e.value) == ( 'Data has not been downloaded and/or loaded yet. Call Dataset.download() to download ' 'data, call Dataset.load() to load data.') # Same after downloading dataset.download() with pytest.raises(RuntimeError) as e: dataset.data assert str(e.value) == ( 'Data has not been downloaded and/or loaded yet. Call Dataset.download() to download ' 'data, call Dataset.load() to load data.')
def test_invalid_sha512(self, tmp_path, gmb_schema): "Test if Dataset class catches an invalid hash." gmb_schema['sha512sum'] = 'invalid hash example' with pytest.raises(IOError) as e: Dataset(gmb_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_ONLY) assert 'the file may by corrupted' in str(e.value)
def test_download_false(self, tmp_path, gmb_schema): "Test to see the function loads properly when download=False and dataset was previously downloaded." init(DATADIR=tmp_path) data_dir = tmp_path / 'dax' / 'gmb' / '1.0.2' gmb = Dataset(gmb_schema, data_dir=data_dir, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) gmb_data = load_dataset('gmb', version='1.0.2', download=False) assert gmb.data == gmb_data
def test_csv_pandas_no_delimiter(self, tmp_path, noaa_jfk_schema): "Test when no delimiter is given." # Remove the delimiter option del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format'][ 'options']['delimiter'] data = Dataset(noaa_jfk_schema, tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD ).data['jfk_weather_cleaned'] assert len(data.columns) == 16 # number of columns remain the same
def test_cache_dir_is_not_a_dir(self, tmp_path, gmb_schema): "Test when ``nourish_dir`` (i.e., ``data_dir/.nourish.dataset``) exists and is not a dir." (tmp_path / '.nourish.dataset').touch() # Occupy this path with a regular file with pytest.raises(NotADirectoryError) as e: Dataset(gmb_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_ONLY) assert str( e.value ) == f"\"{tmp_path/'.nourish.dataset'}\" exists and is not a directory."
def test_csv_pandas_loader_non_option(self, tmp_path, noaa_jfk_schema): "Test CSVPandasLoader when None option is passed." del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format'][ 'options'] dataset = Dataset(noaa_jfk_schema, tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) data = dataset.data['jfk_weather_cleaned'] assert isinstance(data, pd.DataFrame) assert len(data) == 75119
def test_custom_data_dir(self, tmp_path, wikitext103_schema): "Test to make sure Dataset constructor uses new global data dir if one was supplied earlier to nourish.init." init(DATADIR=tmp_path) assert get_config().DATADIR == tmp_path assert isinstance(get_config().DATADIR, pathlib.Path) wikitext = Dataset(wikitext103_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.LAZY) assert wikitext._data_dir == tmp_path assert isinstance(wikitext._data_dir, pathlib.Path)
def test_default_dataset_schema_name(self, tmp_path, gmb_schema): "Test the default schemata name." init(DATADIR=tmp_path) data_dir = tmp_path / 'default' / 'gmb' / '1.0.2' gmb = Dataset(gmb_schema, data_dir=data_dir, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) _get_schemata_manager().dataset_schemata._schemata.pop( 'name') # Remove the "name" key gmb_data = load_dataset('gmb', version='1.0.2', download=False) assert gmb.data == gmb_data
def test_loading_undownloaded(self, tmp_path, gmb_schema): "Test loading before ``Dataset.download()`` has been called." dataset = Dataset(gmb_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.LAZY) with pytest.raises(FileNotFoundError) as e: dataset.load(check=False) assert ( 'Failed to load subdataset "gmb_subset_full" because some files are not found. ' 'Did you forget to call Dataset.download()?\nCaused by:\n') in str( e.value) # Half-loaded data objects should get reset to None assert dataset._data is None with pytest.raises(RuntimeError) as e: dataset.data assert str(e.value) == ( 'Data has not been downloaded and/or loaded yet. Call Dataset.download() to download ' 'data, call Dataset.load() to load data.') # Force check undownloaded dataset should error with pytest.raises(RuntimeError) as e: dataset.load(check=True) assert str(e.value) == ( f'Downloaded data files are not present in {dataset._data_dir_} or are corrupted.' )
def test_csv_pandas_header(self, tmp_path, noaa_jfk_schema): "Test CSVPandasLoader header options" noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format'][ 'options']['no_header'] = True noaa_dataset = Dataset(noaa_jfk_schema, tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_ONLY) with pytest.raises( ValueError ) as exinfo: # Pandas should error from trying to read string as another dtype noaa_dataset.load() assert ('could not convert string to float' in str(exinfo.value)) noaa_dataset.delete() false_test_cases = [False, '', None] # These should all be treated as False for case in false_test_cases: noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format'][ 'options']['no_header'] = case self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema) del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format'][ 'options']['no_header'] self.test_csv_pandas_loader(tmp_path, noaa_jfk_schema)
def test_csv_pandas_delimiter(self, tmp_path, noaa_jfk_schema, delimiter): "Test common delimiter settings. Note that the case of comma has been tested in ``test_csv_pandas_loader``." del noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format'][ 'options']['columns'] # Change delimiter to tab, |, ;, space noaa_jfk_schema['subdatasets']['jfk_weather_cleaned']['format'][ 'options']['delimiter'] = delimiter data = Dataset(noaa_jfk_schema, tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD ).data['jfk_weather_cleaned'] # None of these delimiters exist in the file, number of columns should be 1. assert len(data.columns) == 1
def test_deleting_data_dir(self, tmp_path, gmb_schema): "Test ``Dataset.delete()``." # Note we don't use tmp_sub_dir fixture because we want data_dir to be non-existing at the beginning of the # test. data_dir = tmp_path / 'data-dir' dataset = Dataset(gmb_schema, data_dir=data_dir, mode=Dataset.InitializationMode.LAZY) assert not data_dir.exists() # sanity check: data_dir doesn't exist dataset.delete() # no exception should be raised here assert not data_dir.exists() # sanity check: data_dir doesn't exist dataset.download() # Sanity check: Files are in place assert dataset.is_downloaded() assert len(os.listdir(data_dir)) > 0 # Delete the dir dataset.delete() assert not data_dir.exists()
def test_zerobyte_files(self, dataset_base_url, dataset_dir, gmb_schema, tmp_path, zerobyte_file): "Test compression extractors to make sure they handle zero-byte files." fake_schema = gmb_schema fake_schema[ 'download_url'] = dataset_base_url + '/extractables/' + zerobyte_file zerobyte_fp = dataset_dir / ('extractables/' + zerobyte_file) fake_schema['sha512sum'] = hashlib.sha512( (zerobyte_fp).read_bytes()).hexdigest() with pytest.raises(OSError) as e: Dataset(fake_schema, data_dir=(tmp_path), mode=Dataset.InitializationMode.DOWNLOAD_ONLY) assert str( e.value) == ('The extracted file test-zerobyte.csv is empty.')
def test_unsupported_file_extensions(self, tmp_path, gmb_schema, schemata_file_https_url, schemata_file_relative_dir): "Test if Dataset class catches an unsupported filetype (flat files like ``.yaml`` currently unsupported)." fake_schema = gmb_schema fake_schema[ 'download_url'] = schemata_file_https_url + '/datasets.yaml' fake_schema['sha512sum'] = \ hashlib.sha512((schemata_file_relative_dir / 'datasets.yaml').read_bytes()).hexdigest() with pytest.raises(RuntimeError) as e: Dataset(fake_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_ONLY) assert str(e.value) == 'Filetype not (yet) supported'
def test_download_data_dir_is_not_a_dir(self, gmb_schema): "Test when downloading when ``data_dir`` exists and is not a dir." # These are raised by pathlib.Path.mkdir # Also see https://bugs.python.org/issue42872 ExceptionClass = FileExistsError if os.name == 'nt' else NotADirectoryError with pytest.raises(ExceptionClass) as e: Dataset(gmb_schema, data_dir='./setup.py', mode=Dataset.InitializationMode.DOWNLOAD_ONLY) # This error message may be generated by pathlib.Path.mkdir() (as in DirectoryLock.lock()). We only make sure # the path is in the string. # On Windows, backslashes in the error message are doubled: # [WinError 183] Cannot create a file when that file already exists: 'D:\\\\a\\\\nourish\\\\nourish\\\\setup.py' assert str(pathlib.Path.cwd() / "setup.py").replace( '\\', '\\\\') in str(e.value)
def test_is_downloaded(self, tmp_path, gmb_schema): "Test is_downloaded method using a ``.tar.gz`` archive." data_dir = tmp_path / 'non-existing-dir' assert not data_dir.exists() # Sanity check: data_dir must not exist gmb = Dataset(gmb_schema, data_dir=data_dir, mode=Dataset.InitializationMode.LAZY) assert gmb.is_downloaded() is False gmb.download() assert gmb.is_downloaded() is True # JSON decoding error gmb._file_list_file.write_text("nonsense\n", encoding='utf-8') with pytest.raises(JSONDecodeError): # We don't check the value of the exception because we clearly only are only interested in ensuring that the # file isn't decodable gmb.is_downloaded()
def test_constructor_download_and_load(self, tmp_path, wikitext103_schema): "Test the full power of Dataset.__init__() (mode being ``InitializationMode.DOWNLOAD_AND_LOAD``)." dataset = Dataset(wikitext103_schema, data_dir=tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) assert (hashlib.sha512(dataset.data['train'].encode()).hexdigest() == ( 'df7615f77cb9dd19975881f271e3e3525bee38c08a67fea36a51c96be69a3ecabc9e05c02cbaf' '6fc63a0082efb44156f61c81061d3b0272bbccd7657c682e791')) assert (hashlib.sha512(dataset.data['valid'].encode()).hexdigest() == ( 'e4834d365d5f8313503895fd8304d29a566ff4a2df77efb32457fdc353304fb61460511f89bb9' '0f14a47132c1539aaa324d3e71f5f56045a61a7292ad25a3c02')) assert (hashlib.sha512(dataset.data['test'].encode()).hexdigest() == ( '6fe665d33c0f788eba76da50539f0ca02432c70c94b788a493da491215e86043fc732dbeef9bb' '49a72341c7283ea55f59d10941ac41f7ac58aea3bdcd72f5cd8'))
def test_csv_pandas_column_data_types(self, tmp_path, noaa_jfk_schema, columns): "Test the column data types." assert len(columns) > 0 # Sanity check, make sure columns are there # Clear columns column_dict = noaa_jfk_schema['subdatasets']['jfk_weather_cleaned'][ 'format']['options']['columns'] = {} # Update column dictionary as specified for col in columns: if col.dtype is not None: column_dict[col.name] = col.dtype dataset = Dataset(noaa_jfk_schema, tmp_path, mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD) data = dataset.data['jfk_weather_cleaned'] for col in columns: assert col.check(data.dtypes[col.name])
def test_mode(self, tmp_path, gmb_schema): "Test if Dataset class catches an invalid mode." with pytest.raises(ValueError) as e: Dataset(gmb_schema, data_dir=tmp_path, mode='DOWNLOAD_ONLY') assert str(e.value) == 'DOWNLOAD_ONLY not a valid mode'
def downloaded_noaa_jfk_dataset(noaa_jfk_schema) -> Dataset: with TemporaryDirectory() as tmp_data_dir: yield Dataset(noaa_jfk_schema, data_dir=tmp_data_dir, mode=Dataset.InitializationMode.DOWNLOAD_ONLY)