Esempio n. 1
0
    def test_custom_relative_data_dir(self, chdir_tmp_path, tmp_sub_dir,
                                      tmp_relative_sub_dir):
        "Test using a custom relative data directory."

        init(DATADIR=tmp_relative_sub_dir)
        assert get_config().DATADIR == tmp_sub_dir
        assert get_config().DATADIR.is_absolute()
Esempio n. 2
0
    def test_version_param(self, tmp_path):
        "Test to see the version parameter is being handled properly."

        init(DATADIR=tmp_path)

        with pytest.raises(TypeError) as e:
            load_dataset('gmb', version=1.0)
        assert str(e.value) == 'The version parameter must be supplied a str.'

        name, version = 'gmb', ''
        with pytest.raises(KeyError) as e:
            load_dataset('gmb', version=version)
        assert str(e.value) == (
            f'\'"{version}" is not a valid Nourish version for the dataset "{name}". '
            'You can view all valid datasets and their versions by running the function '
            'nourish.list_all_datasets().\'')

        name, version = 'gmb', 'fake_version'
        with pytest.raises(KeyError) as e:
            load_dataset('gmb', version=version)
        assert str(e.value) == (
            f'\'"{version}" is not a valid Nourish version for the dataset "{name}". '
            'You can view all valid datasets and their versions by running the function '
            'nourish.list_all_datasets().\'')

        # If no version specified, make sure latest version grabbed
        all_datasets = list_all_datasets()
        latest_version = str(
            sorted(version_parser(v) for v in all_datasets[name])[-1])
        assert load_dataset('gmb') == load_dataset('gmb',
                                                   version=latest_version)
Esempio n. 3
0
    def test_load_schemata_manager(self, loaded_schemata_manager,
                                   schemata_file_absolute_dir):
        "Test high-level load_schemata_manager function."

        init(update_only=False,
             DATASET_SCHEMATA_URL=loaded_schemata_manager.dataset_schemata.
             retrieved_url_or_path,
             FORMAT_SCHEMATA_URL=loaded_schemata_manager.format_schemata.
             retrieved_url_or_path,
             LICENSE_SCHEMATA_URL=loaded_schemata_manager.license_schemata.
             retrieved_url_or_path)
        load_schemata_manager(force_reload=True)
        for name in ('datasets', 'formats', 'licenses'):
            assert (
                _get_schemata_manager().schemata[name].retrieved_url_or_path ==
                loaded_schemata_manager.schemata[name].retrieved_url_or_path)

        init(
            update_only=True,
            # Different from the previous relative path used in loaded_schemata_manager
            DATASET_SCHEMATA_URL=schemata_file_absolute_dir / 'datasets.yaml')
        load_schemata_manager(force_reload=False)
        for name in ('formats', 'licenses'):
            assert (
                _get_schemata_manager().schemata[name].retrieved_url_or_path ==
                loaded_schemata_manager.schemata[name].retrieved_url_or_path)
        assert (_get_schemata_manager().dataset_schemata.retrieved_url_or_path
                == schemata_file_absolute_dir / 'datasets.yaml')
Esempio n. 4
0
    def test_download_true(self, tmp_path, downloaded_gmb_dataset):
        "Test to see the function downloads and loads properly when download=True."

        init(DATADIR=tmp_path)
        downloaded_gmb_dataset_data = downloaded_gmb_dataset.load()
        gmb_data = load_dataset('gmb', version='1.0.2', download=True)
        assert downloaded_gmb_dataset_data == gmb_data
Esempio n. 5
0
    def test_loading_undownloaded(self, tmp_path):
        "Test loading before ``Dataset.download()`` has been called."

        init(DATADIR=tmp_path)
        with pytest.raises(RuntimeError) as e:
            load_dataset('wikitext103', version='1.0.1', download=False)
        assert 'Did you forget to download the dataset (by specifying `download=True`)?' in str(
            e.value)
Esempio n. 6
0
    def test_non_path_data_dir(self):
        "Test exception when a nonpath is passed as DATADIR."

        with pytest.raises(ValidationError) as e:
            init(DATADIR=10)

        assert re.search((r'1 validation error for Config\s+DATADIR\s+value'
                          r' is not a valid path \(type=type_error.path\)'),
                         str(e.value))
Esempio n. 7
0
    def test_download_false(self, tmp_path, gmb_schema):
        "Test to see the function loads properly when download=False and dataset was previously downloaded."

        init(DATADIR=tmp_path)
        data_dir = tmp_path / 'dax' / 'gmb' / '1.0.2'
        gmb = Dataset(gmb_schema,
                      data_dir=data_dir,
                      mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD)
        gmb_data = load_dataset('gmb', version='1.0.2', download=False)
        assert gmb.data == gmb_data
 def test_secure_connections_succeed_load_schemata_manager(
         self, dataset_schemata_url_or_path):
     "Test secure connections that should succeed for :func:`nourish.load_schemata_manager`."
     # We use '/' instead of os.path.sep because URLs only accept / not \ as separators, but Windows path accepts
     # both. This is not an issue for the purpose of this test.
     init(update_only=True,
          DATASET_SCHEMATA_URL=dataset_schemata_url_or_path)
     load_schemata_manager(force_reload=True, tls_verification=True)
     assert export_schemata_manager(
     ).dataset_schemata.retrieved_url_or_path == dataset_schemata_url_or_path
Esempio n. 9
0
    def test_custom_data_dir(self, tmp_path, wikitext103_schema):
        "Test to make sure Dataset constructor uses new global data dir if one was supplied earlier to nourish.init."

        init(DATADIR=tmp_path)
        assert get_config().DATADIR == tmp_path
        assert isinstance(get_config().DATADIR, pathlib.Path)
        wikitext = Dataset(wikitext103_schema,
                           data_dir=tmp_path,
                           mode=Dataset.InitializationMode.LAZY)
        assert wikitext._data_dir == tmp_path
        assert isinstance(wikitext._data_dir, pathlib.Path)
Esempio n. 10
0
    def test_default_dataset_schema_name(self, tmp_path, gmb_schema):
        "Test the default schemata name."

        init(DATADIR=tmp_path)
        data_dir = tmp_path / 'default' / 'gmb' / '1.0.2'
        gmb = Dataset(gmb_schema,
                      data_dir=data_dir,
                      mode=Dataset.InitializationMode.DOWNLOAD_AND_LOAD)
        _get_schemata_manager().dataset_schemata._schemata.pop(
            'name')  # Remove the "name" key
        gmb_data = load_dataset('gmb', version='1.0.2', download=False)
        assert gmb.data == gmb_data
    def test_insecure_connections_load_schemata_manager(
            self, remote_dataset_schemata_url, untrust_self_signed_cert):
        "Test insecure connections that should fail when ``tls_verification=True`` for ``load_schemata_manager``."
        init(update_only=True,
             DATASET_SCHEMATA_URL=remote_dataset_schemata_url)
        with pytest.raises(InsecureConnectionError) as e:
            load_schemata_manager(force_reload=True, tls_verification=True)
        assert remote_dataset_schemata_url in str(e.value)

        # Insecure load succeeds, no exception raised
        load_schemata_manager(force_reload=True, tls_verification=False)
        assert export_schemata_manager(
        ).dataset_schemata.retrieved_url_or_path == remote_dataset_schemata_url
Esempio n. 12
0
    def test_name_param(self, tmp_path):
        "Test to see the name parameter is being handled properly."

        init(DATADIR=tmp_path)

        with pytest.raises(TypeError) as e:
            load_dataset(123)
        assert str(e.value) == 'The name parameter must be supplied a str.'

        name = 'fake_dataset'
        with pytest.raises(KeyError) as e:
            load_dataset(name)
        assert str(e.value) == (
            f'\'"{name}" is not a valid Nourish dataset. You can view all valid datasets and their '
            'versions by running the function nourish.list_all_datasets().\'')
Esempio n. 13
0
def nourish_initialization(schemata_file_https_url, schema_localized_url):
    """Create the default initialization used for all tests. This is mainly for having a uniform initialization for all
    tests as well as avoiding using the actual default schemata file URLs so as to decouple the two lines of development
    (default schemata files and this library). It also replaces all download URLs with localized URLs."""

    init(update_only=False,
         DATASET_SCHEMATA_URL=f'{schemata_file_https_url}/datasets.yaml',
         FORMAT_SCHEMATA_URL=f'{schemata_file_https_url}/formats.yaml',
         LICENSE_SCHEMATA_URL=f'{schemata_file_https_url}/licenses.yaml')

    # Use local dataset locations by default in our tests
    datasets = _get_schemata_manager().dataset_schemata._schemata['datasets']
    for name, versions in datasets.items():
        for version in versions:
            datasets[name][version] = schema_localized_url(name, version)
Esempio n. 14
0
    def test_custom_configs(self):
        "Test custom configs."

        init(update_only=False)  # set back everything to default
        assert dataclasses.asdict(get_config()) == dataclasses.asdict(Config())

        new_urls = {
            'DATASET_SCHEMATA_URL': 'some/local/file',
            'FORMAT_SCHEMATA_URL': 'file://c:/some/other/local/file',
            'LICENSE_SCHEMATA_URL': 'http://some/remote/file'
        }
        init(update_only=True, **new_urls)

        for url, val in new_urls.items():
            assert getattr(get_config(), url) == val
        assert get_config().DATADIR == Config.DATADIR
Esempio n. 15
0
    def test_subdatasets_param(self, tmp_path):
        "Test to see subdatasets parameter is being handled properly."

        init(DATADIR=tmp_path)

        with pytest.raises(TypeError) as e:
            load_dataset('wikitext103',
                         version='1.0.1',
                         download=True,
                         subdatasets=123)
        assert str(e.value) == '\'int\' object is not iterable'

        subdatasets = ['train']
        wikitext103_data = load_dataset('wikitext103',
                                        version='1.0.1',
                                        download=True,
                                        subdatasets=subdatasets)
        assert list(wikitext103_data.keys()) == subdatasets
Esempio n. 16
0
    def test_export_schemata_manager(self, schemata_file_absolute_dir,
                                     schemata_file_https_url):
        "Test high-level export_schemata_manager function."

        assert export_schemata_manager() is not _get_schemata_manager()
        # The two returned schemata should equal
        assert (json.dumps(
            export_schemata_manager().dataset_schemata.export_schema(),
            sort_keys=True,
            indent=2,
            default=str) == json.dumps(
                _get_schemata_manager().dataset_schemata.export_schema(),
                sort_keys=True,
                indent=2,
                default=str))

        # Different from https url used by nourish_initialization autouse fixture
        new_urls = {
            'DATASET_SCHEMATA_URL':
            schemata_file_absolute_dir / 'datasets.yaml',
            'LICENSE_SCHEMATA_URL':
            schemata_file_absolute_dir / 'licenses.yaml'
        }
        init(update_only=True, **new_urls)
        assert (export_schemata_manager().format_schemata.retrieved_url_or_path
                == f'{schemata_file_https_url}/formats.yaml')
        assert export_schemata_manager(
        ).dataset_schemata.retrieved_url_or_path == new_urls[
            'DATASET_SCHEMATA_URL']
        assert export_schemata_manager(
        ).license_schemata.retrieved_url_or_path == new_urls[
            'LICENSE_SCHEMATA_URL']
        new_format_schemata_url = schemata_file_absolute_dir / 'formats.yaml'
        init(update_only=True, FORMAT_SCHEMATA_URL=new_format_schemata_url)
        assert export_schemata_manager(
        ).format_schemata.retrieved_url_or_path == new_format_schemata_url
Esempio n. 17
0
    def test_custom_symlink_data_dir(self, tmp_symlink_dir):
        "Test using a custom symlink data directory. The symlink should not be resolved."

        init(DATADIR=tmp_symlink_dir)
        assert get_config().DATADIR == tmp_symlink_dir