def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir): data_id = 61 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) openml_path = sklearn.datasets._openml._DATA_FILE.format(data_id) cache_directory = str(tmpdir.mkdir('scikit_learn_data')) # first fill the cache response1 = _open_openml_url(openml_path, cache_directory) # assert file exists location = _get_local_path(openml_path, cache_directory) assert os.path.isfile(location) # redownload, to utilize cache response2 = _open_openml_url(openml_path, cache_directory) assert response1.read() == response2.read()
def test_open_openml_url_unlinks_local_path(monkeypatch, gzip_response, tmpdir, write_to_disk): data_id = 61 openml_path = sklearn.datasets._openml._DATA_FILE.format(data_id) cache_directory = str(tmpdir.mkdir('scikit_learn_data')) location = _get_local_path(openml_path, cache_directory) def _mock_urlopen(request): if write_to_disk: with open(location, "w") as f: f.write("") raise ValueError("Invalid request") monkeypatch.setattr(sklearn.datasets._openml, 'urlopen', _mock_urlopen) with pytest.raises(ValueError, match="Invalid request"): _open_openml_url(openml_path, cache_directory) assert not os.path.exists(location)
def _test_features_list(data_id): # XXX Test is intended to verify/ensure correct decoding behavior # Not usable with sparse data or datasets that have columns marked as # {row_identifier, ignore} def decode_column(data_bunch, col_idx): col_name = data_bunch.feature_names[col_idx] if col_name in data_bunch.categories: # XXX: This would be faster with np.take, although it does not # handle missing values fast (also not with mode='wrap') cat = data_bunch.categories[col_name] result = [ None if is_scalar_nan(idx) else cat[int(idx)] for idx in data_bunch.data[:, col_idx] ] return np.array(result, dtype='O') else: # non-nominal attribute return data_bunch.data[:, col_idx] data_bunch = fetch_openml(data_id=data_id, cache=False, target_column=None) # also obtain decoded arff data_description = _get_data_description_by_id(data_id, None) sparse = data_description['format'].lower() == 'sparse_arff' if sparse is True: raise ValueError('This test is not intended for sparse data, to keep ' 'code relatively simple') url = _DATA_FILE.format(data_description['file_id']) with _open_openml_url(url, data_home=None) as f: data_arff = _arff.load( (line.decode('utf-8') for line in f), return_type=(_arff.COO if sparse else _arff.DENSE_GEN), encode_nominal=False) data_downloaded = np.array(list(data_arff['data']), dtype='O') for i in range(len(data_bunch.feature_names)): # XXX: Test per column, as this makes it easier to avoid problems with # missing values np.testing.assert_array_equal(data_downloaded[:, i], decode_column(data_bunch, i))