def _run_weka(name: str, extra_args: List[str], dataset: Bunch, log_results: Callable, cache_dir: str): logger.info('weka: prepare dataset') arffgz_path = _get_local_path( _DATA_FILE.format(dataset.details['file_id']), # maybe != ds_id data_home=cache_dir + '/openml/') with tempfile.TemporaryDirectory() as tmpdirname: # link to a file named '*.arff.gz', otherwise weka doesn't # recognize it's gzipped arffgz_linked = (tmpdirname + '/' + os.path.basename(arffgz_path) + '.arff.gz') os.symlink(arffgz_path, arffgz_linked) # get target column target_column_ids = [ int(feature['index']) for feature in _get_data_features(dataset.details['id'], data_home=cache_dir) if feature['is_target'] == 'true' ] if len(target_column_ids) != 1: logger.warning( "identified {} target columns [{}], using only {}".format( len(target_column_ids), ','.join(map(str, target_column_ids)), target_column_ids[0])) logger.info('run weka.' + name) _cmd = WEKA_CMD + [ name, '-t', arffgz_linked, # "train" ARFF file '-c', str(target_column_ids[0] + 1), # todo: hardcode metrics ] + extra_args logger.debug(_cmd) weka_process = subprocess.run(_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) logger.info('weka.{} returned with {}'.format(name, weka_process.returncode)) if weka_process.stdout: weka_stdout_logger.debug(weka_process.stdout) if weka_process.stderr: weka_stderr_logger.debug(weka_process.stderr) weka_out: str = weka_process.stdout if WEKA_REGEX.FULL_MODEL.search(weka_out): # if has successful run runtime_single, runtime_cv, jrip_n_rules, metrics = \ _process_weka_output(weka_out) log_results(algorithm='weka.' + name, runtime_single=runtime_single, runtime_cv=runtime_cv, n_rules=jrip_n_rules, metrics=metrics) else: logger.warning('weka.{}: no successful output found')
def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir): data_id = 61 _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response) openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id) cache_directory = str(tmpdir.mkdir('scikit_learn_data')) # first fill the cache response1 = _open_openml_url(openml_path, cache_directory) # assert file exists location = _get_local_path(openml_path, cache_directory) assert os.path.isfile(location) # redownload, to utilize cache response2 = _open_openml_url(openml_path, cache_directory) assert response1.read() == response2.read()
def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir): data_id = 61 _monkey_patch_webbased_functions( monkeypatch, data_id, gzip_response) openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id) cache_directory = str(tmpdir.mkdir('scikit_learn_data')) # first fill the cache response1 = _open_openml_url(openml_path, cache_directory) # assert file exists location = _get_local_path(openml_path, cache_directory) assert os.path.isfile(location) # redownload, to utilize cache response2 = _open_openml_url(openml_path, cache_directory) assert response1.read() == response2.read()
def test_open_openml_url_unlinks_local_path( monkeypatch, gzip_response, tmpdir, write_to_disk): data_id = 61 openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id) cache_directory = str(tmpdir.mkdir('scikit_learn_data')) location = _get_local_path(openml_path, cache_directory) def _mock_urlopen(request): if write_to_disk: with open(location, "w") as f: f.write("") raise ValueError("Invalid request") monkeypatch.setattr(sklearn.datasets.openml, 'urlopen', _mock_urlopen) with pytest.raises(ValueError, match="Invalid request"): _open_openml_url(openml_path, cache_directory) assert not os.path.exists(location)
def test_retry_with_clean_cache(tmpdir): data_id = 61 openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id) cache_directory = str(tmpdir.mkdir('scikit_learn_data')) location = _get_local_path(openml_path, cache_directory) os.makedirs(os.path.dirname(location)) with open(location, 'w') as f: f.write("") @_retry_with_clean_cache(openml_path, cache_directory) def _load_data(): # The first call will raise an error since location exists if os.path.exists(location): raise Exception("File exist!") return 1 warn_msg = "Invalid cache, redownloading file" with pytest.warns(RuntimeWarning, match=warn_msg): result = _load_data() assert result == 1