Beispiel #1
0
def _run_weka(name: str, extra_args: List[str], dataset: Bunch,
              log_results: Callable, cache_dir: str):
    logger.info('weka: prepare dataset')
    arffgz_path = _get_local_path(
        _DATA_FILE.format(dataset.details['file_id']),  # maybe != ds_id
        data_home=cache_dir + '/openml/')
    with tempfile.TemporaryDirectory() as tmpdirname:
        # link to a file named '*.arff.gz', otherwise weka doesn't
        # recognize it's gzipped
        arffgz_linked = (tmpdirname + '/' + os.path.basename(arffgz_path) +
                         '.arff.gz')
        os.symlink(arffgz_path, arffgz_linked)

        # get target column
        target_column_ids = [
            int(feature['index'])
            for feature in _get_data_features(dataset.details['id'],
                                              data_home=cache_dir)
            if feature['is_target'] == 'true'
        ]
        if len(target_column_ids) != 1:
            logger.warning(
                "identified {} target columns [{}], using only {}".format(
                    len(target_column_ids),
                    ','.join(map(str,
                                 target_column_ids)), target_column_ids[0]))

        logger.info('run weka.' + name)
        _cmd = WEKA_CMD + [
            name,
            '-t',
            arffgz_linked,  # "train" ARFF file
            '-c',
            str(target_column_ids[0] + 1),
            # todo: hardcode metrics
        ] + extra_args
        logger.debug(_cmd)
        weka_process = subprocess.run(_cmd,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE,
                                      universal_newlines=True)
    logger.info('weka.{} returned with {}'.format(name,
                                                  weka_process.returncode))
    if weka_process.stdout:
        weka_stdout_logger.debug(weka_process.stdout)
    if weka_process.stderr:
        weka_stderr_logger.debug(weka_process.stderr)

    weka_out: str = weka_process.stdout
    if WEKA_REGEX.FULL_MODEL.search(weka_out):  # if has successful run
        runtime_single, runtime_cv, jrip_n_rules, metrics = \
            _process_weka_output(weka_out)

        log_results(algorithm='weka.' + name,
                    runtime_single=runtime_single,
                    runtime_cv=runtime_cv,
                    n_rules=jrip_n_rules,
                    metrics=metrics)
    else:
        logger.warning('weka.{}: no successful output found')
Beispiel #2
0
def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir):
    data_id = 61

    _monkey_patch_webbased_functions(monkeypatch, data_id, gzip_response)
    openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id)
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
    # first fill the cache
    response1 = _open_openml_url(openml_path, cache_directory)
    # assert file exists
    location = _get_local_path(openml_path, cache_directory)
    assert os.path.isfile(location)
    # redownload, to utilize cache
    response2 = _open_openml_url(openml_path, cache_directory)
    assert response1.read() == response2.read()
Beispiel #3
0
def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir):
    data_id = 61

    _monkey_patch_webbased_functions(
        monkeypatch, data_id, gzip_response)
    openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id)
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
    # first fill the cache
    response1 = _open_openml_url(openml_path, cache_directory)
    # assert file exists
    location = _get_local_path(openml_path, cache_directory)
    assert os.path.isfile(location)
    # redownload, to utilize cache
    response2 = _open_openml_url(openml_path, cache_directory)
    assert response1.read() == response2.read()
Beispiel #4
0
def test_open_openml_url_unlinks_local_path(
        monkeypatch, gzip_response, tmpdir, write_to_disk):
    data_id = 61
    openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id)
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
    location = _get_local_path(openml_path, cache_directory)

    def _mock_urlopen(request):
        if write_to_disk:
            with open(location, "w") as f:
                f.write("")
        raise ValueError("Invalid request")

    monkeypatch.setattr(sklearn.datasets.openml, 'urlopen', _mock_urlopen)

    with pytest.raises(ValueError, match="Invalid request"):
        _open_openml_url(openml_path, cache_directory)

    assert not os.path.exists(location)
Beispiel #5
0
def test_open_openml_url_unlinks_local_path(
        monkeypatch, gzip_response, tmpdir, write_to_disk):
    data_id = 61
    openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id)
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
    location = _get_local_path(openml_path, cache_directory)

    def _mock_urlopen(request):
        if write_to_disk:
            with open(location, "w") as f:
                f.write("")
        raise ValueError("Invalid request")

    monkeypatch.setattr(sklearn.datasets.openml, 'urlopen', _mock_urlopen)

    with pytest.raises(ValueError, match="Invalid request"):
        _open_openml_url(openml_path, cache_directory)

    assert not os.path.exists(location)
Beispiel #6
0
def test_retry_with_clean_cache(tmpdir):
    data_id = 61
    openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id)
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
    location = _get_local_path(openml_path, cache_directory)
    os.makedirs(os.path.dirname(location))

    with open(location, 'w') as f:
        f.write("")

    @_retry_with_clean_cache(openml_path, cache_directory)
    def _load_data():
        # The first call will raise an error since location exists
        if os.path.exists(location):
            raise Exception("File exist!")
        return 1

    warn_msg = "Invalid cache, redownloading file"
    with pytest.warns(RuntimeWarning, match=warn_msg):
        result = _load_data()
    assert result == 1
Beispiel #7
0
def test_retry_with_clean_cache(tmpdir):
    data_id = 61
    openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id)
    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
    location = _get_local_path(openml_path, cache_directory)
    os.makedirs(os.path.dirname(location))

    with open(location, 'w') as f:
        f.write("")

    @_retry_with_clean_cache(openml_path, cache_directory)
    def _load_data():
        # The first call will raise an error since location exists
        if os.path.exists(location):
            raise Exception("File exist!")
        return 1

    warn_msg = "Invalid cache, redownloading file"
    with pytest.warns(RuntimeWarning, match=warn_msg):
        result = _load_data()
    assert result == 1