def test_framework_definition_raises_error_if_no_matching_framework():
    res = ns(config=ns(frameworks=ns(definition_file="none")),
             _frameworks={default_tag: ns(present=ns(name="present"))})
    # binding `framework_definition` method to our resource mock: use pytest-mock instead?
    res.framework_definition = Resources.framework_definition.__get__(res)
    assert res.framework_definition("present")
    with pytest.raises(ValueError, match=r"Incorrect framework `missing`"):
        res.framework_definition("missing")
Esempio n. 2
0
def oml_config():
    return from_config(
        ns(input_dir="my_input",
           output_dir="my_output",
           user_dir="my_user_dir",
           root_dir="my_root_dir",
           openml=ns(apikey="c1994bdb7ecb3c6f3c8f3b35f4b47f1f",
                     infer_dtypes=False))).config
def test_framework_definition_raises_error_if_the_framework_is_abstract():
    res = ns(config=ns(frameworks=ns(definition_file="none")),
             _frameworks={
                 default_tag: ns(present=ns(name="present", abstract=True))
             })
    # binding `framework_definition` method to our resource mock: use pytest-mock instead?
    res.framework_definition = Resources.framework_definition.__get__(res)
    with pytest.raises(
            ValueError,
            match=
            r"Framework definition `present` is abstract and cannot be run directly"
    ):
        res.framework_definition("present")
def test_framework_definition_lookup_is_case_insensitive(
        frameworks, lookup, expected):
    res = ns(_frameworks={default_tag: frameworks})
    # binding `framework_definition` method to our resource mock: use pytest-mock instead?
    res.framework_definition = Resources.framework_definition.__get__(res)
    assert res.framework_definition(lookup) == (frameworks[expected],
                                                frameworks[expected].name)
Esempio n. 5
0
def run(dataset, config):
    log.info("\n**** Random Forest (sklearn %s) ****\n", sklearn.__version__)

    is_classification = config.type == 'classification'

    # Impute any missing data (can test using -t 146606)
    X_train, X_test = impute(dataset.train.X_enc, dataset.test.X_enc)
    y_train, y_test = dataset.train.y, dataset.test.y

    log.info(
        "Running RandomForest with a maximum time of {}s on {} cores.".format(
            config.max_runtime_seconds, config.cores))
    log.warning(
        "We completely ignore the requirement to stay within the time limit.")
    log.warning(
        "We completely ignore the advice to optimize towards metric: {}.".
        format(config.metric))

    estimator = RandomForestClassifier if is_classification else RandomForestRegressor
    rfc = estimator(n_jobs=config.cores, **config.framework_params)

    rfc.fit(X_train, y_train)

    predictions = rfc.predict(X_test)
    probabilities = rfc.predict_proba(X_test) if is_classification else None

    return ns(output_file=config.output_predictions_file,
              probabilities=probabilities,
              predictions=predictions,
              truth=y_test,
              target_is_encoded=False)
Esempio n. 6
0
def file_config():
    return from_config(
        ns(
            input_dir="my_input",
            output_dir="my_output",
            user_dir="my_user_dir",
            root_dir="my_root_dir",
        )
    ).config
Esempio n. 7
0
def test_load_multiclass_task_arff(file_loader):
    ds_def = ns(
        train=os.path.join(res, "iris_train.arff"),
        test=os.path.join(res, "iris_test.arff"),
        target="class"
    )
    ds = file_loader.load(ds_def)
    assert ds.type is DatasetType.multiclass
    _assert_X_y_types(ds.train)
    _assert_data_consistency(ds)
    _assert_data_paths(ds, ds_def)
    _assert_iris_features(ds, ds_def)
Esempio n. 8
0
def test_load_binary_task_arff(file_loader):
    ds_def = ns(
        train=os.path.join(res, "kc2_train.arff"),
        test=os.path.join(res, "kc2_test.arff"),
        target="problems"
    )
    ds = file_loader.load(ds_def)
    assert ds.type is DatasetType.binary
    _assert_X_y_types(ds.train)
    _assert_data_consistency(ds)
    _assert_data_paths(ds, ds_def)
    _assert_kc2_features(ds, ds_def)
Esempio n. 9
0
def run(dataset: Dataset, config: TaskConfig):
    with TmpDir() as tmpdir:
        ds = ns(
            train=ns(
                X_enc=os.path.join(tmpdir, 'train.X_enc'),
                y=os.path.join(tmpdir, 'train.y')
            ),
            test=ns(
                X_enc=os.path.join(tmpdir, 'test.X_enc'),
                y=os.path.join(tmpdir, 'test.y')
            )
        )
        write_csv(dataset.train.X_enc, ds.train.X_enc),
        write_csv(dataset.train.y.reshape(-1, 1), ds.train.y),
        write_csv(dataset.test.X_enc, ds.test.X_enc),
        write_csv(dataset.test.y.reshape(-1, 1), ds.test.y),
        dataset.release()
        config.result_token = str(uuid.uuid1())
        config.result_dir = tmpdir
        params = json_dumps(dict(dataset=ds, config=config), style='compact')
        output, err = run_cmd('{python} {here}/exec_proc.py'.format(python=PYTHON, here=dir_of(__file__)), _input_str_=params)
        out = io.StringIO(output)
        res = ns()
        for line in out:
            li = line.rstrip()
            if li == config.result_token:
                res = json_loads(out.readline(), as_namespace=True)
                break

        def load_data(path):
            return read_csv(path, as_data_frame=False, header=False)

        log.debug("Result from subprocess:\n%s", res)
        save_predictions_to_file(dataset=dataset,
                                 output_file=res.output_file,
                                 probabilities=load_data(res.probabilities) if res.probabilities is not None else None,
                                 predictions=load_data(res.predictions).squeeze(),
                                 truth=load_data(res.truth).squeeze(),
                                 target_is_encoded=res.target_is_encoded)
Esempio n. 10
0
def test_load_regression_task_arff(file_loader):
    ds_def = ns(
        train=os.path.join(res, "cholesterol_train.arff"),
        test=os.path.join(res, "cholesterol_test.arff"),
        target="chol"
    )
    ds = file_loader.load(ds_def)
    assert ds.type is DatasetType.regression
    print(ds.train.X.dtypes)
    _assert_X_y_types(ds.train)
    _assert_data_consistency(ds)
    _assert_data_paths(ds, ds_def)
    _assert_cholesterol_features(ds, ds_def, 'arff')
Esempio n. 11
0
def run_cmd_in_venv(caller_file, cmd, *args, **kwargs):
    params = ns(python_exec='python')
    for k, v in params:
        kk = '_' + k + '_'
        if kk in kwargs:
            params[k] = kwargs[kk]
            del kwargs[kk]

    here = dir_of(caller_file)
    venv_bin_path = os.path.join(here, 'venv', 'bin')
    if os.path.isdir(venv_bin_path):
        py = os.path.join(venv_bin_path, 'python -W ignore')
        pip = os.path.join(venv_bin_path, 'python -m pip')
    else:
        py = f"{params.python_exec} -W ignore"
        pip = f"{params.python_exec} -m pip"

    cmd = cmd.format(py=py, pip=pip)
    return run_cmd(cmd, *args, **kwargs)
Esempio n. 12
0
config_user = config_load(os.path.join(args.userdir if args.userdir is not None else config.user_dir, "config.yaml"))
# config listing properties set by command line
config_args = ns.parse(
    {'results.save': args.keep_scores},
    input_dir=args.indir,
    output_dir=args.outdir,
    user_dir=args.userdir,
    root_dir=root_dir,
    script=os.path.basename(__file__),
    run_mode=args.mode,
    parallel_jobs=args.parallel,
    sid=sid,
) + ns.parse(extras)
if args.mode != 'local':
    config_args + ns.parse({'monitoring.frequency_seconds': 0})
config_args = ns({k: v for k, v in config_args if v is not None})
log.debug("Config args: %s.", config_args)
# merging all configuration files
amlb.resources.from_configs(config, config_user, config_args)

try:
    if args.mode == 'local':
        bench = amlb.Benchmark(args.framework, args.benchmark, args.constraint)
    elif args.mode == 'docker':
        bench = amlb.DockerBenchmark(args.framework, args.benchmark, args.constraint)
    elif args.mode == 'singularity':
        bench = amlb.SingularityBenchmark(args.framework, args.benchmark, args.constraint)
    elif args.mode == 'aws':
        bench = amlb.AWSBenchmark(args.framework, args.benchmark, args.constraint)
        # bench = amlb.AWSBenchmark(args.framework, args.benchmark, args.constraint, region=args.region)
    # elif args.mode == "aws-remote":
Esempio n. 13
0
def run_in_venv(caller_file,
                script_file: str,
                *args,
                input_data: Union[dict, ns],
                dataset: Dataset,
                config: TaskConfig,
                process_results=None,
                python_exec=None):

    here = dir_of(caller_file)
    venv_bin_path = os.path.join(here, 'venv', 'bin')
    if python_exec is None:  # use local virtual env by default
        python_exec = os.path.join(venv_bin_path, 'python -W ignore')
    script_path = os.path.join(here, script_file)
    cmd = f"{python_exec} {script_path}"

    input_data = ns.from_dict(input_data)
    with TemporaryDirectory() as tmpdir:

        def make_path(k, v, parents=None):
            if isinstance(v, np.ndarray):
                path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy']))
                if vector_keys.match(k):
                    v = v.reshape(-1, 1)
                np.save(path, v, allow_pickle=True)
                return k, path
            return k, v

        ds = ns.walk(input_data, make_path)
        dataset.release()

        config.result_dir = tmpdir
        config.result_file = mktemp(dir=tmpdir)

        params = json_dumps(dict(dataset=ds, config=config), style='compact')
        with Timer() as proc_timer:
            output, err = run_cmd(
                cmd,
                *args,
                _input_str_=params,
                _live_output_=True,
                _error_level_=logging.DEBUG,
                _env_=dict(PATH=os.pathsep.join(
                    [venv_bin_path, os.environ['PATH']]),
                           PYTHONPATH=os.pathsep.join([
                               rconfig().root_dir,
                           ]),
                           AMLB_PATH=os.path.join(rconfig().root_dir, "amlb")),
            )

        res = ns(lambda: None)
        if os.path.exists(config.result_file):
            res = json_load(config.result_file, as_namespace=True)

        log.debug("Result from subprocess:\n%s", res)

        if not res:
            raise NoResultError(f"Process crashed:\n{err}")

        if res.error_message is not None:
            raise NoResultError(res.error_message)

        for name in ['predictions', 'truth', 'probabilities']:
            res[name] = np.load(
                res[name],
                allow_pickle=True) if res[name] is not None else None

        if callable(process_results):
            res = process_results(res)

        if res.output_file:
            save_predictions(
                dataset=dataset,
                output_file=res.output_file,
                predictions=res.predictions.reshape(-1)
                if res.predictions is not None else None,
                truth=res.truth.reshape(-1) if res.truth is not None else None,
                probabilities=res.probabilities,
                probabilities_labels=res.probabilities_labels,
                target_is_encoded=res.target_is_encoded)

        return dict(models_count=res.models_count
                    if res.models_count is not None else 1,
                    training_duration=res.training_duration if
                    res.training_duration is not None else proc_timer.duration,
                    predict_duration=res.predict_duration,
                    **res.others.__dict__)
Esempio n. 14
0
def run_in_venv(caller_file,
                script_file: str,
                *args,
                input_data: Union[dict, ns],
                dataset: Dataset,
                config: TaskConfig,
                process_results=None,
                python_exec=None):

    here = dir_of(caller_file)
    if python_exec is None:  # use local virtual env by default
        python_exec = os.path.join(here, 'venv/bin/python -W ignore')
    script_path = os.path.join(here, script_file)
    cmd = f"{python_exec} {script_path}"

    input_data = ns.from_dict(input_data)
    with TmpDir() as tmpdir:

        def make_path(k, v, parents=None):
            if isinstance(v, np.ndarray):
                path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy']))
                if vector_keys.match(k):
                    v = v.reshape(-1, 1)
                np.save(path, v, allow_pickle=True)
                return k, path
            return k, v

        ds = ns.walk(input_data, make_path)
        dataset.release()

        config.result_token = str(uuid.uuid1())
        config.result_dir = tmpdir

        params = json_dumps(dict(dataset=ds, config=config), style='compact')
        with Timer() as proc_timer:
            output, err = run_cmd(cmd,
                                  *args,
                                  _input_str_=params,
                                  _live_output_=True,
                                  _env_=dict(PYTHONPATH=os.pathsep.join([
                                      rconfig().root_dir,
                                      os.path.join(rconfig().root_dir, "amlb"),
                                  ])))

        out = io.StringIO(output)
        res = ns()
        for line in out:
            li = line.rstrip()
            if li == config.result_token:
                res = json_loads(out.readline(), as_namespace=True)
                break

        if res.error_message is not None:
            raise NoResultError(res.error_message)

        for name in ['predictions', 'truth', 'probabilities']:
            res[name] = np.load(
                res[name],
                allow_pickle=True) if res[name] is not None else None

        log.debug("Result from subprocess:\n%s", res)
        if callable(process_results):
            res = process_results(res)

        save_predictions_to_file(
            dataset=dataset,
            output_file=res.output_file,
            predictions=res.predictions.reshape(-1)
            if res.predictions is not None else None,
            truth=res.truth.reshape(-1) if res.truth is not None else None,
            probabilities=res.probabilities,
            target_is_encoded=res.target_is_encoded)

        return dict(models_count=res.models_count
                    if res.models_count is not None else 1,
                    training_duration=res.training_duration if
                    res.training_duration is not None else proc_timer.duration)
import pytest

from amlb.frameworks import default_tag
from amlb.resources import Resources
from amlb.utils import Namespace as ns


@pytest.mark.parametrize("frameworks, lookup, expected", [
    (ns(MixedCase=ns(name="MixedCase")), "MixedCase", "MixedCase"),
    (ns(MixedCase=ns(name="MixedCase")), "mixedcase", "MixedCase"),
    (ns(MixedCase=ns(name="MixedCase")), "MIXEDCASE", "MixedCase"),
    (ns(MixedCase=ns(name="MixedCase")), "mIxEdCasE", "MixedCase"),
])
def test_framework_definition_lookup_is_case_insensitive(
        frameworks, lookup, expected):
    res = ns(_frameworks={default_tag: frameworks})
    # binding `framework_definition` method to our resource mock: use pytest-mock instead?
    res.framework_definition = Resources.framework_definition.__get__(res)
    assert res.framework_definition(lookup) == (frameworks[expected],
                                                frameworks[expected].name)


def test_framework_definition_raises_error_if_no_matching_framework():
    res = ns(config=ns(frameworks=ns(definition_file="none")),
             _frameworks={default_tag: ns(present=ns(name="present"))})
    # binding `framework_definition` method to our resource mock: use pytest-mock instead?
    res.framework_definition = Resources.framework_definition.__get__(res)
    assert res.framework_definition("present")
    with pytest.raises(ValueError, match=r"Incorrect framework `missing`"):
        res.framework_definition("missing")
Esempio n. 16
0
    return ns(output_file=config.output_predictions_file,
              probabilities=probabilities,
              predictions=predictions,
              truth=y_test,
              target_is_encoded=False)


if __name__ == '__main__':
    params = json_loads(sys.stdin.read(), as_namespace=True)

    def load_data(path):
        return read_csv(path, as_data_frame=False, header=False)

    ds = ns(train=ns(X_enc=load_data(params.dataset.train.X_enc),
                     y=load_data(params.dataset.train.y).squeeze()),
            test=ns(
                X_enc=load_data(params.dataset.test.X_enc),
                y=load_data(params.dataset.test.y).squeeze(),
            ))
    config = params.config
    config.framework_params = ns.dict(config.framework_params)
    result = run(ds, config)

    res = copy.copy(result)
    res.predictions = os.path.join(config.result_dir, 'predictions')
    res.truth = os.path.join(config.result_dir, 'truth')
    write_csv(result.predictions.reshape(-1, 1), res.predictions)
    write_csv(result.truth.reshape(-1, 1), res.truth)
    if result.probabilities is not None:
        res.probabilities = os.path.join(config.result_dir, 'probabilities')
        write_csv(result.probabilities, res.probabilities)
Esempio n. 17
0
import os
import re
import shutil
import sys

# prevent asap other modules from defining the root logger using basicConfig
import amlb.logger

from openml.config import get_cache_directory

import amlb
from amlb.utils import Namespace as ns, config_load, datetime_iso, str2bool, str_sanitize, zip_path
from amlb import log, AutoMLError

default_dirs = ns(input_dir=get_cache_directory(),
                  output_dir="./results",
                  user_dir="~/.config/automlbenchmark",
                  root_dir=os.path.dirname(__file__))

parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument(
    'framework',
    type=str,
    help=
    "The framework to evaluate as defined by default in resources/frameworks.yaml."
    "\nTo use a labelled framework (i.e. a framework defined in resources/frameworks-{label}.yaml),"
    "\nuse the syntax {framework}:{label}.")
parser.add_argument(
    'benchmark',
    type=str,
    nargs='?',
    default='test',