Exemple #1
0
def load_oml_benchmark(
        benchmark: str) -> Tuple[str, Optional[str], List[Namespace]]:
    """ Loads benchmark defined by openml suite or task, from openml/s/X or openml/t/Y. """
    domain, oml_type, oml_id = benchmark.split('/')
    path = None  # benchmark file does not exist on disk
    name = benchmark  # name is later passed as cli input again for containers, it needs to remain parsable
    if oml_type == 't':
        log.info("Loading openml task %s.", oml_id)
        # We first have the retrieve the task because we don't know the dataset id
        t = openml.tasks.get_task(oml_id, download_data=False)
        data = openml.datasets.get_dataset(t.dataset_id, download_data=False)
        tasks = [
            Namespace(name=str_sanitize(data.name),
                      description=data.description,
                      openml_task_id=t.id)
        ]
    elif oml_type == 's':
        log.info("Loading openml suite %s.", oml_id)
        suite = openml.study.get_suite(oml_id)

        # Here we know the (task, dataset) pairs so only download dataset meta-data is sufficient
        tasks = []
        for tid, did in zip(suite.tasks, suite.data):
            data = openml.datasets.get_dataset(did, download_data=False)
            tasks.append(
                Namespace(name=str_sanitize(data.name),
                          description=data.description,
                          openml_task_id=tid))
    else:
        raise ValueError(f"The oml_type is {oml_type} but must be 's' or 't'")
    return name, path, tasks
def test_framework_definition_lookup_is_case_insensitive(
        frameworks, lookup, expected):
    res = NS(_frameworks=frameworks)
    # binding `framework_definition` method to our resource mock: use pytest-mock instead?
    res.framework_definition = Resources.framework_definition.__get__(res)
    assert res.framework_definition(lookup) == (frameworks[expected],
                                                frameworks[expected].name)
def test_find_all_parents_returns_empty_list_if_framework_has_no_parent():
    frameworks = Namespace(
        gama=Namespace(name="gama", version="latest", description="flexible automl"),
        h2o_automl=Namespace(name="h2o", version="1.3"),
    )
    parents = _find_all_parents(frameworks.gama, frameworks)
    assert parents == []
def test_update_frameworks_with_parent_definitions_does_not_overwrite_child(field, p_value, c_value):
    frameworks = Namespace(
        gama=Namespace(name="gama", **{field: p_value}),
        gama_old=Namespace(name="gama_20.1", **{field: c_value}, extends="gama"),
    )
    _update_frameworks_with_parent_definitions(frameworks)
    assert frameworks.gama_old[field] == c_value
def test_sanitize_and_add_defaults_child_inherits_module(simple_resource):
    frameworks = Namespace(
        auto_sklearn=Namespace(),
        auto_sklearn_old=Namespace(extends="auto_sklearn")
    )
    _sanitize_and_add_defaults(frameworks, simple_resource.config)
    assert frameworks.auto_sklearn_old.module == "frameworks.auto_sklearn"
def test_update_frameworks_with_parent_definitions_add_missing_field_from_parent(field, value):
    frameworks = Namespace(
        gama=Namespace(name="gama", **{field: value}),
        gama_old=Namespace(name="gama_20.1.0", extends="gama"),
    )
    _update_frameworks_with_parent_definitions(frameworks)
    assert frameworks.gama_old[field] == value
Exemple #7
0
def call_run(run_fn):
    import numpy as np

    params = NS.from_dict(json.loads(sys.stdin.read()))

    def load_data(name, path, **ignored):
        if isinstance(path, str) and data_keys.match(name):
            return name, np.load(path, allow_pickle=True)
        return name, path

    print(params.dataset)
    ds = NS.walk(params.dataset, load_data)

    config = params.config
    config.framework_params = NS.dict(config.framework_params)

    try:
        result = run_fn(ds, config)
        res = dict(result)
        for name in ['predictions', 'truth', 'probabilities']:
            arr = result[name]
            if arr is not None:
                res[name] = os.path.join(config.result_dir, '.'.join([name, 'npy']))
                np.save(res[name], arr, allow_pickle=True)
    except Exception as e:
        log.exception(e)
        res = dict(
            error_message=str(e),
            models_count=0
        )

    print(config.result_token)
    print(json.dumps(res, separators=(',', ':')))
def test_update_frameworks_with_parent_definitions_parent_overwrites_grandparent_yaml(field, g_value, p_value):
    frameworks = Namespace(
        gama=Namespace(name="gama", **{field: g_value}),
        gama_old=Namespace(name="gama_2", **{field: p_value}, extends="gama"),
        gama_oldest=Namespace(name="gama_1", extends="gama_old"),
    )
    _update_frameworks_with_parent_definitions(frameworks)
    assert frameworks.gama_oldest[field] == p_value
def test_remove_frameworks_with_unknown_parent_keeps_children_with_known_parents():
    f = Namespace(
        dummy=Namespace(name="dummy", extends="does_exist"),
        does_exist=Namespace(name="does_exist"),
    )
    _remove_frameworks_with_unknown_parent(f)
    assert "dummy" in f
    assert "does_exist" in f
def test_framework_definition_raises_error_if_no_matching_framework():
    res = NS(config=NS(frameworks=NS(definition_file="none")),
             _frameworks=NS(present=NS(name="present")))
    # binding `framework_definition` method to our resource mock: use pytest-mock instead?
    res.framework_definition = Resources.framework_definition.__get__(res)
    assert res.framework_definition("present")
    with pytest.raises(ValueError, match=r"Incorrect framework `missing`"):
        res.framework_definition("missing")
def test_find_all_parents_returns_parent_of_framework_with_single_parent(framework):
    frameworks = Namespace(
        gama=Namespace(name="gama", version="latest", description="flexible automl"),
        gama_old=Namespace(name="gama_20.1.0", version="20.1.0", extends="gama"),
        h2o_automl=Namespace(name="h2o", version="latest"),
        h2o_automl_old=Namespace(name="h2o_1.2", version="1.2", extends="h2o_automl"),
    )
    parents = _find_all_parents(frameworks[f"{framework}_old"], frameworks)
    assert parents == [frameworks[framework]]
def test_setup_args_set_to_version_if_undefined():
    f_my_version = Namespace(version="my_version")
    f_my_other_version = Namespace(version="my_other_version")

    _add_default_setup_args(f_my_version)
    _add_default_setup_args(f_my_other_version)

    assert f_my_version.setup_args == ["my_version"]
    assert f_my_other_version.setup_args == ["my_other_version"]
Exemple #13
0
def simple_resource():
    return Resources(
        Namespace(input_dir="my_input",
                  output_dir="my_output",
                  user_dir="my_user_dir",
                  root_dir="my_root_dir",
                  docker=Namespace(image_defaults=Namespace(
                      author="author",
                      image=None,
                      tag=None,
                  )),
                  frameworks=Namespace(root_module="frameworks",
                                       definition_file=[])))
def _add_default_image(framework: Namespace, config: Namespace, props: Optional[List[str]] = None):
    if "image" not in framework:
        framework.image = copy.deepcopy(config.docker.image_defaults)
    else:
        framework.image = Namespace.merge(config.docker.image_defaults, framework.image)

    if framework.image.tag is None and (not props or 'tag' in props):
        framework.image.tag = framework.version.lower()

    if framework.image.image is None and (not props or 'image' in props):
        framework.image.image = framework.name.lower()

    if framework.image.author is None and (not props or 'author' in props):
        framework.image.author = ""
Exemple #15
0
def _add_default_image(framework: Namespace, config: Namespace):
    if "image" not in framework:
        framework.image = copy.deepcopy(config.docker.image_defaults)
    else:
        framework.image = Namespace.merge(config.docker.image_defaults, framework.image)

    if framework.image.tag is None:
        framework.image.tag = framework.version.lower()

    if framework.image.image is None:
        framework.image.image = framework.name.lower()

    if framework.image.author is None:
        framework.image.author = ""
def _load_and_merge_framework_definitions(frameworks_file: Union[str, List[str]], config) -> Namespace:
    """ Load and merge the framework file(s), does not allow duplicate definitions. """
    log.info("Loading frameworks definitions from %s.", frameworks_file)
    if not isinstance(frameworks_file, list):
        frameworks_file = [frameworks_file]

    definitions_by_tag = Namespace()
    for tag in [default_tag]+config.frameworks.tags:
        definitions_by_file = [config_load(_definition_file(file, tag)) for file in frameworks_file]
        if not config.frameworks.allow_duplicates:
            for d1, d2 in itertools.combinations([set(dir(d)) for d in definitions_by_file], 2):
                if d1.intersection(d2) != set():
                    raise ValueError(f"Duplicate entry '{d1.intersection(d2).pop()}' found.")
        definitions_by_tag[tag] = Namespace.merge(*definitions_by_file)

    return definitions_by_tag
Exemple #17
0
def load_oml_benchmark(
        benchmark: str) -> Tuple[str, Optional[str], List[Namespace]]:
    """ Loads benchmark defined by openml suite or task, from openml/s/X or openml/t/Y. """
    domain, oml_type, oml_id = benchmark.split('/')
    path = None  # benchmark file does not exist on disk
    name = benchmark  # name is later passed as cli input again for containers, it needs to remain parsable

    if openml.config.retry_policy != "robot":
        log.debug("Setting openml retry_policy from '%s' to 'robot'." %
                  openml.config.retry_policy)
        openml.config.set_retry_policy("robot")

    if oml_type == 't':
        log.info("Loading openml task %s.", oml_id)
        # We first have the retrieve the task because we don't know the dataset id
        t = openml.tasks.get_task(oml_id,
                                  download_data=False,
                                  download_qualities=False)
        data = openml.datasets.get_dataset(t.dataset_id,
                                           download_data=False,
                                           download_qualities=False)
        tasks = [
            Namespace(name=str_sanitize(data.name),
                      description=data.description,
                      openml_task_id=t.id)
        ]
    elif oml_type == 's':
        log.info("Loading openml suite %s.", oml_id)
        suite = openml.study.get_suite(oml_id)

        # Here we know the (task, dataset) pairs so only download dataset meta-data is sufficient
        tasks = []
        datasets = openml.datasets.list_datasets(data_id=suite.data,
                                                 output_format='dataframe')
        datasets.set_index('did', inplace=True)
        for tid, did in zip(suite.tasks, suite.data):
            tasks.append(
                Namespace(
                    name=str_sanitize(datasets.loc[did]['name']),
                    description=
                    f"{openml.config.server.replace('/api/v1/xml', '')}/d/{did}",
                    openml_task_id=tid))
    else:
        raise ValueError(f"The oml_type is {oml_type} but must be 's' or 't'")
    return name, path, tasks
def test_find_all_parents_returns_frameworks_closest_first_if_two_parents(framework):
    frameworks = Namespace(
        gama=Namespace(name="gama", version="latest", description="flexible automl"),
        gama_old=Namespace(name="gama_20.1.0", version="20.1.0", extends="gama"),
        gama_older=Namespace(name="gama_20.0.0", version="20.0.0", extends="gama_old"),
        h2o_automl=Namespace(name="h2o", version="latest"),
        h2o_automl_old=Namespace(name="h2o_1.2", version="1.2", extends="h2o_automl"),
        h2o_automl_older=Namespace(name="h2o_1.1", version="1.1", extends="h2o_automl_old"),
    )
    parents = _find_all_parents(frameworks[f"{framework}_older"], frameworks)
    assert parents == [frameworks[f"{framework}_old"], frameworks[framework]]
def _add_default_params(framework):
    if "params" not in framework:
        framework.params = dict()
    else:
        framework.params = Namespace.dict(framework.params)
                    '--extra',
                    default=[],
                    action='append',
                    help=argparse.SUPPRESS)
args = parser.parse_args()
extras = {
    t[0]: t[1] if len(t) > 1 else True
    for t in [x.split('=', 1) for x in args.extra]
}

# script_name = os.path.splitext(os.path.basename(__file__))[0]
# log_dir = os.path.join(args.outdir if args.outdir else '.', 'logs')
# os.makedirs(log_dir, exist_ok=True)
# now_str = datetime_iso(date_sep='', time_sep='')
amlb.logger.setup(root_level='DEBUG', console_level='INFO')

root_dir = os.path.dirname(__file__)
config = config_load(os.path.join(root_dir, "resources", "config.yaml"))
config_args = ns.parse(
    root_dir=root_dir,
    script=os.path.basename(__file__),
    run_mode='script',
) + ns.parse(extras)
config_args = ns({k: v for k, v in config_args if v is not None})
amlb.resources.from_configs(config, config_args)

if args.reconnect:
    amlb.AWSBenchmark.reconnect(args.instances)
else:
    amlb.AWSBenchmark.fetch_results(args.instances)
Exemple #21
0
def run_in_venv(caller_file,
                script_file: str,
                *args,
                input_data: Union[dict, ns],
                dataset: Dataset,
                config: TaskConfig,
                process_results=None,
                python_exec=None):

    here = dir_of(caller_file)
    venv_bin_path = os.path.join(here, 'venv', 'bin')
    if python_exec is None:  # use local virtual env by default
        python_exec = os.path.join(venv_bin_path, 'python -W ignore')
    script_path = os.path.join(here, script_file)
    cmd = f"{python_exec} {script_path}"

    input_data = ns.from_dict(input_data)
    with TemporaryDirectory() as tmpdir:

        def make_path(k, v, parents=None):
            if isinstance(v, np.ndarray):
                path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy']))
                if vector_keys.match(k):
                    v = v.reshape(-1, 1)
                np.save(path, v, allow_pickle=True)
                return k, path
            return k, v

        ds = ns.walk(input_data, make_path)
        dataset.release()

        config.result_dir = tmpdir
        config.result_file = mktemp(dir=tmpdir)

        params = json_dumps(dict(dataset=ds, config=config), style='compact')
        with Timer() as proc_timer:
            output, err = run_cmd(
                cmd,
                *args,
                _input_str_=params,
                _live_output_=True,
                _error_level_=logging.DEBUG,
                _env_=dict(PATH=os.pathsep.join(
                    [venv_bin_path, os.environ['PATH']]),
                           PYTHONPATH=os.pathsep.join([
                               rconfig().root_dir,
                           ]),
                           AMLB_PATH=os.path.join(rconfig().root_dir, "amlb")),
            )

        res = ns(lambda: None)
        if os.path.exists(config.result_file):
            res = json_load(config.result_file, as_namespace=True)

        log.debug("Result from subprocess:\n%s", res)

        if not res:
            raise NoResultError(f"Process crashed:\n{err}")

        if res.error_message is not None:
            raise NoResultError(res.error_message)

        for name in ['predictions', 'truth', 'probabilities']:
            res[name] = np.load(
                res[name],
                allow_pickle=True) if res[name] is not None else None

        if callable(process_results):
            res = process_results(res)

        if res.output_file:
            save_predictions(
                dataset=dataset,
                output_file=res.output_file,
                predictions=res.predictions.reshape(-1)
                if res.predictions is not None else None,
                truth=res.truth.reshape(-1) if res.truth is not None else None,
                probabilities=res.probabilities,
                probabilities_labels=res.probabilities_labels,
                target_is_encoded=res.target_is_encoded)

        return dict(models_count=res.models_count
                    if res.models_count is not None else 1,
                    training_duration=res.training_duration if
                    res.training_duration is not None else proc_timer.duration,
                    predict_duration=res.predict_duration,
                    **res.others.__dict__)
def test_version_is_set_to_stable_if_undefined():
    framework = Namespace()
    _add_default_version(framework)
    assert "version" in framework
    assert framework.version == "stable"
def test_setup_script_kept_if_defined(simple_resource):
    framework = Namespace(module="my_module", setup_script="t.sh")
    _add_default_setup_script(framework, simple_resource.config)
    assert framework.setup_script == "t.sh"
def test_setup_script_set_to_none_if_undefined(simple_resource):
    framework = Namespace()
    _add_default_setup_script(framework, simple_resource.config)
    assert framework.setup_script is None
def test_setup_args_kept_if_defined():
    f_with_extra = Namespace(setup_args="w_extra", version="my_version", repo="my_repo")
    _add_default_setup_args(f_with_extra)
    assert f_with_extra.setup_args == ["w_extra"]
def test_params_set_to_empty_dict_if_undefined():
    framework = Namespace()
    _add_default_params(framework)
    assert framework.params == dict()
def test_setup_args_also_includes_repo_if_repo_is_defined(version, repo):
    f_my_version = Namespace(version=version, repo=repo)
    _add_default_setup_args(f_my_version)
    assert f_my_version.setup_args == [version, repo]
def test_version_is_kept_if_defined():
    framework = Namespace(version="v1.0")
    _add_default_version(framework)
    assert "version" in framework
    assert framework.version == "v1.0"
Exemple #29
0
                  root_file=os.path.join(log_dir, '{script}.{now}.full.log'.format(script=script_name, now=now_str)),
                  root_level='INFO', app_level='DEBUG', console_level='INFO', print_to_log=True)

log.info("Running `%s` on `%s` benchmarks in `%s` mode.", args.framework, args.benchmark, args.mode)
log.debug("Script args: %s.", args)

config = config_load(os.path.join(root_dir, "resources", "config.yaml"))
# allowing config override from user_dir: useful to define custom benchmarks and frameworks for example.
config_user = config_load(os.path.join(args.userdir if args.userdir is not None else config.user_dir, "config.yaml"))
# config listing properties set by command line
config_args = ns.parse(
    {'results.save': args.keep_scores},
    input_dir=args.indir,
    output_dir=args.outdir,
    user_dir=args.userdir,
    root_dir=root_dir,
    script=os.path.basename(__file__),
    run_mode=args.mode,
    parallel_jobs=args.parallel,
    sid=sid,
) + ns.parse(extras)
if args.mode != 'local':
    config_args + ns.parse({'monitoring.frequency_seconds': 0})
config_args = ns({k: v for k, v in config_args if v is not None})
log.debug("Config args: %s.", config_args)
# merging all configuration files
amlb.resources.from_configs(config, config_user, config_args)

try:
    if args.mode == 'local':
        bench = amlb.Benchmark(args.framework, args.benchmark, args.constraint)
def test_setup_script_interpolates_module(simple_resource):
    framework = Namespace(module="my_module", setup_script="{module}/t.sh")
    _add_default_setup_script(framework, simple_resource.config)
    assert framework.setup_script == "my_module/t.sh"