def load_oml_benchmark( benchmark: str) -> Tuple[str, Optional[str], List[Namespace]]: """ Loads benchmark defined by openml suite or task, from openml/s/X or openml/t/Y. """ domain, oml_type, oml_id = benchmark.split('/') path = None # benchmark file does not exist on disk name = benchmark # name is later passed as cli input again for containers, it needs to remain parsable if oml_type == 't': log.info("Loading openml task %s.", oml_id) # We first have the retrieve the task because we don't know the dataset id t = openml.tasks.get_task(oml_id, download_data=False) data = openml.datasets.get_dataset(t.dataset_id, download_data=False) tasks = [ Namespace(name=str_sanitize(data.name), description=data.description, openml_task_id=t.id) ] elif oml_type == 's': log.info("Loading openml suite %s.", oml_id) suite = openml.study.get_suite(oml_id) # Here we know the (task, dataset) pairs so only download dataset meta-data is sufficient tasks = [] for tid, did in zip(suite.tasks, suite.data): data = openml.datasets.get_dataset(did, download_data=False) tasks.append( Namespace(name=str_sanitize(data.name), description=data.description, openml_task_id=tid)) else: raise ValueError(f"The oml_type is {oml_type} but must be 's' or 't'") return name, path, tasks
def test_framework_definition_lookup_is_case_insensitive( frameworks, lookup, expected): res = NS(_frameworks=frameworks) # binding `framework_definition` method to our resource mock: use pytest-mock instead? res.framework_definition = Resources.framework_definition.__get__(res) assert res.framework_definition(lookup) == (frameworks[expected], frameworks[expected].name)
def test_find_all_parents_returns_empty_list_if_framework_has_no_parent(): frameworks = Namespace( gama=Namespace(name="gama", version="latest", description="flexible automl"), h2o_automl=Namespace(name="h2o", version="1.3"), ) parents = _find_all_parents(frameworks.gama, frameworks) assert parents == []
def test_update_frameworks_with_parent_definitions_does_not_overwrite_child(field, p_value, c_value): frameworks = Namespace( gama=Namespace(name="gama", **{field: p_value}), gama_old=Namespace(name="gama_20.1", **{field: c_value}, extends="gama"), ) _update_frameworks_with_parent_definitions(frameworks) assert frameworks.gama_old[field] == c_value
def test_sanitize_and_add_defaults_child_inherits_module(simple_resource): frameworks = Namespace( auto_sklearn=Namespace(), auto_sklearn_old=Namespace(extends="auto_sklearn") ) _sanitize_and_add_defaults(frameworks, simple_resource.config) assert frameworks.auto_sklearn_old.module == "frameworks.auto_sklearn"
def test_update_frameworks_with_parent_definitions_add_missing_field_from_parent(field, value): frameworks = Namespace( gama=Namespace(name="gama", **{field: value}), gama_old=Namespace(name="gama_20.1.0", extends="gama"), ) _update_frameworks_with_parent_definitions(frameworks) assert frameworks.gama_old[field] == value
def call_run(run_fn): import numpy as np params = NS.from_dict(json.loads(sys.stdin.read())) def load_data(name, path, **ignored): if isinstance(path, str) and data_keys.match(name): return name, np.load(path, allow_pickle=True) return name, path print(params.dataset) ds = NS.walk(params.dataset, load_data) config = params.config config.framework_params = NS.dict(config.framework_params) try: result = run_fn(ds, config) res = dict(result) for name in ['predictions', 'truth', 'probabilities']: arr = result[name] if arr is not None: res[name] = os.path.join(config.result_dir, '.'.join([name, 'npy'])) np.save(res[name], arr, allow_pickle=True) except Exception as e: log.exception(e) res = dict( error_message=str(e), models_count=0 ) print(config.result_token) print(json.dumps(res, separators=(',', ':')))
def test_update_frameworks_with_parent_definitions_parent_overwrites_grandparent_yaml(field, g_value, p_value): frameworks = Namespace( gama=Namespace(name="gama", **{field: g_value}), gama_old=Namespace(name="gama_2", **{field: p_value}, extends="gama"), gama_oldest=Namespace(name="gama_1", extends="gama_old"), ) _update_frameworks_with_parent_definitions(frameworks) assert frameworks.gama_oldest[field] == p_value
def test_remove_frameworks_with_unknown_parent_keeps_children_with_known_parents(): f = Namespace( dummy=Namespace(name="dummy", extends="does_exist"), does_exist=Namespace(name="does_exist"), ) _remove_frameworks_with_unknown_parent(f) assert "dummy" in f assert "does_exist" in f
def test_framework_definition_raises_error_if_no_matching_framework(): res = NS(config=NS(frameworks=NS(definition_file="none")), _frameworks=NS(present=NS(name="present"))) # binding `framework_definition` method to our resource mock: use pytest-mock instead? res.framework_definition = Resources.framework_definition.__get__(res) assert res.framework_definition("present") with pytest.raises(ValueError, match=r"Incorrect framework `missing`"): res.framework_definition("missing")
def test_find_all_parents_returns_parent_of_framework_with_single_parent(framework): frameworks = Namespace( gama=Namespace(name="gama", version="latest", description="flexible automl"), gama_old=Namespace(name="gama_20.1.0", version="20.1.0", extends="gama"), h2o_automl=Namespace(name="h2o", version="latest"), h2o_automl_old=Namespace(name="h2o_1.2", version="1.2", extends="h2o_automl"), ) parents = _find_all_parents(frameworks[f"{framework}_old"], frameworks) assert parents == [frameworks[framework]]
def test_setup_args_set_to_version_if_undefined(): f_my_version = Namespace(version="my_version") f_my_other_version = Namespace(version="my_other_version") _add_default_setup_args(f_my_version) _add_default_setup_args(f_my_other_version) assert f_my_version.setup_args == ["my_version"] assert f_my_other_version.setup_args == ["my_other_version"]
def simple_resource(): return Resources( Namespace(input_dir="my_input", output_dir="my_output", user_dir="my_user_dir", root_dir="my_root_dir", docker=Namespace(image_defaults=Namespace( author="author", image=None, tag=None, )), frameworks=Namespace(root_module="frameworks", definition_file=[])))
def _add_default_image(framework: Namespace, config: Namespace, props: Optional[List[str]] = None): if "image" not in framework: framework.image = copy.deepcopy(config.docker.image_defaults) else: framework.image = Namespace.merge(config.docker.image_defaults, framework.image) if framework.image.tag is None and (not props or 'tag' in props): framework.image.tag = framework.version.lower() if framework.image.image is None and (not props or 'image' in props): framework.image.image = framework.name.lower() if framework.image.author is None and (not props or 'author' in props): framework.image.author = ""
def _add_default_image(framework: Namespace, config: Namespace): if "image" not in framework: framework.image = copy.deepcopy(config.docker.image_defaults) else: framework.image = Namespace.merge(config.docker.image_defaults, framework.image) if framework.image.tag is None: framework.image.tag = framework.version.lower() if framework.image.image is None: framework.image.image = framework.name.lower() if framework.image.author is None: framework.image.author = ""
def _load_and_merge_framework_definitions(frameworks_file: Union[str, List[str]], config) -> Namespace: """ Load and merge the framework file(s), does not allow duplicate definitions. """ log.info("Loading frameworks definitions from %s.", frameworks_file) if not isinstance(frameworks_file, list): frameworks_file = [frameworks_file] definitions_by_tag = Namespace() for tag in [default_tag]+config.frameworks.tags: definitions_by_file = [config_load(_definition_file(file, tag)) for file in frameworks_file] if not config.frameworks.allow_duplicates: for d1, d2 in itertools.combinations([set(dir(d)) for d in definitions_by_file], 2): if d1.intersection(d2) != set(): raise ValueError(f"Duplicate entry '{d1.intersection(d2).pop()}' found.") definitions_by_tag[tag] = Namespace.merge(*definitions_by_file) return definitions_by_tag
def load_oml_benchmark( benchmark: str) -> Tuple[str, Optional[str], List[Namespace]]: """ Loads benchmark defined by openml suite or task, from openml/s/X or openml/t/Y. """ domain, oml_type, oml_id = benchmark.split('/') path = None # benchmark file does not exist on disk name = benchmark # name is later passed as cli input again for containers, it needs to remain parsable if openml.config.retry_policy != "robot": log.debug("Setting openml retry_policy from '%s' to 'robot'." % openml.config.retry_policy) openml.config.set_retry_policy("robot") if oml_type == 't': log.info("Loading openml task %s.", oml_id) # We first have the retrieve the task because we don't know the dataset id t = openml.tasks.get_task(oml_id, download_data=False, download_qualities=False) data = openml.datasets.get_dataset(t.dataset_id, download_data=False, download_qualities=False) tasks = [ Namespace(name=str_sanitize(data.name), description=data.description, openml_task_id=t.id) ] elif oml_type == 's': log.info("Loading openml suite %s.", oml_id) suite = openml.study.get_suite(oml_id) # Here we know the (task, dataset) pairs so only download dataset meta-data is sufficient tasks = [] datasets = openml.datasets.list_datasets(data_id=suite.data, output_format='dataframe') datasets.set_index('did', inplace=True) for tid, did in zip(suite.tasks, suite.data): tasks.append( Namespace( name=str_sanitize(datasets.loc[did]['name']), description= f"{openml.config.server.replace('/api/v1/xml', '')}/d/{did}", openml_task_id=tid)) else: raise ValueError(f"The oml_type is {oml_type} but must be 's' or 't'") return name, path, tasks
def test_find_all_parents_returns_frameworks_closest_first_if_two_parents(framework): frameworks = Namespace( gama=Namespace(name="gama", version="latest", description="flexible automl"), gama_old=Namespace(name="gama_20.1.0", version="20.1.0", extends="gama"), gama_older=Namespace(name="gama_20.0.0", version="20.0.0", extends="gama_old"), h2o_automl=Namespace(name="h2o", version="latest"), h2o_automl_old=Namespace(name="h2o_1.2", version="1.2", extends="h2o_automl"), h2o_automl_older=Namespace(name="h2o_1.1", version="1.1", extends="h2o_automl_old"), ) parents = _find_all_parents(frameworks[f"{framework}_older"], frameworks) assert parents == [frameworks[f"{framework}_old"], frameworks[framework]]
def _add_default_params(framework): if "params" not in framework: framework.params = dict() else: framework.params = Namespace.dict(framework.params)
'--extra', default=[], action='append', help=argparse.SUPPRESS) args = parser.parse_args() extras = { t[0]: t[1] if len(t) > 1 else True for t in [x.split('=', 1) for x in args.extra] } # script_name = os.path.splitext(os.path.basename(__file__))[0] # log_dir = os.path.join(args.outdir if args.outdir else '.', 'logs') # os.makedirs(log_dir, exist_ok=True) # now_str = datetime_iso(date_sep='', time_sep='') amlb.logger.setup(root_level='DEBUG', console_level='INFO') root_dir = os.path.dirname(__file__) config = config_load(os.path.join(root_dir, "resources", "config.yaml")) config_args = ns.parse( root_dir=root_dir, script=os.path.basename(__file__), run_mode='script', ) + ns.parse(extras) config_args = ns({k: v for k, v in config_args if v is not None}) amlb.resources.from_configs(config, config_args) if args.reconnect: amlb.AWSBenchmark.reconnect(args.instances) else: amlb.AWSBenchmark.fetch_results(args.instances)
def run_in_venv(caller_file, script_file: str, *args, input_data: Union[dict, ns], dataset: Dataset, config: TaskConfig, process_results=None, python_exec=None): here = dir_of(caller_file) venv_bin_path = os.path.join(here, 'venv', 'bin') if python_exec is None: # use local virtual env by default python_exec = os.path.join(venv_bin_path, 'python -W ignore') script_path = os.path.join(here, script_file) cmd = f"{python_exec} {script_path}" input_data = ns.from_dict(input_data) with TemporaryDirectory() as tmpdir: def make_path(k, v, parents=None): if isinstance(v, np.ndarray): path = os.path.join(tmpdir, '.'.join(parents + [k, 'npy'])) if vector_keys.match(k): v = v.reshape(-1, 1) np.save(path, v, allow_pickle=True) return k, path return k, v ds = ns.walk(input_data, make_path) dataset.release() config.result_dir = tmpdir config.result_file = mktemp(dir=tmpdir) params = json_dumps(dict(dataset=ds, config=config), style='compact') with Timer() as proc_timer: output, err = run_cmd( cmd, *args, _input_str_=params, _live_output_=True, _error_level_=logging.DEBUG, _env_=dict(PATH=os.pathsep.join( [venv_bin_path, os.environ['PATH']]), PYTHONPATH=os.pathsep.join([ rconfig().root_dir, ]), AMLB_PATH=os.path.join(rconfig().root_dir, "amlb")), ) res = ns(lambda: None) if os.path.exists(config.result_file): res = json_load(config.result_file, as_namespace=True) log.debug("Result from subprocess:\n%s", res) if not res: raise NoResultError(f"Process crashed:\n{err}") if res.error_message is not None: raise NoResultError(res.error_message) for name in ['predictions', 'truth', 'probabilities']: res[name] = np.load( res[name], allow_pickle=True) if res[name] is not None else None if callable(process_results): res = process_results(res) if res.output_file: save_predictions( dataset=dataset, output_file=res.output_file, predictions=res.predictions.reshape(-1) if res.predictions is not None else None, truth=res.truth.reshape(-1) if res.truth is not None else None, probabilities=res.probabilities, probabilities_labels=res.probabilities_labels, target_is_encoded=res.target_is_encoded) return dict(models_count=res.models_count if res.models_count is not None else 1, training_duration=res.training_duration if res.training_duration is not None else proc_timer.duration, predict_duration=res.predict_duration, **res.others.__dict__)
def test_version_is_set_to_stable_if_undefined(): framework = Namespace() _add_default_version(framework) assert "version" in framework assert framework.version == "stable"
def test_setup_script_kept_if_defined(simple_resource): framework = Namespace(module="my_module", setup_script="t.sh") _add_default_setup_script(framework, simple_resource.config) assert framework.setup_script == "t.sh"
def test_setup_script_set_to_none_if_undefined(simple_resource): framework = Namespace() _add_default_setup_script(framework, simple_resource.config) assert framework.setup_script is None
def test_setup_args_kept_if_defined(): f_with_extra = Namespace(setup_args="w_extra", version="my_version", repo="my_repo") _add_default_setup_args(f_with_extra) assert f_with_extra.setup_args == ["w_extra"]
def test_params_set_to_empty_dict_if_undefined(): framework = Namespace() _add_default_params(framework) assert framework.params == dict()
def test_setup_args_also_includes_repo_if_repo_is_defined(version, repo): f_my_version = Namespace(version=version, repo=repo) _add_default_setup_args(f_my_version) assert f_my_version.setup_args == [version, repo]
def test_version_is_kept_if_defined(): framework = Namespace(version="v1.0") _add_default_version(framework) assert "version" in framework assert framework.version == "v1.0"
root_file=os.path.join(log_dir, '{script}.{now}.full.log'.format(script=script_name, now=now_str)), root_level='INFO', app_level='DEBUG', console_level='INFO', print_to_log=True) log.info("Running `%s` on `%s` benchmarks in `%s` mode.", args.framework, args.benchmark, args.mode) log.debug("Script args: %s.", args) config = config_load(os.path.join(root_dir, "resources", "config.yaml")) # allowing config override from user_dir: useful to define custom benchmarks and frameworks for example. config_user = config_load(os.path.join(args.userdir if args.userdir is not None else config.user_dir, "config.yaml")) # config listing properties set by command line config_args = ns.parse( {'results.save': args.keep_scores}, input_dir=args.indir, output_dir=args.outdir, user_dir=args.userdir, root_dir=root_dir, script=os.path.basename(__file__), run_mode=args.mode, parallel_jobs=args.parallel, sid=sid, ) + ns.parse(extras) if args.mode != 'local': config_args + ns.parse({'monitoring.frequency_seconds': 0}) config_args = ns({k: v for k, v in config_args if v is not None}) log.debug("Config args: %s.", config_args) # merging all configuration files amlb.resources.from_configs(config, config_user, config_args) try: if args.mode == 'local': bench = amlb.Benchmark(args.framework, args.benchmark, args.constraint)
def test_setup_script_interpolates_module(simple_resource): framework = Namespace(module="my_module", setup_script="{module}/t.sh") _add_default_setup_script(framework, simple_resource.config) assert framework.setup_script == "my_module/t.sh"