def load_oml_benchmark( benchmark: str) -> Tuple[str, Optional[str], List[Namespace]]: """ Loads benchmark defined by openml suite or task, from openml/s/X or openml/t/Y. """ domain, oml_type, oml_id = benchmark.split('/') path = None # benchmark file does not exist on disk name = benchmark # name is later passed as cli input again for containers, it needs to remain parsable if oml_type == 't': log.info("Loading openml task %s.", oml_id) # We first have the retrieve the task because we don't know the dataset id t = openml.tasks.get_task(oml_id, download_data=False) data = openml.datasets.get_dataset(t.dataset_id, download_data=False) tasks = [ Namespace(name=str_sanitize(data.name), description=data.description, openml_task_id=t.id) ] elif oml_type == 's': log.info("Loading openml suite %s.", oml_id) suite = openml.study.get_suite(oml_id) # Here we know the (task, dataset) pairs so only download dataset meta-data is sufficient tasks = [] for tid, did in zip(suite.tasks, suite.data): data = openml.datasets.get_dataset(did, download_data=False) tasks.append( Namespace(name=str_sanitize(data.name), description=data.description, openml_task_id=tid)) else: raise ValueError(f"The oml_type is {oml_type} but must be 's' or 't'") return name, path, tasks
def load_oml_benchmark( benchmark: str) -> Tuple[str, Optional[str], List[Namespace]]: """ Loads benchmark defined by openml suite or task, from openml/s/X or openml/t/Y. """ domain, oml_type, oml_id = benchmark.split('/') path = None # benchmark file does not exist on disk name = benchmark # name is later passed as cli input again for containers, it needs to remain parsable if openml.config.retry_policy != "robot": log.debug("Setting openml retry_policy from '%s' to 'robot'." % openml.config.retry_policy) openml.config.set_retry_policy("robot") if oml_type == 't': log.info("Loading openml task %s.", oml_id) # We first have the retrieve the task because we don't know the dataset id t = openml.tasks.get_task(oml_id, download_data=False, download_qualities=False) data = openml.datasets.get_dataset(t.dataset_id, download_data=False, download_qualities=False) tasks = [ Namespace(name=str_sanitize(data.name), description=data.description, openml_task_id=t.id) ] elif oml_type == 's': log.info("Loading openml suite %s.", oml_id) suite = openml.study.get_suite(oml_id) # Here we know the (task, dataset) pairs so only download dataset meta-data is sufficient tasks = [] datasets = openml.datasets.list_datasets(data_id=suite.data, output_format='dataframe') datasets.set_index('did', inplace=True) for tid, did in zip(suite.tasks, suite.data): tasks.append( Namespace( name=str_sanitize(datasets.loc[did]['name']), description= f"{openml.config.server.replace('/api/v1/xml', '')}/d/{did}", openml_task_id=tid)) else: raise ValueError(f"The oml_type is {oml_type} but must be 's' or 't'") return name, path, tasks
def benchmark_load(name, benchmark_definition_dirs: List[str]): """ Loads the benchmark definition for the 'benchmark' cli input string. :param name: the value for 'benchmark' :param benchmark_definition_dirs: directories in which benchmark definitions can be found :return: a tuple with constraint defaults, tasks, the benchmark path (if it is a local file) and benchmark name """ # Identify where the resource is located, all name structures are clearly defined, # but local file benchmark can require probing from disk to see if it is valid, # which is why it is tried last. if is_openml_benchmark(name): benchmark_name, benchmark_path, tasks = load_oml_benchmark(name) # elif is_kaggle_benchmark(name): else: benchmark_name, benchmark_path, tasks = load_file_benchmark( name, benchmark_definition_dirs) hard_defaults = next( (task for task in tasks if task.name == '__defaults__'), None) tasks = [task for task in tasks if task is not hard_defaults] for t in tasks: t.name = str_sanitize(t.name) return hard_defaults, tasks, benchmark_path, str_sanitize(benchmark_name)
def _add_framework_name(frameworks: Namespace): """ Adds a 'name' attribute to each framework. """ for name, framework in frameworks: framework.name = str_sanitize(name)
# on top of this, user can now override the aws.region setting in his custom ~/.config/automlbenchmark/config.yaml settings. # parser.add_argument('-r', '--region', metavar='aws_region', default=None, # help="The region on which to run the benchmark when using AWS.") root_dir = os.path.dirname(__file__) args = parser.parse_args() script_name = os.path.splitext(os.path.basename(__file__))[0] extras = { t[0]: t[1] if len(t) > 1 else True for t in [x.split('=', 1) for x in args.extra] } now_str = datetime_iso(date_sep='', time_sep='') sid = (args.session if args.session is not None else "{}.{}".format( '.'.join([ str_sanitize(args.framework.split(':', 1)[0]), str_sanitize(args.benchmark if re.fullmatch(r"(openml)/[st]/\d+", args. benchmark) else os.path. splitext(os.path.basename(args.benchmark))[0]), str_sanitize(args.constraint), extras.get('run_mode', args.mode) ]).lower(), now_str)) log_dir = amlb.resources.output_dirs( args.outdir or os.path.join(os.getcwd(), 'logs'), session=sid, subdirs='logs' if args.outdir else '', create=True)['logs' if args.outdir else 'session'] # now_str = datetime_iso(time=False, no_sep=True) if args.profiling: logging.TRACE = logging.INFO amlb.logger.setup(log_file=os.path.join(