Example #1
0
def load_oml_benchmark(
        benchmark: str) -> Tuple[str, Optional[str], List[Namespace]]:
    """ Loads benchmark defined by openml suite or task, from openml/s/X or openml/t/Y. """
    domain, oml_type, oml_id = benchmark.split('/')
    path = None  # benchmark file does not exist on disk
    name = benchmark  # name is later passed as cli input again for containers, it needs to remain parsable
    if oml_type == 't':
        log.info("Loading openml task %s.", oml_id)
        # We first have the retrieve the task because we don't know the dataset id
        t = openml.tasks.get_task(oml_id, download_data=False)
        data = openml.datasets.get_dataset(t.dataset_id, download_data=False)
        tasks = [
            Namespace(name=str_sanitize(data.name),
                      description=data.description,
                      openml_task_id=t.id)
        ]
    elif oml_type == 's':
        log.info("Loading openml suite %s.", oml_id)
        suite = openml.study.get_suite(oml_id)

        # Here we know the (task, dataset) pairs so only download dataset meta-data is sufficient
        tasks = []
        for tid, did in zip(suite.tasks, suite.data):
            data = openml.datasets.get_dataset(did, download_data=False)
            tasks.append(
                Namespace(name=str_sanitize(data.name),
                          description=data.description,
                          openml_task_id=tid))
    else:
        raise ValueError(f"The oml_type is {oml_type} but must be 's' or 't'")
    return name, path, tasks
Example #2
0
def load_oml_benchmark(
        benchmark: str) -> Tuple[str, Optional[str], List[Namespace]]:
    """ Loads benchmark defined by openml suite or task, from openml/s/X or openml/t/Y. """
    domain, oml_type, oml_id = benchmark.split('/')
    path = None  # benchmark file does not exist on disk
    name = benchmark  # name is later passed as cli input again for containers, it needs to remain parsable

    if openml.config.retry_policy != "robot":
        log.debug("Setting openml retry_policy from '%s' to 'robot'." %
                  openml.config.retry_policy)
        openml.config.set_retry_policy("robot")

    if oml_type == 't':
        log.info("Loading openml task %s.", oml_id)
        # We first have the retrieve the task because we don't know the dataset id
        t = openml.tasks.get_task(oml_id,
                                  download_data=False,
                                  download_qualities=False)
        data = openml.datasets.get_dataset(t.dataset_id,
                                           download_data=False,
                                           download_qualities=False)
        tasks = [
            Namespace(name=str_sanitize(data.name),
                      description=data.description,
                      openml_task_id=t.id)
        ]
    elif oml_type == 's':
        log.info("Loading openml suite %s.", oml_id)
        suite = openml.study.get_suite(oml_id)

        # Here we know the (task, dataset) pairs so only download dataset meta-data is sufficient
        tasks = []
        datasets = openml.datasets.list_datasets(data_id=suite.data,
                                                 output_format='dataframe')
        datasets.set_index('did', inplace=True)
        for tid, did in zip(suite.tasks, suite.data):
            tasks.append(
                Namespace(
                    name=str_sanitize(datasets.loc[did]['name']),
                    description=
                    f"{openml.config.server.replace('/api/v1/xml', '')}/d/{did}",
                    openml_task_id=tid))
    else:
        raise ValueError(f"The oml_type is {oml_type} but must be 's' or 't'")
    return name, path, tasks
Example #3
0
def benchmark_load(name, benchmark_definition_dirs: List[str]):
    """ Loads the benchmark definition for the 'benchmark' cli input string.

    :param name: the value for 'benchmark'
    :param benchmark_definition_dirs: directories in which benchmark definitions can be found
    :return: a tuple with constraint defaults, tasks, the benchmark path (if it is a local file) and benchmark name
    """
    # Identify where the resource is located, all name structures are clearly defined,
    # but local file benchmark can require probing from disk to see if it is valid,
    # which is why it is tried last.
    if is_openml_benchmark(name):
        benchmark_name, benchmark_path, tasks = load_oml_benchmark(name)
    # elif is_kaggle_benchmark(name):
    else:
        benchmark_name, benchmark_path, tasks = load_file_benchmark(
            name, benchmark_definition_dirs)

    hard_defaults = next(
        (task for task in tasks if task.name == '__defaults__'), None)
    tasks = [task for task in tasks if task is not hard_defaults]
    for t in tasks:
        t.name = str_sanitize(t.name)
    return hard_defaults, tasks, benchmark_path, str_sanitize(benchmark_name)
def _add_framework_name(frameworks: Namespace):
    """ Adds a 'name' attribute to each framework. """
    for name, framework in frameworks:
        framework.name = str_sanitize(name)
Example #5
0
#  on top of this, user can now override the aws.region setting in his custom ~/.config/automlbenchmark/config.yaml settings.
# parser.add_argument('-r', '--region', metavar='aws_region', default=None,
#                     help="The region on which to run the benchmark when using AWS.")

root_dir = os.path.dirname(__file__)
args = parser.parse_args()
script_name = os.path.splitext(os.path.basename(__file__))[0]
extras = {
    t[0]: t[1] if len(t) > 1 else True
    for t in [x.split('=', 1) for x in args.extra]
}

now_str = datetime_iso(date_sep='', time_sep='')
sid = (args.session if args.session is not None else "{}.{}".format(
    '.'.join([
        str_sanitize(args.framework.split(':', 1)[0]),
        str_sanitize(args.benchmark if re.fullmatch(r"(openml)/[st]/\d+", args.
                                                    benchmark) else os.path.
                     splitext(os.path.basename(args.benchmark))[0]),
        str_sanitize(args.constraint),
        extras.get('run_mode', args.mode)
    ]).lower(), now_str))
log_dir = amlb.resources.output_dirs(
    args.outdir or os.path.join(os.getcwd(), 'logs'),
    session=sid,
    subdirs='logs' if args.outdir else '',
    create=True)['logs' if args.outdir else 'session']
# now_str = datetime_iso(time=False, no_sep=True)
if args.profiling:
    logging.TRACE = logging.INFO
amlb.logger.setup(log_file=os.path.join(