Esempio n. 1
0
def get_objective_signature(model_name, dataset, scorer, data_root=None):
    """Get signature of an objective function specified by an sklearn model and dataset.

    This routine specializes :func:`.signatures.get_func_signature` for the `sklearn` study case.

    Parameters
    ----------
    model_name : str
        Which sklearn model we are attempting to tune, must be an element of `constants.MODEL_NAMES`.
    dataset : str
        Which data set the model is being tuned to, which must be either a) an element of
        `constants.DATA_LOADER_NAMES`, or b) the name of a csv file in the `data_root` folder for a custom data set.
    scorer : str
        Which metric to use when evaluating the model. This must be an element of `sklearn_funcs.SCORERS_CLF` for
        classification models, or `sklearn_funcs.SCORERS_REG` for regression models.
    data_root : str
        Absolute path to folder containing custom data sets. This may be ``None`` if no custom data sets are used.``

    Returns
    -------
    signature : list(str)
        The signature of this test function.
    """
    function_instance = SklearnModel(model_name,
                                     dataset,
                                     scorer,
                                     data_root=data_root)
    api_config = function_instance.get_api_config()
    signature = get_func_signature(function_instance.evaluate, api_config)
    return signature
Esempio n. 2
0
def run_sklearn_study(opt_class,
                      opt_kwargs,
                      model_name,
                      dataset,
                      scorer,
                      n_calls,
                      n_suggestions,
                      data_root=None):
    """Run a study for a single optimizer on a single `sklearn` model/data set combination.

    This routine is meant for benchmarking when tuning `sklearn` models, as opposed to the more general
    :func:`.run_study`.

    Parameters
    ----------
    opt_class : :class:`.abstract_optimizer.AbstractOptimizer`
        Type of wrapper optimizer must be subclass of :class:`.abstract_optimizer.AbstractOptimizer`.
    opt_kwargs : kwargs
        `kwargs` to use when instantiating the wrapper class.
    model_name : str
        Which sklearn model we are attempting to tune, must be an element of `constants.MODEL_NAMES`.
    dataset : str
        Which data set the model is being tuned to, which must be either a) an element of
        `constants.DATA_LOADER_NAMES`, or b) the name of a csv file in the `data_root` folder for a custom data set.
    scorer : str
        Which metric to use when evaluating the model. This must be an element of `sklearn_funcs.SCORERS_CLF` for
        classification models, or `sklearn_funcs.SCORERS_REG` for regression models.
    n_calls : int
        How many iterations of minimization to run.
    n_suggestions : int
        How many parallel evaluation we run each iteration. Must be ``>= 1``.
    data_root : str
        Absolute path to folder containing custom data sets. This may be ``None`` if no custom data sets are used.``

    Returns
    -------
    function_evals : :class:`numpy:numpy.ndarray` of shape (n_calls, n_suggestions)
        Value of objective for each evaluation.
    timing_evals : (:class:`numpy:numpy.ndarray`, :class:`numpy:numpy.ndarray`, :class:`numpy:numpy.ndarray`)
        Tuple of 3 timing results: ``(suggest_time, eval_time, observe_time)`` with shapes ``(n_calls,)``,
        ``(n_calls, n_suggestions)``, and ``(n_calls,)``. These are the time to make each suggestion, the time for each
        evaluation of the objective function, and the time to make an observe call.
    """
    # Setup test function
    function_instance = SklearnModel(model_name,
                                     dataset,
                                     scorer,
                                     data_root=data_root)

    # Setup optimizer
    api_config = function_instance.get_api_config()
    optimizer_instance = opt_class(api_config, **opt_kwargs)

    # Now actually do the experiment
    results = run_study(optimizer_instance, function_instance, n_calls,
                        n_suggestions)
    return results
def test_run_study(model_name, dataset, scorer, n_calls, n_suggestions, seed):
    prob_type = data.get_problem_type(dataset)
    assume(scorer in data.METRICS_LOOKUP[prob_type])

    function_instance = SklearnModel(model_name, dataset, scorer)
    optimizer = RandomOptimizer(function_instance.get_api_config(),
                                random=np.random.RandomState(seed))
    optimizer.get_version()
    exp.run_study(optimizer,
                  function_instance,
                  n_calls,
                  n_suggestions,
                  n_obj=len(function_instance.objective_names))
Esempio n. 4
0
def _build_test_problem(model_name, dataset, scorer, path):
    """Build the class with the class to use an objective. Sort of a factory.

    Parameters
    ----------
    model_name : str
        Which sklearn model we are attempting to tune, must be an element of `constants.MODEL_NAMES`.
    dataset : str
        Which data set the model is being tuned to, which must be either a) an element of
        `constants.DATA_LOADER_NAMES`, or b) the name of a csv file in the `data_root` folder for a custom data set.
    scorer : str
        Which metric to use when evaluating the model. This must be an element of `sklearn_funcs.SCORERS_CLF` for
        classification models, or `sklearn_funcs.SCORERS_REG` for regression models.
    path : str or None
        Absolute path to folder containing custom data sets/pickle files with surrogate model.

    Returns
    -------
    prob : :class:`.sklearn_funcs.TestFunction`
        The test function to evaluate in experiments.
    """
    if model_name.endswith("-surr"):
        # Requires IO to test these, so will add the pargma here. Maybe that points towards a possible design change.
        model_name = chomp(model_name, "-surr")  # pragma: io
        prob = SklearnSurrogate(model_name, dataset, scorer,
                                path=path)  # pragma: io
    else:
        prob = SklearnModel(model_name, dataset, scorer, data_root=path)
    return prob
def load_experiments(uuid_list, db_root, dbid):  # pragma: io
    """Generator to load the results of the experiments.

    Parameters
    ----------
    uuid_list : list(uuid.UUID)
        List of UUIDs corresponding to experiments to load.
    db_root : str
        Root location for data store as requested by the serializer used.
    dbid : str
        Name of the data store as requested by the serializer used.

    Yields
    ------
    meta_data : (str, str, str)
        The `meta_data` contains a `tuple` of `str` with ``test_case, optimizer, uuid``.
    data : (:class:`xarray:xarray.Dataset`, :class:`xarray:xarray.Dataset`, :class:`xarray:xarray.Dataset` list(float))
        The `data` contains a tuple of ``(perf_ds, time_ds, suggest_ds, sig)``. The `perf_ds` is a
        :class:`xarray:xarray.Dataset` containing the evaluation results with dimensions ``(ITER, SUGGEST)``, each
        variable is an objective. The `time_ds` is an :class:`xarray:xarray.Dataset` containing the timing results of
        the form accepted by `summarize_time`. The coordinates must be compatible with `perf_ds`. The suggest_ds is a
        :class:`xarray:xarray.Dataset` containing the inputs to the function evaluations. Each variable is a function
        input. Finally, `sig` contains the `test_case` signature and must be `list(float)`.
    """
    uuids_seen = set()
    for uuid_ in uuid_list:
        logger.info(uuid_.hex)

        # Load perf and timing data
        perf_ds, meta = XRSerializer.load(db_root, db=dbid, key=cc.EVAL, uuid_=uuid_)
        time_ds, meta_t = XRSerializer.load(db_root, db=dbid, key=cc.TIME, uuid_=uuid_)
        assert meta == meta_t, "meta data should between time and eval files"
        suggest_ds, meta_t = XRSerializer.load(db_root, db=dbid, key=cc.SUGGEST_LOG, uuid_=uuid_)
        assert meta == meta_t, "meta data should between suggest and eval files"

        # Get signature to pass out as well
        _, sig = meta["signature"]
        logger.info(meta)
        logger.info(sig)

        # Build the new indices for combined data, this could be put in function for easier testing
        eval_args = unserializable_dict(meta["args"])  # Unpack meta-data
        test_case = SklearnModel.test_case_str(
            eval_args[CmdArgs.classifier], eval_args[CmdArgs.data], eval_args[CmdArgs.metric]
        )
        optimizer = str_join_safe(
            ARG_DELIM, (eval_args[CmdArgs.optimizer], eval_args[CmdArgs.opt_rev], eval_args[CmdArgs.rev])
        )
        args_uuid = eval_args[CmdArgs.uuid]

        # Check UUID sanity
        assert isinstance(args_uuid, str)
        assert args_uuid == uuid_.hex, "UUID meta-data does not match filename"
        assert args_uuid not in uuids_seen, "uuids being reused between studies"
        uuids_seen.add(args_uuid)

        # Return key -> data so this generator can be iterated over in dict like manner
        meta_data = (test_case, optimizer, args_uuid)
        data = (perf_ds, time_ds, suggest_ds, sig)
        yield meta_data, data
def test_run_study_bounds_fail(model_name, dataset, scorer, n_calls,
                               n_suggestions, seed):
    prob_type = data.get_problem_type(dataset)
    assume(scorer in data.METRICS_LOOKUP[prob_type])

    function_instance = SklearnModel(model_name, dataset, scorer)
    optimizer = OutOfBoundsOptimizer(function_instance.get_api_config(),
                                     random=np.random.RandomState(seed))
    optimizer.get_version()

    # pytest have some assert failed tools we could use instead, but this is ok for now
    bounds_fails = False
    try:
        exp.run_study(optimizer,
                      function_instance,
                      n_calls,
                      n_suggestions,
                      n_obj=len(function_instance.objective_names))
    except Exception as e:
        bounds_fails = str(e) == "Optimizer suggestion is out of range."
    assert bounds_fails
def test_run_study_callback(model_name, dataset, scorer, n_calls,
                            n_suggestions, seed):
    prob_type = data.get_problem_type(dataset)
    assume(scorer in data.METRICS_LOOKUP[prob_type])

    function_instance = SklearnModel(model_name, dataset, scorer)
    optimizer = RandomOptimizer(function_instance.get_api_config(),
                                random=np.random.RandomState(seed))
    optimizer.get_version()
    n_obj = len(function_instance.objective_names)

    function_evals_cmin = np.zeros((n_calls, n_obj), dtype=float)
    iters_list = []

    def callback(f_min, iters):
        assert f_min.shape == (n_obj, )

        iters_list.append(iters)
        if iters == 0:
            assert np.all(f_min == np.inf)
            return

        function_evals_cmin[iters - 1, :] = f_min

    function_evals, _, _ = exp.run_study(optimizer,
                                         function_instance,
                                         n_calls,
                                         n_suggestions,
                                         n_obj=n_obj,
                                         callback=callback)

    assert iters_list == list(range(n_calls + 1))

    for ii in range(n_obj):
        for jj in range(n_calls):
            idx0, idx1 = np_util.argmin_2d(function_evals[:jj + 1, :, 0])
            assert function_evals_cmin[jj, ii] == function_evals[idx0, idx1,
                                                                 ii]
Esempio n. 8
0
def experiment_main(opt_class, args=None):  # pragma: main
    """This is in effect the `main` routine for this experiment. However, it is called from the optimizer wrapper file
    so the class can be passed in. The optimizers are assumed to be outside the package, so the optimizer class can't
    be named from inside the main function without using hacky stuff like `eval`.
    """
    if args is None:
        description = "Run a study with one benchmark function and an optimizer"
        args = cmd.parse_args(cmd.experiment_parser(description))
    args[CmdArgs.opt_rev] = opt_class.get_version()

    run_uuid = uuid.UUID(args[CmdArgs.uuid])

    logging.captureWarnings(True)

    # Setup logging to both a file and stdout (if verbose is set to True)
    logger.setLevel(logging.INFO)  # Note this is the module-wide logger
    logfile = XRSerializer.logging_path(args[CmdArgs.db_root],
                                        args[CmdArgs.db], run_uuid)
    logger_file_handler = logging.FileHandler(logfile, mode="w")
    logger.addHandler(logger_file_handler)
    if args[CmdArgs.verbose]:
        logger.addHandler(logging.StreamHandler())

    warnings_logger = logging.getLogger("py.warnings")
    warnings_logger.addHandler(logger_file_handler)
    if args[CmdArgs.verbose]:
        warnings_logger.addHandler(logging.StreamHandler())

    logger.info("running: %s" % str(cmd.serializable_dict(args)))
    logger.info("cmd: %s" % cmd.cmd_str())

    assert (args[CmdArgs.metric]
            in METRICS_LOOKUP[get_problem_type(args[CmdArgs.data])]
            ), "reg/clf metrics can only be used on compatible dataset"

    # Setup random streams for computing the signature, must use same seed
    # across all runs to ensure signature is consistent. This seed is random:
    _setup_seeds(
        "7e9f2cabb0dd4f44bc10cf18e440b427")  # pragma: allowlist secret
    signature = get_objective_signature(args[CmdArgs.classifier],
                                        args[CmdArgs.data],
                                        args[CmdArgs.metric],
                                        data_root=args[CmdArgs.data_root])
    logger.info("computed signature: %s" % str(signature))

    opt_kwargs = load_optimizer_kwargs(args[CmdArgs.optimizer],
                                       args[CmdArgs.optimizer_root])

    # Setup the call back for intermediate logging
    if cc.BASELINE not in XRSerializer.get_derived_keys(args[CmdArgs.db_root],
                                                        db=args[CmdArgs.db]):
        warnings.warn("Baselines not found. Will not log intermediate scores.")
        callback = None
    else:
        test_case_str = SklearnModel.test_case_str(args[CmdArgs.classifier],
                                                   args[CmdArgs.data],
                                                   args[CmdArgs.metric])
        optimizer_str = str_join_safe(
            ARG_DELIM, (args[CmdArgs.optimizer], args[CmdArgs.opt_rev],
                        args[CmdArgs.rev]))

        baseline_ds, baselines_meta = XRSerializer.load_derived(
            args[CmdArgs.db_root], db=args[CmdArgs.db], key=cc.BASELINE)

        # Check the objective function signatures match in the baseline file
        sig_errs, _ = analyze_signature_pair({test_case_str: signature[1]},
                                             baselines_meta["signature"])
        logger.info("Signature errors:\n%s" % sig_errs.to_string())
        print(json.dumps({"exp sig errors": sig_errs.T.to_dict()}))

        def log_mean_score_json(evals, iters):
            assert evals.shape == (len(OBJECTIVE_NAMES), )
            assert not np.any(np.isnan(evals))

            log_msg = {
                cc.TEST_CASE: test_case_str,
                cc.METHOD: optimizer_str,
                cc.TRIAL: args[CmdArgs.uuid],
                cc.ITER: iters,
            }

            for idx, obj in enumerate(OBJECTIVE_NAMES):
                assert OBJECTIVE_NAMES[idx] == obj

                # Extract relevant rescaling info
                slice_ = {cc.TEST_CASE: test_case_str, OBJECTIVE: obj}
                best_opt = baseline_ds[cc.PERF_BEST].sel(
                    slice_, drop=True).values.item()
                base_clip_val = baseline_ds[cc.PERF_CLIP].sel(
                    slice_, drop=True).values.item()

                # Perform the same rescaling as found in experiment_analysis.compute_aggregates()
                score = linear_rescale(evals[idx],
                                       best_opt,
                                       base_clip_val,
                                       0.0,
                                       1.0,
                                       enforce_bounds=False)
                # Also, clip the score from below at -1 to limit max influence of single run on final average
                score = np.clip(score, -1.0, 1.0)
                score = score.item()  # Make easiest for logging in JSON
                assert isinstance(score, float)

                # Note: This is not the raw score but the rescaled one!
                log_msg[obj] = score
            log_msg = json.dumps(log_msg)
            print(log_msg, flush=True)
            # One second safety delay to protect against subprocess stdout getting lost
            sleep(1)

        callback = log_mean_score_json

    # Now set the seeds for the actual experiment
    _setup_seeds(args[CmdArgs.uuid])

    # Now do the experiment
    logger.info("starting sklearn study %s %s %s %s %d %d" % (
        args[CmdArgs.optimizer],
        args[CmdArgs.classifier],
        args[CmdArgs.data],
        args[CmdArgs.metric],
        args[CmdArgs.n_calls],
        args[CmdArgs.n_suggest],
    ))
    logger.info("with data root: %s" % args[CmdArgs.data_root])
    function_evals, timing, suggest_log = run_sklearn_study(
        opt_class,
        opt_kwargs,
        args[CmdArgs.classifier],
        args[CmdArgs.data],
        args[CmdArgs.metric],
        args[CmdArgs.n_calls],
        args[CmdArgs.n_suggest],
        data_root=args[CmdArgs.data_root],
        callback=callback,
    )

    # Curate results into clean dataframes
    eval_ds = build_eval_ds(function_evals, OBJECTIVE_NAMES)
    time_ds = build_timing_ds(*timing)
    suggest_ds = build_suggest_ds(suggest_log)

    # setup meta:
    meta = {"args": cmd.serializable_dict(args), "signature": signature}
    logger.info("saving meta data: %s" % str(meta))

    # Now the final IO to export the results
    logger.info("saving results")
    XRSerializer.save(eval_ds,
                      meta,
                      args[CmdArgs.db_root],
                      db=args[CmdArgs.db],
                      key=cc.EVAL,
                      uuid_=run_uuid)

    logger.info("saving timing")
    XRSerializer.save(time_ds,
                      meta,
                      args[CmdArgs.db_root],
                      db=args[CmdArgs.db],
                      key=cc.TIME,
                      uuid_=run_uuid)

    logger.info("saving suggest log")
    XRSerializer.save(suggest_ds,
                      meta,
                      args[CmdArgs.db_root],
                      db=args[CmdArgs.db],
                      key=cc.SUGGEST_LOG,
                      uuid_=run_uuid)

    logger.info("done")