def test_evaluate_criterion_returns_nan(minimal_params):
    def introduce_nan(params, useless_arg):
        params.loc[0, "value"] = np.nan
        return params["value"].to_numpy()

    expanded_crit = expand_criterion_output(introduce_nan)
    crit_kwargs = {"useless_arg": "hello world"}

    with pytest.raises(ValueError):
        tp._evaluate_criterion(expanded_crit, minimal_params, crit_kwargs)
def test_evaluate_criterion_array(minimal_params):
    def return_array(params, useless_arg):
        return params["value"].to_numpy()

    expanded_crit = expand_criterion_output(return_array)
    crit_kwargs = {"useless_arg": "hello world"}

    expected_fitness_eval = 13.66666666666666666666666
    expected_comparison_plot_data = minimal_params
    res_fitness, res_cp_data, _ = tp._evaluate_criterion(
        expanded_crit, minimal_params, crit_kwargs)
    assert res_fitness == expected_fitness_eval
    afe(res_cp_data, expected_comparison_plot_data)
def test_evaluate_criterion_scalar(minimal_params):
    def crit_func(params, useless_arg):
        return params["value"].mean()

    expanded_crit = expand_criterion_output(crit_func)
    crit_kwargs = {"useless_arg": "hello world"}

    expected_fitness_eval = 3
    expected_comparison_plot_data = pd.DataFrame()
    expected_comparison_plot_data["value"] = [np.nan]
    res_fitness, res_cp_data, _ = tp._evaluate_criterion(
        expanded_crit, minimal_params, crit_kwargs)
    assert res_fitness == expected_fitness_eval
    afe(res_cp_data, expected_comparison_plot_data)
Exemple #4
0
def _single_minimize(
    criterion,
    params,
    algorithm,
    criterion_kwargs,
    constraints,
    general_options,
    algo_options,
    gradient,
    gradient_options,
    logging,
    log_options,
    dashboard,
    db_options,
):
    """Minimize * criterion * using * algorithm * subject to * constraints * and bounds.
    Only one minimization.

    Args:
        criterion (function):
            Python function that takes a pandas DataFrame with parameters as the first
            argument and returns a scalar floating point value.

        params (pd.DataFrame):
            See :ref:`params`.

        algorithm (str):
            specifies the optimization algorithm. See :ref:`list_of_algorithms`.

        criterion_kwargs (dict):
            additional keyword arguments for criterion

        constraints (list):
            list with constraint dictionaries. See for details.

        general_options (dict):
            additional configurations for the optimization

        algo_options (dict):
            algorithm specific configurations for the optimization

        gradient (callable or None):
            Gradient function.

        gradient_options (dict):
            Options for the gradient function.

        logging (str or pathlib.Path): Path to an sqlite3 file which typically has the
            file extension ``.db``. If the file does not exist, it will be created. See
            :ref:`logging` for details.

        log_options (dict): Keyword arguments to influence the logging. See
            :ref:`logging` for details.

        dashboard (bool):
            whether to create and show a dashboard

        db_options (dict):
            dictionary with kwargs to be supplied to the run_server function.

    """
    simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
    params = _process_params(params)

    # Apply decorator two handle criterion functions with one or two returns.
    criterion = expand_criterion_output(criterion)

    is_maximization = general_options.pop("_maximization", False)
    criterion = negative_criterion(criterion) if is_maximization else criterion
    fitness_factor = -1 if is_maximization else 1

    criterion_out, comparison_plot_data = criterion(params, **criterion_kwargs)
    if np.isscalar(criterion_out):
        fitness_eval = fitness_factor * criterion_out
    else:
        fitness_eval = fitness_factor * np.mean(np.square(criterion_out))

    if np.any(np.isnan(fitness_eval)):
        raise ValueError(
            "The criterion function evaluated at the start parameters returns NaNs."
        )

    database = (prepare_database(logging, params, comparison_plot_data,
                                 log_options) if logging else False)

    general_options["start_criterion_value"] = fitness_eval

    constraints, params = process_constraints(constraints, params)
    internal_params = reparametrize_to_internal(params, constraints)

    queue = Queue() if dashboard else None
    if dashboard:
        stop_signal = Event()
        outer_server_process = Process(
            target=run_server,
            kwargs={
                "queue": queue,
                "db_options": db_options,
                "start_param_df": params,
                "start_fitness": fitness_eval,
                "stop_signal": stop_signal,
            },
            daemon=False,
        )
        outer_server_process.start()

    result, params = _internal_minimize(
        criterion=criterion,
        criterion_kwargs=criterion_kwargs,
        params=params,
        internal_params=internal_params,
        constraints=constraints,
        algorithm=algorithm,
        algo_options=algo_options,
        gradient=gradient,
        gradient_options=gradient_options,
        general_options=general_options,
        database=database,
        queue=queue,
        fitness_factor=fitness_factor,
    )

    if dashboard:
        stop_signal.set()
        outer_server_process.terminate()

    return result, params
Exemple #5
0
def maximize_log_likelihood(
    log_like_obs,
    params,
    algorithm,
    criterion_kwargs=None,
    constraints=None,
    general_options=None,
    algo_options=None,
    gradient_options=None,
    logging=DEFAULT_DATABASE_NAME,
    log_options=None,
    dashboard=False,
    dash_options=None,
):
    """Estimate parameters via maximum likelihood.

    This function provides a convenient interface for estimating models via maximum
    likelihood. In the future, it will also calculate standard errors for the solution.

    The criterion function ``log_like_obs`` has to return an array of log likelihoods at
    the first position, not the mean log likelihood. The array is internally aggregated
    to whatever output is needed. For example, the mean is used for maximization, the
    sum for standard error calculations.

    The second return can be a :class:`pandas.DataFrame` in the `tidy data format`_ to
    display the distribution of contributions for subgroups via the comparison plot in
    the future.

    The limitation to log likelihoods instead of likelihoods may seem unnecessarily
    restrictive, but it is preferred for two reasons.

    1. Optimization methods which rely on gradients generally work better optimizing the
       log transformation. See `1`_ for a simplified example.

    2. Using the log transformation to convert products of probabilities to sums of log
       probabilities is numerically more stable as it prevents over- and underflows. See
       `2`_ for an example.

    Args:
        log_like_obs (callable or list of callables):
            Python function that takes a pandas DataFrame with parameters as the first
            argument and returns an array of log likelihood contributions as the first
            return.

        params (pd.DataFrame or list of pd.DataFrames):
            See :ref:`params`.

        algorithm (str or list of strings):
            specifies the optimization algorithm. See :ref:`list_of_algorithms`.

        criterion_kwargs (dict or list of dicts):
            additional keyword arguments for criterion

        constraints (list or list of lists):
            list with constraint dictionaries. See for details.

        general_options (dict):
            additional configurations for the optimization

        algo_options (dict or list of dicts):
            algorithm specific configurations for the optimization

        gradient_options (dict):
            Options for the gradient function.

        logging (str or pathlib.Path): Path to an sqlite3 file which typically has the
            file extension ``.db``. If the file does not exist, it will be created. See
            :ref:`logging` for details.

        log_options (dict): Keyword arguments to influence the logging. See
            :ref:`logging` for details.

        dashboard (bool):
            whether to create and show a dashboard. See :ref:`dashboard` for details.

        dash_options (dict):
            dictionary with kwargs for the dashboard. See :ref:`dashboard` for details.

    Returns:
        results (tuple or list of tuples):
            The return is either a tuple containing a dictionary of the results and the
            parameters or a list of tuples containing multiples of the former.

    .. _tidy data format:
        http://dx.doi.org/10.18637/jss.v059.i10

    .. _1:
        https://stats.stackexchange.com/a/176563/218971

    .. _2:
        https://statmodeling.stat.columbia.edu/2016/06/11/log-sum-of-exponentials/

    """
    if isinstance(log_like_obs, list):
        extended_loglikelobs = [
            expand_criterion_output(crit_func) for crit_func in log_like_obs
        ]
        wrapped_loglikeobs = [
            aggregate_criterion_output(np.mean)(crit_func)
            for crit_func in extended_loglikelobs
        ]
    else:
        extended_loglikelobs = expand_criterion_output(log_like_obs)
        wrapped_loglikeobs = aggregate_criterion_output(
            np.mean)(extended_loglikelobs)

    results = maximize(
        wrapped_loglikeobs,
        params,
        algorithm,
        criterion_kwargs,
        constraints,
        general_options,
        algo_options,
        gradient_options,
        logging,
        log_options,
        dashboard,
        dash_options,
    )

    # To convert the mean log likelihood in the results dictionary to the log
    # likelihood, get the length of contributions for each optimization.
    arguments = broadcast_arguments(criterion=extended_loglikelobs,
                                    params=params,
                                    criterion_kwargs=criterion_kwargs)
    check_arguments(arguments)

    contribs_and_cp_data = [
        args_one_run["criterion"](args_one_run["params"],
                                  **args_one_run["criterion_kwargs"])
        for args_one_run in arguments
    ]
    n_contributions = [len(c_and_cp[0]) for c_and_cp in contribs_and_cp_data]

    if isinstance(results, list):
        for result, n_contribs in zip(results, n_contributions):
            result[0]["fitness"] = result[0]["fitness"] * n_contribs
    else:
        results[0]["fitness"] = results[0]["fitness"] * n_contributions[0]

    return results
def transform_problem(
    criterion,
    params,
    algorithm,
    criterion_kwargs,
    constraints,
    general_options,
    algo_options,
    gradient,
    gradient_kwargs,
    gradient_options,
    logging,
    log_options,
    dashboard,
    dash_options,
):
    """Transform the user supplied problem.

    The transformed optimization problem is converted from the original problem
    which consists of the user supplied criterion, params DataFrame, criterion_kwargs,
    constraints and gradient (if supplied).
    In addition, the transformed optimization problem provides sophisticated logging
    tools if activated by the user.

    The transformed problem can be solved by almost any optimizer package:
        1. The only constraints are bounds on the parameters.
        2. The internal_criterion function takes an one dimensional np.array as input.
        3. The internal criterion function returns a scalar value
            (except for the case of the tao_pounders algorithm).

    Note that because of the reparametrizations done by estimagic to implement
    constraints on behalf of the user the internal params cannot be interpreted without
    reparametrizing it to the full params DataFrame.

    Args:
        criterion (callable or list of callables): Python function that takes a pandas
            DataFrame with parameters as the first argument. Supported outputs are:
                - scalar floating point
                - np.ndarray: contributions for the tao Pounders algorithm.
                - tuple of a scalar floating point and a pd.DataFrame:
                    In this case the first output is the criterion value.
                    The second output are the comparison_plot_data.
                    See :ref:`comparison_plot`.
                    .. warning::
                        This feature is not implemented in the dashboard yet.
        params (pd.DataFrame or list of pd.DataFrames): See :ref:`params`.
        algorithm (str or list of strings): Name of the optimization algorithm.
            See :ref:`list_of_algorithms`.
        criterion_kwargs (dict or list of dict): Additional criterion keyword arguments.
        constraints (list or list of lists): List with constraint dictionaries.
            See :ref:`constraints` for details.
        general_options (dict): Additional configurations for the optimization.
            Keys can include:
                - keep_dashboard_alive (bool): if True and dashboard is True the process
                    in which the dashboard is run is not terminated when maximize or
                    minimize finish.
        algo_options (dict or list of dicts): Algorithm specific configurations.
        gradient_options (dict): Options for the gradient function.
        gradient_kwargs (dict): Additional keyword arguments for the gradient.
        logging (str or pathlib.Path or list thereof): Path to an sqlite3 file which
            typically has the file extension ``.db``. If the file does not exist,
            it will be created. See :ref:`logging` for details.
        log_options (dict or list of dict): Keyword arguments to influence the logging.
            See :ref:`logging` for details.
        dashboard (bool): Whether to create and show a dashboard, default is False.
            See :ref:`dashboard` for details.
        dash_options (dict or list of dict, optional): Options passed to the dashboard.
            Supported keys are:
                - port (int): port where to display the dashboard
                - no_browser (bool): whether to display the dashboard in a browser
                - rollover (int): how many iterations to keep in the monitoring plots

    Returns:
        optim_kwargs (dict): Dictionary collecting all arguments that are going to be
            passed to _internal_minimize.
        database_path (str or pathlib.Path or None): Path to the database.
        result_kwargs (dict): Arguments needed to reparametrize back from the internal
            paramater array to the params DataFrame of the user supplied problem.
            In addition it contains whether the dashboard process should be kept alive
            after the optimization(s) terminate(s).

    """
    optim_kwargs, params, dash_options, database_path = _pre_process_arguments(
        params=params,
        algorithm=algorithm,
        algo_options=algo_options,
        logging=logging,
        dashboard=dashboard,
        dash_options=dash_options,
    )

    # harmonize criterion interface
    is_maximization = general_options.pop("_maximization", False)
    criterion = expand_criterion_output(criterion)
    criterion = negative_criterion(criterion) if is_maximization else criterion

    # first criterion evaluation for the database and the pounders algorithm
    fitness_eval, comparison_plot_data, raw_result = _evaluate_criterion(
        criterion=criterion, params=params, criterion_kwargs=criterion_kwargs)
    general_options = general_options.copy()
    general_options["_start_criterion_value"] = raw_result
    general_options["start_criterion_value"] = fitness_eval

    with warnings.catch_warnings():
        warnings.simplefilter(action="ignore",
                              category=pd.errors.PerformanceWarning)

        # transform the user supplied inputs into the internal inputs.
        constraints, params = process_constraints(constraints, params)
        internal_params = reparametrize_to_internal(params, constraints)
        bounds = _get_internal_bounds(params)

    # setup the database to pass it to the internal functions for logging
    if logging:
        database = prepare_database(
            path=logging,
            params=params,
            comparison_plot_data=comparison_plot_data,
            dash_options=dash_options,
            constraints=constraints,
            **log_options,
        )
    else:
        database = False

    # transform the user supplied criterion and gradient function into their
    # internal counterparts that use internal inputs.

    # this must be passed to _create_internal_criterion because the internal
    # gradient creates its own internal criterion function whose calls are
    # logged differently by the database.
    logging_decorator = functools.partial(
        log_evaluation,
        database=database,
        tables=[
            "params_history", "criterion_history", "comparison_plot",
            "timestamps"
        ],
    )

    internal_criterion = _create_internal_criterion(
        criterion=criterion,
        params=params,
        constraints=constraints,
        criterion_kwargs=criterion_kwargs,
        logging_decorator=logging_decorator,
        general_options=general_options,
        database=database,
    )

    internal_gradient = _create_internal_gradient(
        gradient=gradient,
        gradient_kwargs=gradient_kwargs,
        gradient_options=gradient_options,
        criterion=criterion,
        params=params,
        constraints=constraints,
        criterion_kwargs=criterion_kwargs,
        general_options=general_options,
        database=database,
    )

    internal_kwargs = {
        "internal_criterion": internal_criterion,
        "internal_params": internal_params,
        "bounds": bounds,
        "internal_gradient": internal_gradient,
        "database": database,
        "general_options": general_options,
    }
    optim_kwargs.update(internal_kwargs)

    result_kwargs = {
        "params": params,
        "constraints": constraints,
        "keep_dashboard_alive": general_options.pop("keep_dashboard_alive",
                                                    False),
    }
    return optim_kwargs, database_path, result_kwargs