コード例 #1
0
    def test_filtering_out_invalid_rows(self):
        spaces = [
            bayesian_optimizer_config_store.parameter_space,
            glow_worm_swarm_optimizer_config_store.parameter_space
        ]

        # Just to make sure we are testing both hierarchical and flat code paths.
        #
        assert any(space.is_hierarchical() for space in spaces)
        assert any(not space.is_hierarchical() for space in spaces)

        num_samples = 1000
        for space in spaces:
            random_dataframe_with_invalid_rows = space.random_dataframe(
                num_samples=num_samples)
            for dimension in space.dimensions:
                if isinstance(dimension,
                              (ContinuousDimension, DiscreteDimension)):
                    # This makes about half of the rows invalid.
                    #
                    random_dataframe_with_invalid_rows.loc[:, [dimension.
                                                               name]] *= 2
                    break

            with traced(scope_name="slow_filtering"):
                # Let's filter out invalid rows the slow way.
                #
                valid_indices = []
                for idx in random_dataframe_with_invalid_rows.index:
                    row_as_df = random_dataframe_with_invalid_rows.loc[[idx]]
                    row_as_point = Point.from_dataframe(row_as_df)
                    if row_as_point in space:
                        valid_indices.append(idx)
                expected_valid_rows_index = pd.Index(valid_indices)

            print(
                f"{len(expected_valid_rows_index)}/{len(random_dataframe_with_invalid_rows.index)} rows are valid."
            )
            assert 0 < len(expected_valid_rows_index)
            assert len(expected_valid_rows_index) < num_samples

            # Let's filter out invalid rows the fast way.
            #
            actual_valid_rows_index = space.filter_out_invalid_rows(
                original_dataframe=random_dataframe_with_invalid_rows,
                exclude_extra_columns=True).index
            assert expected_valid_rows_index.equals(actual_valid_rows_index)

            if not space.is_hierarchical():
                # For flat spaces we can choose between the column-wise operators and the row-wise validation. This is to get the tracing data to see the
                # perf difference, but also to validate correctness by computing the desired index in yet another way.
                #
                with traced(scope_name="faster_filtering"):
                    expected_valid_rows_index_2 = random_dataframe_with_invalid_rows[
                        random_dataframe_with_invalid_rows.apply(
                            lambda row: Point(
                                **{
                                    dim_name: row[i]
                                    for i, dim_name in enumerate(
                                        space.dimension_names)
                                }) in space,
                            axis=1)].index
                assert expected_valid_rows_index_2.equals(
                    actual_valid_rows_index)
コード例 #2
0
ファイル: PolynomialObjective.py プロジェクト: sycomix/MLOS
class PolynomialObjective:
    """ A class to enable evaluation of optimizer convergence characteristics.

    An instance allows one to evaluate an arbitrarily high degree (<=16) polynomial objective
    in up to 16 dimensions in which some number of coefficients have been eliminated (set to zero).
    """

    CONFIG_SPACE = SimpleHypergrid(
        name="polynomial_objective_config",
        dimensions=[
            DiscreteDimension(name='seed', min=1, max=2**32),
            DiscreteDimension(name='input_domain_dimension', min=1, max=5),
            ContinuousDimension(name='input_domain_min', min=-2**20,
                                max=2**20),
            ContinuousDimension(name='input_domain_width', min=1, max=2**21),
            DiscreteDimension(name='max_degree', min=1, max=5),
            CategoricalDimension(name='include_mixed_coefficients',
                                 values=[False, True]),
            ContinuousDimension(name='percent_coefficients_zeroed',
                                min=0.0,
                                max=1.0),
            ContinuousDimension(name='coefficient_domain_min',
                                min=-2**32,
                                max=2**32),
            ContinuousDimension(name='coefficient_domain_width',
                                min=1,
                                max=2**32),
            CategoricalDimension(name='include_noise', values=[False, True]),
            ContinuousDimension(name='noise_coefficient_of_variation',
                                min=0.0,
                                max=1.0)
        ])
    # needs constraint coefficient_domain_min < coefficient_domain_max

    _DEFAULT = Point(seed=17,
                     input_domain_dimension=2,
                     input_domain_min=-2**10,
                     input_domain_width=2**11,
                     max_degree=2,
                     include_mixed_coefficients=True,
                     percent_coefficients_zeroed=0.0,
                     coefficient_domain_min=-10.0,
                     coefficient_domain_width=9.0,
                     include_noise=False,
                     noise_coefficient_of_variation=0.0)
    """
    Initialization parameters:

    :param coefficients If specified, will override random generation of a polynomial even if `seed` arg is specified
    """
    def __init__(
            self,
            seed: int = 17,
            input_domain_dimension: int = _DEFAULT.input_domain_dimension,
            max_degree: int = _DEFAULT.max_degree,
            include_mixed_coefficients: bool = _DEFAULT.
        include_mixed_coefficients,
            percent_coefficients_zeroed: float = _DEFAULT.
        percent_coefficients_zeroed,
            coefficient_domain_min: float = _DEFAULT.coefficient_domain_min,
            coefficient_domain_width: float = _DEFAULT.
        coefficient_domain_width,
            include_noise: bool = _DEFAULT.include_noise,
            noise_coefficient_of_variation: float = _DEFAULT.
        noise_coefficient_of_variation,
            coefficients=None,
            logger=None):
        if logger is None:
            logger = create_logger("PolynomialObjective")
        self.logger = logger

        self.seed = seed
        self.input_domain_dimension = input_domain_dimension
        self.max_degree = max_degree
        self.include_mixed_coefficients = include_mixed_coefficients
        self.percent_coefficients_zeroed = percent_coefficients_zeroed
        self.coefficient_domain_min = coefficient_domain_min
        self.coefficient_domain_max = coefficient_domain_min + coefficient_domain_width
        self.coefficients = coefficients
        self.include_noise = include_noise
        self.noise_coefficient_of_variation = noise_coefficient_of_variation

        self.coef_ = []

        # confirm min < max constraint
        assert coefficient_domain_min < self.coefficient_domain_max, 'Minimum coefficient range must be less than maximum'

        self.polynomial_features_ = PolynomialFeatures(degree=self.max_degree)
        discarded_x = np.array([1] * self.input_domain_dimension).reshape(
            1, -1)
        poly_terms_x = self.polynomial_features_.fit_transform(discarded_x)
        self.num_expected_coefficients_ = len(poly_terms_x[0])

        if coefficients is None:
            # generate random polynomial if coefficients not specified
            np.random.seed(self.seed)

            self.coef_ = [
                r for r in np.random.uniform(self.coefficient_domain_min,
                                             self.coefficient_domain_max,
                                             self.num_expected_coefficients_)
            ]  # temporarily a list to be convert to np.array

            if self.percent_coefficients_zeroed > 0.0:
                # reset a random subset of coefficients to 0
                num_coef_to_zero = int(self.percent_coefficients_zeroed *
                                       self.num_expected_coefficients_)
                true_poly_term_indices_without_effects = np.random.choice(
                    range(self.num_expected_coefficients_),
                    size=num_coef_to_zero,
                    replace=False)
                for zi in true_poly_term_indices_without_effects:
                    self.coef_[zi] = 0.0

            # eliminate mixed variable terms if requested
            if not self.include_mixed_coefficients:
                # zero term coef where input's power != max_degree
                for ip, p in enumerate(self.polynomial_features_.powers_):
                    max_variable_degree = np.max(p)
                    if max_variable_degree != self.max_degree:
                        self.coef_[ip] = 0.0

            # convert to np.array to enable matmul evaluations
            self.coef_ = np.array(self.coef_)

        else:
            # test if degree specified is consistent with number of coefficients passed
            num_specified_coefficients = len(self.coefficients)
            assert num_specified_coefficients == self.num_expected_coefficients_, \
                'Failed to find sufficient number of coefficients for specified polynomial degree'

            self.coef_ = np.array(self.coefficients)

    def evaluate(self, x):
        y = np.matmul(self.polynomial_features_.fit_transform(x), self.coef_)
        if self.include_noise:
            cv = self.noise_coefficient_of_variation

            y_cv = np.tile(cv, [len(y)])
            y_std = np.abs(y_cv * y)
            y = np.random.normal(y, y_std, [len(y)])

        return y
コード例 #3
0
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
#
from mlos.Spaces import CategoricalDimension, ContinuousDimension, DiscreteDimension, Point, SimpleHypergrid
from mlos.Spaces.Configs import ComponentConfigStore

hypersphere_config_store = ComponentConfigStore(
    parameter_space=SimpleHypergrid(
        name="hypersphere_config",
        dimensions=[
            DiscreteDimension(name="num_objectives", min=1, max=100),
            CategoricalDimension(name="minimize",
                                 values=["all", "none", "some"]),
            ContinuousDimension(name="radius",
                                min=0,
                                max=100,
                                include_min=False)
        ]),
    default=Point(num_objectives=3, minimize="all", radius=10))

for num_objectives in [2, 10]:
    for minimize in ["all", "none", "some"]:
        hypersphere_config_store.add_config_by_name(
            config_name=f"{num_objectives}d_hypersphere_minimize_{minimize}",
            config_point=Point(num_objectives=num_objectives,
                               minimize=minimize,
                               radius=10),
            description=
            f"An objective function with {num_objectives + 1} parameters and {num_objectives} objectives to maximize."
        )
コード例 #4
0
ファイル: ExperimentDesigner.py プロジェクト: microsoft/MLOS
                on_external_dimension=CategoricalDimension(
                    'numeric_optimizer_implementation',
                    values=[RandomSearchOptimizer.__name__])).
    join(subgrid=glow_worm_swarm_optimizer_config_store.parameter_space,
         on_external_dimension=CategoricalDimension(
             'numeric_optimizer_implementation',
             values=[GlowWormSwarmOptimizer.__name__])).
    join(subgrid=random_near_incumbent_optimizer_config_store.parameter_space,
         on_external_dimension=CategoricalDimension(
             'numeric_optimizer_implementation',
             values=[RandomNearIncumbentOptimizer.__name__])),
    default=Point(
        utility_function_implementation=ConfidenceBoundUtilityFunction.
        __name__,
        numeric_optimizer_implementation=RandomSearchOptimizer.__name__,
        confidence_bound_utility_function_config=
        confidence_bound_utility_function_config_store.default,
        random_search_optimizer_config=random_search_optimizer_config_store.
        default,
        fraction_random_suggestions=0.5))

experiment_designer_config_store.add_config_by_name(
    config_name="default_random_near_incumbent_config",
    config_point=Point(
        utility_function_implementation=
        MultiObjectiveProbabilityOfImprovementUtilityFunction.__name__,
        numeric_optimizer_implementation=RandomNearIncumbentOptimizer.__name__,
        multi_objective_probability_of_improvement_config=
        multi_objective_probability_of_improvement_utility_function_config_store
        .default,
        random_near_incumbent_optimizer_config=
コード例 #5
0
ファイル: TestBayesianOptimizer.py プロジェクト: sycomix/MLOS
    def test_optimization_with_context(self):
        # Gaussian blob in x with position dependent on context variable y.
        def f(parameters, context):
            if isinstance(parameters, pd.DataFrame):
                index = parameters.index
            else:
                index = [0]
            return pd.DataFrame(
                {
                    'function_value':
                    -np.exp(-50 * (parameters.x - 0.5 * context.y - 0.5)**2)
                },
                index=index)

        input_space = SimpleHypergrid(
            name="input",
            dimensions=[ContinuousDimension(name="x", min=0, max=1)])
        output_space = SimpleHypergrid(name="objective",
                                       dimensions=[
                                           ContinuousDimension(
                                               name="function_value",
                                               min=-10,
                                               max=10)
                                       ])
        context_space = SimpleHypergrid(
            name="context",
            dimensions=[ContinuousDimension(name="y", min=-1, max=1)])

        optimization_problem = OptimizationProblem(
            parameter_space=input_space,
            objective_space=output_space,
            # we want to minimize the function
            objectives=[Objective(name="function_value", minimize=True)],
            context_space=context_space)

        # create some data points to eval
        n_samples = 100
        parameter_df = input_space.random_dataframe(n_samples)
        context_df = context_space.random_dataframe(n_samples)

        target_df = f(parameter_df, context_df)

        local_optimizer = self.bayesian_optimizer_factory.create_local_optimizer(
            optimization_problem=optimization_problem, )

        with pytest.raises(ValueError, match="Context required"):
            local_optimizer.register(
                parameter_values_pandas_frame=parameter_df,
                target_values_pandas_frame=target_df)

        with pytest.raises(
                ValueError,
                match="Incompatible shape of parameters and context"):
            local_optimizer.register(
                parameter_values_pandas_frame=parameter_df,
                target_values_pandas_frame=target_df,
                context_values_pandas_frame=context_df.iloc[:-1])

        local_optimizer.register(parameter_values_pandas_frame=parameter_df,
                                 target_values_pandas_frame=target_df,
                                 context_values_pandas_frame=context_df)

        with pytest.raises(ValueError, match="Context required"):
            local_optimizer.suggest()

        with pytest.raises(ValueError, match="Context required"):
            local_optimizer.predict(parameter_values_pandas_frame=parameter_df)

        suggestion = local_optimizer.suggest(context=context_space.random())
        assert isinstance(suggestion, Point)
        assert suggestion in input_space

        with pytest.raises(
                ValueError,
                match="Incompatible shape of parameters and context"):
            # unaligned parameters and context
            local_optimizer.predict(
                parameter_values_pandas_frame=parameter_df,
                context_values_pandas_frame=context_df.iloc[:-1])

        predictions = local_optimizer.predict(
            parameter_values_pandas_frame=parameter_df,
            context_values_pandas_frame=context_df)
        predictions_df = predictions.get_dataframe()
        assert len(predictions_df) == len(parameter_df)

        remote_optimizer = self.bayesian_optimizer_factory.create_remote_optimizer(
            optimization_problem=optimization_problem, )

        with pytest.raises(ValueError,
                           match="not supported if context is provided"):
            local_optimizer.optimum(
                optimum_definition=OptimumDefinition.BEST_OBSERVATION,
                context=Point(y=0).to_dataframe())

        with pytest.raises(ValueError,
                           match="not supported if context is provided"):
            local_optimizer.optimum(
                optimum_definition=OptimumDefinition.BEST_OBSERVATION)

        with pytest.raises(ValueError,
                           match="requires context to be not None"):
            local_optimizer.optimum(optimum_definition=OptimumDefinition.
                                    BEST_SPECULATIVE_WITHIN_CONTEXT)

        # can't register, predict, suggest with context on remote optimizer
        with pytest.raises(NotImplementedError,
                           match="Context not currently supported"):
            remote_optimizer.register(
                parameter_values_pandas_frame=parameter_df,
                target_values_pandas_frame=target_df,
                context_values_pandas_frame=context_df)

        with pytest.raises(NotImplementedError,
                           match="Context not currently supported"):
            remote_optimizer.predict(
                parameter_values_pandas_frame=parameter_df,
                context_values_pandas_frame=context_df)

        with pytest.raises(NotImplementedError,
                           match="Context not currently supported"):
            remote_optimizer.suggest(context=context_df)

        # context is missing but required by problem, should give error
        with pytest.raises(grpc.RpcError):
            remote_optimizer.register(
                parameter_values_pandas_frame=parameter_df,
                target_values_pandas_frame=target_df)

        # run some iterations on local optimizer to see we do something sensible
        for _ in range(100):
            # pick context at random
            context = context_space.random()
            suggested_config = local_optimizer.suggest(context=context)
            target_values = f(suggested_config, context)
            local_optimizer.register(
                parameter_values_pandas_frame=suggested_config.to_dataframe(),
                target_values_pandas_frame=target_values,
                context_values_pandas_frame=context.to_dataframe())

        optimum_y_1 = local_optimizer.optimum(
            optimum_definition=OptimumDefinition.
            BEST_SPECULATIVE_WITHIN_CONTEXT,
            context=Point(y=-1).to_dataframe())
        optimum_y1 = local_optimizer.optimum(
            optimum_definition=OptimumDefinition.
            BEST_SPECULATIVE_WITHIN_CONTEXT,
            context=Point(y=1).to_dataframe())
        assert optimum_y1.x > .6
        assert optimum_y_1.x < .4
コード例 #6
0
            dimensions=[ContinuousDimension(name="value", min=0, max=2 ** 10)]
        )
    ).join(
        on_external_dimension=CategoricalDimension(name="uncertainty_type", values=["coefficient_of_variation"]),
        subgrid=SimpleHypergrid(
            name="coefficient_of_variation_config",
            dimensions=[ContinuousDimension(name="value", min=0, max=1)]
        )
    ).join(
        on_external_dimension=CategoricalDimension(name="use_objective_function", values=[True]),
        subgrid=objective_function_config_store.parameter_space
    ),
    default=Point(
        uncertainty_type="constant",
        use_objective_function=True,
        predicted_value_degrees_of_freedom=10,
        constant_uncertainty_config=Point(value=1),
        objective_function_config=objective_function_config_store.get_config_by_name("three_level_quadratic")
    ),
    description=""
)

multi_objective_pass_through_model_config_store.add_config_by_name(
    config_name="three_level_quadratic",
    config_point=Point(
        uncertainty_type="constant",
        use_objective_function=True,
        predicted_value_degrees_of_freedom=10,
        constant_uncertainty_config=Point(value=1),
        objective_function_config=objective_function_config_store.get_config_by_name("three_level_quadratic")
    )
コード例 #7
0
 def _untranslate_point(self, point: Point) -> Point:
     unflattened_dict = {
         self._backward_name_mapping[dim_name]: value
         for dim_name, value in point
     }
     return Point(**unflattened_dict)
コード例 #8
0
    def suggest(self, context_values_dataframe=None):  # pylint: disable=unused-argument
        """ Returns the next best configuration to try.

        The idea is pretty simple:
            1. We start with a random population of glowworms, whose luciferin levels are equal to their utility function value.
            2. Each glowworm looks around for all other glowworms in its neighborhood and finds ones that are brighter.
            3. Each glowworm randomly selects from its brighter neighbors the one to walk towards (with probability proportional to the diff in brightness).
            4. Everybody takes a step.
            5. Everybody updates step size to have the desired number of neighbors.
            5. Update luciferin levels.


        """
        assert context_values_dataframe is None or len(context_values_dataframe.index) == 1
        self.logger.info(f"Suggesting config for context: {context_values_dataframe}")

        # TODO: consider remembering great features from previous invocations of the suggest() method.
        parameters_df = self.optimization_problem.parameter_space.random_dataframe(
            num_samples=self.optimizer_config.num_worms * self.optimizer_config.num_initial_points_multiplier
        )

        features_df = self.optimization_problem.construct_feature_dataframe(
            parameters_df=parameters_df.copy(deep=False),
            context_df=context_values_dataframe,
            product=True
        )

        self.logger.info(f"Computing utility values for the initial {len(features_df.index)} configurations.")
        utility_function_values = self.utility_function(feature_values_pandas_frame=features_df.copy(deep=False))
        num_utility_function_values = len(utility_function_values.index)
        self.logger.info(f"Obtained {num_utility_function_values} utility function values.")
        if num_utility_function_values == 0:
            error_message = f"Utility function {self.utility_function.__class__.__name__} produced no values."
            self.logger.info(error_message)
            raise UtilityValueUnavailableException(f"Utility function {self.utility_function.__class__.__name__} produced no values.")

        # TODO: keep getting configs until we have enough utility values to get started. Or assign 0 to missing ones,
        #  and let them climb out of their infeasible holes.
        top_utility_values = utility_function_values.nlargest(n=self.optimizer_config.num_worms, columns=['utility'])
        self.logger.info(f"Selected {len(top_utility_values.index)} initial glow worm positions.")

        self.logger.info("Initializing glow worm parameters.")
        # TODO: could it be in place?
        params_for_top_utility = self.parameter_adapter.project_dataframe(parameters_df.loc[top_utility_values.index], in_place=False)
        worms = pd.concat([params_for_top_utility, top_utility_values], axis=1)
        # Let's reset the index to make keeping track down the road easier.
        #
        worms.index = pd.Index(range(len(worms.index)))

        # Initialize luciferin to the value of the utility function
        #
        worms['decision_radius'] = self.optimizer_config.initial_decision_radius
        worms['luciferin'] = worms['utility']

        self.logger.info(f"Starting {self.optimizer_config.num_iterations} iterations.")
        for i in range(self.optimizer_config.num_iterations):
            self.logger.info(f"[{i+1}/{self.optimizer_config.num_iterations}] Updating glow-worm positions.")
            worms = self.run_iteration(worms=worms)
            # TODO: keep track of the max configs over iterations
            self.logger.info(f"[{i+1}/{self.optimizer_config.num_iterations}] Computing utility.")
            worms = self.compute_utility(worms, context_values_dataframe)
            self.logger.info(f"[{i+1}/{self.optimizer_config.num_iterations}] Updating luciferin levels.")
            worms['luciferin'] = (1 - self.optimizer_config.luciferin_decay_constant) * worms['luciferin'] + \
                                 self.optimizer_config.luciferin_enhancement_constant * worms['utility']


        # TODO: return the max of all seen configs - not just the configs that the glowworms occupied in this iteration.
        idx_of_max = worms['utility'].idxmax()
        best_config = worms.loc[[idx_of_max], self.dimension_names]
        config_to_suggest = Point.from_dataframe(best_config)
        self.logger.info(f"Suggesting: {str(config_to_suggest)}.")
        # TODO: we might have to go for second or nth best if the projection won't work out. But then again if we were
        # TODO: able to compute the utility function then the projection has worked out once before...
        return self.parameter_adapter.unproject_point(config_to_suggest)
コード例 #9
0
             name="max_features",
             values=[function.value for function in MaxFeaturesFunc]),
         DiscreteDimension(name="max_leaf_nodes", min=0, max=2**10),
         ContinuousDimension(name="min_impurity_decrease",
                             min=0.0,
                             max=2**10),
         ContinuousDimension(name="ccp_alpha", min=0.0, max=2**10),
         DiscreteDimension(name="min_samples_to_fit", min=1, max=32),
         DiscreteDimension(name="n_new_samples_before_refit", min=1, max=32)
     ]),
 default=Point(criterion=Criterion.MSE.value,
               splitter=Splitter.BEST.value,
               max_depth=0,
               min_samples_split=2,
               min_samples_leaf=3,
               min_weight_fraction_leaf=0.0,
               max_features=MaxFeaturesFunc.AUTO.value,
               max_leaf_nodes=0,
               min_impurity_decrease=0.0,
               ccp_alpha=0.0,
               min_samples_to_fit=10,
               n_new_samples_before_refit=10),
 description=
 "Governs the construction of an instance of a decision tree regressor. Most of the parameters are passed directly"
 "to the DecisionTreeRegressor constructor. Two exceptions: "
 "min_samples_to_fit determines the minimum number of samples required for the tree to be fitted."
 "n_new_samples_before_refit determines the number of new samples before a tree will be refitted."
 "Copied from scikit-learn docs:"
 "criterion: The function to measure the quality of a split."
 "splitter: The strategy used to choose the split at each node."
 "max_depth: The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than"
 " min_samples_split samples."
コード例 #10
0
from mlos.Spaces.Configs.ComponentConfigStore import ComponentConfigStore
from mlos.Tracer import trace

confidence_bound_utility_function_config_store = ComponentConfigStore(
    parameter_space=SimpleHypergrid(
        name="confidence_bound_utility_function_config",
        dimensions=[
            CategoricalDimension(name="utility_function_name",
                                 values=[
                                     "lower_confidence_bound_on_improvement",
                                     "upper_confidence_bound_on_improvement"
                                 ]),
            ContinuousDimension(name="alpha", min=0.01, max=0.5)
        ]),
    default=Point(
        utility_function_name="upper_confidence_bound_on_improvement",
        alpha=0.01))


class ConfidenceBoundUtilityFunction(UtilityFunction):
    def __init__(self,
                 function_config: Point,
                 surrogate_model,
                 minimize,
                 logger=None):
        if logger is None:
            logger = create_logger(self.__class__.__name__)
        self.logger = logger

        self.config = function_config
        self.minimize = minimize
コード例 #11
0
            DiscreteDimension(name="num_iterations", min=1, max=20), # TODO: consider other stopping criteria too
            ContinuousDimension(name="luciferin_decay_constant", min=0, max=1),
            ContinuousDimension(name="luciferin_enhancement_constant", min=0, max=1),
            ContinuousDimension(name="step_size", min=0, max=1),  # TODO: make this adaptive
            ContinuousDimension(name="initial_decision_radius", min=0, max=1, include_min=False),
            ContinuousDimension(name="max_sensory_radius", min=0.5, max=10), # TODO: add constraints
            DiscreteDimension(name="desired_num_neighbors", min=1, max=100),  # TODO: add constraint to make it smaller than num_worms
            ContinuousDimension(name="decision_radius_adjustment_constant", min=0, max=1)
        ]
    ),
    default=Point(
        num_initial_points_multiplier=5,
        num_worms=100,
        num_iterations=10,
        luciferin_decay_constant=0.2,
        luciferin_enhancement_constant=0.2,
        step_size=0.01,
        initial_decision_radius=0.2,
        max_sensory_radius=2,
        desired_num_neighbors=10,
        decision_radius_adjustment_constant=0.05
    )
)


class GlowWormSwarmOptimizer(UtilityFunctionOptimizer):
    """ Searches the utility function for maxima using glowworms.

    The first part of this has a good description:
        https://www.hindawi.com/journals/mpe/2016/5481602/

    The main benefits are:
コード例 #12
0
    def read_from_disk(target_folder):
        """Mirrors write_to_disk by reading into memory the contents of an OptimizerEvaluationReport from disk."""

        optimizer_evaluation_report = OptimizerEvaluationReport()

        optimizer_config_file = os.path.join(target_folder,
                                             "optimizer_config.json")
        with open(optimizer_config_file, 'r') as in_file:
            optimizer_evaluation_report.optimizer_configuration = Point.from_json(
                in_file.read())

        objective_function_config_file = os.path.join(
            target_folder, "objective_function_config.json")
        with open(objective_function_config_file, 'r') as in_file:
            optimizer_evaluation_report.objective_function_configuration = Point.from_json(
                in_file.read())

        pickled_optimizers_dir = os.path.join(target_folder,
                                              "pickled_optimizers")
        if os.path.exists(pickled_optimizers_dir):
            for file_name in os.listdir(pickled_optimizers_dir):
                iteration_number, file_extension = file_name.split(".")
                assert file_extension == "pickle"
                iteration_number = int(iteration_number)
                with open(os.path.join(pickled_optimizers_dir, file_name),
                          'rb') as in_file:
                    optimizer_evaluation_report.pickled_optimizers_over_time[
                        iteration_number] = in_file.read()

        objective_function_initial_state_file_path = os.path.join(
            target_folder, "objective_function_initial_state.pickle")
        if os.path.exists(objective_function_initial_state_file_path):
            with open(objective_function_initial_state_file_path,
                      'rb') as in_file:
                optimizer_evaluation_report.pickled_objective_function_initial_state = in_file.read(
                )

        objective_function_final_state_file_path = os.path.join(
            target_folder, "objective_function_final_state.pickle")
        if os.path.exists(objective_function_final_state_file_path):
            with open(objective_function_final_state_file_path,
                      'rb') as in_file:
                optimizer_evaluation_report.pickled_objective_function_final_state = in_file.read(
                )

        gof_file_path = os.path.join(
            target_folder, "regression_model_goodness_of_fit_state.pickle")
        if os.path.exists(gof_file_path):
            with open(gof_file_path, 'rb') as in_file:
                optimizer_evaluation_report.regression_model_fit_state = pickle.load(
                    in_file)

        optima_over_time_file_path = os.path.join(target_folder,
                                                  "optima_over_time.pickle")
        if os.path.exists(optima_over_time_file_path):
            with open(optima_over_time_file_path, 'rb') as in_file:
                optimizer_evaluation_report.optima_over_time = pickle.load(
                    in_file)

        pareto_over_time_file_path = os.path.join(target_folder,
                                                  "pareto_over_time.pickle")
        if os.path.exists(pareto_over_time_file_path):
            with open(pareto_over_time_file_path, "rb") as in_file:
                optimizer_evaluation_report.pareto_over_time = pickle.load(
                    in_file)

        pareto_volume_over_time_file_path = os.path.join(
            target_folder, "pareto_volume_over_time.json")
        if os.path.exists(pareto_volume_over_time_file_path):
            with open(pareto_volume_over_time_file_path, 'r') as in_file:
                optimizer_evaluation_report.pareto_volume_over_time = json.load(
                    in_file)

        execution_info_file_path = os.path.join(target_folder,
                                                "execution_info.json")
        if os.path.exists(execution_info_file_path):
            with open(execution_info_file_path, 'r') as in_file:
                execution_info_dict = json.load(in_file)
                optimizer_evaluation_report.success = execution_info_dict[
                    'success']
                optimizer_evaluation_report.num_optimization_iterations = execution_info_dict[
                    'num_optimization_iterations']
                optimizer_evaluation_report.evaluation_frequency = execution_info_dict[
                    'evaluation_frequency']
                optimizer_evaluation_report.exception = execution_info_dict[
                    'exception']
                optimizer_evaluation_report.exception_traceback = execution_info_dict[
                    'exception_stack_trace']

        return optimizer_evaluation_report
コード例 #13
0
    def suggest(self, context_values_dataframe: pd.DataFrame = None):
        """ Returns the next best configuration to try.

        The idea is pretty simple:
            1. We start with some configs on the pareto frontier, plus some good points from previous calls to suggest plus some random configs.
            2. For each point we generate random neighbors and optionally adjust them using our velocity.
            3. We compute utility for all neighbors and select a new incumbent.
            4. We update the velocity.
            5. We repeat until we run out of iterations or until velocity falls below some threshold.

        """
        self.logger.info(
            f"Suggesting config for context: {context_values_dataframe}")

        assert context_values_dataframe is None or len(
            context_values_dataframe.index) == 1

        incumbent_params_df = self._prepare_initial_params_df()
        incumbent_utility_df = self._compute_utility_for_params(
            params_df=incumbent_params_df, context_df=context_values_dataframe)

        if len(incumbent_utility_df.index) == 0:
            error_message = f"Utility function {self.utility_function.__class__.__name__} produced no values."
            self.logger.info(error_message)
            raise UtilityValueUnavailableException(error_message)

        # Before we can create random neighbors, we need to normalize all parameter values by projecting them into unit hypercube.
        #
        projected_incumbent_params_df = self.parameter_adapter.project_dataframe(
            df=incumbent_params_df, in_place=False)

        # Now, let's put together our incumbents_df which contains the projected params, the accompanying utiliy, as well as the velocity
        # component along each dimension.
        incumbents_df = projected_incumbent_params_df
        incumbents_df['utility'] = incumbent_utility_df['utility']

        incumbents_df['speed'] = 0
        for dimension_name in self.parameter_dimension_names:
            incumbents_df[
                f'{dimension_name}_velocity'] = self.optimizer_config.initial_velocity
            incumbents_df['speed'] += self.optimizer_config.initial_velocity**2

        incumbents_df['speed'] = np.sqrt(incumbents_df['speed'])
        incumbents_df['active'] = incumbents_df[
            'speed'] > self.optimizer_config.velocity_convergence_threshold

        # Let's disable all incumbents for which we couldn't compute utility.
        #
        null_utility_index = incumbents_df[
            incumbents_df['utility'].isna()].index
        incumbents_df.loc[null_utility_index, 'active'] = False

        num_iterations = 0

        while num_iterations < self.optimizer_config.max_num_iterations and incumbents_df[
                'active'].any():
            num_iterations += 1
            incumbents_df = self._run_iteration(
                incumbents_df,
                context_df=context_values_dataframe,
                iteration_number=num_iterations)

        if incumbents_df['utility'].isna().all():
            error_message = "Utility values were not available for the incumbent."
            self.logger.info(error_message)
            raise UtilityValueUnavailableException(error_message)

        if num_iterations == 0:
            error_message = f"{self.__class__.__name__} performed 0 iterations."
            self.logger.info(error_message)
            raise UnableToProduceGuidedSuggestionException(error_message)

        if incumbents_df.dtypes['utility'] != float:
            self.logger.info(
                f"The type of incumbents_df['utility'] is {incumbents_df.dtypes['utility']}. Utility function: {self.utility_function.__class__.__name__}, "
                f"incumbents_df length: {len(incumbents_df.index)}")
            incumbents_df['utility'] = pd.to_numeric(
                arg=incumbents_df['utility'], errors='raise')

        self._cache_good_incumbents(incumbents_df)

        idx_of_max = incumbents_df['utility'].idxmax()
        best_config_df = incumbents_df.loc[[idx_of_max],
                                           self.parameter_dimension_names]
        config_to_suggest = Point.from_dataframe(best_config_df)
        unprojected_config_to_suggest = self.parameter_adapter.unproject_point(
            config_to_suggest)
        self.logger.info(
            f"After {num_iterations} iterations suggesting: {unprojected_config_to_suggest.to_json(indent=2)}"
        )
        return unprojected_config_to_suggest
コード例 #14
0
         DiscreteDimension(name="num_neighbors", min=1, max=1000),
         DiscreteDimension(name="num_cached_good_params", min=0, max=2**16),
         ContinuousDimension(name="initial_points_pareto_weight",
                             min=0,
                             max=1),
         ContinuousDimension(
             name="initial_points_cached_good_params_weight", min=0, max=1),
         ContinuousDimension(name="initial_points_random_params_weight",
                             min=0,
                             max=1),
     ]),
 default=Point(num_starting_configs=10,
               initial_velocity=0.3,
               velocity_update_constant=0.5,
               velocity_convergence_threshold=0.01,
               max_num_iterations=50,
               num_neighbors=20,
               num_cached_good_params=2**10,
               initial_points_pareto_weight=0.5,
               initial_points_cached_good_params_weight=0.3,
               initial_points_random_params_weight=0.2),
 description="""
 * num_starting_configs - how many points to start the search from?
 * initial_velocity - how far from the incumbent should the random neighbors be generated?
 * velocity_update_constant - how quickly to change the velocity (0 - don't change it at all, 1 - change it as fast as possible)?
 * velocity_convergence_threshold - when an incumbent's velocity drops below this threshold, it is assumed to have converged.
 * max_num_iterations - cap on the number of iterations. A failsafe - should be higher than what the algorithm needs to converge on average.
 * num_neighbors - how many random neighbors to generate for each incumbent?
 * num_cached_good_params - how many good configurations should this optimizer cache for future use?
 * initial_points_pareto_weight - what proportion of initial points should come from the pareto frontier?
 * initial_points_cached_good_params_weight - what proportion of initial points should come from the good params cache?
 * initial_points_random_params_weight - what proportion of initial points should be randomly generated?
コード例 #15
0
                                ]),
        on_external_dimension=CategoricalDimension('workload_type',
                                                   values=['fibonacci']),
    ).join(
        subgrid=SimpleHypergrid(name='random_key_from_range_config',
                                dimensions=[
                                    DiscreteDimension('min', min=0, max=2**10),
                                    DiscreteDimension('range_width',
                                                      min=0,
                                                      max=2**20)
                                ]),
        on_external_dimension=CategoricalDimension(
            'workload_type', values=['random_key_from_range']),
    ).join(
        subgrid=SimpleHypergrid(name='sequential_key_from_range_config',
                                dimensions=[
                                    DiscreteDimension('min', min=0, max=2**10),
                                    DiscreteDimension('range_width',
                                                      min=0,
                                                      max=2**20)
                                ]),
        on_external_dimension=CategoricalDimension(
            'workload_type', values=['sequential_key_from_range']),
    )

smart_cache_workload_generator_default_config = Point(
    workload_type='fibonacci',
    reconfiguration_interval=10,
    fibonacci_config=Point(min=2**10, range_width=2**10))
assert smart_cache_workload_generator_default_config in smart_cache_workload_generator_config_space
コード例 #16
0
        name="optimizer_evaluator",
        dimensions=[
            DiscreteDimension(name="num_iterations", min=1, max=2**32),
            DiscreteDimension(name="evaluation_frequency", min=1, max=2**10),
            CategoricalDimension(name="include_pickled_optimizer_in_report", values=[True, False]),
            CategoricalDimension(name="include_pickled_objective_function_in_report", values=[True, False]),
            CategoricalDimension(name="report_regression_model_goodness_of_fit", values=[True, False]),
            CategoricalDimension(name="report_optima_over_time", values=[True, False]),
            CategoricalDimension(name="include_execution_trace_in_report", values=[True, False]),
        ]
    ),
    default=Point(
        num_iterations=100,
        evaluation_frequency=10,
        include_pickled_optimizer_in_report=True,
        include_pickled_objective_function_in_report=True,
        report_regression_model_goodness_of_fit=True,
        report_optima_over_time=True,
        include_execution_trace_in_report=True,
    )
)


# Parallel unit tests config.
#
parallel_unit_tests_config = optimizer_evaluator_config_store.default
parallel_unit_tests_config.num_iterations = 50
parallel_unit_tests_config.evaluation_frequency = 10
optimizer_evaluator_config_store.add_config_by_name(
    config_name="parallel_unit_tests_config",
    description="This config is to be used in our parallel unit tests.",
コード例 #17
0
class SmartCache:
    """ A tunable and observable cache that takes advantage of Mlos.

    The goal here is to provide a bunch of cache implementations that are parameterizable.

    Parameters
    ----------
    logger : Logger
        Logger to use.

    Attributes
    ----------
    RuntimeAttributes : MlosSmartComponentRuntimeAttributes
    parameter_search_space : SimpleHypergrid
    default_config : Point
    telemetry_message_types : list
    runtime_decision_contexts : list
    """

    # Used during registration
    RuntimeAttributes = MlosSmartComponentRuntimeAttributes(
        smart_component_name="SmartCache", attribute_names=[])

    parameter_search_space = SimpleHypergrid(
        name='smart_cache_config',
        dimensions=[
            CategoricalDimension(name='implementation', values=['LRU', 'MRU'])
        ]).join(subgrid=LruCacheConfig.CONFIG_SPACE,
                on_external_dimension=CategoricalDimension(
                    name='implementation', values=['LRU'])).join(
                        subgrid=MruCacheConfig.CONFIG_SPACE,
                        on_external_dimension=CategoricalDimension(
                            name='implementation', values=['MRU']))

    # Used if no intelligence is hooked up
    default_config = Point(implementation='LRU',
                           lru_cache_config=LruCacheConfig.DEFAULT)

    # Used to inform the Mlos Global Context about all types of telemetry messages that this component can emit
    telemetry_message_types = [
        (SmartCachePush, 0b1),
        (SmartCacheGet, 0b10),
        (SmartCacheEvict, 0b100),
    ]

    # Used to inform the Mlos Global Context about all types of runtime decisions that can be expected
    runtime_decision_contexts = [
        PushRuntimeDecisionContext,
        ReconfigurationRuntimeDecisionContext,
    ]

    def __init__(self, logger):
        self.logger = logger
        self.mlos_object = MlosObject(
            smart_component_type=type(self),
            smart_component_runtime_attributes=self.RuntimeAttributes(
                component_id=id(self)))
        self.current_config = Configuration(component_type=SmartCache,
                                            values=self.default_config,
                                            id=-1)
        self.cache_implementation = LruCache(
            max_size=self.current_config.values.lru_cache_config.cache_size,
            logger=self.logger)
        self.mlos_object.register()

        self.reconfigure()

    def __del__(self):
        self.mlos_object.unregister()

    def __iter__(self):
        return self.cache_implementation.__iter__()

    def __len__(self):
        return len(self.cache_implementation)

    def __contains__(self, item):
        return item in self.cache_implementation

    def push(self, key, value):
        self.reconfigure()  # TODO: make this less frequent

        if key in self:
            return

        should_push = self.mlos_object.make_runtime_decision(
            PushRuntimeDecisionContext(mlos_object=self.mlos_object,
                                       current_config=self.current_config))

        if not should_push:
            return

        if self.mlos_object.is_message_type_enabled(SmartCachePush):
            # Note that we hide this behind an 'is_enabled' check. This is for the cases
            # when assembling the message itself can be expensive.
            self.mlos_object.send_telemetry_message(SmartCachePush(key=key))
        cache_entry = CacheEntry(key, value)

        evicted_cache_entry = self.cache_implementation.push(cache_entry)

        if evicted_cache_entry is not None:
            # Note that here we skip the 'is message type enabled check' since assembling the message is cheap and
            # the check can be done by mlos_object
            self.mlos_object.send_telemetry_message(
                SmartCacheEvict(key=evicted_cache_entry.key))

    def get(self, key):
        if key not in self.cache_implementation:
            self.mlos_object.send_telemetry_message(
                SmartCacheGet(key=key, was_hit=False))
            return None
        self.mlos_object.send_telemetry_message(
            SmartCacheGet(key=key, was_hit=True))
        return self.cache_implementation.get(key)

    def reconfigure(self):
        """ Reconfigures the cache according to the configuration present in self.mlos_object

        :return:
        """
        smart_cache_reconfiguration_decision_runtime_context = ReconfigurationRuntimeDecisionContext(
            self.mlos_object)
        should_reconfigure = self.mlos_object.make_runtime_decision(
            smart_cache_reconfiguration_decision_runtime_context)
        if not should_reconfigure or self.current_config == self.mlos_object.config or self.mlos_object.config is None:
            return

        self.current_config = self.mlos_object.config
        self.logger.info(
            f"Reconfiguring. New config values: {self.current_config.values.to_json()}"
        )

        if self.current_config.values.implementation == 'LRU':
            self.cache_implementation = LruCache(
                max_size=self.current_config.values.lru_cache_config.
                cache_size,
                logger=self.logger)
        elif self.current_config.values.implementation == 'MRU':
            self.cache_implementation = MruCache(
                max_size=self.current_config.values.mru_cache_config.
                cache_size,
                logger=self.logger)
        else:
            raise RuntimeError("Invalid config")
コード例 #18
0
class SklearnRidgeRegressionModelConfig(metaclass=DefaultConfigMeta):
    class Solver(Enum):
        """
        From https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html:
        Solver to use in the computational routines:
            * ‘auto’ chooses the solver automatically based on the type of data.
            * ‘svd’ uses a Singular Value Decomposition of X to compute the Ridge coefficients. More stable for
                singular matrices than ‘cholesky’.
            * ‘cholesky’ uses the standard scipy.linalg.solve function to obtain a closed-form solution.
            * ‘sparse_cg’ uses the conjugate gradient solver as found in scipy.sparse.linalg.cg.
                As an iterative algorithm, this solver is more appropriate than ‘cholesky’ for
                large-scale data (possibility to set tol and max_iter).
            * ‘lsqr’ uses the dedicated regularized least-squares routine scipy.sparse.linalg.lsqr.
                It is the fastest and uses an iterative procedure.
            * ‘sag’ uses a Stochastic Average Gradient descent, and ‘saga’ uses its improved,
                unbiased version named SAGA. Both methods also use an iterative procedure, and are
                often faster than other solvers when both n_samples and n_features are large.
                Note that ‘sag’ and ‘saga’ fast convergence is only guaranteed on features with
                approximately the same scale. You can preprocess the data with a scaler from sklearn.preprocessing.

        All last five solvers support both dense and sparse data. However, only ‘sag’ and ‘sparse_cg’ supports
        sparse input when fit_intercept is True.
        """
        AUTO = 'auto'  # default
        SVD = 'svd'
        CHOLESKY = 'cholesky'
        LSQR = 'lsqr'
        SPARSE_CG = 'sparse_cg'
        SAG = 'sag'
        SAGA = 'saga'

    CONFIG_SPACE = SimpleHypergrid(
        name="sklearn_ridge_regression_model_config",
        dimensions=[
            ContinuousDimension(name="alpha", min=0, max=2**16),
            CategoricalDimension(name="fit_intercept", values=[False, True]),
            CategoricalDimension(name="normalize", values=[False, True]),
            CategoricalDimension(name="copy_x", values=[False, True]),
            DiscreteDimension(name="max_iter", min=0, max=10**5),
            ContinuousDimension(name="tol", min=0, max=2**10),
            CategoricalDimension(name="solver",
                                 values=[solver.value for solver in Solver]),
        ])
    _DEFAULT = Point(alpha=1.0,
                     fit_intercept=False,
                     normalize=False,
                     copy_x=True,
                     max_iter=1000,
                     tol=10**-4,
                     solver=Solver.AUTO.value)

    @classmethod
    def contains(cls, config):
        return Point(alpha=config.alpha,
                     fit_intercept=config.fit_intercept,
                     normalize=config.normalize,
                     copy_x=config.copy_x,
                     max_iter=config.max_iter,
                     tol=config.tol,
                     random_state=config.random_state,
                     solver=config.solver) in cls.CONFIG_SPACE

    @classmethod
    def create_from_config_point(cls, config_point):
        assert cls.contains(config_point)
        config_key_value_pairs = {
            param_name: value
            for param_name, value in config_point
        }
        return cls(**config_key_value_pairs)

    def __init__(self,
                 alpha=_DEFAULT.alpha,
                 fit_intercept=_DEFAULT.fit_intercept,
                 normalize=_DEFAULT.normalize,
                 copy_x=_DEFAULT.copy_x,
                 max_iter=_DEFAULT.max_iter,
                 tol=_DEFAULT.tol,
                 random_state=None,
                 solver=_DEFAULT.solver):
        """
        Ridge parameters:
        :param alpha:Regularization strength; must be a positive float. Defaults to 1.0.
        :param fit_intercept: Whether to calculate the intercept for this model.
        :param normalize: This parameter is ignored when ``fit_intercept`` is set to False.
            If True, the regressors X will be normalized before regression by
            subtracting the mean and dividing by the l2-norm.
        :param copy_x: If ``True``, X will be copied; else, it may be overwritten.
        :param max_iter: The maximum number of iterations
        :param tol: The tolerance for the optimization: if the updates are
            smaller than ``tol``, the optimization code checks the
            dual gap for optimality and continues until it is smaller
            than ``tol``.
        :param solver: Solver to use in the computational routines:
            - 'auto' chooses the solver automatically based on the type of data.
            - 'svd' uses a Singular Value Decomposition of X to compute the Ridge
              coefficients. More stable for singular matrices than 'cholesky'.
            - 'cholesky' uses the standard scipy.linalg.solve function to
              obtain a closed-form solution.
            - 'sparse_cg' uses the conjugate gradient solver as found in
              scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
              more appropriate than 'cholesky' for large-scale data
              (possibility to set `tol` and `max_iter`).
            - 'lsqr' uses the dedicated regularized least-squares routine
              scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative
              procedure.
            - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
              its improved, unbiased version named SAGA. Both methods also use an
              iterative procedure, and are often faster than other solvers when
              both n_samples and n_features are large. Note that 'sag' and
              'saga' fast convergence is only guaranteed on features with
              approximately the same scale. You can preprocess the data with a
              scaler from sklearn.preprocessing.
        :param random_state: The seed of the pseudo random number generator that selects a random
            feature to update. Used when ``selection`` == 'random'.
        """
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.normalize = normalize
        self.copy_x = copy_x
        self.max_iter = max_iter
        self.tol = tol
        self.random_state = random_state
        self.solver = solver
コード例 #19
0
 def _translate_point(self, point: Point) -> Point:
     return point.flat_copy()
コード例 #20
0
                max=100)
        ]).join(
            subgrid=homogeneous_random_forest_config_store.parameter_space,
            on_external_dimension=CategoricalDimension(
                name="surrogate_model_implementation",
                values=[
                    HomogeneousRandomForestRegressionModel.__name__
                ])).join(
                    subgrid=experiment_designer_config_store.parameter_space,
                    on_external_dimension=CategoricalDimension(
                        name="experiment_designer_implementation",
                        values=[ExperimentDesigner.__name__])),
    default=Point(
        surrogate_model_implementation=HomogeneousRandomForestRegressionModel.
        __name__,
        experiment_designer_implementation=ExperimentDesigner.__name__,
        min_samples_required_for_guided_design_of_experiments=10,
        homogeneous_random_forest_regression_model_config=
        homogeneous_random_forest_config_store.default,
        experiment_designer_config=experiment_designer_config_store.default),
    description="TODO")

# Add a config with homogeneous random forest where the decision trees refit for every new observation.
#
optimizer_config = bayesian_optimizer_config_store.default
optimizer_config.homogeneous_random_forest_regression_model_config.decision_tree_regression_model_config.n_new_samples_before_refit = 1
optimizer_config.homogeneous_random_forest_regression_model_config.n_estimators = 50
bayesian_optimizer_config_store.add_config_by_name(
    config_name='default_refit_tree_every_time', config_point=optimizer_config)

# Add a default config with glowworm swarm optimizer
#
コード例 #21
0
class RegressionEnhancedRandomForestRegressionModelConfig(RegressionModelConfig):
    """A configuration object for RERF model.

    Class responsible for validating its objects are valid hyper parameters for the sklearn classes:
       Lasso (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html),
       Ridge
        (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge)
    and
       RandomForest ()
    """

    CONFIG_SPACE = SimpleHypergrid(
        name="regression_enhanced_random_forest_regression_model_config",
        dimensions=[
            DiscreteDimension(name="max_basis_function_degree", min=1, max=10),
            CategoricalDimension(name="residual_model_name",
                                 values=[SklearnRandomForestRegressionModelConfig.__name__]),
            CategoricalDimension(name="boosting_root_model_name",
                                 values=[SklearnLassoRegressionModelConfig.__name__,
                                         SklearnRidgeRegressionModelConfig.__name__]),
            ContinuousDimension(name="min_abs_root_model_coef", min=0, max=2 ** 10),
            CategoricalDimension(name="perform_initial_root_model_hyper_parameter_search", values=[False, True]),
            CategoricalDimension(name="perform_initial_random_forest_hyper_parameter_search", values=[False, True])
        ]
    ).join(
        subgrid=SklearnLassoRegressionModelConfig.CONFIG_SPACE,
        on_external_dimension=CategoricalDimension(name="boosting_root_model_name",
                                                   values=[SklearnLassoRegressionModelConfig.__name__])
    ).join(
        subgrid=SklearnRidgeRegressionModelConfig.CONFIG_SPACE,
        on_external_dimension=CategoricalDimension(name="boosting_root_model_name",
                                                   values=[SklearnRidgeRegressionModelConfig.__name__])
    ).join(
        subgrid=SklearnRandomForestRegressionModelConfig.CONFIG_SPACE,
        on_external_dimension=CategoricalDimension(name="residual_model_name",
                                                   values=[SklearnRandomForestRegressionModelConfig.__name__])
    )

    _DEFAULT = Point(
        max_basis_function_degree=2,
        residual_model_name=SklearnRandomForestRegressionModelConfig.__name__,
        boosting_root_model_name=SklearnLassoRegressionModelConfig.__name__,
        min_abs_root_model_coef=0.01,
        sklearn_lasso_regression_model_config=SklearnLassoRegressionModelConfig.DEFAULT,
        sklearn_ridge_regression_model_config=SklearnRidgeRegressionModelConfig.DEFAULT,
        sklearn_random_forest_regression_model_config=SklearnRandomForestRegressionModelConfig.DEFAULT,
        perform_initial_root_model_hyper_parameter_search=True,
        perform_initial_random_forest_hyper_parameter_search=False
    )

    @classmethod
    def contains(cls, config):
        # following example set in HomogeneousRandomForestRegressionModelConfig.contains()
        return True

    @classmethod
    def create_from_config_point(cls, config_point):
        assert cls.contains(config_point)
        config_key_value_pairs = {param_name: value for param_name, value in config_point}
        return cls(**config_key_value_pairs)

    def __init__(
            self,
            max_basis_function_degree=_DEFAULT.max_basis_function_degree,
            boosting_root_model_name=_DEFAULT.boosting_root_model_name,
            min_abs_root_model_coef=_DEFAULT.min_abs_root_model_coef,
            boosting_root_model_config: Point()=_DEFAULT.sklearn_lasso_regression_model_config,
            random_forest_model_config: Point()=_DEFAULT.sklearn_random_forest_regression_model_config,
            residual_model_name=_DEFAULT.residual_model_name,
            perform_initial_root_model_hyper_parameter_search=_DEFAULT.perform_initial_root_model_hyper_parameter_search,
            perform_initial_random_forest_hyper_parameter_search=_DEFAULT.perform_initial_random_forest_hyper_parameter_search
    ):
        self.max_basis_function_degree = max_basis_function_degree
        self.residual_model_name = residual_model_name
        self.min_abs_root_model_coef = min_abs_root_model_coef
        self.perform_initial_root_model_hyper_parameter_search = perform_initial_root_model_hyper_parameter_search
        self.perform_initial_random_forest_hyper_parameter_search = perform_initial_random_forest_hyper_parameter_search

        self.boosting_root_model_name = boosting_root_model_name
        self.boosting_root_model_config = None
        if self.boosting_root_model_name == SklearnLassoRegressionModelConfig.__name__:
            self.boosting_root_model_config = SklearnLassoRegressionModelConfig \
                .create_from_config_point(boosting_root_model_config)
        elif self.boosting_root_model_name == SklearnRidgeRegressionModelConfig.__name__:
            self.boosting_root_model_config = SklearnRidgeRegressionModelConfig \
                .create_from_config_point(boosting_root_model_config)
        else:
            print('Unrecognized boosting_root_model_name "{}"'.format(self.boosting_root_model_name))

        self.random_forest_model_config = SklearnRandomForestRegressionModelConfig \
            .create_from_config_point(random_forest_model_config)
コード例 #22
0
        name="homogeneous_random_forest_regression_model_config",
        dimensions=[
            DiscreteDimension(name="n_estimators", min=1, max=256),
            ContinuousDimension(name="features_fraction_per_estimator",
                                min=0,
                                max=1,
                                include_min=False,
                                include_max=True),
            ContinuousDimension(name="samples_fraction_per_estimator",
                                min=0.2,
                                max=1,
                                include_min=False,
                                include_max=True),
            CategoricalDimension(name="regressor_implementation",
                                 values=[DecisionTreeRegressionModel.__name__
                                         ]),
            CategoricalDimension(name="bootstrap", values=[True, False])
        ]).join(subgrid=decision_tree_config_store.parameter_space,
                on_external_dimension=CategoricalDimension(
                    name="regressor_implementation",
                    values=[DecisionTreeRegressionModel.__name__])),
    default=Point(
        n_estimators=10,
        features_fraction_per_estimator=1,
        samples_fraction_per_estimator=0.7,
        regressor_implementation=DecisionTreeRegressionModel.__name__,
        decision_tree_regression_model_config=decision_tree_config_store.
        default,
        bootstrap=True),
    description="TODO")
コード例 #23
0
    def test_multi_objective_optimization(self,
                                          objective_function_implementation,
                                          minimize, num_output_dimensions,
                                          num_points):
        if objective_function_implementation == Hypersphere:
            hypersphere_radius = 10
            objective_function_config = Point(
                implementation=Hypersphere.__name__,
                hypersphere_config=Point(num_objectives=num_output_dimensions,
                                         minimize=minimize,
                                         radius=hypersphere_radius))
        else:
            objective_function_config = Point(
                implementation=MultiObjectiveNestedPolynomialObjective.
                __name__,
                multi_objective_nested_polynomial_config=Point(
                    num_objectives=num_output_dimensions,
                    objective_function_implementation=NestedPolynomialObjective
                    .__name__,
                    nested_polynomial_objective_config=Point(
                        num_nested_polynomials=2,
                        nested_function_implementation=PolynomialObjective.
                        __name__,
                        polynomial_objective_config=Point(
                            seed=17,
                            input_domain_dimension=2,
                            input_domain_min=-2**10,
                            input_domain_width=2**11,
                            max_degree=2,
                            include_mixed_coefficients=True,
                            percent_coefficients_zeroed=0.0,
                            coefficient_domain_min=-10.0,
                            coefficient_domain_width=9.0,
                            include_noise=False,
                            noise_coefficient_of_variation=0.0))))
        objective_function = ObjectiveFunctionFactory.create_objective_function(
            objective_function_config)
        optimization_problem = objective_function.default_optimization_problem

        if objective_function_implementation == MultiObjectiveNestedPolynomialObjective:
            # We need to modify the default optimization problem to respect the "minimize" argument.
            #
            objectives = []
            for i, default_objective in enumerate(
                    optimization_problem.objectives):
                if minimize == "all":
                    minimize = True
                elif minimize == "some":
                    minimize = ((i % 2) == 0)
                else:
                    minimize = False
                new_objective = Objective(name=default_objective.name,
                                          minimize=minimize)
                objectives.append(new_objective)
            optimization_problem.objectives = objectives

        optimizer_config = bayesian_optimizer_config_store.get_config_by_name(
            "default_multi_objective_optimizer_config")
        self.logger.info(optimizer_config)

        optimizer = self.bayesian_optimizer_factory.create_local_optimizer(
            optimization_problem=optimization_problem,
            optimizer_config=optimizer_config)

        assert optimizer.optimizer_config.surrogate_model_implementation == MultiObjectiveHomogeneousRandomForest.__name__

        # We can now go through the optimization loop, at each point validating that:
        #   1) The suggested point is valid.
        #   2) The volume of the pareto frontier is monotonically increasing.

        lower_bounds_on_pareto_volume = []
        upper_bounds_on_pareto_volume = []

        for i in range(num_points):
            suggestion = optimizer.suggest()
            assert suggestion in optimization_problem.parameter_space
            objectives = objective_function.evaluate_point(suggestion)
            optimizer.register(
                parameter_values_pandas_frame=suggestion.to_dataframe(),
                target_values_pandas_frame=objectives.to_dataframe())

            if i > 10:
                pareto_volume_estimator = optimizer.pareto_frontier.approximate_pareto_volume(
                    num_samples=1000000)
                lower_bound, upper_bound = pareto_volume_estimator.get_two_sided_confidence_interval_on_pareto_volume(
                    alpha=0.95)
                lower_bounds_on_pareto_volume.append(lower_bound)
                upper_bounds_on_pareto_volume.append(upper_bound)

        pareto_volumes_over_time_df = pd.DataFrame({
            'lower_bounds':
            lower_bounds_on_pareto_volume,
            'upper_bounds':
            upper_bounds_on_pareto_volume
        })

        # If we had precise volume measurements, we would want to ascertain that the volume of the pareto frontier is monotonically increasing.
        # However, we only have estimates so we cannot assert that they are monotonic. But we can assert that they are approximately monotonic:
        # we can make sure that any dip between consecutive volumes is smaller than some small number. Actually we can make sure that there
        # is no drift, by looking over larger windows too.
        #
        threshold = -0.1
        for periods in [1, 10, 20]:
            min_pct_increase_in_lower_bound = pareto_volumes_over_time_df[
                'lower_bounds'].pct_change(periods=periods).fillna(0).min()
            if not (min_pct_increase_in_lower_bound > threshold):
                print(pareto_volumes_over_time_df)
                assert min_pct_increase_in_lower_bound > threshold

            min_pct_increase_in_upper_bound = pareto_volumes_over_time_df[
                'upper_bounds'].pct_change(periods=periods).fillna(0).min()
            if not (min_pct_increase_in_upper_bound > threshold):
                print(pareto_volumes_over_time_df)
                assert min_pct_increase_in_upper_bound > threshold
コード例 #24
0
ファイル: TestBayesianOptimizer.py プロジェクト: sycomix/MLOS
    def test_registering_multiple_objectives(self):

        input_space = SimpleHypergrid(name='input',
                                      dimensions=[
                                          ContinuousDimension(name="x_1",
                                                              min=0,
                                                              max=10),
                                          ContinuousDimension(name="x_2",
                                                              min=0,
                                                              max=10)
                                      ])

        output_space = SimpleHypergrid(name='output',
                                       dimensions=[
                                           ContinuousDimension(name="y_1",
                                                               min=0,
                                                               max=10),
                                           ContinuousDimension(name="y_2",
                                                               min=0,
                                                               max=10)
                                       ])

        optimization_problem = OptimizationProblem(
            parameter_space=input_space,
            objective_space=output_space,
            objectives=[Objective(name='y_1', minimize=True)])

        optimizer = self.bayesian_optimizer_factory.create_local_optimizer(
            optimization_problem=optimization_problem)

        for _ in range(100):
            input = optimizer.suggest()
            output = Point(y_1=input.x_1, y_2=input.x_2)

            optimizer.register(input.to_dataframe(), output.to_dataframe())

        num_predictions = 100
        prediction = optimizer.predict(
            parameter_values_pandas_frame=optimization_problem.parameter_space.
            random_dataframe(num_predictions))
        prediction_df = prediction.get_dataframe()
        assert len(prediction_df.index) == num_predictions

        # Let's test invalid observations.
        #
        input = input_space.random()
        input_df = input.to_dataframe()

        # We should only remember the valid dimensions.
        #
        output_with_extra_dimension = Point(y_1=input.x_1,
                                            y_2=input.x_2,
                                            invalid_dimension=42)
        output_with_extra_dimension_df = output_with_extra_dimension.to_dataframe(
        )
        optimizer.register(input_df, output_with_extra_dimension_df)

        # Let's make sure that the invalid_dimension was not remembered.
        #
        all_inputs_df, all_outputs_df, _ = optimizer.get_all_observations()
        assert all(column in {'y_1', 'y_2'}
                   for column in all_outputs_df.columns)

        # We should accept inputs with missing output dimensions, as long as at least one is specified.
        #
        output_with_missing_dimension = Point(y_1=input.x_1)
        output_with_missing_dimension_df = output_with_missing_dimension.to_dataframe(
        )
        optimizer.register(input_df, output_with_missing_dimension_df)
        all_inputs_df, all_outputs_df, _ = optimizer.get_all_observations()

        # Let's make sure the missing dimension ends up being a null.
        #
        last_observation = all_outputs_df.iloc[[-1]]
        assert last_observation['y_2'].isnull().values.all()

        # Inserting an observation with no valid dimensions should fail.
        #
        empty_output = Point()
        empty_output_df = empty_output.to_dataframe()
        with pytest.raises(ValueError):
            optimizer.register(input_df, empty_output_df)

        only_invalid_outputs = Point(invalid_col1=0, invalid_col2=2)
        only_invalid_outputs_df = only_invalid_outputs.to_dataframe()

        with pytest.raises(ValueError):
            optimizer.register(input_df, only_invalid_outputs_df)