def __call__( self, search_space: Optional[SearchSpace] = None, experiment: Optional[Experiment] = None, data: Optional[Data] = None, silently_filter_kwargs: bool = True, # TODO[Lena]: default to False **kwargs: Any, ) -> ModelBridge: assert self.value in MODEL_KEY_TO_MODEL_SETUP, f"Unknown model {self.value}" # All model bridges require either a search space or an experiment. assert search_space or experiment, "Search space or experiment required." model_setup_info = MODEL_KEY_TO_MODEL_SETUP[self.value] model_class = model_setup_info.model_class bridge_class = model_setup_info.bridge_class if not silently_filter_kwargs: validate_kwarg_typing( # TODO[Lena]: T46467254, pragma: no cover typed_callables=[model_class, bridge_class], search_space=search_space, experiment=experiment, data=data, **kwargs, ) # Create model with consolidated arguments: defaults + passed in kwargs. model_kwargs = consolidate_kwargs( kwargs_iterable=[get_function_default_arguments(model_class), kwargs], keywords=get_function_argument_names(model_class), ) model = model_class(**model_kwargs) # Create `ModelBridge`: defaults + standard kwargs + passed in kwargs. bridge_kwargs = consolidate_kwargs( kwargs_iterable=[ get_function_default_arguments(bridge_class), model_setup_info.standard_bridge_kwargs, {"transforms": model_setup_info.transforms}, kwargs, ], keywords=get_function_argument_names( function=bridge_class, omit=["experiment", "search_space", "data"] ), ) # Create model bridge with the consolidated kwargs. model_bridge = bridge_class( search_space=search_space or not_none(experiment).search_space, experiment=experiment, data=data, model=model, **bridge_kwargs, ) # Store all kwargs on model bridge, to be saved on generator run. model_bridge._set_kwargs_to_save( model_key=self.value, model_kwargs=_encode_callables_as_references(model_kwargs), bridge_kwargs=_encode_callables_as_references(bridge_kwargs), ) return model_bridge
def test_ModelSetups_do_not_share_kwargs(self): """Tests that none of the preset model and bridge combinations share a kwarg. """ for model_setup_info in MODEL_KEY_TO_MODEL_SETUP.values(): model_class = model_setup_info.model_class bridge_class = model_setup_info.bridge_class model_args = set(get_function_argument_names(model_class)) bridge_args = set(get_function_argument_names(bridge_class)) # Intersection of two sets should be empty self.assertEqual(model_args & bridge_args, set())
def _get_model_kwargs( info: ModelSetup, kwargs: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: return consolidate_kwargs( [get_function_default_arguments(info.model_class), kwargs], keywords=get_function_argument_names(info.model_class), )
def gen( self, experiment: Experiment, data: Optional[Data] = None, n: int = 1, **kwargs: Any, ) -> GeneratorRun: """Produce the next points in the experiment.""" self.experiment = experiment self._set_model(experiment=experiment, data=data or experiment.fetch_data()) max_parallelism = self._curr.max_parallelism num_running = self.num_running_trials_for_current_step if max_parallelism is not None and num_running >= max_parallelism: raise MaxParallelismReachedException( step=self._curr, num_running=num_running ) model = not_none(self.model) generator_run = model.gen( n=n, **consolidate_kwargs( kwargs_iterable=[self._curr.model_gen_kwargs, kwargs], keywords=get_function_argument_names(model.gen), ), ) generator_run._generation_step_index = self._curr.index self._generator_runs.append(generator_run) return generator_run
def gen( self, experiment: Experiment, new_data: Optional[Data] = None, # Take in just the new data. n: int = 1, **kwargs: Any, ) -> GeneratorRun: """Produce the next points in the experiment.""" self._set_experiment(experiment=experiment) # Get arm signatures for each entry in new_data that is indeed new. new_arms = self._get_new_arm_signatures(experiment=experiment, new_data=new_data) enough_observed = (len(self._observed) + len(new_arms)) >= self._curr.min_arms_observed unlimited_arms = self._curr.num_arms == -1 enough_generated = (not unlimited_arms and len(self._generated) >= self._curr.num_arms) remaining_arms = self._curr.num_arms - len(self._generated) # Check that minimum observed_arms is satisfied if it's enforced. if self._curr.enforce_num_arms and enough_generated and not enough_observed: raise ValueError( "All trials for current model have been generated, but not enough " "data has been observed to fit next model. Try again when more data " "are available.") # TODO[Lena, T44021164]: take into account failed trials. Potentially # reduce `_generated` count when a trial mentioned in new data failed. if (self._curr.enforce_num_arms and not unlimited_arms and 0 < remaining_arms < n): raise ValueError( f"Cannot generate {n} new arms as there are only {remaining_arms} " "remaining arms to generate using the current model.") all_data = (Data.from_multiple_data( data=[self._data, new_data]) if new_data else self._data) if self._model is None: # Instantiate the first model. self._set_current_model(experiment=experiment, data=all_data) elif enough_generated and enough_observed: # Change to the next model. self._change_model(experiment=experiment, data=all_data) elif new_data is not None: # We're sticking with the curr. model, but should update with new data. # pyre-fixme[16]: `Optional` has no attribute `update`. self._model.update(experiment=experiment, data=new_data) kwargs = consolidate_kwargs( kwargs_iterable=[self._curr.model_gen_kwargs, kwargs], keywords=get_function_argument_names(not_none(self._model).gen), ) gen_run = not_none(self._model).gen(n=n, **kwargs) # If nothing failed, update known data, _generated, and _observed. self._data = all_data self._generated.extend([arm.signature for arm in gen_run.arms]) self._observed.extend(new_arms) self._generator_runs.append(gen_run) return gen_run
def gen(self, **model_gen_kwargs: Any) -> GeneratorRun: """Generates candidates from the fitted model, using the model gen kwargs set on the model spec, alongside any passed as kwargs to this function (local kwargs take precedent) NOTE: Model must have been fit prior to calling gen() Args: n: Integer representing how many arms should be in the generator run produced by this method. NOTE: Some underlying models may ignore the ``n`` and produce a model-determined number of arms. In that case this method will also output a generator run with number of arms that can differ from ``n``. pending_observations: A map from metric name to pending observations for that metric, used by some models to avoid resuggesting points that are currently being evaluated. """ fitted_model = self.fitted_model model_gen_kwargs = consolidate_kwargs( kwargs_iterable=[ self.model_gen_kwargs, model_gen_kwargs, ], keywords=get_function_argument_names(fitted_model.gen), ) return fitted_model.gen(**model_gen_kwargs)
def gen( self, experiment: Experiment, data: Optional[Data] = None, n: int = 1, **kwargs: Any, ) -> GeneratorRun: """Produce the next points in the experiment. Additional kwargs passed to this method are propagated directly to the underlying model's `gen`, along with the `model_gen_kwargs` set on the current generation step. Args: experiment: Experiment, for which the generation strategy is producing a new generator run in the course of `gen`, and to which that generator run will be added as trial(s). Information stored on the experiment (e.g., trial statuses) is used to determine which model will be used to produce the generator run returned from this method. data: Optional data to be passed to the underlying model's `gen`, which is called within this method and actually produces the resulting generator run. By default, data is all data on the `experiment` if `use_update` is False and only the new data since the last call to this method if `use_update` is True. n: Integer representing how many arms should be in the generator run produced by this method. NOTE: Some underlying models may ignore the `n` and produce a model-determined number of arms. In that case this method will also output a generator run with number of arms that can differ from `n`. """ self.experiment = experiment self._set_or_update_model(data=data) self._seen_trial_indices_by_status = deepcopy( experiment.trial_indices_by_status ) max_parallelism = self._curr.max_parallelism num_running = self.num_running_trials_for_current_step if max_parallelism is not None and num_running >= max_parallelism: raise MaxParallelismReachedException( step_index=self._curr.index, model_name=self._curr.model_name, num_running=num_running, ) model = not_none(self.model) generator_run = model.gen( n=n, **consolidate_kwargs( kwargs_iterable=[self._curr.model_gen_kwargs, kwargs], keywords=get_function_argument_names(model.gen), ), ) generator_run._generation_step_index = self._curr.index self._generator_runs.append(generator_run) return generator_run
def _get_bridge_kwargs( info: ModelSetup, kwargs: Optional[Dict[str, Any]] = None ) -> Dict[str, Any]: return consolidate_kwargs( [ get_function_default_arguments(info.bridge_class), info.standard_bridge_kwargs, {"transforms": info.transforms}, kwargs, ], keywords=get_function_argument_names( info.bridge_class, omit=["experiment", "search_space", "data"] ), )
def __call__( self, search_space: Optional[SearchSpace] = None, experiment: Optional[Experiment] = None, data: Optional[Data] = None, silently_filter_kwargs: bool = True, # TODO[Lena]: default to False **kwargs: Any, ) -> ModelBridge: assert self.value in MODEL_KEY_TO_MODEL_SETUP # All model bridges require either a search space or an experiment. assert search_space or experiment, "Search space or experiment required." model_setup_info = MODEL_KEY_TO_MODEL_SETUP[self.value] model_class = model_setup_info.model_class bridge_class = model_setup_info.bridge_class if not silently_filter_kwargs: validate_kwarg_typing( # TODO[Lena]: T46467254, pragma: no cover typed_callables=[model_class, bridge_class], search_space=search_space, experiment=experiment, data=data, **kwargs, ) # Create model with consolidated arguments: defaults + passed in kwargs. model_kwargs = consolidate_kwargs( kwargs_iterable=[get_function_default_arguments(model_class), kwargs], keywords=get_function_argument_names(model_class), ) model = model_class(**model_kwargs) # Create `ModelBridge`: defaults + standard kwargs + passed in kwargs. bridge_kwargs = consolidate_kwargs( kwargs_iterable=[ get_function_default_arguments(bridge_class), model_setup_info.standard_bridge_kwargs, {"transforms": model_setup_info.transforms}, kwargs, ], keywords=get_function_argument_names( function=bridge_class, omit=["experiment", "search_space", "data"] ), ) # Create model bridge with the consolidated kwargs. model_bridge = bridge_class( search_space=search_space or not_none(experiment).search_space, experiment=experiment, data=data, model=model, **bridge_kwargs, ) # Temporarily ignore Botorch callable & torch-typed arguments, as those # are not serializable to JSON out-of-the-box. TODO[Lena]: T46527142 if isinstance(model, TorchModel): model_kwargs = {kw: p for kw, p in model_kwargs.items() if not callable(p)} bridge_kwargs = { kw: p for kw, p in bridge_kwargs.items() if kw[:5] != "torch" } # Store all kwargs on model bridge, to be saved on generator run. model_bridge._set_kwargs_to_save( model_key=self.value, model_kwargs=model_kwargs, bridge_kwargs=bridge_kwargs ) return model_bridge
def _gen_multiple( self, experiment: Experiment, num_generator_runs: int, data: Optional[Data] = None, n: int = 1, pending_observations: Optional[Dict[str, List[ObservationFeatures]]] = None, **kwargs: Any, ) -> List[GeneratorRun]: """Produce multiple generator runs at once, to be made into multiple trials on the experiment. NOTE: This is used to ensure that maximum paralellism and number of trials per step are not violated when producing many generator runs from this generation strategy in a row. Without this function, if one generates multiple generator runs without first making any of them into running trials, generation strategy cannot enforce that it only produces as many generator runs as are allowed by the paralellism limit and the limit on number of trials in current step. Args: experiment: Experiment, for which the generation strategy is producing a new generator run in the course of `gen`, and to which that generator run will be added as trial(s). Information stored on the experiment (e.g., trial statuses) is used to determine which model will be used to produce the generator run returned from this method. data: Optional data to be passed to the underlying model's `gen`, which is called within this method and actually produces the resulting generator run. By default, data is all data on the `experiment` if `use_update` is False and only the new data since the last call to this method if `use_update` is True. n: Integer representing how many arms should be in the generator run produced by this method. NOTE: Some underlying models may ignore the `n` and produce a model-determined number of arms. In that case this method will also output a generator run with number of arms that can differ from `n`. pending_observations: A map from metric name to pending observations for that metric, used by some models to avoid resuggesting points that are currently being evaluated. """ self.experiment = experiment self._maybe_move_to_next_step() self._set_or_update_current_model(data=data) self._save_seen_trial_indices() # Make sure to not make too many generator runs and # exceed maximum allowed paralellism for the step. num_until_max_parallelism = self._num_remaining_trials_until_max_parallelism( ) if num_until_max_parallelism is not None: num_generator_runs = min(num_generator_runs, num_until_max_parallelism) # Make sure not to extend number of trials expected in step. if self._curr.enforce_num_trials and self._curr.num_trials > 0: num_generator_runs = min( num_generator_runs, self._curr.num_trials - self.num_can_complete_this_step, ) model = not_none(self.model) model_gen_kwargs = consolidate_kwargs( kwargs_iterable=[self._curr.model_gen_kwargs, kwargs], keywords=get_function_argument_names(model.gen), ) generator_runs = [] for _ in range(num_generator_runs): try: generator_run = _produce_generator_run_from_model( model=model, input_max_gen_draws=MAX_GEN_DRAWS, n=n, pending_observations=pending_observations, model_gen_kwargs=model_gen_kwargs, should_deduplicate=self._curr.should_deduplicate, arms_by_signature=self.experiment.arms_by_signature, ) generator_run._generation_step_index = self._curr.index self._generator_runs.append(generator_run) generator_runs.append(generator_run) except DataRequiredError as err: # Model needs more data, so we log the error and return # as many generator runs as we were able to produce, unless # no trials were produced at all (in which case its safe to raise). if len(generator_runs) == 0: raise logger.debug(f"Model required more data: {err}.") break return generator_runs
def _gen_multiple( self, experiment: Experiment, num_generator_runs: int, data: Optional[Data] = None, n: int = 1, **kwargs: Any, ) -> List[GeneratorRun]: """Produce multiple generator runs at once, to be made into multiple trials on the experiment. NOTE: This is used to ensure that maximum paralellism and number of trials per step are not violated when producing many generator runs from this generation strategy in a row. Without this function, if one generates multiple generator runs without first making any of them into running trials, generation strategy cannot enforce that it only produces as many generator runs as are allowed by the paralellism limit and the limit on number of trials in current step. """ self.experiment = experiment self._set_or_update_model(data=data) self._save_seen_trial_indices() max_parallelism = self._curr.max_parallelism num_running = self.num_running_trials_this_step # Make sure to not make too many generator runs and # exceed maximum allowed paralellism for the step. if max_parallelism is not None: if num_running >= max_parallelism: raise MaxParallelismReachedException( step_index=self._curr.index, model_name=self._curr.model_name, num_running=num_running, ) else: num_generator_runs = min(num_generator_runs, max_parallelism - num_running) # Make sure not to extend number of trials expected in step. if self._curr.enforce_num_trials and self._curr.num_trials > 0: num_generator_runs = min( num_generator_runs, self._curr.num_trials - self.num_can_complete_this_step, ) model = not_none(self.model) # TODO[T79183560]: Cloning generator runs here is a temporary measure # to ensure a 1-to-1 correspondence between user-facing generator runs # and their stored SQL counterparts. This will be no longer needed soon # as we move to use foreign keys to avoid storing generotor runs on both # experiment and generation strategy like we do now. generator_run_clones = [] for _ in range(num_generator_runs): try: generator_run = model.gen( n=n, **consolidate_kwargs( kwargs_iterable=[self._curr.model_gen_kwargs, kwargs], keywords=get_function_argument_names(model.gen), ), ) generator_run._generation_step_index = self._curr.index self._generator_runs.append(generator_run) generator_run_clones.append(generator_run.clone()) except DataRequiredError as err: # Model needs more data, so we log the error and return # as many generator runs as we were able to produce, unless # no trials were produced at all (in which case its safe to raise). if len(generator_run_clones) == 0: raise logger.debug(f"Model required more data: {err}.") return generator_run_clones
def gen( self, experiment: Experiment, data: Optional[Data] = None, n: int = 1, **kwargs: Any, ) -> GeneratorRun: """Produce the next points in the experiment.""" self._set_experiment(experiment=experiment) new_arm_signatures = set() data = data or experiment.fetch_data() if data is not None and not data.df.empty: if self._data.df.empty: new_data = data.df else: # Select only the new data to determine how many new arms were # evaluated since the generation strategy was last updated with # data (find rows that are in `data.df`, but not in `self._data.df`) merged = data.df.merge( self._data.df, on=[ "arm_name", "trial_index", "metric_name", "mean", "sem" ], how="left", indicator=True, ) new_data = merged[merged["_merge"] == "left_only"] # Get arm signatures for each entry in data that the GS hasn't seen yet. new_arm_signatures = { not_none(experiment.arms_by_name.get( row["arm_name"])).signature for _, row in new_data.iterrows() if (row["arm_name"] in experiment.arms_by_name and not not_none(experiment.trials.get( row["trial_index"])).status.is_failed) } enough_observed = (len(self._observed) + len(new_arm_signatures) ) >= self._curr.min_arms_observed unlimited_arms = self._curr.num_arms == -1 enough_generated = (not unlimited_arms and len(self._generated) >= self._curr.num_arms) # Check that minimum observed_arms is satisfied if it's enforced. if self._curr.enforce_num_arms and enough_generated and not enough_observed: raise DataRequiredError( "All trials for current model have been generated, but not enough " "data has been observed to fit next model. Try again when more data " "are available.") # TODO[Lena, T44021164]: take into account failed trials. Potentially # reduce `_generated` count when a trial mentioned in new data failed. lgr = self.last_generator_run if enough_generated and enough_observed: # Change to the next model. self._change_model(experiment=experiment, data=data) elif lgr is not None and lgr._model_state_after_gen is not None: model_state = not_none(lgr._model_state_after_gen) self._set_current_model(experiment=experiment, data=data, **model_state) else: self._set_current_model(experiment=experiment, data=data) model = not_none(self._model) kwargs = consolidate_kwargs( kwargs_iterable=[self._curr.model_gen_kwargs, kwargs], keywords=get_function_argument_names(not_none(self._model).gen), ) gen_run = model.gen(n=n, **kwargs) # If nothing failed, update known data, _generated, and _observed. self._data = data self._generated.extend([arm.signature for arm in gen_run.arms]) self._observed.extend(new_arm_signatures) self._generator_runs.append(gen_run) return gen_run
def _gen_multiple( self, experiment: Experiment, num_generator_runs: int, data: Optional[Data] = None, n: int = 1, pending_observations: Optional[Dict[str, List[ObservationFeatures]]] = None, **kwargs: Any, ) -> List[GeneratorRun]: """Produce multiple generator runs at once, to be made into multiple trials on the experiment. NOTE: This is used to ensure that maximum paralellism and number of trials per step are not violated when producing many generator runs from this generation strategy in a row. Without this function, if one generates multiple generator runs without first making any of them into running trials, generation strategy cannot enforce that it only produces as many generator runs as are allowed by the paralellism limit and the limit on number of trials in current step. Args: experiment: Experiment, for which the generation strategy is producing a new generator run in the course of `gen`, and to which that generator run will be added as trial(s). Information stored on the experiment (e.g., trial statuses) is used to determine which model will be used to produce the generator run returned from this method. data: Optional data to be passed to the underlying model's `gen`, which is called within this method and actually produces the resulting generator run. By default, data is all data on the `experiment` if `use_update` is False and only the new data since the last call to this method if `use_update` is True. n: Integer representing how many arms should be in the generator run produced by this method. NOTE: Some underlying models may ignore the `n` and produce a model-determined number of arms. In that case this method will also output a generator run with number of arms that can differ from `n`. pending_observations: A map from metric name to pending observations for that metric, used by some models to avoid resuggesting points that are currently being evaluated. """ self.experiment = experiment self._set_or_update_model(data=data) self._save_seen_trial_indices() max_parallelism = self._curr.max_parallelism num_running = self.num_running_trials_this_step # Make sure to not make too many generator runs and # exceed maximum allowed paralellism for the step. if max_parallelism is not None: if num_running >= max_parallelism: raise MaxParallelismReachedException( step_index=self._curr.index, model_name=self._curr.model_name, num_running=num_running, ) else: num_generator_runs = min(num_generator_runs, max_parallelism - num_running) # Make sure not to extend number of trials expected in step. if self._curr.enforce_num_trials and self._curr.num_trials > 0: num_generator_runs = min( num_generator_runs, self._curr.num_trials - self.num_can_complete_this_step, ) model = not_none(self.model) # TODO[T79183560]: Cloning generator runs here is a temporary measure # to ensure a 1-to-1 correspondence between user-facing generator runs # and their stored SQL counterparts. This will be no longer needed soon # as we move to use foreign keys to avoid storing generotor runs on both # experiment and generation strategy like we do now. generator_run_clones = [] for _ in range(num_generator_runs): try: generator_run = model.gen( n=n, pending_observations=pending_observations, **consolidate_kwargs( kwargs_iterable=[self._curr.model_gen_kwargs, kwargs], keywords=get_function_argument_names(model.gen), ), ) generator_run._generation_step_index = self._curr.index self._generator_runs.append(generator_run) generator_run_clones.append(generator_run.clone()) except DataRequiredError as err: # Model needs more data, so we log the error and return # as many generator runs as we were able to produce, unless # no trials were produced at all (in which case its safe to raise). if len(generator_run_clones) == 0: raise logger.debug(f"Model required more data: {err}.") break return generator_run_clones