def testEmptyMetrics(self): empty_experiment = Experiment(name="test_experiment", search_space=get_search_space()) self.assertEqual(empty_experiment.num_trials, 0) with self.assertRaises(ValueError): empty_experiment.fetch_data() batch = empty_experiment.new_batch_trial() self.assertEqual(empty_experiment.num_trials, 1) with self.assertRaises(ValueError): batch.fetch_data() empty_experiment.add_tracking_metric(Metric(name="some_metric")) empty_experiment.attach_data(get_data()) self.assertFalse(empty_experiment.fetch_data().df.empty)
def gen( self, experiment: Experiment, data: Optional[Data] = None, n: int = 1, **kwargs: Any, ) -> GeneratorRun: """Produce the next points in the experiment.""" self.experiment = experiment self._set_model(experiment=experiment, data=data or experiment.fetch_data()) max_parallelism = self._curr.max_parallelism num_running = self.num_running_trials_for_current_step if max_parallelism is not None and num_running >= max_parallelism: raise MaxParallelismReachedException( step=self._curr, num_running=num_running ) model = not_none(self.model) generator_run = model.gen( n=n, **consolidate_kwargs( kwargs_iterable=[self._curr.model_gen_kwargs, kwargs], keywords=get_function_argument_names(model.gen), ), ) generator_run._generation_step_index = self._curr.index self._generator_runs.append(generator_run) return generator_run
def testEmptyMetrics(self): empty_experiment = Experiment(name="test_experiment", search_space=get_search_space()) self.assertEqual(empty_experiment.num_trials, 0) with self.assertRaises(ValueError): empty_experiment.fetch_data() batch = empty_experiment.new_batch_trial() batch.mark_running(no_runner_required=True) self.assertEqual(empty_experiment.num_trials, 1) with self.assertRaises(ValueError): batch.fetch_data() empty_experiment.add_tracking_metric(Metric(name="ax_test_metric")) self.assertTrue(empty_experiment.fetch_data().df.empty) empty_experiment.attach_data(get_data()) batch.mark_completed() self.assertFalse(empty_experiment.fetch_data().df.empty)
def _extract_optimization_trace_from_metrics( experiment: Experiment) -> np.ndarray: names = [] for trial in experiment.trials.values(): for i, arm in enumerate(trial.arms): reps = int(trial.weights[i]) if isinstance(trial, BatchTrial) else 1 names.extend([arm.name] * reps) iters_df = pd.DataFrame({"arm_name": names}) data_df = experiment.fetch_data(noisy=False).df metrics = data_df["metric_name"].unique() true_values = {} for metric in metrics: df_m = data_df[data_df["metric_name"] == metric] # Get one row per arm df_m = df_m.groupby("arm_name").first().reset_index() df_b = pd.merge(iters_df, df_m, how="left", on="arm_name") true_values[metric] = df_b["mean"].values if isinstance(experiment.optimization_config, MultiObjectiveOptimizationConfig): return feasible_hypervolume( # pyre-fixme[6]: Expected `OptimizationConfig` for 1st param but got # `Optional[ax.core.optimization_config.OptimizationConfig]`. optimization_config=experiment.optimization_config, values=true_values, ) return best_feasible_objective( # pyre-fixme[6]: Expected `OptimizationConfig` for 1st param but got # `Optional[ax.core.optimization_config.OptimizationConfig]`. optimization_config=experiment.optimization_config, values=true_values, )
def gen( self, experiment: Experiment, data: Optional[Data] = None, n: int = 1, **kwargs: Any, ) -> GeneratorRun: """Generate new points, rotating through projections each time.""" if data is None: data = experiment.fetch_data() if not isinstance(data, Data): raise ValueError( "Data fetched from experiment not an instance of PTS-supporting `Data`" ) # Get the next model in the rotation i = self.current_iteration % self.k data_by_proj = self._filter_data_to_projection( experiment=experiment, data=data, arm_sigs=self.arms_by_proj[i], ) lgr = self.last_generator_run # NOTE: May need to `model_class.deserialize_model_state` in the # future if using non-readily serializable state. model_state = (not_none(lgr._model_state_after_gen) if lgr is not None and lgr._model_state_after_gen is not None else {}) A, bounds_d = self.projections[i] if (data_by_proj is None or len(data_by_proj.df["arm_name"].unique()) < self.init_per_proj): # Not enough data to switch to GP, use Sobol for initialization m = get_rembo_initializer( search_space=experiment.search_space, A=A.double().numpy(), bounds_d=bounds_d, **model_state, ) else: # We have enough data to switch to GP. m = get_REMBO( experiment=experiment, data=data_by_proj, A=A, initial_X_d=torch.tensor(self.X_d_by_proj[i], dtype=self.dtype, device=self.device), bounds_d=bounds_d, **self.gp_kwargs, ) self.current_iteration += 1 # Call gen gr = m.gen(n=n) self.X_d_by_proj[i].extend(not_none( m.model).X_d_gen) # pyre-ignore[16] self.arms_by_proj[i].update(a.signature for a in gr.arms) self._generator_runs.append(gr) return gr
def testEmptyMetrics(self): empty_experiment = Experiment( name="test_experiment", search_space=get_search_space(), default_data_type=DataType.MAP_DATA, ) self.assertEqual(empty_experiment.num_trials, 0) empty_experiment.add_tracking_metric(Metric(name="ax_test_metric")) self.assertTrue(empty_experiment.fetch_data().df.empty) empty_experiment.attach_data(get_map_data())
def get_best_raw_objective_point_with_trial_index( experiment: Experiment, optimization_config: Optional[OptimizationConfig] = None ) -> Tuple[int, TParameterization, Dict[str, Tuple[float, float]]]: """Given an experiment, identifies the arm that had the best raw objective, based on the data fetched from the experiment. Args: experiment: Experiment, on which to identify best raw objective arm. optimization_config: Optimization config to use in absence or in place of the one stored on the experiment. Returns: Tuple of parameterization and a mapping from metric name to a tuple of the corresponding objective mean and SEM. """ # pyre-ignore [16] if isinstance(experiment.optimization_config.objective, MultiObjective): logger.warning( "get_best_raw_objective_point is deprecated for multi-objective " "optimization. This method will return an arbitrary point on the " "pareto frontier.") opt_config = optimization_config or experiment.optimization_config assert opt_config is not None, ( "Cannot identify the best point without an optimization config, but no " "optimization config was provided on the experiment or as an argument." ) dat = experiment.fetch_data() if dat.df.empty: raise ValueError( "Cannot identify best point if experiment contains no data.") objective = opt_config.objective if isinstance(objective, ScalarizedObjective): best_row = _get_best_row_for_scalarized_objective(df=dat.df, objective=objective) else: best_row = _get_best_feasible_row_for_single_objective( df=dat.df, optimization_config=opt_config, status_quo=experiment.status_quo, ) # pyre-fixme[6]: Expected `str` for 1st param but got `Series`. best_arm = experiment.arms_by_name[best_row["arm_name"]] best_trial_index = best_row["trial_index"] objective_rows = dat.df.loc[(dat.df["arm_name"] == best_arm.name) & (dat.df["trial_index"] == best_trial_index)] vals = { row["metric_name"]: (row["mean"], row["sem"]) for _, row in objective_rows.iterrows() } # pyre-fixme[7]: Expected `int` for 1st param but got `Series`. return best_trial_index, not_none(best_arm).parameters, vals
def gen( self, experiment: Experiment, data: Optional[Data] = None, n: int = 1, **kwargs: Any, ) -> GeneratorRun: """Generate new points, rotating through projections each time.""" # Use all data in experiment if none is supplied data = data or experiment.fetch_data() # Get the next model in the rotation i = self.current_iteration % self.k data_by_proj = self._filter_data_to_projection( experiment=experiment, data=data, arm_sigs=self.arms_by_proj[i]) lgr = self.last_generator_run model_state = (not_none(lgr._model_state_after_gen) if lgr is not None and lgr._model_state_after_gen is not None else {}) A, bounds_d = self.projections[i] if (data_by_proj is None or len(data_by_proj.df["arm_name"].unique()) < self.init_per_proj): # Not enough data to switch to GP, use Sobol for initialization m = get_rembo_initializer( search_space=experiment.search_space, A=A.double().numpy(), bounds_d=bounds_d, **model_state, ) else: # We have enough data to switch to GP. m = get_REMBO( experiment=experiment, data=data_by_proj, A=A, initial_X_d=torch.tensor(self.X_d_by_proj[i], dtype=self.dtype, device=self.device), bounds_d=bounds_d, **self.gp_kwargs, ) self.current_iteration += 1 # Call gen gr = m.gen(n=n) self.X_d_by_proj[i].extend(not_none( m.model).X_d_gen) # pyre-ignore[16] self.arms_by_proj[i].update(a.signature for a in gr.arms) return gr
def _get_objective_trace_plot( experiment: Experiment, metric_name: str, model_transitions: List[int], optimization_direction: Optional[str] = None, ) -> Optional[go.Figure]: best_objectives = np.array([experiment.fetch_data().df["mean"]]) return optimization_trace_single_method_plotly( y=best_objectives, title="Best objective found vs. # of iterations", ylabel=metric_name, model_transitions=model_transitions, optimization_direction=optimization_direction, plot_trial_points=True, )
def get_best_raw_objective_point( experiment: Experiment, optimization_config: Optional[OptimizationConfig] = None ) -> Tuple[TParameterization, Dict[str, Tuple[float, float]]]: """Given an experiment, identifies the arm that had the best raw objective, based on the data fetched from the experiment. Args: experiment: Experiment, on which to identify best raw objective arm. optimization_config: Optimization config to use in absence or in place of the one stored on the experiment. Returns: Tuple of parameterization and a mapping from metric name to a tuple of the corresponding objective mean and SEM. """ dat = experiment.fetch_data() if dat.df.empty: raise ValueError("Cannot identify best point if experiment contains no data.") opt_config = optimization_config or experiment.optimization_config objective_name = opt_config.objective.metric.name objective_rows = dat.df.loc[dat.df["metric_name"] == objective_name] if objective_rows.empty: raise ValueError('No data has been logged for objective "{objective_name}".') optimization_config = optimization_config or opt_config assert optimization_config is not None, ( "Cannot identify the best point without an optimization config, but no " "optimization config was provided on the experiment or as an argument." ) best_row = ( objective_rows.loc[objective_rows["mean"].idxmin()] if opt_config.objective.minimize else objective_rows.loc[objective_rows["mean"].idxmax()] ) best_arm = experiment.arms_by_name.get(best_row["arm_name"]) objective_rows = dat.df.loc[ (dat.df["arm_name"] == best_row["arm_name"]) & (dat.df["trial_index"] == best_row["trial_index"]) ] vals = { row["metric_name"]: (row["mean"], row["sem"]) for _, row in objective_rows.iterrows() } return not_none(best_arm).parameters, vals
def get_best_raw_objective_point( experiment: Experiment, optimization_config: Optional[OptimizationConfig] = None ) -> Tuple[TParameterization, Dict[str, Tuple[float, float]]]: """Given an experiment, identifies the arm that had the best raw objective, based on the data fetched from the experiment. Args: experiment: Experiment, on which to identify best raw objective arm. optimization_config: Optimization config to use in absence or in place of the one stored on the experiment. Returns: Tuple of parameterization and a mapping from metric name to a tuple of the corresponding objective mean and SEM. """ opt_config = optimization_config or experiment.optimization_config assert opt_config is not None, ( "Cannot identify the best point without an optimization config, but no " "optimization config was provided on the experiment or as an argument." ) dat = experiment.fetch_data() if dat.df.empty: raise ValueError("Cannot identify best point if experiment contains no data.") objective = opt_config.objective if isinstance(objective, ScalarizedObjective): best_row = _get_best_row_for_scalarized_objective(dat.df, objective) else: best_row = _get_best_row_for_single_objective(dat.df, objective) best_arm = experiment.arms_by_name[best_row["arm_name"]] best_trial_index = best_row["trial_index"] objective_rows = dat.df.loc[ (dat.df["arm_name"] == best_arm.name) & (dat.df["trial_index"] == best_trial_index) ] vals = { row["metric_name"]: (row["mean"], row["sem"]) for _, row in objective_rows.iterrows() } return not_none(best_arm).parameters, vals
def _check_validity_and_get_data( self, experiment: Experiment) -> Optional[MapData]: """Validity checks and returns the `MapData` used for early stopping.""" if experiment.optimization_config is None: raise UnsupportedError( # pragma: no cover "Experiment must have an optimization config in order to use an " "early stopping strategy.") optimization_config = not_none(experiment.optimization_config) objective_name = optimization_config.objective.metric.name data = experiment.fetch_data() if data.df.empty: logger.info(f"{self.__class__.__name__} received empty data. " "Not stopping any trials.") return None if objective_name not in set(data.df["metric_name"]): logger.info(f"{self.__class__.__name__} did not receive data " "from the objective metric. Not stopping any trials.") return None if not isinstance(data, MapData): logger.info( f"{self.__class__.__name__} expects MapData, but the " f"data attached to experiment is of type {type(data)}. " "Not stopping any trials.") return None data = checked_cast(MapData, data) map_keys = data.map_keys if len(list(map_keys)) > 1: logger.info( f"{self.__class__.__name__} expects MapData with a single " "map key, but the data attached to the experiment has multiple: " f"{data.map_keys}. Not stopping any trials.") return None return data
def gen( self, experiment: Experiment, data: Optional[Data] = None, n: int = 1, **kwargs: Any, ) -> GeneratorRun: """Produce the next points in the experiment.""" self._set_experiment(experiment=experiment) new_arm_signatures = set() data = data or experiment.fetch_data() if data is not None and not data.df.empty: if self._data.df.empty: new_data = data.df else: # Select only the new data to determine how many new arms were # evaluated since the generation strategy was last updated with # data (find rows that are in `data.df`, but not in `self._data.df`) merged = data.df.merge( self._data.df, on=[ "arm_name", "trial_index", "metric_name", "mean", "sem" ], how="left", indicator=True, ) new_data = merged[merged["_merge"] == "left_only"] # Get arm signatures for each entry in data that the GS hasn't seen yet. new_arm_signatures = { not_none(experiment.arms_by_name.get( row["arm_name"])).signature for _, row in new_data.iterrows() if (row["arm_name"] in experiment.arms_by_name and not not_none(experiment.trials.get( row["trial_index"])).status.is_failed) } enough_observed = (len(self._observed) + len(new_arm_signatures) ) >= self._curr.min_arms_observed unlimited_arms = self._curr.num_arms == -1 enough_generated = (not unlimited_arms and len(self._generated) >= self._curr.num_arms) # Check that minimum observed_arms is satisfied if it's enforced. if self._curr.enforce_num_arms and enough_generated and not enough_observed: raise DataRequiredError( "All trials for current model have been generated, but not enough " "data has been observed to fit next model. Try again when more data " "are available.") # TODO[Lena, T44021164]: take into account failed trials. Potentially # reduce `_generated` count when a trial mentioned in new data failed. lgr = self.last_generator_run if enough_generated and enough_observed: # Change to the next model. self._change_model(experiment=experiment, data=data) elif lgr is not None and lgr._model_state_after_gen is not None: model_state = not_none(lgr._model_state_after_gen) self._set_current_model(experiment=experiment, data=data, **model_state) else: self._set_current_model(experiment=experiment, data=data) model = not_none(self._model) kwargs = consolidate_kwargs( kwargs_iterable=[self._curr.model_gen_kwargs, kwargs], keywords=get_function_argument_names(not_none(self._model).gen), ) gen_run = model.gen(n=n, **kwargs) # If nothing failed, update known data, _generated, and _observed. self._data = data self._generated.extend([arm.signature for arm in gen_run.arms]) self._observed.extend(new_arm_signatures) self._generator_runs.append(gen_run) return gen_run
def compute_pareto_frontier( experiment: Experiment, primary_objective: Metric, secondary_objective: Metric, data: Optional[Data] = None, outcome_constraints: Optional[List[OutcomeConstraint]] = None, absolute_metrics: Optional[List[str]] = None, num_points: int = 10, trial_index: Optional[int] = None, chebyshev: bool = True, ) -> ParetoFrontierResults: """Compute the Pareto frontier between two objectives. For experiments with batch trials, a trial index or data object must be provided. Args: experiment: The experiment to compute a pareto frontier for. primary_objective: The primary objective to optimize. secondary_objective: The secondary objective against which to trade off the primary objective. outcome_constraints: Outcome constraints to be respected by the optimization. Can only contain constraints on metrics that are not primary or secondary objectives. absolute_metrics: List of outcome metrics that should NOT be relativized w.r.t. the status quo (all other outcomes will be in % relative to status_quo). num_points: The number of points to compute on the Pareto frontier. chebyshev: Whether to use augmented_chebyshev_scalarization when computing Pareto Frontier points. Returns: ParetoFrontierResults: A NamedTuple with the following fields: - param_dicts: The parameter dicts of the points generated on the Pareto Frontier. - means: The posterior mean predictions of the model for each metric (same order as the param dicts). - sems: The posterior sem predictions of the model for each metric (same order as the param dicts). - primary_metric: The name of the primary metric. - secondary_metric: The name of the secondary metric. - absolute_metrics: List of outcome metrics that are NOT be relativized w.r.t. the status quo (all other metrics are in % relative to status_quo). """ # TODO(jej): Implement using MultiObjectiveTorchModelBridge's _pareto_frontier model_gen_options = { "acquisition_function_kwargs": { "chebyshev_scalarization": chebyshev } } if (trial_index is None and data is None and any( isinstance(t, BatchTrial) for t in experiment.trials.values())): raise UnsupportedError( "Must specify trial index or data for experiment with batch trials" ) absolute_metrics = [] if absolute_metrics is None else absolute_metrics for metric in absolute_metrics: if metric not in experiment.metrics: raise ValueError(f"Model was not fit on metric `{metric}`") if outcome_constraints is None: outcome_constraints = [] else: # ensure we don't constrain an objective _validate_outcome_constraints( outcome_constraints=outcome_constraints, primary_objective=primary_objective, secondary_objective=secondary_objective, ) # build posterior mean model if not data: try: data = (experiment.trials[trial_index].fetch_data() if trial_index else experiment.fetch_data()) except Exception as e: logger.info(f"Could not fetch data from experiment or trial: {e}") oc = _build_new_optimization_config( weights=np.array([0.5, 0.5]), primary_objective=primary_objective, secondary_objective=secondary_objective, outcome_constraints=outcome_constraints, ) model = Models.MOO( experiment=experiment, data=data, acqf_constructor=get_PosteriorMean, optimization_config=oc, ) status_quo = experiment.status_quo if status_quo: try: status_quo_prediction = model.predict([ ObservationFeatures( parameters=status_quo.parameters, # pyre-fixme [6]: Expected `Optional[np.int64]` for trial_index trial_index=trial_index, ) ]) except ValueError as e: logger.warning(f"Could not predict OOD status_quo outcomes: {e}") status_quo = None status_quo_prediction = None else: status_quo_prediction = None param_dicts: List[TParameterization] = [] # Construct weightings with linear angular spacing. # TODO: Verify whether 0, 1 weights cause problems because of subset_model. alpha = np.linspace(0 + 0.01, np.pi / 2 - 0.01, num_points) primary_weight = (-1 if primary_objective.lower_is_better else 1) * np.cos(alpha) secondary_weight = (-1 if secondary_objective.lower_is_better else 1) * np.sin(alpha) weights_list = np.stack([primary_weight, secondary_weight]).transpose() for weights in weights_list: outcome_constraints = outcome_constraints oc = _build_new_optimization_config( weights=weights, primary_objective=primary_objective, secondary_objective=secondary_objective, outcome_constraints=outcome_constraints, ) # TODO: (jej) T64002590 Let this serve as a starting point for optimization. # ex. Add global spacing criterion. Implement on BoTorch side. # pyre-fixme [6]: Expected different type for model_gen_options run = model.gen(1, model_gen_options=model_gen_options, optimization_config=oc) param_dicts.append(run.arms[0].parameters) # Call predict on points to get their decomposed metrics. means, cov = model.predict( [ObservationFeatures(parameters) for parameters in param_dicts]) return _extract_pareto_frontier_results( param_dicts=param_dicts, means=means, variances=cov, primary_metric=primary_objective.name, secondary_metric=secondary_objective.name, absolute_metrics=absolute_metrics, outcome_constraints=outcome_constraints, status_quo_prediction=status_quo_prediction, )
def _extract_asynchronous_optimization_trace( experiment: Experiment, start_time: float, end_time: float, delta_t: float, completed_time_key: str, include_only_completed_trials: bool, ) -> np.ndarray: """Extract optimization trace for an asynchronous benchmark run. This involves getting the `completed_time` from the trial `run_metadata`, as described by the `completed_time_key`. From the `start_time`, `end_time`, and `delta_t` arguments, a sequence of times is constructed. The returned optimization trace is the best achieved value so far for each time, amongst completed (or early stopped) trials. Args: experiment: The experiment from which to generate results. start_time: The starting time. end_time: The ending time. delta_t: The increment between successive time points. completed_time_key: The key from which we look up completed run times from trial `run_metadata`. include_only_completed_trials: Include results only from completed trials. This will ignore trials that were early stopped. Returns: An array representing the optimization trace as a function of time. """ if any( isinstance(trial, BatchTrial) for trial in experiment.trials.values()): raise NotImplementedError("Batched trials are not yet supported.") def get_completed_time(row): time = experiment.trials[ row.trial_index].run_metadata[completed_time_key] return pd.Series({"completed_time": time}) if include_only_completed_trials: completed_trials = experiment.trial_indices_by_status[ TrialStatus.COMPLETED] data_df = experiment.fetch_trials_data(trial_indices=completed_trials, noisy=False).df else: data_df = experiment.fetch_data(noisy=False).df minimize = experiment.optimization_config.objective.minimize # pyre-ignore[16] num_periods_running = int((end_time - start_time) // delta_t + 1) # TODO: Currently, the timestamps generated below must exactly match the # `completed_time` column iters_df = pd.DataFrame({ "completed_time": np.arange(num_periods_running) * delta_t + start_time }) true_values = {} for metric, df_m in data_df.groupby("metric_name"): df_m = data_df[data_df["metric_name"] == metric] # only keep the last data point for each arm df_m = (df_m.sort_values(["timestamp"], ascending=True).groupby("arm_name").tail(n=1)) # get completed times from run metadata df_m["completed_time"] = df_m.apply(get_completed_time, axis=1) # for trials that completed at the same time, keep only the best df_m_g = df_m.groupby("completed_time") df_m = (df_m_g.min() if minimize else df_m_g.max()).reset_index() # take cumulative best wrt the completed time df_m = df_m.sort_index() df_m["mean"] = df_m["mean"].cummin( ) if minimize else df_m["mean"].cummax() df_b = pd.merge(iters_df, df_m, how="left", on="completed_time") # replace nans with Infs, which can be handled by `best_feasible_objective` true_values[metric] = df_b["mean"].fillna( np.Inf if minimize else -np.Inf) return best_feasible_objective( # pyre-fixme[6]: Expected `OptimizationConfig` for 1st param but got # `Optional[ax.core.optimization_config.OptimizationConfig]`. optimization_config=experiment.optimization_config, values=true_values, )
def exp_to_df( exp: Experiment, metrics: Optional[List[Metric]] = None, run_metadata_fields: Optional[List[str]] = None, trial_properties_fields: Optional[List[str]] = None, **kwargs: Any, ) -> pd.DataFrame: """Transforms an experiment to a DataFrame. Only supports Experiment and SimpleExperiment. Transforms an Experiment into a dataframe with rows keyed by trial_index and arm_name, metrics pivoted into one row. Args: exp: An Experiment that may have pending trials. metrics: Override list of metrics to return. Return all metrics if None. run_metadata_fields: fields to extract from trial.run_metadata for trial in experiment.trials. If there are multiple arms per trial, these fields will be replicated across the arms of a trial. trial_properties_fields: fields to extract from trial._properties for trial in experiment.trials. If there are multiple arms per trial, these fields will be replicated across the arms of a trial. Output columns names will be prepended with "trial_properties_". **kwargs: Custom named arguments, useful for passing complex objects from call-site to the `fetch_data` callback. Returns: DataFrame: A dataframe of inputs, metadata and metrics by trial and arm. If no trials are available, returns an empty dataframe. If no metric ouputs are available, returns a dataframe of inputs and metadata. """ def prep_return(df: pd.DataFrame, drop_col: str, sort_by: List[str]) -> pd.DataFrame: return not_none( not_none(df.drop(drop_col, axis=1)).sort_values(sort_by)) def merge_trials_dict_with_df(df: pd.DataFrame, trials_dict: Dict[int, Any], column_name: str) -> None: """Add a column ``column_name`` to a DataFrame ``df`` containing a column ``trial_index``. Each value of the new column is given by the element of ``trials_dict`` indexed by ``trial_index``. Args: df: Pandas DataFrame with column ``trial_index``, to be appended with a new column. trials_dict: Dict mapping each ``trial_index`` to a value. The new column of df will be populated with the value corresponding with the ``trial_index`` of each row. column_name: Name of the column to be appended to ``df``. """ if "trial_index" not in df.columns: raise ValueError("df must have trial_index column") if any(trials_dict.values()): # field present for any trial if not all(trials_dict.values()): # not present for all trials logger.warning( f"Column {column_name} missing for some trials. " "Filling with None when missing.") df[column_name] = [ trials_dict[trial_index] for trial_index in df.trial_index ] else: logger.warning(f"Column {column_name} missing for all trials. " "Not appending column.") def get_generation_method_str(trial: BaseTrial) -> str: generation_methods = { not_none(generator_run._model_key) for generator_run in trial.generator_runs if generator_run._model_key is not None } # add "Manual" if any generator_runs are manual if any(generator_run.generator_run_type == GeneratorRunType.MANUAL.name for generator_run in trial.generator_runs): generation_methods.add("Manual") return ", ".join( generation_methods) if generation_methods else "Unknown" # Accept Experiment and SimpleExperiment if isinstance(exp, MultiTypeExperiment): raise ValueError( "Cannot transform MultiTypeExperiments to DataFrames.") key_components = ["trial_index", "arm_name"] # Get each trial-arm with parameters arms_df = pd.DataFrame() for trial_index, trial in exp.trials.items(): for arm in trial.arms: arms_df = arms_df.append( { "arm_name": arm.name, "trial_index": trial_index, **arm.parameters }, ignore_index=True, ) # Fetch results; in case arms_df is empty, return empty results (legacy behavior) results = exp.fetch_data(metrics, **kwargs).df if len(arms_df.index) == 0: if len(results.index) != 0: raise ValueError( "exp.fetch_data().df returned more rows than there are experimental " "arms. This is an inconsistent experimental state. Please report to " "Ax support.") return results # Create key column from key_components arms_df["trial_index"] = arms_df["trial_index"].astype(int) key_col = "-".join(key_components) key_vals = arms_df[key_components[0]].astype("str") + arms_df[ key_components[1]].astype("str") arms_df[key_col] = key_vals # Add trial status trials = exp.trials.items() trial_to_status = {index: trial.status.name for index, trial in trials} merge_trials_dict_with_df(df=arms_df, trials_dict=trial_to_status, column_name="trial_status") # Add generation_method, accounting for the generic case that generator_runs is of # arbitrary length. Repeated methods within a trial are condensed via `set` and an # empty set will yield "Unknown" as the method. trial_to_generation_method = { trial_index: get_generation_method_str(trial) for trial_index, trial in trials } merge_trials_dict_with_df( df=arms_df, trials_dict=trial_to_generation_method, column_name="generation_method", ) # Add any trial properties fields to arms_df if trial_properties_fields is not None: # add trial._properties fields for field in trial_properties_fields: trial_to_properties_field = { trial_index: (trial._properties[field] if field in trial._properties else None) for trial_index, trial in trials } merge_trials_dict_with_df( df=arms_df, trials_dict=trial_to_properties_field, column_name="trial_properties_" + field, ) # Add any run_metadata fields to arms_df if run_metadata_fields is not None: # add run_metadata fields for field in run_metadata_fields: trial_to_metadata_field = { trial_index: (trial.run_metadata[field] if field in trial.run_metadata else None) for trial_index, trial in trials } merge_trials_dict_with_df( df=arms_df, trials_dict=trial_to_metadata_field, column_name=field, ) if len(results.index) == 0: logger.info( f"No results present for the specified metrics `{metrics}`. " "Returning arm parameters and metadata only.") exp_df = arms_df elif not all(col in results.columns for col in key_components): logger.warn( f"At least one of key columns `{key_components}` not present in results df " f"`{results}`. Returning arm parameters and metadata only.") exp_df = arms_df else: # prepare results for merge key_vals = results[key_components[0]].astype("str") + results[ key_components[1]].astype("str") results[key_col] = key_vals metric_vals = results.pivot(index=key_col, columns="metric_name", values="mean").reset_index() # dedupe results by key_components metadata = results[key_components + [key_col]].drop_duplicates() metrics_df = pd.merge(metric_vals, metadata, on=key_col) # merge and return exp_df = pd.merge(metrics_df, arms_df, on=key_components + [key_col], how="outer") return prep_return(df=exp_df, drop_col=key_col, sort_by=["arm_name"])
def exp_to_df( exp: Experiment, metrics: Optional[List[Metric]] = None, run_metadata_fields: Optional[List[str]] = None, trial_properties_fields: Optional[List[str]] = None, **kwargs: Any, ) -> pd.DataFrame: """Transforms an experiment to a DataFrame with rows keyed by trial_index and arm_name, metrics pivoted into one row. If the pivot results in more than one row per arm (or one row per ``arm * map_keys`` combination if ``map_keys`` are present), results are omitted and warning is produced. Only supports ``Experiment``. Transforms an ``Experiment`` into a ``pd.DataFrame``. Args: exp: An ``Experiment`` that may have pending trials. metrics: Override list of metrics to return. Return all metrics if ``None``. run_metadata_fields: fields to extract from ``trial.run_metadata`` for trial in ``experiment.trials``. If there are multiple arms per trial, these fields will be replicated across the arms of a trial. trial_properties_fields: fields to extract from ``trial._properties`` for trial in ``experiment.trials``. If there are multiple arms per trial, these fields will be replicated across the arms of a trial. Output columns names will be prepended with ``"trial_properties_"``. **kwargs: Custom named arguments, useful for passing complex objects from call-site to the `fetch_data` callback. Returns: DataFrame: A dataframe of inputs, metadata and metrics by trial and arm (and ``map_keys``, if present). If no trials are available, returns an empty dataframe. If no metric ouputs are available, returns a dataframe of inputs and metadata. """ # Accept Experiment and SimpleExperiment if isinstance(exp, MultiTypeExperiment): raise ValueError( "Cannot transform MultiTypeExperiments to DataFrames.") key_components = ["trial_index", "arm_name"] # Get each trial-arm with parameters arms_df = pd.DataFrame() for trial_index, trial in exp.trials.items(): for arm in trial.arms: arms_df = arms_df.append( { "arm_name": arm.name, "trial_index": trial_index, **arm.parameters }, ignore_index=True, ) # Fetch results; in case arms_df is empty, return empty results (legacy behavior) data = exp.fetch_data(metrics, **kwargs) results = data.df if len(arms_df.index) == 0: if len(results.index) != 0: raise ValueError( "exp.fetch_data().df returned more rows than there are experimental " "arms. This is an inconsistent experimental state. Please report to " "Ax support.") return results # Create key column from key_components arms_df["trial_index"] = arms_df["trial_index"].astype(int) # Add trial status trials = exp.trials.items() trial_to_status = {index: trial.status.name for index, trial in trials} _merge_trials_dict_with_df(df=arms_df, trials_dict=trial_to_status, column_name="trial_status") # Add generation_method, accounting for the generic case that generator_runs is of # arbitrary length. Repeated methods within a trial are condensed via `set` and an # empty set will yield "Unknown" as the method. trial_to_generation_method = { trial_index: _get_generation_method_str(trial) for trial_index, trial in trials } _merge_trials_dict_with_df( df=arms_df, trials_dict=trial_to_generation_method, column_name="generation_method", ) # Add any trial properties fields to arms_df if trial_properties_fields is not None: # add trial._properties fields for field in trial_properties_fields: trial_to_properties_field = { trial_index: (trial._properties[field] if field in trial._properties else None) for trial_index, trial in trials } _merge_trials_dict_with_df( df=arms_df, trials_dict=trial_to_properties_field, column_name="trial_properties_" + field, ) # Add any run_metadata fields to arms_df if run_metadata_fields is not None: # add run_metadata fields for field in run_metadata_fields: trial_to_metadata_field = { trial_index: (trial.run_metadata[field] if field in trial.run_metadata else None) for trial_index, trial in trials } _merge_trials_dict_with_df( df=arms_df, trials_dict=trial_to_metadata_field, column_name=field, ) exp_df = _merge_results_if_no_duplicates( arms_df=arms_df, data=data, key_components=key_components, metrics=metrics or list(exp.metrics.values()), ) return not_none(not_none(exp_df).sort_values(["trial_index"]))
def get_standard_plots( experiment: Experiment, model: Optional[ModelBridge], data: Optional[Data] = None, model_transitions: Optional[List[int]] = None, true_objective_metric_name: Optional[str] = None, ) -> List[go.Figure]: """Extract standard plots for single-objective optimization. Extracts a list of plots from an ``Experiment`` and ``ModelBridge`` of general interest to an Ax user. Currently not supported are - TODO: multi-objective optimization - TODO: ChoiceParameter plots Args: - experiment: The ``Experiment`` from which to obtain standard plots. - model: The ``ModelBridge`` used to suggest trial parameters. - data: If specified, data, to which to fit the model before generating plots. - model_transitions: The arm numbers at which shifts in generation_strategy occur. Returns: - a plot of objective value vs. trial index, to show experiment progression - a plot of objective value vs. range parameter values, only included if the model associated with generation_strategy can create predictions. This consists of: - a plot_slice plot if the search space contains one range parameter - an interact_contour plot if the search space contains multiple range parameters """ if (true_objective_metric_name is not None and true_objective_metric_name not in experiment.metrics.keys()): raise ValueError( f"true_objective_metric_name='{true_objective_metric_name}' is not present " f"in experiment.metrics={experiment.metrics}. Please add a valid " "true_objective_metric_name or remove the optional parameter to get " "standard plots.") objective = not_none(experiment.optimization_config).objective if isinstance(objective, ScalarizedObjective): logger.warning( "get_standard_plots does not currently support ScalarizedObjective " "optimization experiments. Returning an empty list.") return [] if data is None: data = experiment.fetch_data() if data.df.empty: logger.info( f"Experiment {experiment} does not yet have data, nothing to plot." ) return [] output_plot_list = [] output_plot_list.extend( _get_objective_trace_plot( experiment=experiment, data=data, model_transitions=model_transitions if model_transitions is not None else [], true_objective_metric_name=true_objective_metric_name, )) # Objective vs. parameter plot requires a `Model`, so add it only if model # is alrady available. In cases where initially custom trials are attached, # model might not yet be set on the generation strategy. if model: # TODO: Check if model can predict in favor of try/catch. try: if true_objective_metric_name is not None: output_plot_list.append( _objective_vs_true_objective_scatter( model=model, objective_metric_name=objective.metric_names[0], true_objective_metric_name=true_objective_metric_name, )) output_plot_list.extend( _get_objective_v_param_plots( experiment=experiment, model=model, )) output_plot_list.extend(_get_cross_validation_plots(model=model)) feature_importance_plot = plot_feature_importance_by_feature_plotly( model=model, relative=False, caption=FEATURE_IMPORTANCE_CAPTION) feature_importance_plot.layout.title = "[ADVANCED] " + str( # pyre-fixme[16]: go.Figure has no attribute `layout` feature_importance_plot.layout.title.text) output_plot_list.append(feature_importance_plot) output_plot_list.append( interact_fitted_plotly(model=model, rel=False)) except NotImplementedError: # Model does not implement `predict` method. pass return [plot for plot in output_plot_list if plot is not None]
def exp_to_df( exp: Experiment, metrics: Optional[List[Metric]] = None, key_components: Optional[List[str]] = None, run_metadata_fields: Optional[List[str]] = None, **kwargs: Any, ) -> pd.DataFrame: """Transforms an experiment to a DataFrame. Only supports Experiment and SimpleExperiment. Transforms an Experiment into a dataframe with rows keyed by trial_index and arm_name, metrics pivoted into one row. Args: exp: An Experiment that may have pending trials. metrics: Override list of metrics to return. Return all metrics if None. key_components: fields that combine to make a unique key corresponding to rows, similar to the list of fields passed to a GROUP BY. Defaults to ['arm_name', 'trial_index']. run_metadata_fields: fields to extract from trial.run_metadata for trial in experiment.trials. If there are multiple arms per trial, these fields will be replicated across the arms of a trial. **kwargs: Custom named arguments, useful for passing complex objects from call-site to the `fetch_data` callback. Returns: DataFrame: A dataframe of inputs and metrics by trial and arm. """ def prep_return( df: pd.DataFrame, drop_col: str, sort_by: List[str] ) -> pd.DataFrame: return not_none(not_none(df.drop(drop_col, axis=1)).sort_values(sort_by)) key_components = key_components or ["trial_index", "arm_name"] # Accept Experiment and SimpleExperiment if isinstance(exp, MultiTypeExperiment): raise ValueError("Cannot transform MultiTypeExperiments to DataFrames.") results = exp.fetch_data(metrics, **kwargs).df if len(results.index) == 0: # Handle empty case return results # create key column from key_components key_col = "-".join(key_components) key_vals = results[key_components[0]].astype("str") for key in key_components[1:]: key_vals = key_vals + results[key].astype("str") results[key_col] = key_vals # pivot dataframe from long to wide metric_vals = results.pivot( index=key_col, columns="metric_name", values="mean" ).reset_index() # dedupe results by key_components metadata = results[key_components + [key_col]].drop_duplicates() metric_and_metadata = pd.merge(metric_vals, metadata, on=key_col) # get params of each arm and merge with deduped results arm_names_and_params = pd.DataFrame( [{"arm_name": name, **arm.parameters} for name, arm in exp.arms_by_name.items()] ) exp_df = pd.merge(metric_and_metadata, arm_names_and_params, on="arm_name") # add trial status trials = exp.trials.items() trial_to_status = {index: trial.status.name for index, trial in trials} exp_df["trial_status"] = [trial_to_status[key] for key in exp_df.trial_index] # if no run_metadata fields are requested, return exp_df so far if run_metadata_fields is None: return prep_return(df=exp_df, drop_col=key_col, sort_by=key_components) if not isinstance(run_metadata_fields, list): raise ValueError("run_metadata_fields must be List[str] or None.") # add additional run_metadata fields for field in run_metadata_fields: trial_to_metadata_field = { index: (trial.run_metadata[field] if field in trial.run_metadata else None) for index, trial in trials } if any(trial_to_metadata_field.values()): # field present for any trial if not all(trial_to_metadata_field.values()): # not present for all trials logger.warning( f"Field {field} missing for some trials' run_metadata. " "Returning None when missing." ) exp_df[field] = [trial_to_metadata_field[key] for key in exp_df.trial_index] else: logger.warning( f"Field {field} missing for all trials' run_metadata. " "Not appending column." ) return prep_return(df=exp_df, drop_col=key_col, sort_by=key_components)
def get_standard_plots( experiment: Experiment, generation_strategy: GenerationStrategy ) -> List[go.Figure]: """Extract standard plots for single-objective optimization. Extracts a list of plots from an Experiment and GenerationStrategy of general interest to an Ax user. Currently not supported are - TODO: multi-objective optimization - TODO: ChoiceParameter plots Args: - experiment: the Experiment from which to obtain standard plots. - generation_strategy: the GenerationStrategy used to suggest trial parameters in experiment Returns: - a plot of objective value vs. trial index, to show experiment progression - a plot of objective value vs. range parameter values, only included if the model associated with generation_strategy can create predictions. This consists of: - a plot_slice plot if the search space contains one range parameter - an interact_contour plot if the search space contains multiple range parameters """ objective = not_none(experiment.optimization_config).objective if isinstance(objective, MultiObjective): logger.warning( "get_standard_plots does not currently support MultiObjective " "optimization experiments. Returning an empty list." ) return [] if isinstance(objective, ScalarizedObjective): logger.warning( "get_standard_plots does not currently support ScalarizedObjective " "optimization experiments. Returning an empty list." ) return [] if experiment.fetch_data().df.empty: logger.info(f"Experiment {experiment} does not yet have data, nothing to plot.") return [] output_plot_list = [] output_plot_list.append( _get_objective_trace_plot( experiment=experiment, metric_name=not_none(experiment.optimization_config).objective.metric.name, model_transitions=generation_strategy.model_transitions, optimization_direction=( "minimize" if not_none(experiment.optimization_config).objective.minimize else "maximize" ), ) ) try: output_plot_list.append( _get_objective_v_param_plot( search_space=experiment.search_space, model=not_none(generation_strategy.model), metric_name=not_none( experiment.optimization_config ).objective.metric.name, trials=experiment.trials, ) ) except NotImplementedError: # Model does not implement `predict` method. pass return [plot for plot in output_plot_list if plot is not None]
def get_observed_pareto_frontiers( experiment: Experiment, data: Optional[Data] = None, rel: bool = True, ) -> List[ParetoFrontierResults]: """ Find all Pareto points from an experiment. Uses only values as observed in the data; no modeling is involved. Makes no assumption about the search space or types of parameters. If "data" is provided will use that, otherwise will use all data attached to the experiment. Uses all arms present in data; does not filter according to experiment search space. Assumes experiment has a multiobjective optimization config from which the objectives and outcome constraints will be extracted. Will generate a ParetoFrontierResults for every pair of metrics in the experiment's multiobjective optimization config. """ if data is None: data = experiment.fetch_data() if experiment.optimization_config is None: raise ValueError("Experiment must have an optimization config") mb = get_tensor_converter_model(experiment=experiment, data=data) pareto_observations = observed_pareto_frontier(modelbridge=mb) # Convert to ParetoFrontierResults metric_names = [ metric.name for metric in experiment.optimization_config.objective.metrics # pyre-ignore ] pfr_means = {name: [] for name in metric_names} pfr_sems = {name: [] for name in metric_names} for obs in pareto_observations: for i, name in enumerate(obs.data.metric_names): pfr_means[name].append(obs.data.means[i]) pfr_sems[name].append(np.sqrt(obs.data.covariance[i, i])) # Relativize as needed if rel and experiment.status_quo is not None: # Get status quo values sq_df = data.df[data.df["arm_name"] == experiment.status_quo.name # pyre-ignore ] sq_df = sq_df.to_dict(orient="list") # pyre-ignore sq_means = {} sq_sems = {} for i, metric in enumerate(sq_df["metric_name"]): sq_means[metric] = sq_df["mean"][i] sq_sems[metric] = sq_df["sem"][i] # Relativize for name in metric_names: if np.isnan(sq_sems[name]) or np.isnan(pfr_sems[name]).any(): # Just relativize means pfr_means[name] = [(mu / sq_means[name] - 1) * 100 for mu in pfr_means[name]] else: # Use delta method pfr_means[name], pfr_sems[name] = relativize( means_t=pfr_means[name], sems_t=pfr_sems[name], mean_c=sq_means[name], sem_c=sq_sems[name], as_percent=True, ) absolute_metrics = [] else: absolute_metrics = metric_names objective_thresholds = {} if experiment.optimization_config.objective_thresholds is not None: # pyre-ignore for objth in experiment.optimization_config.objective_thresholds: is_rel = objth.metric.name not in absolute_metrics if objth.relative != is_rel: raise ValueError( f"Objective threshold for {objth.metric.name} has " f"rel={objth.relative} but was specified here as rel={is_rel}" ) objective_thresholds[objth.metric.name] = objth.bound # Construct ParetoFrontResults for each pair pfr_list = [] param_dicts = [obs.features.parameters for obs in pareto_observations] arm_names = [obs.arm_name for obs in pareto_observations] for metric_a, metric_b in combinations(metric_names, 2): pfr_list.append( ParetoFrontierResults( param_dicts=param_dicts, means=pfr_means, sems=pfr_sems, primary_metric=metric_a, secondary_metric=metric_b, absolute_metrics=absolute_metrics, objective_thresholds=objective_thresholds, arm_names=arm_names, )) return pfr_list
def get_standard_plots( experiment: Experiment, model: Optional[ModelBridge], model_transitions: Optional[List[int]] = None, ) -> List[go.Figure]: """Extract standard plots for single-objective optimization. Extracts a list of plots from an ``Experiment`` and ``ModelBridge`` of general interest to an Ax user. Currently not supported are - TODO: multi-objective optimization - TODO: ChoiceParameter plots Args: - experiment: The ``Experiment`` from which to obtain standard plots. - model: The ``ModelBridge`` used to suggest trial parameters. - data: If specified, data, to which to fit the model before generating plots. - model_transitions: The arm numbers at which shifts in generation_strategy occur. Returns: - a plot of objective value vs. trial index, to show experiment progression - a plot of objective value vs. range parameter values, only included if the model associated with generation_strategy can create predictions. This consists of: - a plot_slice plot if the search space contains one range parameter - an interact_contour plot if the search space contains multiple range parameters """ objective = not_none(experiment.optimization_config).objective if isinstance(objective, MultiObjective): logger.warning( "get_standard_plots does not currently support MultiObjective " "optimization experiments. Returning an empty list.") return [] if isinstance(objective, ScalarizedObjective): logger.warning( "get_standard_plots does not currently support ScalarizedObjective " "optimization experiments. Returning an empty list.") return [] if experiment.fetch_data().df.empty: logger.info( f"Experiment {experiment} does not yet have data, nothing to plot." ) return [] output_plot_list = [] output_plot_list.append( _get_objective_trace_plot( experiment=experiment, metric_name=not_none( experiment.optimization_config).objective.metric.name, model_transitions=model_transitions if model_transitions is not None else [], optimization_direction=("minimize" if not_none( experiment.optimization_config).objective.minimize else "maximize"), )) # Objective vs. parameter plot requires a `Model`, so add it only if model # is alrady available. In cases where initially custom trials are attached, # model might not yet be set on the generation strategy. if model: # TODO: Check if model can predict in favor of try/catch. try: output_plot_list.append( _get_objective_v_param_plot( search_space=experiment.search_space, model=model, metric_name=not_none( experiment.optimization_config).objective.metric.name, trials=experiment.trials, )) output_plot_list.append(_get_cross_validation_plot(model)) except NotImplementedError: # Model does not implement `predict` method. pass return [plot for plot in output_plot_list if plot is not None]