Example #1
0
 def test_start_and_end_time_in_trial_completion(self):
     start_time = current_timestamp_in_millis()
     ax_client = AxClient()
     ax_client.create_experiment(
         parameters=[
             {
                 "name": "x",
                 "type": "range",
                 "bounds": [-5.0, 10.0]
             },
             {
                 "name": "y",
                 "type": "range",
                 "bounds": [0.0, 15.0]
             },
         ],
         minimize=True,
     )
     params, idx = ax_client.get_next_trial()
     ax_client.complete_trial(
         trial_index=idx,
         raw_data=1.0,
         metadata={
             "start_time": start_time,
             "end_time": current_timestamp_in_millis(),
         },
     )
     dat = ax_client.experiment.fetch_data().df
     self.assertGreater(dat["end_time"][0], dat["start_time"][0])
Example #2
0
 def testFromFidelityEvaluations(self):
     data = Data.from_fidelity_evaluations(
         evaluations={
             "0_1": [
                 ({
                     "f1": 1.0,
                     "f2": 0.5
                 }, {
                     "b": (3.7, 0.5)
                 }),
                 ({
                     "f1": 1.0,
                     "f2": 0.75
                 }, {
                     "b": (3.8, 0.5)
                 }),
             ]
         },
         trial_index=0,
         sample_sizes={"0_1": 2},
         start_time=current_timestamp_in_millis(),
         end_time=current_timestamp_in_millis(),
     )
     self.assertEqual(len(data.df), 2)
     self.assertIn("start_time", data.df)
     self.assertIn("end_time", data.df)
Example #3
0
 def testFromEvaluations(self):
     data = Data.from_evaluations(
         evaluations={"0_1": {"b": (3.7, 0.5)}},
         trial_index=0,
         sample_sizes={"0_1": 2},
         start_time=current_timestamp_in_millis(),
         end_time=current_timestamp_in_millis(),
     )
     self.assertEqual(len(data.df), 1)
     self.assertNotEqual(data, Data(self.df))
     self.assertIn("start_time", data.df)
     self.assertIn("end_time", data.df)
Example #4
0
 def test_init(self):
     with self.assertRaisesRegex(UnsupportedError,
                                 ".* metrics .* implemented fetching"):
         scheduler = BareBonesTestScheduler(
             experiment=self.branin_experiment_no_impl_metrics,
             generation_strategy=self.sobol_GPEI_GS,
             options=SchedulerOptions(total_trials=10),
         )
     scheduler = BareBonesTestScheduler(
         experiment=self.branin_experiment,
         generation_strategy=self.sobol_GPEI_GS,
         options=SchedulerOptions(
             total_trials=0,
             tolerated_trial_failure_rate=0.2,
             init_seconds_between_polls=10,
         ),
     )
     self.assertEqual(scheduler.experiment, self.branin_experiment)
     self.assertEqual(scheduler.generation_strategy, self.sobol_GPEI_GS)
     self.assertEqual(scheduler.options.total_trials, 0)
     self.assertEqual(scheduler.options.tolerated_trial_failure_rate, 0.2)
     self.assertEqual(scheduler.options.init_seconds_between_polls, 10)
     self.assertIsNone(scheduler._latest_optimization_start_timestamp)
     for status_prop in ExperimentStatusProperties:
         self.assertEqual(
             scheduler.experiment._properties[status_prop.value], [])
     scheduler.run_all_trials()  # Runs no trials since total trials is 0.
     # `_latest_optimization_start_timestamp` should be set now.
     self.assertLessEqual(
         scheduler._latest_optimization_start_timestamp,
         current_timestamp_in_millis(),
     )
Example #5
0
 def generation_strategy_to_sqa(
         self, generation_strategy: GenerationStrategy,
         experiment_id: Optional[int]) -> SQAGenerationStrategy:
     """Convert an Ax `GenerationStrategy` to SQLAlchemy, preserving its state,
     so that the restored generation strategy can be resumed from the point
     at which it was interrupted and stored.
     """
     # pyre-ignore[9]: Expected Base, but redeclared to `SQAGenerationStrategy`.
     gs_class: SQAGenerationStrategy = self.config.class_to_sqa_class[cast(
         Type[Base], GenerationStrategy)]
     # pyre-fixme[29]: `SQAGenerationStrategy` is not a function.
     return gs_class(
         name=generation_strategy.name,
         steps=object_to_json(generation_strategy._steps),
         generated=generation_strategy._generated,
         observed=generation_strategy._observed,
         curr_index=generation_strategy._curr.index,
         generator_runs=[
             self.generator_run_to_sqa(gr)
             for gr in generation_strategy._generator_runs
         ],
         data=self.data_to_sqa(
             data=generation_strategy._data,
             # Generation strategy data is a compilation of data, so it does
             # not strictly speaking have a timestamp. Setting timestamp to
             # current.
             timestamp=current_timestamp_in_millis(),
             trial_index=None,
         ),
         experiment_id=experiment_id,
     )
Example #6
0
    def attach_data(self, data: Data, combine_with_last_data: bool = False) -> int:
        """Attach data to experiment. Stores data in `experiment._data_by_trial`,
        to be looked up via `experiment.lookup_data_by_trial`.

        Args:
            data: Data object to store.
            combine_with_last_data: By default, when attaching data, it's identified
                by its timestamp, and `experiment.lookup_data_by_trial` returns
                data by most recent timestamp. In some cases, however, the goal
                is to combine all data attached for a trial into a single `Data`
                object. To achieve that goal, every call to `attach_data` after
                the initial data is attached to trials, should be set to `True`.
                Then, the newly attached data will be appended to existing data,
                rather than stored as a separate object, and `lookup_data_by_trial`
                will return the combined data object, rather than just the most
                recently added data. This will validate that the newly added data
                does not contain observations for the metrics that already have
                observations in the most recent data stored.

        Returns:
            Timestamp of storage in millis.
        """
        if data.df.empty:
            raise ValueError("Data to attach is empty.")
        cur_time_millis = current_timestamp_in_millis()
        for trial_index, trial_df in data.df.groupby(data.df["trial_index"]):
            current_trial_data = (
                self._data_by_trial[trial_index]
                if trial_index in self._data_by_trial
                else OrderedDict()
            )
            if combine_with_last_data and len(current_trial_data) > 0:
                last_ts, last_data = list(current_trial_data.items())[-1]
                merged = pd.merge(
                    last_data.df,
                    trial_df,
                    on=["trial_index", "metric_name", "arm_name"],
                    how="inner",
                )
                if not merged.empty:
                    raise ValueError(
                        f"Last data for trial {trial_index} already contained an "
                        f"observation for metric {merged.head()['metric_name']}."
                    )
                current_trial_data[cur_time_millis] = Data.from_multiple_data(
                    [last_data, Data(trial_df)]
                )
            else:
                current_trial_data[cur_time_millis] = Data(trial_df)
            self._data_by_trial[trial_index] = current_trial_data

        return cur_time_millis
Example #7
0
def _observations_from_dataframe(
    experiment: Experiment, df: pd.DataFrame, cols: List[str], arm_name_only: bool
) -> List[Observation]:
    """Helper method for extracting observations grouped by `cols` from `df`."""
    observations = []
    for g, d in df.groupby(by=cols):
        if arm_name_only:
            features = {"arm_name": g}
            arm_name = g
            trial_index = None
        else:
            features = dict(zip(cols, g))
            arm_name = features["arm_name"]
            trial_index = features.get("trial_index", None)
        obs_kwargs = {}
        obs_parameters = experiment.arms_by_name[arm_name].parameters.copy()
        if obs_parameters:
            obs_kwargs["parameters"] = obs_parameters
        for f, val in features.items():
            if f in OBS_KWARGS:
                obs_kwargs[f] = val
        fidelities = features.get("fidelities")
        if fidelities is not None:
            obs_parameters.update(json.loads(fidelities))
        if trial_index is not None:
            trial = experiment.trials[trial_index]
            metadata = (
                trial._get_candidate_metadata_from_all_generator_runs().get(arm_name)
                or {}
            )
            if Keys.OBS_FROM_DF_TIMESTAMP not in metadata:
                metadata[Keys.OBS_FROM_DF_TIMESTAMP] = current_timestamp_in_millis()
            obs_kwargs[Keys.METADATA] = metadata
        observations.append(
            Observation(
                features=ObservationFeatures(**obs_kwargs),
                data=ObservationData(
                    metric_names=d["metric_name"].tolist(),
                    means=d["mean"].values,
                    covariance=np.diag(d["sem"].values ** 2),
                ),
                arm_name=arm_name,
            )
        )
    return observations
Example #8
0
    def attach_data(self, data: Data) -> int:
        """Attach data to experiment.

        Args:
            data: Data object to store.

        Returns:
            Timestamp of storage in millis.
        """
        cur_time_millis = current_timestamp_in_millis()
        for trial_index, trial_df in data.df.groupby(data.df["trial_index"]):
            current_trial_data = (self._data_by_trial[trial_index]
                                  if trial_index in self._data_by_trial else
                                  OrderedDict())
            current_trial_data[cur_time_millis] = Data(trial_df)
            self._data_by_trial[trial_index] = current_trial_data

        return cur_time_millis
Example #9
0
    def attach_data(self,
                    data: AbstractDataFrameData,
                    combine_with_last_data: bool = False) -> int:
        """Attach data to experiment. Stores data in `experiment._data_by_trial`,
        to be looked up via `experiment.lookup_data_for_trial`.

        Args:
            data: Data object to store.
            combine_with_last_data: By default, when attaching data, it's identified
                by its timestamp, and `experiment.lookup_data_for_trial` returns
                data by most recent timestamp. In some cases, however, the goal
                is to combine all data attached for a trial into a single Data
                object. To achieve that goal, every call to `attach_data` after
                the initial data is attached to trials, should be set to `True`.
                Then, the newly attached data will be appended to existing data,
                rather than stored as a separate object, and `lookup_data_for_trial`
                will return the combined data object, rather than just the most
                recently added data. This will validate that the newly added data
                does not contain observations for the metrics that already have
                observations in the most recent data stored.

        Returns:
            Timestamp of storage in millis.
        """
        data_type = type(data)
        data_init_args = data.serialize_init_args(data)
        if data.df.empty:
            raise ValueError("Data to attach is empty.")
        metrics_not_on_exp = set(data.df["metric_name"].values) - set(
            self.metrics.keys())
        if metrics_not_on_exp:
            logger.info(
                f"Attached data has some metrics ({metrics_not_on_exp}) that are "
                "not among the metrics on this experiment. Note that attaching data "
                "will not automatically add those metrics to the experiment. "
                "For these metrics to be automatically fetched by `experiment."
                "fetch_data`, add them via `experiment.add_tracking_metric` or update "
                "the experiment's optimization config.")
        cur_time_millis = current_timestamp_in_millis()
        for trial_index, trial_df in data.df.groupby(data.df["trial_index"]):
            current_trial_data = (self._data_by_trial[trial_index]
                                  if trial_index in self._data_by_trial else
                                  OrderedDict())
            if combine_with_last_data and len(current_trial_data) > 0:
                last_ts, last_data = list(current_trial_data.items())[-1]
                merged = pd.merge(
                    last_data.df,
                    trial_df,
                    on=["trial_index", "metric_name", "arm_name"],
                    how="inner",
                )
                if not merged.empty:
                    raise ValueError(
                        f"Last data for trial {trial_index} already contained an "
                        f"observation for metric {merged.head()['metric_name']}."
                    )
                last_data_type = type(last_data)
                # pyre-ignore [6]: 2nd Param is `AbstractData`,
                #   but we know class is concrete.
                current_trial_data[
                    cur_time_millis] = last_data_type.from_multiple_data([
                        last_data,
                        # pyre-ignore [45]: Cannot instantiate abstract class.
                        #   But we know the class is concrete.
                        last_data_type(trial_df, **data_init_args),
                    ])
            else:
                # pyre-ignore [45]: Cannot instantiate `AbstractDataFrameData`.
                current_trial_data[cur_time_millis] = data_type(
                    trial_df, **data_init_args)
            self._data_by_trial[trial_index] = current_trial_data

        return cur_time_millis
Example #10
0
    def attach_data(
        self,
        data: Data,
        combine_with_last_data: bool = False,
        overwrite_existing_data: bool = False,
    ) -> int:
        """Attach data to experiment. Stores data in `experiment._data_by_trial`,
        to be looked up via `experiment.lookup_data_for_trial`.

        Args:
            data: Data object to store.
            combine_with_last_data: By default, when attaching data, it's identified
                by its timestamp, and `experiment.lookup_data_for_trial` returns
                data by most recent timestamp. Sometimes, however, we want to combine
                the data from multiple calls to `attach_data` into one dataframe.
                This might be because:
                    - We attached data for some metrics at one point and data for
                    the rest of the metrics later on.
                    - We attached data for some fidelity at one point and data for
                    another fidelity later one.
                To achieve that goal, set `combine_with_last_data` to `True`.
                In this case, we will take the most recent previously attached
                data, append the newly attached data to it, attach a new
                Data object with the merged result, and delete the old one.
                Afterwards, calls to `lookup_data_for_trial` will return this
                new combined data object. This operation will also validate that the
                newly added data does not contain observations for metrics that
                already have observations at the same fidelity in the most recent data.
            overwrite_existing_data: By default, we keep around all data that has
                ever been attached to the experiment. However, if we know that
                the incoming data contains all the information we need for a given
                trial, we can replace the existing data for that trial, thereby
                reducing the amount we need to store in the database.

        Returns:
            Timestamp of storage in millis.
        """
        if combine_with_last_data and overwrite_existing_data:
            raise UnsupportedError(
                "Cannot set both combine_with_last_data=True and "
                "overwrite_existing_data=True. Data can either be "
                "combined, or overwritten, or neither.")
        data_type = type(data)
        data_init_args = data.serialize_init_args(data)
        if data.df.empty:
            raise ValueError("Data to attach is empty.")
        metrics_not_on_exp = set(data.true_df["metric_name"].values) - set(
            self.metrics.keys())
        if metrics_not_on_exp:
            logger.info(
                f"Attached data has some metrics ({metrics_not_on_exp}) that are "
                "not among the metrics on this experiment. Note that attaching data "
                "will not automatically add those metrics to the experiment. "
                "For these metrics to be automatically fetched by `experiment."
                "fetch_data`, add them via `experiment.add_tracking_metric` or update "
                "the experiment's optimization config.")
        cur_time_millis = current_timestamp_in_millis()
        for trial_index, trial_df in data.true_df.groupby(
                data.true_df["trial_index"]):
            current_trial_data = (self._data_by_trial[trial_index]
                                  if trial_index in self._data_by_trial else
                                  OrderedDict())
            if combine_with_last_data and len(current_trial_data) > 0:
                last_ts, last_data = list(current_trial_data.items())[-1]
                last_data_type = type(last_data)
                merge_keys = ["trial_index", "metric_name", "arm_name"
                              ] + (last_data.map_keys if issubclass(
                                  last_data_type, MapData) else [])
                merged = pd.merge(
                    last_data.true_df,
                    trial_df,
                    on=merge_keys,
                    how="inner",
                )
                if not merged.empty:
                    raise ValueError(
                        f"Last data for trial {trial_index} already contained an "
                        f"observation for metric {merged.head()['metric_name']}."
                    )
                del current_trial_data[last_ts]
                current_trial_data[
                    cur_time_millis] = last_data_type.from_multiple_data([
                        last_data,
                        last_data_type(trial_df, **data_init_args),
                    ])
            elif overwrite_existing_data:
                if len(current_trial_data) > 0:
                    _, last_data = list(current_trial_data.items())[-1]
                    last_data_metrics = set(last_data.df["metric_name"])
                    new_data_metrics = set(trial_df["metric_name"])
                    if last_data_metrics.difference(new_data_metrics):
                        raise ValueError(
                            "overwrite_trial_data is True, but the new data contains "
                            "only a subset of the metrics that are present in the "
                            "previous data.")
                current_trial_data = OrderedDict(
                    {cur_time_millis: data_type(trial_df, **data_init_args)})
            else:
                current_trial_data[cur_time_millis] = data_type(
                    trial_df, **data_init_args)
            self._data_by_trial[trial_index] = current_trial_data

        return cur_time_millis