Ejemplo n.º 1
0
    def lookup_or_fetch_experiment_data_multi(
        cls,
        experiment: core.experiment.Experiment,
        metrics: Iterable[Metric],
        trials: Optional[Iterable[core.base_trial.BaseTrial]] = None,
        **kwargs: Any,
    ) -> Tuple[Data, bool]:
        """Fetch or lookup (with fallback to fetching) data for given metrics,
        depending on whether they are available while running. Return a tuple
        containing the data, along with a boolean that will be True if new
        data was fetched, and False if all data was looked up from cache.

        If metric is available while running, its data can change (and therefore
        we should always re-fetch it). If metric is available only upon trial
        completion, its data does not change, so we can look up that data on
        the experiment and only fetch the data that is not already attached to
        the experiment.

        NOTE: If fetching data for a metrics class that is only available upon
        trial completion, data fetched in this function (data that was not yet
        available on experiment) will be attached to experiment.
        """
        # If this metric is available while trial is running, just default to
        # `fetch_experiment_data_multi`.
        if cls.is_available_while_running():
            fetched_data = cls.fetch_experiment_data_multi(
                experiment=experiment,
                metrics=metrics,
                trials=trials,
                **kwargs)
            return fetched_data, True

        # If this metric is available only upon trial completion, look up data
        # on experiment and only fetch data that is not already cached.
        if trials is None:
            completed_trials = experiment.trials_by_status[
                core.base_trial.TrialStatus.COMPLETED]
        else:
            completed_trials = [t for t in trials if t.status.is_completed]

        if not completed_trials:
            return cls.data_constructor(), False

        trials_data = []
        contains_new_data = False
        for trial in completed_trials:
            cached_trial_data = experiment.lookup_data_for_trial(
                trial_index=trial.index, )[0]

            cached_metric_names = cached_trial_data.metric_names
            metrics_to_fetch = [
                m for m in metrics if m.name not in cached_metric_names
            ]
            if not metrics_to_fetch:
                # If all needed data fetched from cache, no need to fetch any other data
                # for trial.
                trials_data.append(cached_trial_data)
                continue

            try:
                fetched_trial_data = cls.fetch_experiment_data_multi(
                    experiment=experiment,
                    metrics=metrics_to_fetch,
                    trials=[trial],
                    **kwargs,
                )
                contains_new_data = True
            except NotImplementedError:
                # Metric does not implement fetching logic and only uses lookup.
                fetched_trial_data = cls.data_constructor()

            final_data = cls.data_constructor.from_multiple_data(
                [cached_trial_data, fetched_trial_data])

            trials_data.append(final_data)
        return (
            cls.data_constructor.from_multiple_data(
                trials_data, subset_metrics=[m.name for m in metrics]),
            contains_new_data,
        )
Ejemplo n.º 2
0
    def lookup_or_fetch_experiment_data_multi(
        cls,
        experiment: core.experiment.Experiment,
        metrics: Iterable[Metric],
        trials: Optional[Iterable[core.base_trial.BaseTrial]] = None,
        **kwargs: Any,
    ) -> AbstractDataFrameData:
        """Fetch or lookup (with fallback to fetching) data for given metrics,
        depending on whether they are available while running.

        If metric is available while running, its data can change (and therefore
        we should always re-fetch it). If metric is available only upon trial
        completion, its data does not change, so we can look up that data on
        the experiment and only fetch the data that is not already attached to
        the experiment.

        NOTE: If fetching data for a metrics class that is only available upon
        trial completion, data fetched in this function (data that was not yet
        available on experiment) will be attached to experiment.
        """
        # If this metric is available while trial is running, just default to
        # `fetch_experiment_data_multi`.
        if cls.is_available_while_running():
            fetched_data = cls.fetch_experiment_data_multi(
                experiment=experiment,
                metrics=metrics,
                trials=trials,
                **kwargs)
            if not fetched_data.df.empty:
                experiment.attach_data(
                    fetched_data,
                    overwrite_existing_data=cls.overwrite_existing_data(),
                    combine_with_last_data=cls.combine_with_last_data(),
                )
            return fetched_data

        # If this metric is available only upon trial completion, look up data
        # on experiment and only fetch data that is not already cached.
        if trials is None:
            completed_trials = experiment.trials_by_status[
                core.base_trial.TrialStatus.COMPLETED]
        else:
            completed_trials = [t for t in trials if t.status.is_completed]

        if not completed_trials:
            return cls.data_constructor()

        trials_data = []
        for trial in completed_trials:
            cached_trial_data = experiment.lookup_data_for_trial(
                trial_index=trial.index, )[0]

            cached_metric_names = cached_trial_data.metric_names
            metrics_to_fetch = [
                m for m in metrics if m.name not in cached_metric_names
            ]
            if not metrics_to_fetch:
                # If all needed data fetched from cache, no need to fetch any other data
                # for trial.
                trials_data.append(cached_trial_data)
                continue

            try:
                fetched_trial_data = cls.fetch_experiment_data_multi(
                    experiment=experiment,
                    metrics=metrics_to_fetch,
                    trials=[trial],
                    **kwargs,
                )

            except NotImplementedError:
                # Metric does not implement fetching logic and only uses lookup.
                fetched_trial_data = cls.data_constructor()

            final_data = cls.data_constructor.from_multiple_data(
                # pyre-fixme [6]: Incompatible paramtype: Expected `Data`
                #   but got `AbstractDataFrameData`.
                [cached_trial_data, fetched_trial_data])
            if not final_data.df.empty:
                experiment.attach_data(
                    final_data,
                    overwrite_existing_data=cls.overwrite_existing_data(),
                    combine_with_last_data=cls.combine_with_last_data(),
                )
            trials_data.append(final_data)

        return cls.data_constructor.from_multiple_data(
            trials_data, subset_metrics=[m.name for m in metrics])
Ejemplo n.º 3
0
    def lookup_or_fetch_experiment_data_multi(
        cls,
        experiment: core.experiment.Experiment,
        metrics: Iterable[Metric],
        trials: Optional[Iterable[core.base_trial.BaseTrial]] = None,
        **kwargs: Any,
    ) -> AbstractDataFrameData:
        """Fetch or lookup (with fallback to fetching) data for given metrics,
        depending on whether they are available while running.

        If metric is available while running, its data can change (and therefore
        we should always re-fetch it). If metric is available only upon trial
        completion, its data does not change, so we can look up that data on
        the experiment and only fetch the data that is not already attached to
        the experiment.

        NOTE: If fetching data for a metrics class that is only available upon
        trial completion, data fetched in this function (data that was not yet
        available on experiment) will be attached to experiment.
        """
        # If this metric is available while trial is running, just default to
        # `fetch_experiment_data_multi`.
        if cls.is_available_while_running():
            cached_data = (experiment.lookup_data(
                trial_indices=[trial.index for trial in trials],
                keep_latest_map_values_only=False,
            ) if trials else None)

            fetched_data = cls.fetch_experiment_data_multi(
                experiment=experiment,
                metrics=metrics,
                trials=trials,
                **kwargs)

            # If there is cached data, consider combining it with fetched data.
            # That way, if this function is being called from within a loop over
            # multiple metric classes (as is currently the case in lookup_data),
            # we'll combine data from all metric classes into a single dataframe
            # before attaching it to the experiment.
            if cached_data:
                cached_metric_names = cached_data.metric_names

                # if there is a collision (ie fetched = A cached = AB), just use
                # the recently fetched. That way, if we call `fetch_data` twice
                # in a row (not within a for loop), we don't end up with
                # duplicate data
                if len(
                        cached_metric_names.intersection(
                            {m.name
                             for m in metrics})) > 0:
                    final_data = fetched_data
                else:
                    final_data = cls.data_constructor.from_multiple_data(
                        # pyre-fixme [6]: Incompatible paramtype: Expected `Data`
                        #   but got `AbstractDataFrameData`.
                        [cached_data, fetched_data])
            else:
                final_data = fetched_data

            if not fetched_data.df.empty:
                experiment.attach_data(
                    final_data,
                    overwrite_existing_data=cls.overwrite_existing_data(),
                    combine_with_last_data=cls.combine_with_last_data(),
                )
            return fetched_data

        # If this metric is available only upon trial completion, look up data
        # on experiment and only fetch data that is not already cached.
        if trials is None:
            completed_trials = experiment.trials_by_status[
                core.base_trial.TrialStatus.COMPLETED]
        else:
            completed_trials = [t for t in trials if t.status.is_completed]

        if not completed_trials:
            return cls.data_constructor()

        trials_data = []
        for trial in completed_trials:
            cached_trial_data = experiment.lookup_data_for_trial(
                trial_index=trial.index,
                keep_latest_map_values_only=False,
            )[0]

            cached_metric_names = cached_trial_data.metric_names
            metrics_to_fetch = [
                m for m in metrics if m.name not in cached_metric_names
            ]
            if not metrics_to_fetch:
                # If all needed data fetched from cache, no need to fetch any other data
                # for trial.
                trials_data.append(cached_trial_data)
                continue

            try:
                fetched_trial_data = cls.fetch_experiment_data_multi(
                    experiment=experiment,
                    metrics=metrics_to_fetch,
                    trials=[trial],
                    **kwargs,
                )

            except NotImplementedError:
                # Metric does not implement fetching logic and only uses lookup.
                fetched_trial_data = cls.data_constructor()

            final_data = cls.data_constructor.from_multiple_data(
                # pyre-fixme [6]: Incompatible paramtype: Expected `Data`
                #   but got `AbstractDataFrameData`.
                [cached_trial_data, fetched_trial_data])
            if not final_data.df.empty:
                experiment.attach_data(
                    final_data,
                    overwrite_existing_data=cls.overwrite_existing_data(),
                    combine_with_last_data=cls.combine_with_last_data(),
                )
            trials_data.append(final_data)

        return cls.data_constructor.from_multiple_data(
            trials_data, subset_metrics=[m.name for m in metrics])