Esempio n. 1
0
def save_to_cache_for_jackknife(self, key, val, split_by=None):
    """Used to monkey patch the save_to_cache() during Jackknife.precompute().

  What cache_key to use for the point estimate of Jackknife is tricky because we
  want to support two use cases at the same time.
  1. We want sumx to be computed only once in
    MetricList([Jackknife(sumx), sumx]).compute_on(df, return_dataframe=False),
    so the key for point estimate should be the same sumx uses.
  2. But then it will fail when multiple Jackknifes are involved. For example,
    (Jackknife(unit1, sumx) - Jackknife(unit2, sumx)).compute_on(df)
  will fail because two Jackknifes share point estimate but not LOO estimates.
  When the 2nd Jackknife precomputes its point esitmate, as it uses the same key
  as the 1st one, it will mistakenly assume LOO has been cached, but
  unfortunately it's not true.
  The solution here is we use different keys for different Jackknifes, so LOO
  will always be precomputed. Additionally we cache the point estimate again
  with the key other Metrics like Sum would use so they can reuse it.

  Args:
    self: An instance of metrics.Metric.
    key: The cache key currently being used in computation.
    val: The value to cache.
    split_by: Something can be passed into df.group_by().
  """
    key = self.wrap_cache_key(key, split_by)
    if isinstance(key.key, tuple) and key.key[:2] == ('_RESERVED', 'jk'):
        val = val.copy() if isinstance(val, (pd.Series, pd.DataFrame)) else val
        base_key = key.key[2]
        base_key = utils.CacheKey(base_key, key.where, key.split_by,
                                  key.slice_val)
        self.cache[base_key] = val
        if utils.is_tmp_key(base_key):
            self.tmp_cache_keys.add(base_key)
    val = val.copy() if isinstance(val, (pd.Series, pd.DataFrame)) else val
    self.cache[key] = val
Esempio n. 2
0
    def compute_on(self,
                   df: pd.DataFrame,
                   split_by: Optional[Union[Text, List[Text]]] = None,
                   melted: bool = False,
                   return_dataframe: bool = True,
                   cache_key: Any = None):
        """Key API of Metric.

    Wraps computing logic with caching.

    This is what you should call to use Metric. It's compute_through +
    final_compute + caching. As caching is the shared part of Metric, we suggest
    you NOT to overwrite this method. Overwriting compute_slices and/or
    final_compute should be enough. If not, contact us with your use cases.

    Args:
      df: The DataFrame to compute on.
      split_by: Something can be passed into df.group_by().
      melted: Whether to transform the result to long format.
      return_dataframe: Whether to convert the result to DataFrame if it's not.
        If False, it could still return a DataFrame.
      cache_key: What key to use to cache the df. You can use anything that can
        be a key of a dict except '_RESERVED' and tuples like ('_RESERVED', ..).

    Returns:
      Final result returned to user. If split_by, it's a pd.Series or a
      pd.DataFrame, otherwise it could be a base type.
    """
        need_clean_up = True
        try:
            split_by = [split_by] if isinstance(split_by,
                                                str) else split_by or []
            if cache_key is not None:
                cache_key = self.wrap_cache_key(cache_key, split_by)
                if self.in_cache(cache_key):
                    need_clean_up = False
                    raw_res = self.get_cached(cache_key)
                    res = self.manipulate(raw_res, melted, return_dataframe)
                    res = self.final_compute(res, melted, return_dataframe,
                                             split_by, df)
                    return res
                else:
                    self.cache_key = cache_key
                    raw_res = self.compute_through(df, split_by)
                    self.save_to_cache(cache_key, raw_res)
                    if utils.is_tmp_key(cache_key):
                        self.tmp_cache_keys.add(cache_key)
            else:
                raw_res = self.compute_through(df, split_by)

            res = self.manipulate(raw_res, melted, return_dataframe)
            res = self.final_compute(res, melted, return_dataframe, split_by,
                                     df)
            return res

        finally:
            if need_clean_up:
                self.flush_tmp_cache()