Ejemplo n.º 1
0
def test_get_metric_names():
    assert get_metric_names(
        [STR_METRIC, SIMPLE_SUM_ADHOC_METRIC,
         SQL_ADHOC_METRIC]) == ["my_metric", "my SUM", "my_sql"]
    assert get_metric_names(
        [STR_METRIC, SIMPLE_SUM_ADHOC_METRIC, SQL_ADHOC_METRIC],
        {STR_METRIC: "My Metric"},
    ) == ["My Metric", "my SUM", "my_sql"]
Ejemplo n.º 2
0
def pivot_table(
    df: pd.DataFrame,
    form_data: Dict[str, Any],
    datasource: Optional["BaseDatasource"] = None,
) -> pd.DataFrame:
    """
    Pivot table (v1).
    """
    verbose_map = datasource.data["verbose_map"] if datasource else None
    if form_data.get("granularity") == "all" and DTTM_ALIAS in df:
        del df[DTTM_ALIAS]

    # v1 func names => v2 func names
    func_map = {
        "sum": "Sum",
        "mean": "Average",
        "min": "Minimum",
        "max": "Maximum",
        "std": "Sample Standard Deviation",
        "var": "Sample Variance",
    }

    return pivot_df(
        df,
        rows=get_column_names(form_data.get("groupby"), verbose_map),
        columns=get_column_names(form_data.get("columns"), verbose_map),
        metrics=get_metric_names(form_data["metrics"], verbose_map),
        aggfunc=func_map.get(form_data.get("pandas_aggfunc", "sum"), "Sum"),
        transpose_pivot=bool(form_data.get("transpose_pivot")),
        combine_metrics=bool(form_data.get("combine_metric")),
        show_rows_total=bool(form_data.get("pivot_margins")),
        show_columns_total=bool(form_data.get("pivot_margins")),
        apply_metrics_on_rows=False,
    )
Ejemplo n.º 3
0
def pivot_table_v2(
    df: pd.DataFrame,
    form_data: Dict[str, Any],
    datasource: Optional["BaseDatasource"] = None,
) -> pd.DataFrame:
    """
    Pivot table v2.
    """
    verbose_map = datasource.data["verbose_map"] if datasource else None
    if form_data.get("granularity_sqla") == "all" and DTTM_ALIAS in df:
        del df[DTTM_ALIAS]

    return pivot_df(
        df,
        rows=get_column_names(form_data.get("groupbyRows"), verbose_map),
        columns=get_column_names(form_data.get("groupbyColumns"), verbose_map),
        metrics=get_metric_names(form_data["metrics"], verbose_map),
        aggfunc=form_data.get("aggregateFunction", "Sum"),
        transpose_pivot=bool(form_data.get("transposePivot")),
        combine_metrics=bool(form_data.get("combineMetric")),
        show_rows_total=bool(form_data.get("rowTotals")),
        show_columns_total=bool(form_data.get("colTotals")),
        apply_metrics_on_rows=form_data.get("metricsLayout") == "ROWS",
    )
Ejemplo n.º 4
0
    def processing_time_offsets(  # pylint: disable=too-many-locals
        self, df: pd.DataFrame, query_object: QueryObject,
    ) -> CachedTimeOffset:
        # ensure query_object is immutable
        query_object_clone = copy.copy(query_object)
        queries: List[str] = []
        cache_keys: List[Optional[str]] = []
        rv_dfs: List[pd.DataFrame] = [df]

        time_offsets = query_object.time_offsets
        outer_from_dttm = query_object.from_dttm
        outer_to_dttm = query_object.to_dttm
        for offset in time_offsets:
            try:
                query_object_clone.from_dttm = get_past_or_future(
                    offset, outer_from_dttm,
                )
                query_object_clone.to_dttm = get_past_or_future(offset, outer_to_dttm)
            except ValueError as ex:
                raise QueryObjectValidationError(str(ex)) from ex
            # make sure subquery use main query where clause
            query_object_clone.inner_from_dttm = outer_from_dttm
            query_object_clone.inner_to_dttm = outer_to_dttm
            query_object_clone.time_offsets = []
            query_object_clone.post_processing = []

            if not query_object.from_dttm or not query_object.to_dttm:
                raise QueryObjectValidationError(
                    _(
                        "An enclosed time range (both start and end) must be specified "
                        "when using a Time Comparison."
                    )
                )
            # `offset` is added to the hash function
            cache_key = self.query_cache_key(query_object_clone, time_offset=offset)
            cache = QueryCacheManager.get(cache_key, CacheRegion.DATA, self.force)
            # whether hit on the cache
            if cache.is_loaded:
                rv_dfs.append(cache.df)
                queries.append(cache.query)
                cache_keys.append(cache_key)
                continue

            query_object_clone_dct = query_object_clone.to_dict()
            # rename metrics: SUM(value) => SUM(value) 1 year ago
            metrics_mapping = {
                metric: TIME_COMPARISION.join([metric, offset])
                for metric in get_metric_names(
                    query_object_clone_dct.get("metrics", [])
                )
            }
            join_keys = [col for col in df.columns if col not in metrics_mapping.keys()]

            result = self.datasource.query(query_object_clone_dct)
            queries.append(result.query)
            cache_keys.append(None)

            offset_metrics_df = result.df
            if offset_metrics_df.empty:
                offset_metrics_df = pd.DataFrame(
                    {
                        col: [np.NaN]
                        for col in join_keys + list(metrics_mapping.values())
                    }
                )
            else:
                # 1. normalize df, set dttm column
                offset_metrics_df = self.normalize_df(
                    offset_metrics_df, query_object_clone
                )

                # 2. rename extra query columns
                offset_metrics_df = offset_metrics_df.rename(columns=metrics_mapping)

                # 3. set time offset for dttm column
                offset_metrics_df[DTTM_ALIAS] = offset_metrics_df[
                    DTTM_ALIAS
                ] - DateOffset(**normalize_time_delta(offset))

            # df left join `offset_metrics_df`
            offset_df = self.left_join_df(
                left_df=df, right_df=offset_metrics_df, join_keys=join_keys,
            )
            offset_slice = offset_df[metrics_mapping.values()]

            # set offset_slice to cache and stack.
            value = {
                "df": offset_slice,
                "query": result.query,
            }
            cache.set(
                key=cache_key,
                value=value,
                timeout=self.cache_timeout,
                datasource_uid=self.datasource.uid,
                region=CacheRegion.DATA,
            )
            rv_dfs.append(offset_slice)

        rv_df = pd.concat(rv_dfs, axis=1, copy=False) if time_offsets else df
        return CachedTimeOffset(df=rv_df, queries=queries, cache_keys=cache_keys)
Ejemplo n.º 5
0
 def metric_names(self) -> List[str]:
     """Return metrics names (labels), coerce adhoc metrics to strings."""
     return get_metric_names(self.metrics or [])
Ejemplo n.º 6
0
 def metric_names(self) -> List[str]:
     return get_metric_names(self.metrics)
Ejemplo n.º 7
0
    def processing_time_offsets(
        self,
        df: pd.DataFrame,
        query_object: QueryObject,
    ) -> CachedTimeOffset:
        # ensure query_object is immutable
        query_object_clone = copy.copy(query_object)
        queries = []
        cache_keys = []

        time_offsets = query_object.time_offsets
        outer_from_dttm = query_object.from_dttm
        outer_to_dttm = query_object.to_dttm
        for offset in time_offsets:
            try:
                query_object_clone.from_dttm = get_past_or_future(
                    offset,
                    outer_from_dttm,
                )
                query_object_clone.to_dttm = get_past_or_future(
                    offset, outer_to_dttm)
            except ValueError as ex:
                raise QueryObjectValidationError(str(ex))
            # make sure subquery use main query where clause
            query_object_clone.inner_from_dttm = outer_from_dttm
            query_object_clone.inner_to_dttm = outer_to_dttm
            query_object_clone.time_offsets = []
            query_object_clone.post_processing = []

            if not query_object.from_dttm or not query_object.to_dttm:
                raise QueryObjectValidationError(
                    _("An enclosed time range (both start and end) must be specified "
                      "when using a Time Comparison."))
            # `offset` is added to the hash function
            cache_key = self.query_cache_key(query_object_clone,
                                             time_offset=offset)
            cache = QueryCacheManager.get(cache_key, CacheRegion.DATA,
                                          self.force)
            # whether hit in the cache
            if cache.is_loaded:
                df = self.left_join_on_dttm(df, cache.df)
                queries.append(cache.query)
                cache_keys.append(cache_key)
                continue

            query_object_clone_dct = query_object_clone.to_dict()
            result = self.datasource.query(query_object_clone_dct)
            queries.append(result.query)
            cache_keys.append(None)

            # rename metrics: SUM(value) => SUM(value) 1 year ago
            columns_name_mapping = {
                metric: TIME_COMPARISION.join([metric, offset])
                for metric in get_metric_names(
                    query_object_clone_dct.get("metrics", []))
            }
            columns_name_mapping[DTTM_ALIAS] = DTTM_ALIAS

            offset_metrics_df = result.df
            if offset_metrics_df.empty:
                offset_metrics_df = pd.DataFrame(
                    {col: [np.NaN]
                     for col in columns_name_mapping.values()})
            else:
                # 1. normalize df, set dttm column
                offset_metrics_df = self.normalize_df(offset_metrics_df,
                                                      query_object_clone)

                # 2. extract `metrics` columns and `dttm` column from extra query
                offset_metrics_df = offset_metrics_df[
                    columns_name_mapping.keys()]

                # 3. rename extra query columns
                offset_metrics_df = offset_metrics_df.rename(
                    columns=columns_name_mapping)

                # 4. set offset for dttm column
                offset_metrics_df[DTTM_ALIAS] = offset_metrics_df[
                    DTTM_ALIAS] - DateOffset(**normalize_time_delta(offset))

            # df left join `offset_metrics_df` on `DTTM`
            df = self.left_join_on_dttm(df, offset_metrics_df)

            # set offset df to cache.
            value = {
                "df": offset_metrics_df,
                "query": result.query,
            }
            cache.set(
                key=cache_key,
                value=value,
                timeout=self.cache_timeout,
                datasource_uid=self.datasource.uid,
                region=CacheRegion.DATA,
            )

        return CachedTimeOffset(df=df, queries=queries, cache_keys=cache_keys)