def test_get_samples(test_client, login_as_admin, virtual_dataset):
    """
    Dataset API: Test get dataset samples
    """
    # 1. should cache data
    uri = (
        f"/datasource/samples?datasource_id={virtual_dataset.id}&datasource_type=table"
    )
    # feeds data
    test_client.post(uri)
    # get from cache
    rv = test_client.post(uri)
    rv_data = json.loads(rv.data)
    assert rv.status_code == 200
    assert len(rv_data["result"]["data"]) == 10
    assert QueryCacheManager.has(
        rv_data["result"]["cache_key"],
        region=CacheRegion.DATA,
    )
    assert rv_data["result"]["is_cached"]

    # 2. should read through cache data
    uri2 = f"/datasource/samples?datasource_id={virtual_dataset.id}&datasource_type=table&force=true"
    # feeds data
    test_client.post(uri2)
    # force query
    rv2 = test_client.post(uri2)
    rv_data2 = json.loads(rv2.data)
    assert rv2.status_code == 200
    assert len(rv_data2["result"]["data"]) == 10
    assert QueryCacheManager.has(
        rv_data2["result"]["cache_key"],
        region=CacheRegion.DATA,
    )
    assert not rv_data2["result"]["is_cached"]

    # 3. data precision
    assert "colnames" in rv_data2["result"]
    assert "coltypes" in rv_data2["result"]
    assert "data" in rv_data2["result"]

    eager_samples = virtual_dataset.database.get_df(
        f"select * from ({virtual_dataset.sql}) as tbl"
        f' limit {app.config["SAMPLES_ROW_LIMIT"]}')
    # the col3 is Decimal
    eager_samples["col3"] = eager_samples["col3"].apply(float)
    eager_samples = eager_samples.to_dict(orient="records")
    assert eager_samples == rv_data2["result"]["data"]
Example #2
0
    def get_df_payload(self,
                       query_obj: QueryObject,
                       force_cached: Optional[bool] = False) -> Dict[str, Any]:
        """Handles caching around the df payload retrieval"""
        cache_key = self.query_cache_key(query_obj)
        cache = QueryCacheManager.get(
            cache_key,
            CacheRegion.DATA,
            self._query_context.force,
            force_cached,
        )

        if query_obj and cache_key and not cache.is_loaded:
            try:
                invalid_columns = [
                    col for col in
                    get_column_names_from_columns(query_obj.columns) +
                    get_column_names_from_metrics(query_obj.metrics or [])
                    if (col not in self._qc_datasource.column_names
                        and col != DTTM_ALIAS)
                ]

                if invalid_columns:
                    raise QueryObjectValidationError(
                        _(
                            "Columns missing in datasource: %(invalid_columns)s",
                            invalid_columns=invalid_columns,
                        ))

                query_result = self.get_query_result(query_obj)
                annotation_data = self.get_annotation_data(query_obj)
                cache.set_query_result(
                    key=cache_key,
                    query_result=query_result,
                    annotation_data=annotation_data,
                    force_query=self._query_context.force,
                    timeout=self.get_cache_timeout(),
                    datasource_uid=self._qc_datasource.uid,
                    region=CacheRegion.DATA,
                )
            except QueryObjectValidationError as ex:
                cache.error_message = str(ex)
                cache.status = QueryStatus.FAILED

        return {
            "cache_key": cache_key,
            "cached_dttm": cache.cache_dttm,
            "cache_timeout": self.get_cache_timeout(),
            "df": cache.df,
            "applied_template_filters": cache.applied_template_filters,
            "annotation_data": cache.annotation_data,
            "error": cache.error_message,
            "is_cached": cache.is_cached,
            "query": cache.query,
            "status": cache.status,
            "stacktrace": cache.stacktrace,
            "rowcount": len(cache.df.index),
            "from_dttm": query_obj.from_dttm,
            "to_dttm": query_obj.to_dttm,
        }
Example #3
0
def test_get_samples_on_physical_dataset(test_client, login_as_admin, physical_dataset):
    uri = (
        f"/datasource/samples?datasource_id={physical_dataset.id}&datasource_type=table"
    )
    rv = test_client.post(uri)
    assert rv.status_code == 200
    assert QueryCacheManager.has(
        rv.json["result"]["cache_key"], region=CacheRegion.DATA
    )
    assert len(rv.json["result"]["data"]) == 10
    def processing_time_offsets(  # pylint: disable=too-many-locals
        self,
        df: pd.DataFrame,
        query_object: QueryObject,
    ) -> CachedTimeOffset:
        query_context = self._query_context
        # ensure query_object is immutable
        query_object_clone = copy.copy(query_object)
        queries: List[str] = []
        cache_keys: List[Optional[str]] = []
        rv_dfs: List[pd.DataFrame] = [df]

        time_offsets = query_object.time_offsets
        outer_from_dttm = query_object.from_dttm
        outer_to_dttm = query_object.to_dttm
        for offset in time_offsets:
            try:
                query_object_clone.from_dttm = get_past_or_future(
                    offset,
                    outer_from_dttm,
                )
                query_object_clone.to_dttm = get_past_or_future(
                    offset, outer_to_dttm)
            except ValueError as ex:
                raise QueryObjectValidationError(str(ex)) from ex
            # make sure subquery use main query where clause
            query_object_clone.inner_from_dttm = outer_from_dttm
            query_object_clone.inner_to_dttm = outer_to_dttm
            query_object_clone.time_offsets = []
            query_object_clone.post_processing = []

            if not query_object.from_dttm or not query_object.to_dttm:
                raise QueryObjectValidationError(
                    _("An enclosed time range (both start and end) must be specified "
                      "when using a Time Comparison."))
            # `offset` is added to the hash function
            cache_key = self.query_cache_key(query_object_clone,
                                             time_offset=offset)
            cache = QueryCacheManager.get(cache_key, CacheRegion.DATA,
                                          query_context.force)
            # whether hit on the cache
            if cache.is_loaded:
                rv_dfs.append(cache.df)
                queries.append(cache.query)
                cache_keys.append(cache_key)
                continue

            query_object_clone_dct = query_object_clone.to_dict()
            # rename metrics: SUM(value) => SUM(value) 1 year ago
            metrics_mapping = {
                metric: TIME_COMPARISON.join([metric, offset])
                for metric in get_metric_names(
                    query_object_clone_dct.get("metrics", []))
            }
            join_keys = [
                col for col in df.columns if col not in metrics_mapping.keys()
            ]

            result = self._qc_datasource.query(query_object_clone_dct)
            queries.append(result.query)
            cache_keys.append(None)

            offset_metrics_df = result.df
            if offset_metrics_df.empty:
                offset_metrics_df = pd.DataFrame({
                    col: [np.NaN]
                    for col in join_keys + list(metrics_mapping.values())
                })
            else:
                # 1. normalize df, set dttm column
                offset_metrics_df = self.normalize_df(offset_metrics_df,
                                                      query_object_clone)

                # 2. rename extra query columns
                offset_metrics_df = offset_metrics_df.rename(
                    columns=metrics_mapping)

                # 3. set time offset for index
                # TODO: add x-axis to QueryObject, potentially as an array for
                #  multi-dimensional charts
                granularity = query_object.granularity
                index = granularity if granularity in df.columns else DTTM_ALIAS
                offset_metrics_df[index] = offset_metrics_df[
                    index] - DateOffset(**normalize_time_delta(offset))

            # df left join `offset_metrics_df`
            offset_df = df_utils.left_join_df(
                left_df=df,
                right_df=offset_metrics_df,
                join_keys=join_keys,
            )
            offset_slice = offset_df[metrics_mapping.values()]

            # set offset_slice to cache and stack.
            value = {
                "df": offset_slice,
                "query": result.query,
            }
            cache.set(
                key=cache_key,
                value=value,
                timeout=self.get_cache_timeout(),
                datasource_uid=query_context.datasource.uid,
                region=CacheRegion.DATA,
            )
            rv_dfs.append(offset_slice)

        rv_df = pd.concat(rv_dfs, axis=1, copy=False) if time_offsets else df
        return CachedTimeOffset(df=rv_df,
                                queries=queries,
                                cache_keys=cache_keys)
Example #5
0
def get_samples(  # pylint: disable=too-many-arguments,too-many-locals
    datasource_type: str,
    datasource_id: int,
    force: bool = False,
    page: int = 1,
    per_page: int = 1000,
    payload: Optional[SamplesPayloadSchema] = None,
) -> Dict[str, Any]:
    datasource = DatasourceDAO.get_datasource(
        session=db.session,
        datasource_type=datasource_type,
        datasource_id=datasource_id,
    )

    limit_clause = get_limit_clause(page, per_page)

    # todo(yongjie): Constructing count(*) and samples in the same query_context,
    #  then remove query_type==SAMPLES
    # constructing samples query
    samples_instance = QueryContextFactory().create(
        datasource={
            "type": datasource.type,
            "id": datasource.id,
        },
        queries=[{
            **payload,
            **limit_clause
        } if payload else limit_clause],
        result_type=ChartDataResultType.SAMPLES,
        force=force,
    )

    # constructing count(*) query
    count_star_metric = {
        "metrics": [{
            "expressionType": "SQL",
            "sqlExpression": "COUNT(*)",
            "label": "COUNT(*)",
        }]
    }
    count_star_instance = QueryContextFactory().create(
        datasource={
            "type": datasource.type,
            "id": datasource.id,
        },
        queries=[{
            **payload,
            **count_star_metric
        } if payload else count_star_metric],
        result_type=ChartDataResultType.FULL,
        force=force,
    )
    samples_results = samples_instance.get_payload()
    count_star_results = count_star_instance.get_payload()

    try:
        sample_data = samples_results["queries"][0]
        count_star_data = count_star_results["queries"][0]
        failed_status = (sample_data.get("status") == QueryStatus.FAILED or
                         count_star_data.get("status") == QueryStatus.FAILED)
        error_msg = sample_data.get("error") or count_star_data.get("error")
        if failed_status and error_msg:
            cache_key = sample_data.get("cache_key")
            QueryCacheManager.delete(cache_key, region=CacheRegion.DATA)
            raise DatasetSamplesFailedError(error_msg)

        sample_data["page"] = page
        sample_data["per_page"] = per_page
        sample_data["total_count"] = count_star_data["data"][0]["COUNT(*)"]
        return sample_data
    except (IndexError, KeyError) as exc:
        raise DatasetSamplesFailedError from exc
    def get_df_payload(self,
                       query_obj: QueryObject,
                       force_cached: Optional[bool] = False) -> Dict[str, Any]:
        """Handles caching around the df payload retrieval"""
        cache_key = self.query_cache_key(query_obj)
        cache = QueryCacheManager.get(
            cache_key,
            CacheRegion.DATA,
            self._query_context.force,
            force_cached,
        )

        if query_obj and cache_key and not cache.is_loaded:
            try:
                invalid_columns = [
                    col for col in
                    get_column_names_from_columns(query_obj.columns) +
                    get_column_names_from_metrics(query_obj.metrics or [])
                    if (col not in self._qc_datasource.column_names
                        and col != DTTM_ALIAS)
                ]

                if invalid_columns:
                    raise QueryObjectValidationError(
                        _(
                            "Columns missing in datasource: %(invalid_columns)s",
                            invalid_columns=invalid_columns,
                        ))

                query_result = self.get_query_result(query_obj)
                annotation_data = self.get_annotation_data(query_obj)
                cache.set_query_result(
                    key=cache_key,
                    query_result=query_result,
                    annotation_data=annotation_data,
                    force_query=self._query_context.force,
                    timeout=self.get_cache_timeout(),
                    datasource_uid=self._qc_datasource.uid,
                    region=CacheRegion.DATA,
                )
            except QueryObjectValidationError as ex:
                cache.error_message = str(ex)
                cache.status = QueryStatus.FAILED

        # the N-dimensional DataFrame has converteds into flat DataFrame
        # by `flatten operator`, "comma" in the column is escaped by `escape_separator`
        # the result DataFrame columns should be unescaped
        label_map = {
            unescape_separator(col):
            [unescape_separator(col) for col in re.split(r"(?<!\\),\s", col)]
            for col in cache.df.columns.values
        }
        cache.df.columns = [
            unescape_separator(col) for col in cache.df.columns.values
        ]

        return {
            "cache_key": cache_key,
            "cached_dttm": cache.cache_dttm,
            "cache_timeout": self.get_cache_timeout(),
            "df": cache.df,
            "applied_template_filters": cache.applied_template_filters,
            "annotation_data": cache.annotation_data,
            "error": cache.error_message,
            "is_cached": cache.is_cached,
            "query": cache.query,
            "status": cache.status,
            "stacktrace": cache.stacktrace,
            "rowcount": len(cache.df.index),
            "from_dttm": query_obj.from_dttm,
            "to_dttm": query_obj.to_dttm,
            "label_map": label_map,
        }