Ejemplo n.º 1
0
def apply_post_process(
    result: Dict[Any, Any],
    form_data: Optional[Dict[str, Any]] = None,
) -> Dict[Any, Any]:
    form_data = form_data or {}

    viz_type = form_data.get("viz_type")
    if viz_type not in post_processors:
        return result

    post_processor = post_processors[viz_type]

    for query in result["queries"]:
        df = pd.read_csv(StringIO(query["data"]))
        processed_df = post_processor(df, form_data)

        buf = StringIO()
        processed_df.to_csv(buf)
        buf.seek(0)

        query["data"] = buf.getvalue()
        query["colnames"] = list(processed_df.columns)
        query["coltypes"] = extract_dataframe_dtypes(processed_df)
        query["rowcount"] = len(processed_df.index)

    return result
Ejemplo n.º 2
0
    def test_extract_dataframe_dtypes(self):
        slc = self.get_slice("Girls", db.session)
        cols: Tuple[Tuple[str, GenericDataType, List[Any]], ...] = (
            ("dt", GenericDataType.TEMPORAL, [date(2021, 2, 4), date(2021, 2, 4)]),
            (
                "dttm",
                GenericDataType.TEMPORAL,
                [datetime(2021, 2, 4, 1, 1, 1), datetime(2021, 2, 4, 1, 1, 1)],
            ),
            ("str", GenericDataType.STRING, ["foo", "foo"]),
            ("int", GenericDataType.NUMERIC, [1, 1]),
            ("float", GenericDataType.NUMERIC, [0.5, 0.5]),
            ("mixed-int-float", GenericDataType.NUMERIC, [0.5, 1.0]),
            ("bool", GenericDataType.BOOLEAN, [True, False]),
            ("mixed-str-int", GenericDataType.STRING, ["abc", 1.0]),
            ("obj", GenericDataType.STRING, [{"a": 1}, {"a": 1}]),
            ("dt_null", GenericDataType.TEMPORAL, [None, date(2021, 2, 4)]),
            (
                "dttm_null",
                GenericDataType.TEMPORAL,
                [None, datetime(2021, 2, 4, 1, 1, 1)],
            ),
            ("str_null", GenericDataType.STRING, [None, "foo"]),
            ("int_null", GenericDataType.NUMERIC, [None, 1]),
            ("float_null", GenericDataType.NUMERIC, [None, 0.5]),
            ("bool_null", GenericDataType.BOOLEAN, [None, False]),
            ("obj_null", GenericDataType.STRING, [None, {"a": 1}]),
            # Non-timestamp columns should be identified as temporal if
            # `is_dttm` is set to `True` in the underlying datasource
            ("ds", GenericDataType.TEMPORAL, [None, {"ds": "2017-01-01"}]),
        )

        df = pd.DataFrame(data={col[0]: col[2] for col in cols})
        assert extract_dataframe_dtypes(df, slc.datasource) == [col[1] for col in cols]
Ejemplo n.º 3
0
def _get_full(
    query_context: "QueryContext",
    query_obj: "QueryObject",
    force_cached: Optional[bool] = False,
) -> Dict[str, Any]:
    datasource = _get_datasource(query_context, query_obj)
    result_type = query_obj.result_type or query_context.result_type
    payload = query_context.get_df_payload(query_obj,
                                           force_cached=force_cached)
    df = payload["df"]
    status = payload["status"]
    if status != QueryStatus.FAILED:
        payload["colnames"] = list(df.columns)
        payload["coltypes"] = extract_dataframe_dtypes(df)
        payload["data"] = query_context.get_data(df)
    del payload["df"]

    filters = query_obj.filter
    filter_columns = cast(List[str], [flt.get("col") for flt in filters])
    columns = set(datasource.column_names)
    applied_time_columns, rejected_time_columns = get_time_filter_status(
        datasource, query_obj.applied_time_extras)
    payload["applied_filters"] = [{
        "column": col
    } for col in filter_columns if col in columns] + applied_time_columns
    payload["rejected_filters"] = [{
        "reason": "not_in_datasource",
        "column": col
    } for col in filter_columns if col not in columns] + rejected_time_columns

    if result_type == ChartDataResultType.RESULTS and status != QueryStatus.FAILED:
        return {"data": payload.get("data")}
    return payload
Ejemplo n.º 4
0
    def test_extract_dataframe_dtypes(self):
        cols: Tuple[Tuple[str, GenericDataType, List[Any]], ...] = (
            ("dt", GenericDataType.TEMPORAL, [date(2021, 2, 4), date(2021, 2, 4)]),
            (
                "dttm",
                GenericDataType.TEMPORAL,
                [datetime(2021, 2, 4, 1, 1, 1), datetime(2021, 2, 4, 1, 1, 1)],
            ),
            ("str", GenericDataType.STRING, ["foo", "foo"]),
            ("int", GenericDataType.NUMERIC, [1, 1]),
            ("float", GenericDataType.NUMERIC, [0.5, 0.5]),
            ("mixed-int-float", GenericDataType.NUMERIC, [0.5, 1.0]),
            ("bool", GenericDataType.BOOLEAN, [True, False]),
            ("mixed-str-int", GenericDataType.STRING, ["abc", 1.0]),
            ("obj", GenericDataType.STRING, [{"a": 1}, {"a": 1}]),
            ("dt_null", GenericDataType.TEMPORAL, [None, date(2021, 2, 4)]),
            (
                "dttm_null",
                GenericDataType.TEMPORAL,
                [None, datetime(2021, 2, 4, 1, 1, 1)],
            ),
            ("str_null", GenericDataType.STRING, [None, "foo"]),
            ("int_null", GenericDataType.NUMERIC, [None, 1]),
            ("float_null", GenericDataType.NUMERIC, [None, 0.5]),
            ("bool_null", GenericDataType.BOOLEAN, [None, False]),
            ("obj_null", GenericDataType.STRING, [None, {"a": 1}]),
        )

        df = pd.DataFrame(data={col[0]: col[2] for col in cols})
        assert extract_dataframe_dtypes(df) == [col[1] for col in cols]
Ejemplo n.º 5
0
def apply_post_process(
    result: Dict[Any, Any],
    form_data: Optional[Dict[str, Any]] = None,
    datasource: Optional["BaseDatasource"] = None,
) -> Dict[Any, Any]:
    form_data = form_data or {}

    viz_type = form_data.get("viz_type")
    if viz_type not in post_processors:
        return result

    post_processor = post_processors[viz_type]

    for query in result["queries"]:
        if query["result_format"] not in (rf.value
                                          for rf in ChartDataResultFormat):
            raise Exception(
                f"Result format {query['result_format']} not supported")

        if not query["data"]:
            # do not try to process empty data
            continue

        if query["result_format"] == ChartDataResultFormat.JSON:
            df = pd.DataFrame.from_dict(query["data"])
        elif query["result_format"] == ChartDataResultFormat.CSV:
            df = pd.read_csv(StringIO(query["data"]))

        # convert all columns to verbose (label) name
        if datasource:
            df.rename(columns=datasource.data["verbose_map"], inplace=True)

        processed_df = post_processor(df, form_data, datasource)

        query["colnames"] = list(processed_df.columns)
        query["indexnames"] = list(processed_df.index)
        query["coltypes"] = extract_dataframe_dtypes(processed_df, datasource)
        query["rowcount"] = len(processed_df.index)

        # Flatten hierarchical columns/index since they are represented as
        # `Tuple[str]`. Otherwise encoding to JSON later will fail because
        # maps cannot have tuples as their keys in JSON.
        processed_df.columns = [
            " ".join(str(name) for name in column).strip() if isinstance(
                column, tuple) else column for column in processed_df.columns
        ]
        processed_df.index = [
            " ".join(str(name) for name in index).strip() if isinstance(
                index, tuple) else index for index in processed_df.index
        ]

        if query["result_format"] == ChartDataResultFormat.JSON:
            query["data"] = processed_df.to_dict()
        elif query["result_format"] == ChartDataResultFormat.CSV:
            buf = StringIO()
            processed_df.to_csv(buf)
            buf.seek(0)
            query["data"] = buf.getvalue()

    return result
Ejemplo n.º 6
0
    def get_single_payload(
        self,
        query_obj: QueryObject,
        force_cached: Optional[bool] = False,
    ) -> Dict[str, Any]:
        """Return results payload for a single quey"""
        if self.result_type == utils.ChartDataResultType.QUERY:
            return {
                "query": self.datasource.get_query_str(query_obj.to_dict()),
                "language": self.datasource.query_language,
            }

        if self.result_type == utils.ChartDataResultType.SAMPLES:
            row_limit = query_obj.row_limit or math.inf
            query_obj = copy.copy(query_obj)
            query_obj.is_timeseries = False
            query_obj.orderby = []
            query_obj.groupby = []
            query_obj.metrics = []
            query_obj.post_processing = []
            query_obj.row_limit = min(row_limit, config["SAMPLES_ROW_LIMIT"])
            query_obj.row_offset = 0
            query_obj.columns = [
                o.column_name for o in self.datasource.columns
            ]

        payload = self.get_df_payload(query_obj, force_cached=force_cached)
        df = payload["df"]
        status = payload["status"]
        if status != utils.QueryStatus.FAILED:
            payload["colnames"] = list(df.columns)
            payload["coltypes"] = utils.extract_dataframe_dtypes(df)
            payload["data"] = self.get_data(df)
        del payload["df"]

        filters = query_obj.filter
        filter_columns = cast(List[str], [flt.get("col") for flt in filters])
        columns = set(self.datasource.column_names)
        applied_time_columns, rejected_time_columns = utils.get_time_filter_status(
            self.datasource, query_obj.applied_time_extras)
        payload["applied_filters"] = [{
            "column": col
        } for col in filter_columns if col in columns] + applied_time_columns
        payload["rejected_filters"] = [{
            "reason": "not_in_datasource",
            "column": col
        } for col in filter_columns if col not in columns
                                       ] + rejected_time_columns

        if (self.result_type == utils.ChartDataResultType.RESULTS
                and status != utils.QueryStatus.FAILED):
            return {"data": payload["data"]}
        return payload
Ejemplo n.º 7
0
def _get_full(
    query_context: QueryContext,
    query_obj: QueryObject,
    force_cached: Optional[bool] = False,
) -> Dict[str, Any]:
    datasource = _get_datasource(query_context, query_obj)
    result_type = query_obj.result_type or query_context.result_type
    payload = query_context.get_df_payload(query_obj, force_cached=force_cached)
    applied_template_filters = payload.get("applied_template_filters", [])
    df = payload["df"]
    status = payload["status"]
    if status != QueryStatus.FAILED:
        payload["colnames"] = list(df.columns)
        payload["indexnames"] = list(df.index)
        payload["coltypes"] = extract_dataframe_dtypes(df, datasource)
        payload["data"] = query_context.get_data(df)
        payload["result_format"] = query_context.result_format
    del payload["df"]

    filters = query_obj.filter
    filter_columns = cast(List[str], [flt.get("col") for flt in filters])
    columns = set(datasource.column_names)
    applied_time_columns, rejected_time_columns = get_time_filter_status(
        datasource, query_obj.applied_time_extras
    )
    payload["applied_filters"] = [
        {"column": get_column_name(col)}
        for col in filter_columns
        if is_adhoc_column(col) or col in columns or col in applied_template_filters
    ] + applied_time_columns
    payload["rejected_filters"] = [
        {"reason": ExtraFiltersReasonType.COL_NOT_IN_DATASOURCE, "column": col}
        for col in filter_columns
        if not is_adhoc_column(col)
        and col not in columns
        and col not in applied_template_filters
    ] + rejected_time_columns

    if result_type == ChartDataResultType.RESULTS and status != QueryStatus.FAILED:
        return {
            "data": payload.get("data"),
            "colnames": payload.get("colnames"),
            "coltypes": payload.get("coltypes"),
        }
    return payload