Exemple #1
0
def test_sort_order_md5_sha():
    obj_1 = {
        "product": "Coffee",
        "price_in_cents": 4000,
        "company": "Gobias Industries",
    }

    obj_2 = {
        "product": "Coffee",
        "company": "Gobias Industries",
        "price_in_cents": 4000,
    }

    assert md5_sha_from_dict(obj_1) == md5_sha_from_dict(obj_2)
    assert md5_sha_from_dict(obj_1) == "35f22273cd6a6798b04f8ddef51135e3"
def test_ignore_nan_md5_sha():
    obj = {
        "product": "Coffee",
        "company": "Gobias Industries",
        "price": math.nan,
    }

    serialized_obj = (
        '{"company": "Gobias Industries", "price": NaN, "product": "Coffee"}'
    )

    assert md5_sha_from_str(serialized_obj) == md5_sha_from_dict(obj)
    assert md5_sha_from_str(serialized_obj) == "5d129d1dffebc0bacc734366476d586d"

    serialized_obj = (
        '{"company": "Gobias Industries", "price": null, "product": "Coffee"}'
    )

    assert md5_sha_from_str(serialized_obj) == md5_sha_from_dict(obj, ignore_nan=True)
    assert md5_sha_from_str(serialized_obj) == "40e87d61f6add03816bccdeac5713b9f"
Exemple #3
0
    def cache_key(self, **extra: Any) -> str:
        """
        The cache key is made out of the key/values from to_dict(), plus any
        other key/values in `extra`
        We remove datetime bounds that are hard values, and replace them with
        the use-provided inputs to bounds, which may be time-relative (as in
        "5 days ago" or "now").
        """
        cache_dict = self.to_dict()
        cache_dict.update(extra)

        # TODO: the below KVs can all be cleaned up and moved to `to_dict()` at some
        #  predetermined point in time when orgs are aware that the previously
        #  chached results will be invalidated.
        if not self.apply_fetch_values_predicate:
            del cache_dict["apply_fetch_values_predicate"]
        if self.datasource:
            cache_dict["datasource"] = self.datasource.uid
        if self.result_type:
            cache_dict["result_type"] = self.result_type
        if self.time_range:
            cache_dict["time_range"] = self.time_range
        if self.post_processing:
            cache_dict["post_processing"] = self.post_processing
        if self.time_offsets:
            cache_dict["time_offsets"] = self.time_offsets

        for k in ["from_dttm", "to_dttm"]:
            del cache_dict[k]

        annotation_fields = [
            "annotationType",
            "descriptionColumns",
            "intervalEndColumn",
            "name",
            "overrides",
            "sourceType",
            "timeColumn",
            "titleColumn",
            "value",
        ]
        annotation_layers = [{
            field: layer[field]
            for field in annotation_fields if field in layer
        } for layer in self.annotation_layers]
        # only add to key if there are annotations present that affect the payload
        if annotation_layers:
            cache_dict["annotation_layers"] = annotation_layers

        return md5_sha_from_dict(cache_dict,
                                 default=json_int_dttm_ser,
                                 ignore_nan=True)
def test_basic_md5_sha():
    obj = {
        "product": "Coffee",
        "company": "Gobias Industries",
        "price_in_cents": 4000,
    }

    serialized_obj = (
        '{"company": "Gobias Industries", "price_in_cents": 4000, "product": "Coffee"}'
    )

    assert md5_sha_from_str(serialized_obj) == md5_sha_from_dict(obj)
    assert md5_sha_from_str(serialized_obj) == "35f22273cd6a6798b04f8ddef51135e3"
Exemple #5
0
 def cache_key(
     self,
     window_size: Optional[Union[bool, WindowSize]] = None,
     thumb_size: Optional[Union[bool, WindowSize]] = None,
 ) -> str:
     window_size = window_size or self.window_size
     thumb_size = thumb_size or self.thumb_size
     args = {
         "thumbnail_type": self.thumbnail_type,
         "digest": self.digest,
         "type": "thumb",
         "window_size": window_size,
         "thumb_size": thumb_size,
     }
     return md5_sha_from_dict(args)
Exemple #6
0
def test_custom_default_md5_sha():
    def custom_datetime_serializer(obj: Any):
        if isinstance(obj, datetime.datetime):
            return "<datetime>"

    obj = {
        "product": "Coffee",
        "company": "Gobias Industries",
        "datetime": datetime.datetime.now(),
    }

    serialized_obj = '{"company": "Gobias Industries", "datetime": "<datetime>", "product": "Coffee"}'

    assert md5_sha_from_str(serialized_obj) == md5_sha_from_dict(
        obj, default=custom_datetime_serializer)
    assert md5_sha_from_str(
        serialized_obj) == "dc280121213aabcaeb8087aef268fd0d"
Exemple #7
0
def generate_cache_key(values_dict: Dict[str, Any],
                       key_prefix: str = "") -> str:
    hash_str = md5_sha_from_dict(values_dict, default=json_int_dttm_ser)
    return f"{key_prefix}{hash_str}"
class QueryObject:  # pylint: disable=too-many-instance-attributes
    """
    The query object's schema matches the interfaces of DB connectors like sqla
    and druid. The query objects are constructed on the client.
    """

    annotation_layers: List[Dict[str, Any]]
    applied_time_extras: Dict[str, str]
    apply_fetch_values_predicate: bool
    columns: List[Column]
    datasource: Optional[BaseDatasource]
    extras: Dict[str, Any]
    filter: List[QueryObjectFilterClause]
    from_dttm: Optional[datetime]
    granularity: Optional[str]
    inner_from_dttm: Optional[datetime]
    inner_to_dttm: Optional[datetime]
    is_rowcount: bool
    is_timeseries: bool
    metrics: Optional[List[Metric]]
    order_desc: bool
    orderby: List[OrderBy]
    post_processing: List[Dict[str, Any]]
    result_type: Optional[ChartDataResultType]
    row_limit: Optional[int]
    row_offset: int
    series_columns: List[Column]
    series_limit: int
    series_limit_metric: Optional[Metric]
    time_offsets: List[str]
    time_shift: Optional[timedelta]
    time_range: Optional[str]
    to_dttm: Optional[datetime]

    def __init__(  # pylint: disable=too-many-locals
        self,
        *,
        annotation_layers: Optional[List[Dict[str, Any]]] = None,
        applied_time_extras: Optional[Dict[str, str]] = None,
        apply_fetch_values_predicate: bool = False,
        columns: Optional[List[Column]] = None,
        datasource: Optional[BaseDatasource] = None,
        extras: Optional[Dict[str, Any]] = None,
        filters: Optional[List[QueryObjectFilterClause]] = None,
        granularity: Optional[str] = None,
        is_rowcount: bool = False,
        is_timeseries: Optional[bool] = None,
        metrics: Optional[List[Metric]] = None,
        order_desc: bool = True,
        orderby: Optional[List[OrderBy]] = None,
        post_processing: Optional[List[Optional[Dict[str, Any]]]] = None,
        row_limit: Optional[int],
        row_offset: Optional[int] = None,
        series_columns: Optional[List[Column]] = None,
        series_limit: int = 0,
        series_limit_metric: Optional[Metric] = None,
        time_range: Optional[str] = None,
        time_shift: Optional[str] = None,
        **kwargs: Any,
    ):
        self._set_annotation_layers(annotation_layers)
        self.applied_time_extras = applied_time_extras or {}
        self.apply_fetch_values_predicate = apply_fetch_values_predicate or False
        self.columns = columns or []
        self.datasource = datasource
        self.extras = extras or {}
        self.filter = filters or []
        self.granularity = granularity
        self.is_rowcount = is_rowcount
        self._set_is_timeseries(is_timeseries)
        self._set_metrics(metrics)
        self.order_desc = order_desc
        self.orderby = orderby or []
        self._set_post_processing(post_processing)
        self.row_limit = row_limit
        self.row_offset = row_offset or 0
        self._init_series_columns(series_columns, metrics, is_timeseries)
        self.series_limit = series_limit
        self.series_limit_metric = series_limit_metric
        self.time_range = time_range
        self.time_shift = parse_human_timedelta(time_shift)
        self.from_dttm = kwargs.get("from_dttm")
        self.to_dttm = kwargs.get("to_dttm")
        self.result_type = kwargs.get("result_type")
        self.time_offsets = kwargs.get("time_offsets", [])
        self.inner_from_dttm = kwargs.get("inner_from_dttm")
        self.inner_to_dttm = kwargs.get("inner_to_dttm")
        self._rename_deprecated_fields(kwargs)
        self._move_deprecated_extra_fields(kwargs)

    def _set_annotation_layers(
        self, annotation_layers: Optional[List[Dict[str, Any]]]
    ) -> None:
        self.annotation_layers = [
            layer
            for layer in (annotation_layers or [])
            # formula annotations don't affect the payload, hence can be dropped
            if layer["annotationType"] != "FORMULA"
        ]

    def _set_is_timeseries(self, is_timeseries: Optional[bool]) -> None:
        # is_timeseries is True if time column is in either columns or groupby
        # (both are dimensions)
        self.is_timeseries = (
            is_timeseries if is_timeseries is not None else DTTM_ALIAS in self.columns
        )

    def _set_metrics(self, metrics: Optional[List[Metric]] = None) -> None:
        # Support metric reference/definition in the format of
        #   1. 'metric_name'   - name of predefined metric
        #   2. { label: 'label_name' }  - legacy format for a predefined metric
        #   3. { expressionType: 'SIMPLE' | 'SQL', ... } - adhoc metric
        def is_str_or_adhoc(metric: Metric) -> bool:
            return isinstance(metric, str) or is_adhoc_metric(metric)

        self.metrics = metrics and [
            x if is_str_or_adhoc(x) else x["label"] for x in metrics  # type: ignore
        ]

    def _set_post_processing(
        self, post_processing: Optional[List[Optional[Dict[str, Any]]]]
    ) -> None:
        post_processing = post_processing or []
        self.post_processing = [post_proc for post_proc in post_processing if post_proc]

    def _init_series_columns(
        self,
        series_columns: Optional[List[Column]],
        metrics: Optional[List[Metric]],
        is_timeseries: Optional[bool],
    ) -> None:
        if series_columns:
            self.series_columns = series_columns
        elif is_timeseries and metrics:
            self.series_columns = self.columns
        else:
            self.series_columns = []

    def _rename_deprecated_fields(self, kwargs: Dict[str, Any]) -> None:
        # rename deprecated fields
        for field in DEPRECATED_FIELDS:
            if field.old_name in kwargs:
                logger.warning(
                    "The field `%s` is deprecated, please use `%s` instead.",
                    field.old_name,
                    field.new_name,
                )
                value = kwargs[field.old_name]
                if value:
                    if hasattr(self, field.new_name):
                        logger.warning(
                            "The field `%s` is already populated, "
                            "replacing value with contents from `%s`.",
                            field.new_name,
                            field.old_name,
                        )
                    setattr(self, field.new_name, value)

    def _move_deprecated_extra_fields(self, kwargs: Dict[str, Any]) -> None:
        # move deprecated extras fields to extras
        for field in DEPRECATED_EXTRAS_FIELDS:
            if field.old_name in kwargs:
                logger.warning(
                    "The field `%s` is deprecated and should "
                    "be passed to `extras` via the `%s` property.",
                    field.old_name,
                    field.new_name,
                )
                value = kwargs[field.old_name]
                if value:
                    if hasattr(self.extras, field.new_name):
                        logger.warning(
                            "The field `%s` is already populated in "
                            "`extras`, replacing value with contents "
                            "from `%s`.",
                            field.new_name,
                            field.old_name,
                        )
                    self.extras[field.new_name] = value

    @property
    def metric_names(self) -> List[str]:
        """Return metrics names (labels), coerce adhoc metrics to strings."""
        return get_metric_names(self.metrics or [])

    @property
    def column_names(self) -> List[str]:
        """Return column names (labels). Gives priority to groupbys if both groupbys
        and metrics are non-empty, otherwise returns column labels."""
        return get_column_names(self.columns)

    def validate(
        self, raise_exceptions: Optional[bool] = True
    ) -> Optional[QueryObjectValidationError]:
        """Validate query object"""
        try:
            self._validate_there_are_no_missing_series()
            self._validate_no_have_duplicate_labels()
            self._sanitize_filters()
            return None
        except QueryObjectValidationError as ex:
            if raise_exceptions:
                raise ex
            return ex

    def _validate_no_have_duplicate_labels(self) -> None:
        all_labels = self.metric_names + self.column_names
        if len(set(all_labels)) < len(all_labels):
            dup_labels = find_duplicates(all_labels)
            raise QueryObjectValidationError(
                _(
                    "Duplicate column/metric labels: %(labels)s. Please make "
                    "sure all columns and metrics have a unique label.",
                    labels=", ".join(f'"{x}"' for x in dup_labels),
                )
            )

    def _sanitize_filters(self) -> None:
        for param in ("where", "having"):
            clause = self.extras.get(param)
            if clause:
                try:
                    sanitized_clause = sanitize_clause(clause)
                    if sanitized_clause != clause:
                        self.extras[param] = sanitized_clause
                except QueryClauseValidationException as ex:
                    raise QueryObjectValidationError(ex.message) from ex

    def _validate_there_are_no_missing_series(self) -> None:
        missing_series = [col for col in self.series_columns if col not in self.columns]
        if missing_series:
            raise QueryObjectValidationError(
                _(
                    "The following entries in `series_columns` are missing "
                    "in `columns`: %(columns)s. ",
                    columns=", ".join(f'"{x}"' for x in missing_series),
                )
            )

    def to_dict(self) -> Dict[str, Any]:
        query_object_dict = {
            "apply_fetch_values_predicate": self.apply_fetch_values_predicate,
            "columns": self.columns,
            "extras": self.extras,
            "filter": self.filter,
            "from_dttm": self.from_dttm,
            "granularity": self.granularity,
            "inner_from_dttm": self.inner_from_dttm,
            "inner_to_dttm": self.inner_to_dttm,
            "is_rowcount": self.is_rowcount,
            "is_timeseries": self.is_timeseries,
            "metrics": self.metrics,
            "order_desc": self.order_desc,
            "orderby": self.orderby,
            "row_limit": self.row_limit,
            "row_offset": self.row_offset,
            "series_columns": self.series_columns,
            "series_limit": self.series_limit,
            "series_limit_metric": self.series_limit_metric,
            "to_dttm": self.to_dttm,
        }
        return query_object_dict

    def __repr__(self) -> str:
        # we use `print` or `logging` output QueryObject
        return json.dumps(
            self.to_dict(),
            sort_keys=True,
            default=str,
        )

    def cache_key(self, **extra: Any) -> str:
        """
        The cache key is made out of the key/values from to_dict(), plus any
        other key/values in `extra`
        We remove datetime bounds that are hard values, and replace them with
        the use-provided inputs to bounds, which may be time-relative (as in
        "5 days ago" or "now").
        """
        cache_dict = self.to_dict()
        cache_dict.update(extra)

        # TODO: the below KVs can all be cleaned up and moved to `to_dict()` at some
        #  predetermined point in time when orgs are aware that the previously
        #  chached results will be invalidated.
        if not self.apply_fetch_values_predicate:
            del cache_dict["apply_fetch_values_predicate"]
        if self.datasource:
            cache_dict["datasource"] = self.datasource.uid
        if self.result_type:
            cache_dict["result_type"] = self.result_type
        if self.time_range:
            cache_dict["time_range"] = self.time_range
        if self.post_processing:
            cache_dict["post_processing"] = self.post_processing
        if self.time_offsets:
            cache_dict["time_offsets"] = self.time_offsets

        for k in ["from_dttm", "to_dttm"]:
            del cache_dict[k]

        annotation_fields = [
            "annotationType",
            "descriptionColumns",
            "intervalEndColumn",
            "name",
            "overrides",
            "sourceType",
            "timeColumn",
            "titleColumn",
            "value",
        ]
        annotation_layers = [
            {field: layer[field] for field in annotation_fields if field in layer}
            for layer in self.annotation_layers
        ]
        # only add to key if there are annotations present that affect the payload
        if annotation_layers:
            cache_dict["annotation_layers"] = annotation_layers

        # Add an impersonation key to cache if impersonation is enabled on the db
        if (
            feature_flag_manager.is_feature_enabled("CACHE_IMPERSONATION")
            and self.datasource
            and hasattr(self.datasource, "database")
            and self.datasource.database.impersonate_user
        ):

            if key := self.datasource.database.db_engine_spec.get_impersonation_key(
                getattr(g, "user", None)
            ):

                logger.debug(
                    "Adding impersonation key to QueryObject cache dict: %s", key
                )

                cache_dict["impersonation_key"] = key

        return md5_sha_from_dict(cache_dict, default=json_int_dttm_ser, ignore_nan=True)