def __init__(self, **kwargs): self.id = kwargs.pop("id", uuid4().hex[:12]) self.columns = kwargs.pop("columns", []) self.filters = kwargs.pop("filters", []) self.havings = kwargs.pop("havings", []) self.group_by = kwargs.pop("group_by", []) self.formatters = kwargs.pop("formatters", []) self.quickselects = kwargs.pop("quickselects", []) self.column_suffixes = kwargs.pop("column_suffixes", None) self.cache_context = kwargs.pop("cache_context", "") self.datatype = kwargs.pop("datatype", None) self.datatype_by_role = kwargs.pop("datatype_by_role", dict()) self.anonymize = False self.roles = {} self._labels = [] self.error = kwargs.pop("error", None) # What order should this be in self.ordering = kwargs.pop("ordering", "asc") self.group_by_strategy = kwargs.pop("group_by_strategy", "labels") if not isinstance(self.formatters, (list, tuple)): raise BadIngredient( "formatters passed to an ingredient must be a list or tuple" ) # If explicit suffixes are passed in, there must be one for each column if self.column_suffixes is not None and len(self.column_suffixes) != len( self.columns ): raise BadIngredient("column_suffixes must be the same length as columns") # Any remaining passed properties are available in self.meta self.meta = AttrDict(kwargs)
def ingredient_from_unvalidated_dict(unvalidated_ingr, selectable): try: ingr_dict = normalize_schema(ingredient_schema, unvalidated_ingr, allow_unknown=True) except E.SureError as e: raise BadIngredient(str(e)) return create_ingredient_from_config(ingr_dict, selectable)
def parse_unvalidated_condition(cond, selectable): if cond is None: return try: cond = normalize_schema(condition_schema, cond, allow_unknown=False) except E.SureError as e: raise BadIngredient(str(e)) return parse_validated_condition(cond, selectable)
def parse_unvalidated_field(unvalidated_fld, selectable, aggregated=True): kind = "Metric" if aggregated else "Dimension" ingr = {"field": unvalidated_fld, "kind": kind} try: ingr_dict = normalize_schema(ingredient_schema, ingr, allow_unknown=True) except E.SureError as e: raise BadIngredient(str(e)) return parse_validated_field(ingr_dict["field"], selectable)
def create_ingredient_from_config(ingr_dict, selectable): """Create an ingredient from a validated config object.""" kind = ingr_dict.pop("kind", "metric") IngredientClass = ingredient_class_for_name(kind.title()) if IngredientClass is None: raise BadIngredient("Unknown ingredient kind") field_defn = ingr_dict.pop("field", None) divide_by_defn = ingr_dict.pop("divide_by", None) field = parse_validated_field(field_defn, selectable, use_bucket_labels=True) if isinstance(field_defn, dict) and "buckets" in field_defn: ingr_dict["order_by_expression"] = parse_validated_field( field_defn, selectable, use_bucket_labels=False) if divide_by_defn is not None: # Perform a divide by zero safe division divide_by = parse_validated_field(divide_by_defn, selectable) field = cast(field, Float) / ( func.coalesce(cast(divide_by, Float), 0.0) + SAFE_DIVISON_EPSILON) quickselects = ingr_dict.pop("quickselects", None) parsed_quickselects = [] if quickselects: for qf in quickselects: parsed_quickselects.append({ "name": qf["name"], "condition": parse_validated_condition(qf.get("condition", None), selectable), }) ingr_dict["quickselects"] = parsed_quickselects args = [field] # Each extra field contains a name and a field for extra in ingr_dict.pop("extra_fields", []): ingr_dict[extra.get("name")] = parse_validated_field( extra.get("field"), selectable) try: return IngredientClass(*args, **ingr_dict) except BadIngredient as e: error = { "type": "bad_ingredient", "extra": { "details": str(e), }, } return InvalidIngredient(error=error)
def make_column_suffixes(self): """Make sure we have the right column suffixes. These will be appended to `id` when generating the query. Developers note: These are generated when the query runs because the recipe may be run with anonymization on or off, which will inject a formatter. """ if self.column_suffixes: return self.column_suffixes if len(self.columns) == 0: return () elif len(self.columns) == 1: if self.formatters: return ("_raw",) else: return ("",) else: raise BadIngredient( "column_suffixes must be supplied if there is " "more than one column" )
def __init__(self, expression, **kwargs): super(Dimension, self).__init__(**kwargs) if self.datatype is None: self.datatype = datatype_from_column_expression(expression) # We must always have a value role self.roles = {"value": expression} for k, v in kwargs.items(): role = None if k.endswith("_expression"): # Remove _expression to get the role role = k[:-11] if role: if role == "raw": raise BadIngredient("raw is a reserved role in dimensions") self.roles[role] = v if not self.datatype_by_role: for k, expr in self.roles.items(): self.datatype_by_role[k] = datatype_from_column_expression(expr) self.columns = [] self._group_by = [] self.role_keys = [] if "id" in self.roles: self.columns.append(self.roles["id"]) self._group_by.append(self.roles["id"]) self.role_keys.append("id") if "value" in self.roles: self.columns.append(self.roles["value"]) self._group_by.append(self.roles["value"]) self.role_keys.append("value") # Add all the other columns in sorted order of role # with order_by coming last # For instance, if the following are passed # expression, id_expression, order_by_expresion, zed_expression the order of # columns would be "id", "value", "zed", "order_by" # When using group_bys for ordering we put them in reverse order. ordered_roles = [ k for k in sorted(self.roles.keys()) if k not in ("id", "value") ] # Move order_by to the end if "order_by" in ordered_roles: ordered_roles.remove("order_by") ordered_roles.append("order_by") for k in ordered_roles: self.columns.append(self.roles[k]) self._group_by.append(self.roles[k]) self.role_keys.append(k) if "lookup" in kwargs: self.lookup = kwargs.get("lookup") if not isinstance(self.lookup, dict): raise BadIngredient("lookup must be a dictionary") # Inject a formatter that performs the lookup if "lookup_default" in kwargs: self.lookup_default = kwargs.get("lookup_default") self.formatters.insert( 0, lambda value: self.lookup.get(value, self.lookup_default) ) else: self.formatters.insert(0, lambda value: self.lookup.get(value, value))
def create_ingredient_from_parsed(ingr_dict, builder, debug=False): """Create an ingredient from config version 2 object .""" kind = ingr_dict.pop("kind", "metric") IngredientClass = ingredient_class_for_name(kind.title()) if IngredientClass is None: raise BadIngredient(f"Unknown ingredient kind {kind}") args = [] # For some formats, we will automatically convert dates format = ingr_dict.get("format") if isinstance(format, str) and format.startswith("<") and format.endswith(">"): format = format[1:-1] convert_dates_lookup = {"%Y": "year_conv", "%B %Y": "month_conv"} convert_dates_with = convert_dates_lookup.get(format) convert_datetimes_lookup = { "%Y": "dt_year_conv", "%B %Y": "dt_month_conv", "%B %-d, %Y": "dt_day_conv", "%-d %b %Y": "dt_day_conv", "%-m/%-d/%Y": "dt_day_conv", "%B %-d, %Y": "dt_day_conv", "%-m-%-d-%Y": "dt_day_conv", } convert_datetimes_with = convert_datetimes_lookup.get(format) if builder.drivername.startswith("mssql"): # SQLServer can not use aliases in group bys and also # does not support date/time conversions due to an issue with pyodbc # parameters in queries # https://github.com/mkleehammer/pyodbc/issues/479 default_group_by_strategy = "direct" else: default_group_by_strategy = "labels" try: if kind in ("metric", "dimension"): if kind == "metric": fld_defn = ingr_dict.pop("field", None) # SQLAlchemy ingredient with required aggregation expr, datatype = builder.parse( fld_defn, enforce_aggregation=True, debug=debug, convert_dates_with=convert_dates_with, convert_datetimes_with=convert_datetimes_with, ) # Save the data type in the ingredient ingr_dict["datatype"] = datatype if datatype != "num": error = { "type": "Can not parse field", "extra": { "details": "A string can not be aggregated" }, } return InvalidIngredient(error=error) args = [expr] else: ingr_dict["group_by_strategy"] = ingr_dict.get( "group_by_strategy", default_group_by_strategy) fld_defn = ingr_dict.pop("field", None) buckets = ingr_dict.pop("buckets", None) buckets_default_label = ingr_dict.pop("buckets_default_label", None) if buckets: fld_defn, order_by_fld = _convert_bucket_to_field( fld_defn, buckets, buckets_default_label, builder) if "extra_fields" not in ingr_dict: ingr_dict["extra_fields"] = [] ingr_dict["extra_fields"].append({ "name": "order_by_expression", "field": order_by_fld }) expr, datatype = builder.parse( fld_defn, forbid_aggregation=True, debug=debug, convert_dates_with=convert_dates_with, convert_datetimes_with=convert_datetimes_with, ) # Save the data type in the ingredient ingr_dict["datatype"] = datatype args = [expr] # Convert extra fields to sqlalchemy expressions and add them directly to # the kwargs, saving datatypes datatype_by_role = {"value": datatype} for extra in ingr_dict.pop("extra_fields", []): raw_role = extra.get("name") if raw_role.endswith("_expression"): # Remove _expression to get the role role = raw_role[:-11] else: role = raw_role expr, datatype = builder.parse( extra.get("field"), forbid_aggregation=True, debug=debug, convert_dates_with=convert_dates_with, convert_datetimes_with=convert_datetimes_with, ) datatype_by_role[role] = datatype ingr_dict[raw_role] = expr ingr_dict["datatype_by_role"] = datatype_by_role parsed_quickselects = [] for qs in ingr_dict.pop("quickselects", []): condition_defn = qs.get("condition") expr, _ = builder.parse( condition_defn, forbid_aggregation=True, debug=debug, convert_dates_with=convert_dates_with, convert_datetimes_with=convert_datetimes_with, ) parsed_quickselects.append({ "name": qs["name"], "condition": expr }) ingr_dict["quickselects"] = parsed_quickselects elif kind == "filter": condition_defn = ingr_dict.get("condition") expr, _ = builder.parse( condition_defn, forbid_aggregation=True, debug=debug, convert_dates_with=convert_dates_with, convert_datetimes_with=convert_datetimes_with, ) args = [expr] elif kind == "having": condition_defn = ingr_dict.get("condition") expr, _ = builder.parse( condition_defn, forbid_aggregation=False, debug=debug, convert_dates_with=convert_dates_with, convert_datetimes_with=convert_datetimes_with, ) args = [expr] except (GrammarError, LarkError) as e: error_msg = str(e) if "Expecting:" in error_msg: error_msg = error_msg.split("Expecting:")[0] error = { "type": "Can not parse field", "extra": { "details": error_msg } } return InvalidIngredient(error=error) try: return IngredientClass(*args, **ingr_dict) except BadIngredient as e: # Some internal error while running the Ingredient constructor error = {"type": "bad_ingredient", "extra": {"details": str(e)}} return InvalidIngredient(error=error)
def brew_query_parts(self, order_by_keys=[]): """Make columns, group_bys, filters, havings""" columns, group_bys, filters, havings = [], [], set(), set() order_by_keys = list(order_by_keys) all_filters = set() for ingredient in self.ingredients(): if ingredient.error: error_type = ingredient.error.get("type") if error_type == "invalid_column": extra = ingredient.error.get("extra", {}) column_name = extra.get("column_name") ingredient_name = extra.get("ingredient_name") error_msg = 'Invalid column "{0}" in ingredient "{1}"'.format( column_name, ingredient_name) raise InvalidColumnError(error_msg, column_name=column_name) raise BadIngredient(str(ingredient.error)) if ingredient.query_columns: columns.extend(ingredient.query_columns) if ingredient.group_by: group_bys.extend(ingredient.group_by) if ingredient.filters: # Ensure we don't add duplicate filters for new_f in ingredient.filters: from recipe.utils import filter_to_string new_f_str = filter_to_string(new_f) if new_f_str not in all_filters: filters.add(new_f) all_filters.add(new_f_str) if ingredient.havings: havings.update(ingredient.havings) # If there is an order_by key on one of the ingredients, make sure # the recipe orders by this ingredient if "order_by" in ingredient.roles: if (ingredient.id not in order_by_keys and "-" + ingredient.id not in order_by_keys): if ingredient.ordering == "desc": order_by_keys.append("-" + ingredient.id) else: order_by_keys.append(ingredient.id) order_bys = OrderedDict() for key in order_by_keys: try: ingr = self.find(key, (Dimension, Metric)) for c in ingr.order_by_columns: # Avoid duplicate order by columns if str(c) not in [str(o) for o in order_bys]: order_bys[c] = None except BadRecipe as e: # Ignore order_by if the dimension/metric is not used. # TODO: Add structlog warning pass return { "columns": columns, "group_bys": group_bys, "filters": filters, "havings": havings, "order_bys": list(order_bys.keys()), }
def from_config( cls, obj, selectable, ingredient_constructor=ingredient_from_validated_dict, metadata=None, ): """Create a shelf using a dict shelf definition. :param obj: A Python dictionary describing a Shelf. :param selectable: A SQLAlchemy Table, a Recipe, a table name, or a SQLAlchemy join to select from. :param metadata: If `selectable` is passed as a table name, then in order to introspect its schema, we must have the SQLAlchemy MetaData object to associate it with. :return: A shelf that contains the ingredients defined in obj. """ from recipe import Recipe if isinstance(selectable, Recipe): selectable = selectable.subquery() elif isinstance(selectable, str): if "." in selectable: schema, tablename = selectable.split(".") else: schema, tablename = None, selectable selectable = Table(tablename, metadata, schema=schema, extend_existing=True, autoload=True) try: validated_shelf = normalize_schema(shelf_schema, obj, allow_unknown=True) except E.SureError as e: raise BadIngredient(str(e)) d = {} builder = None for k, v in validated_shelf.items(): if ingredient_constructor == ingredient_from_validated_dict: version = str(v.get("_version", "1")) if version == "1": d[k] = ingredient_constructor(v, selectable) else: if builder is None: builder = SQLAlchemyBuilder.get_builder( selectable=selectable) d[k] = ingredient_constructor(v, selectable, builder=builder) else: d[k] = ingredient_constructor(v, selectable) if isinstance(d[k], InvalidIngredient): if not d[k].error.get("extra"): d[k].error["extra"] = {} d[k].error["extra"]["ingredient_name"] = k shelf = cls(d, select_from=selectable) return shelf