def test_caching_and_materializing(self): with freeze_time("2020-01-04T13:01:01Z"): materialize("events", "$foo") materialize("events", "$bar") materialize("person", "$zeta") self.assertCountEqual( get_materialized_columns("events", use_cache=True).keys(), ["$foo", "$bar", *EVENTS_TABLE_DEFAULT_MATERIALIZED_COLUMNS], ) self.assertCountEqual( get_materialized_columns("person", use_cache=True).keys(), ["$zeta"]) materialize("events", "abc") self.assertCountEqual( get_materialized_columns("events", use_cache=True).keys(), ["$foo", "$bar", *EVENTS_TABLE_DEFAULT_MATERIALIZED_COLUMNS], ) with freeze_time("2020-01-04T14:00:01Z"): self.assertCountEqual( get_materialized_columns("events", use_cache=True).keys(), [ "$foo", "$bar", "abc", *EVENTS_TABLE_DEFAULT_MATERIALIZED_COLUMNS ], )
def get_property_string_expr( table: TableWithProperties, property_name: PropertyName, var: str, column: str, allow_denormalized_props: bool = True, table_alias: Optional[str] = None, ) -> Tuple[str, bool]: """ :param table: the full name of the table in the database. used to look up which properties have been materialized :param property_name: :param var: the value to template in from the data structure for the query e.g. %(key)s or a flat value e.g. ["Safari"]. If a flat value it should be escaped before being passed to this function :param column: the table column where JSON is stored or the name of a materialized column :param allow_denormalized_props: :param table_alias: (optional) alias of the table being queried :return: """ materialized_columns = get_materialized_columns( table) if allow_denormalized_props else {} table_string = f"{table_alias}." if table_alias is not None else "" if allow_denormalized_props and property_name in materialized_columns: return f'{table_string}"{materialized_columns[property_name]}"', True return f"trim(BOTH '\"' FROM JSONExtractRaw({table_string}{column}, {var}))", False
def columns_to_query( self, table: TableWithProperties, used_properties: Set[PropertyIdentifier]) -> Set[ColumnName]: "Transforms a list of property names to what columns are needed for that query" materialized_columns = get_materialized_columns(table) return set( materialized_columns.get(property_name, "properties") for property_name, _, _ in used_properties)
def test_materialized_column_naming(self): random.seed(0) materialize("events", "$foO();--sqlinject") materialize("events", "$foO();ääsqlinject") materialize("events", "$foO_____sqlinject") materialize("person", "SoMePrOp") self.assertDictContainsSubset( { "$foO();--sqlinject": "mat_$foO_____sqlinject", "$foO();ääsqlinject": "mat_$foO_____sqlinject_yWAc", "$foO_____sqlinject": "mat_$foO_____sqlinject_qGFz", }, get_materialized_columns("events"), ) self.assertEqual(get_materialized_columns("person"), {"SoMePrOp": "pmat_SoMePrOp"})
def get_property_string_expr( table: TableWithProperties, property_name: PropertyName, var: str, prop_var: str, allow_denormalized_props: bool = True, ) -> Tuple[str, bool]: materialized_columns = get_materialized_columns( table) if allow_denormalized_props else {} if allow_denormalized_props and property_name in materialized_columns: return materialized_columns[property_name], True return f"trim(BOTH '\"' FROM JSONExtractRaw({prop_var}, {var}))", False
def materialize_properties_task( columns_to_materialize: Optional[List[Suggestion]] = None, time_to_analyze_hours: int = MATERIALIZE_COLUMNS_ANALYSIS_PERIOD_HOURS, maximum: int = MATERIALIZE_COLUMNS_MAX_AT_ONCE, min_query_time: int = MATERIALIZE_COLUMNS_MINIMUM_QUERY_TIME, backfill_period_days: int = MATERIALIZE_COLUMNS_BACKFILL_PERIOD_DAYS, dry_run: bool = False, ) -> None: """ Creates materialized columns for event and person properties based off of slow queries """ if columns_to_materialize is None: columns_to_materialize = analyze( get_queries(time_to_analyze_hours, min_query_time)) result = [] for suggestion in columns_to_materialize: table, property_name, _ = suggestion if property_name not in get_materialized_columns(table): result.append(suggestion) if len(result) > 0: logger.info( f"Calculated columns that could be materialized. count={len(result)}" ) else: logger.info("Found no columns to materialize.") properties: Dict[TableWithProperties, List[PropertyName]] = { "events": [], "person": [], } for table, property_name, cost in result[:maximum]: logger.info( f"Materializing column. table={table}, property_name={property_name}, cost={cost}" ) if not dry_run: materialize(table, property_name) properties[table].append(property_name) if backfill_period_days > 0 and not dry_run: logger.info( f"Starting backfill for new materialized columns. period_days={backfill_period_days}" ) backfill_materialized_columns("events", properties["events"], timedelta(days=backfill_period_days)) backfill_materialized_columns("person", properties["person"], timedelta(days=backfill_period_days))
def get_materialized_columns_with_default_expression(): for table in ["events", "person"]: materialized_columns = get_materialized_columns(table, use_cache=False) for property_name, column_name in materialized_columns.items(): if is_default_expression(table, column_name): yield table, property_name, column_name
def test_get_columns_default(self): self.assertCountEqual(get_materialized_columns("events"), GROUPS_COLUMNS) self.assertCountEqual(get_materialized_columns("person"), [])
def test_get_columns_default(self): self.assertCountEqual(get_materialized_columns("events"), EVENTS_TABLE_DEFAULT_MATERIALIZED_COLUMNS) self.assertCountEqual(get_materialized_columns("person"), []) self.assertEqual(get_materialized_columns("session_recording_events"), {"has_full_snapshot": "has_full_snapshot"})