Beispiel #1
0
    def test_caching_and_materializing(self):
        with freeze_time("2020-01-04T13:01:01Z"):
            materialize("events", "$foo")
            materialize("events", "$bar")
            materialize("person", "$zeta")

            self.assertCountEqual(
                get_materialized_columns("events", use_cache=True).keys(),
                ["$foo", "$bar", *EVENTS_TABLE_DEFAULT_MATERIALIZED_COLUMNS],
            )
            self.assertCountEqual(
                get_materialized_columns("person", use_cache=True).keys(),
                ["$zeta"])

            materialize("events", "abc")

            self.assertCountEqual(
                get_materialized_columns("events", use_cache=True).keys(),
                ["$foo", "$bar", *EVENTS_TABLE_DEFAULT_MATERIALIZED_COLUMNS],
            )

        with freeze_time("2020-01-04T14:00:01Z"):
            self.assertCountEqual(
                get_materialized_columns("events", use_cache=True).keys(),
                [
                    "$foo", "$bar", "abc",
                    *EVENTS_TABLE_DEFAULT_MATERIALIZED_COLUMNS
                ],
            )
Beispiel #2
0
def get_property_string_expr(
    table: TableWithProperties,
    property_name: PropertyName,
    var: str,
    column: str,
    allow_denormalized_props: bool = True,
    table_alias: Optional[str] = None,
) -> Tuple[str, bool]:
    """

    :param table:
        the full name of the table in the database. used to look up which properties have been materialized
    :param property_name:
    :param var:
        the value to template in from the data structure for the query e.g. %(key)s or a flat value e.g. ["Safari"].
        If a flat value it should be escaped before being passed to this function
    :param column:
        the table column where JSON is stored or the name of a materialized column
    :param allow_denormalized_props:
    :param table_alias:
        (optional) alias of the table being queried
    :return:
    """
    materialized_columns = get_materialized_columns(
        table) if allow_denormalized_props else {}

    table_string = f"{table_alias}." if table_alias is not None else ""

    if allow_denormalized_props and property_name in materialized_columns:
        return f'{table_string}"{materialized_columns[property_name]}"', True

    return f"trim(BOTH '\"' FROM JSONExtractRaw({table_string}{column}, {var}))", False
Beispiel #3
0
    def columns_to_query(
            self, table: TableWithProperties,
            used_properties: Set[PropertyIdentifier]) -> Set[ColumnName]:
        "Transforms a list of property names to what columns are needed for that query"

        materialized_columns = get_materialized_columns(table)
        return set(
            materialized_columns.get(property_name, "properties")
            for property_name, _, _ in used_properties)
Beispiel #4
0
    def test_materialized_column_naming(self):
        random.seed(0)

        materialize("events", "$foO();--sqlinject")
        materialize("events", "$foO();ääsqlinject")
        materialize("events", "$foO_____sqlinject")
        materialize("person", "SoMePrOp")

        self.assertDictContainsSubset(
            {
                "$foO();--sqlinject": "mat_$foO_____sqlinject",
                "$foO();ääsqlinject": "mat_$foO_____sqlinject_yWAc",
                "$foO_____sqlinject": "mat_$foO_____sqlinject_qGFz",
            },
            get_materialized_columns("events"),
        )

        self.assertEqual(get_materialized_columns("person"),
                         {"SoMePrOp": "pmat_SoMePrOp"})
Beispiel #5
0
def get_property_string_expr(
    table: TableWithProperties,
    property_name: PropertyName,
    var: str,
    prop_var: str,
    allow_denormalized_props: bool = True,
) -> Tuple[str, bool]:
    materialized_columns = get_materialized_columns(
        table) if allow_denormalized_props else {}

    if allow_denormalized_props and property_name in materialized_columns:
        return materialized_columns[property_name], True

    return f"trim(BOTH '\"' FROM JSONExtractRaw({prop_var}, {var}))", False
Beispiel #6
0
def materialize_properties_task(
    columns_to_materialize: Optional[List[Suggestion]] = None,
    time_to_analyze_hours: int = MATERIALIZE_COLUMNS_ANALYSIS_PERIOD_HOURS,
    maximum: int = MATERIALIZE_COLUMNS_MAX_AT_ONCE,
    min_query_time: int = MATERIALIZE_COLUMNS_MINIMUM_QUERY_TIME,
    backfill_period_days: int = MATERIALIZE_COLUMNS_BACKFILL_PERIOD_DAYS,
    dry_run: bool = False,
) -> None:
    """
    Creates materialized columns for event and person properties based off of slow queries
    """

    if columns_to_materialize is None:
        columns_to_materialize = analyze(
            get_queries(time_to_analyze_hours, min_query_time))
    result = []
    for suggestion in columns_to_materialize:
        table, property_name, _ = suggestion
        if property_name not in get_materialized_columns(table):
            result.append(suggestion)

    if len(result) > 0:
        logger.info(
            f"Calculated columns that could be materialized. count={len(result)}"
        )
    else:
        logger.info("Found no columns to materialize.")

    properties: Dict[TableWithProperties, List[PropertyName]] = {
        "events": [],
        "person": [],
    }
    for table, property_name, cost in result[:maximum]:
        logger.info(
            f"Materializing column. table={table}, property_name={property_name}, cost={cost}"
        )

        if not dry_run:
            materialize(table, property_name)
        properties[table].append(property_name)

    if backfill_period_days > 0 and not dry_run:
        logger.info(
            f"Starting backfill for new materialized columns. period_days={backfill_period_days}"
        )
        backfill_materialized_columns("events", properties["events"],
                                      timedelta(days=backfill_period_days))
        backfill_materialized_columns("person", properties["person"],
                                      timedelta(days=backfill_period_days))
Beispiel #7
0
def get_materialized_columns_with_default_expression():
    for table in ["events", "person"]:
        materialized_columns = get_materialized_columns(table, use_cache=False)
        for property_name, column_name in materialized_columns.items():
            if is_default_expression(table, column_name):
                yield table, property_name, column_name
Beispiel #8
0
 def test_get_columns_default(self):
     self.assertCountEqual(get_materialized_columns("events"),
                           GROUPS_COLUMNS)
     self.assertCountEqual(get_materialized_columns("person"), [])
Beispiel #9
0
 def test_get_columns_default(self):
     self.assertCountEqual(get_materialized_columns("events"),
                           EVENTS_TABLE_DEFAULT_MATERIALIZED_COLUMNS)
     self.assertCountEqual(get_materialized_columns("person"), [])
     self.assertEqual(get_materialized_columns("session_recording_events"),
                      {"has_full_snapshot": "has_full_snapshot"})