def test_prop_person_denormalized(self): _create_person(distinct_ids=["some_id"], team_id=self.team.pk, properties={"email": "*****@*****.**"}) _create_event(event="$pageview", team=self.team, distinct_id="some_id") materialize("person", "email") filter = Filter( data={ "properties": [{ "key": "email", "type": "person", "value": "posthog", "operator": "icontains" }], }) self.assertEqual(len(self._run_query(filter, join_person_tables=True)), 1) filter = Filter( data={ "properties": [{ "key": "email", "type": "person", "value": "posthog", "operator": "not_icontains" }], }) self.assertEqual(len(self._run_query(filter, join_person_tables=True)), 0)
def testdata(db, team): materialize("person", "email") _create_person( distinct_ids=["1"], team_id=team.pk, properties={ "email": "*****@*****.**", "$os": "windows", "$browser": "chrome" }, ) _create_person( distinct_ids=["2"], team_id=team.pk, properties={ "email": "*****@*****.**", "$os": "Mac", "$browser": "firefox" }, ) _create_person( distinct_ids=["3"], team_id=team.pk, properties={ "email": "*****@*****.**", "$os": "windows", "$browser": "mozilla" }, )
def test_events_columns_in_inconsistent_state(self): materialize("events", "$session_id") materialize("events", "$window_id") sync_execute( "ALTER TABLE events RENAME COLUMN mat_$session_id TO $session_id") materialize_session_and_window_id(CLICKHOUSE_DATABASE) self.assert_desired_state()
def create_materialized_columns(database): try: materialize("events", "$session_id", "$session_id") except ValueError: # session_id is already materialized, skip pass try: materialize("events", "$window_id", "$window_id") except ValueError: # window_id is already materialized, skip pass
def test_create_missing_tables(self): self.recreate_database(create_tables=True) materialize("events", "some_property") _, create_table_queries, _ = Command().analyze_cluster_tables() sync_execute("DROP TABLE sharded_events SYNC") self.assertIn("mat_some_property", create_table_queries["sharded_events"]) Command().create_missing_tables({"test_host": {"sharded_events"}}, create_table_queries) schema = sync_execute("SHOW CREATE TABLE sharded_events")[0][0] self.assertIn("mat_some_property", schema)
def test_column_types(self): materialize("events", "myprop") expr = "replaceRegexpAll(JSONExtractRaw(properties, 'myprop'), '^\"|\"$', '')" self.assertEqual(("MATERIALIZED", expr), self._get_column_types("mat_myprop")) backfill_materialized_columns("events", ["myprop"], timedelta(days=50)) self.assertEqual(("DEFAULT", expr), self._get_column_types("mat_myprop")) mark_all_materialized() self.assertEqual(("MATERIALIZED", expr), self._get_column_types("mat_myprop"))
def test_denormalised_props(self): filters = { "events": [ { "id": "user signed up", "type": "events", "order": 0, "properties": [{ "key": "test_prop", "value": "hi" }], }, ], "date_from": "2020-01-01", "properties": [{ "key": "test_prop", "value": "hi" }], "date_to": "2020-01-14", } materialize("events", "test_prop") Person.objects.create(team_id=self.team.pk, distinct_ids=["p1"], properties={"key": "value"}) _create_event( team=self.team, event="$pageview", distinct_id="p1", timestamp="2020-01-02T12:00:00Z", properties={"test_prop": "hi"}, ) Person.objects.create(team_id=self.team.pk, distinct_ids=["p2"], properties={"key_2": "value_2"}) _create_event( team=self.team, event="$pageview", distinct_id="p2", timestamp="2020-01-02T12:00:00Z", properties={"test_prop": "hi"}, ) filter = Filter(data=filters) _, query = self._run_query(filter) self.assertIn("mat_test_prop", query)
def test_caching_and_materializing(self): with freeze_time("2020-01-04T13:01:01Z"): materialize("events", "$foo") materialize("events", "$bar") materialize("person", "$zeta") self.assertCountEqual( get_materialized_columns("events", use_cache=True).keys(), ["$foo", "$bar", *EVENTS_TABLE_DEFAULT_MATERIALIZED_COLUMNS], ) self.assertCountEqual( get_materialized_columns("person", use_cache=True).keys(), ["$zeta"]) materialize("events", "abc") self.assertCountEqual( get_materialized_columns("events", use_cache=True).keys(), ["$foo", "$bar", *EVENTS_TABLE_DEFAULT_MATERIALIZED_COLUMNS], ) with freeze_time("2020-01-04T14:00:01Z"): self.assertCountEqual( get_materialized_columns("events", use_cache=True).keys(), [ "$foo", "$bar", "abc", *EVENTS_TABLE_DEFAULT_MATERIALIZED_COLUMNS ], )
def materialize_properties_task( columns_to_materialize: Optional[List[Suggestion]] = None, time_to_analyze_hours: int = MATERIALIZE_COLUMNS_ANALYSIS_PERIOD_HOURS, maximum: int = MATERIALIZE_COLUMNS_MAX_AT_ONCE, min_query_time: int = MATERIALIZE_COLUMNS_MINIMUM_QUERY_TIME, backfill_period_days: int = MATERIALIZE_COLUMNS_BACKFILL_PERIOD_DAYS, dry_run: bool = False, ) -> None: """ Creates materialized columns for event and person properties based off of slow queries """ if columns_to_materialize is None: columns_to_materialize = analyze( get_queries(time_to_analyze_hours, min_query_time)) result = [] for suggestion in columns_to_materialize: table, property_name, _ = suggestion if property_name not in get_materialized_columns(table): result.append(suggestion) if len(result) > 0: logger.info( f"Calculated columns that could be materialized. count={len(result)}" ) else: logger.info("Found no columns to materialize.") properties: Dict[TableWithProperties, List[PropertyName]] = { "events": [], "person": [], } for table, property_name, cost in result[:maximum]: logger.info( f"Materializing column. table={table}, property_name={property_name}, cost={cost}" ) if not dry_run: materialize(table, property_name) properties[table].append(property_name) if backfill_period_days > 0 and not dry_run: logger.info( f"Starting backfill for new materialized columns. period_days={backfill_period_days}" ) backfill_materialized_columns("events", properties["events"], timedelta(days=backfill_period_days)) backfill_materialized_columns("person", properties["person"], timedelta(days=backfill_period_days))
def test_column_types(self): materialize("events", "myprop") # :KLUDGE: ClickHouse replaces our trim(BOTH '"' FROM properties) with this expr = "replaceRegexpAll(JSONExtractRaw(properties, 'myprop'), concat('^[', regexpQuoteMeta('\"'), ']*|[', regexpQuoteMeta('\"'), ']*$'), '')" self.assertEqual(("MATERIALIZED", expr), self._get_column_types("events", "mat_myprop")) backfill_materialized_columns("events", ["myprop"], timedelta(days=50)) self.assertEqual(("DEFAULT", expr), self._get_column_types("events", "mat_myprop")) mark_all_materialized() self.assertEqual(("MATERIALIZED", expr), self._get_column_types("events", "mat_myprop"))
def test_prop_event_denormalized_ints(self): _create_event( event="$pageview", team=self.team, distinct_id="whatever", properties={"test_prop": 0}, ) _create_event( event="$pageview", team=self.team, distinct_id="whatever", properties={"test_prop": 2}, ) materialize("events", "test_prop") materialize("events", "something_else") filter = Filter(data={ "properties": [{ "key": "test_prop", "value": 1, "operator": "gt" }], }) self.assertEqual(len(self._run_query(filter)), 1) filter = Filter(data={ "properties": [{ "key": "test_prop", "value": 1, "operator": "lt" }], }) self.assertEqual(len(self._run_query(filter)), 1) filter = Filter(data={ "properties": [{ "key": "test_prop", "value": 0 }], }) self.assertEqual(len(self._run_query(filter)), 1)
def test_prop_filter_json_extract_materialized(test_events, property, expected_event_indexes, team): materialize("events", "attr") materialize("events", "email") query, params = prop_filter_json_extract(property, 0, allow_denormalized_props=True) assert "JSONExtract" not in query uuids = list( sorted([ uuid for (uuid, ) in sync_execute( f"SELECT uuid FROM events WHERE team_id = %(team_id)s {query}", { "team_id": team.pk, **params }) ])) expected = list( sorted([test_events[index] for index in expected_event_indexes])) assert uuids == expected
def test_materialized_columns_checks(self): optimizer = lambda: EnterpriseColumnOptimizer(FILTER_WITH_PROPERTIES, self.team.id) optimizer_groups = lambda: EnterpriseColumnOptimizer( FILTER_WITH_GROUPS, self.team.id) self.assertEqual(optimizer().event_columns_to_query, {"properties"}) self.assertEqual(optimizer().person_columns_to_query, {"properties"}) self.assertEqual(optimizer_groups().event_columns_to_query, {"properties"}) self.assertEqual(optimizer_groups().person_columns_to_query, {"properties"}) materialize("events", "event_prop") materialize("person", "person_prop") self.assertEqual(optimizer().event_columns_to_query, {"mat_event_prop"}) self.assertEqual(optimizer().person_columns_to_query, {"pmat_person_prop"}) self.assertEqual(optimizer_groups().event_columns_to_query, {"mat_event_prop"}) self.assertEqual(optimizer_groups().person_columns_to_query, {"pmat_person_prop"})
def test_materialized_column_naming(self): random.seed(0) materialize("events", "$foO();--sqlinject") materialize("events", "$foO();ääsqlinject") materialize("events", "$foO_____sqlinject") materialize("person", "SoMePrOp") self.assertDictContainsSubset( { "$foO();--sqlinject": "mat_$foO_____sqlinject", "$foO();ääsqlinject": "mat_$foO_____sqlinject_yWAc", "$foO_____sqlinject": "mat_$foO_____sqlinject_qGFz", }, get_materialized_columns("events"), ) self.assertEqual(get_materialized_columns("person"), {"SoMePrOp": "pmat_SoMePrOp"})
def test_columns_already_materialized_prior_to_migration(self): materialize("events", "$session_id") materialize("events", "$window_id") materialize_session_and_window_id(CLICKHOUSE_DATABASE) self.assert_desired_state()
def test_prop_event_denormalized(self): _create_event( event="$pageview", team=self.team, distinct_id="whatever", properties={"test_prop": "some_other_val"}, ) _create_event( event="$pageview", team=self.team, distinct_id="whatever", properties={"test_prop": "some_val"}, ) materialize("events", "test_prop") materialize("events", "something_else") filter = Filter(data={ "properties": [{ "key": "test_prop", "value": "some_val" }], }) self.assertEqual(len(self._run_query(filter)), 1) filter = Filter( data={ "properties": [{ "key": "test_prop", "value": "some_val", "operator": "is_not" }], }) self.assertEqual(len(self._run_query(filter)), 1) filter = Filter( data={ "properties": [{ "key": "test_prop", "value": "some_val", "operator": "is_set" }], }) self.assertEqual(len(self._run_query(filter)), 2) filter = Filter( data={ "properties": [{ "key": "test_prop", "value": "some_val", "operator": "is_not_set" }], }) self.assertEqual(len(self._run_query(filter)), 0) filter = Filter( data={ "properties": [{ "key": "test_prop", "value": "_other_", "operator": "icontains" }], }) self.assertEqual(len(self._run_query(filter)), 1) filter = Filter( data={ "properties": [{ "key": "test_prop", "value": "_other_", "operator": "not_icontains" }], }) self.assertEqual(len(self._run_query(filter)), 1)
def test_backfilling_data(self): sync_execute("ALTER TABLE events DROP COLUMN IF EXISTS mat_prop") sync_execute("ALTER TABLE events DROP COLUMN IF EXISTS mat_another") _create_event(event="some_event", distinct_id="1", team=self.team, timestamp="2020-01-01 00:00:00", properties={"prop": 1}) _create_event( event="some_event", distinct_id="1", team=self.team, timestamp="2021-05-02 00:00:00", properties={ "prop": 2, "another": 5 }, ) _create_event(event="some_event", distinct_id="1", team=self.team, timestamp="2021-05-03 00:00:00", properties={"prop": 3}) _create_event(event="another_event", distinct_id="1", team=self.team, timestamp="2021-05-04 00:00:00") _create_event( event="third_event", distinct_id="1", team=self.team, timestamp="2021-05-05 00:00:00", properties={"prop": 4}, ) _create_event( event="fourth_event", distinct_id="1", team=self.team, timestamp="2021-05-06 00:00:00", properties={"another": 6}, ) materialize("events", "prop") materialize("events", "another") self.assertEqual(self._count_materialized_rows("mat_prop"), 0) self.assertEqual(self._count_materialized_rows("mat_another"), 0) with freeze_time("2021-05-10T14:00:01Z"): backfill_materialized_columns( "events", ["prop", "another"], timedelta(days=50), test_settings={"mutations_sync": "0"}) _create_event( event="fifth_event", distinct_id="1", team=self.team, timestamp="2021-05-07 00:00:00", properties={"another": 7}, ) iterations = 0 while self._get_count_of_mutations_running() > 0 and iterations < 100: sleep(0.1) iterations += 1 self.assertGreaterEqual(self._count_materialized_rows("mat_prop"), 4) self.assertGreaterEqual(self._count_materialized_rows("mat_another"), 4) self.assertEqual( sync_execute( "SELECT mat_prop, mat_another FROM events ORDER BY timestamp"), [("1", ""), ("2", "5"), ("3", ""), ("", ""), ("4", ""), ("", "6"), ("", "7")], )