def test_person_cohort_properties(self): person1_distinct_id = "person1" person1 = Person.objects.create( team=self.team, distinct_ids=[person1_distinct_id], properties={"$some_prop": "something"} ) cohort1 = Cohort.objects.create( team=self.team, groups=[{"properties": [{"type": "person", "key": "$some_prop", "value": "something"}]}], name="cohort1", ) person2_distinct_id = "person2" person2 = Person.objects.create( team=self.team, distinct_ids=[person2_distinct_id], properties={"$some_prop": "different"} ) cohort2 = Cohort.objects.create( team=self.team, groups=[ {"properties": [{"type": "person", "key": "$some_prop", "value": "something", "operator": "is_not"}]} ], name="cohort2", ) filter = Filter(data={"properties": [{"key": "id", "value": cohort1.pk, "type": "cohort"}],}, team=self.team) prop_clause, prop_clause_params = parse_prop_grouped_clauses( property_group=filter.property_groups, has_person_id_joined=False, team_id=self.team.pk ) query = """ SELECT distinct_id FROM person_distinct_id WHERE team_id = %(team_id)s {prop_clause} """.format( prop_clause=prop_clause ) # get distinct_id column of result result = sync_execute(query, {"team_id": self.team.pk, **prop_clause_params})[0][0] self.assertEqual(result, person1_distinct_id) # test cohort2 with negation filter = Filter(data={"properties": [{"key": "id", "value": cohort2.pk, "type": "cohort"}],}, team=self.team) prop_clause, prop_clause_params = parse_prop_grouped_clauses( property_group=filter.property_groups, has_person_id_joined=False, team_id=self.team.pk ) query = """ SELECT distinct_id FROM person_distinct_id WHERE team_id = %(team_id)s {prop_clause} """.format( prop_clause=prop_clause ) # get distinct_id column of result result = sync_execute(query, {"team_id": self.team.pk, **prop_clause_params})[0][0] self.assertEqual(result, person2_distinct_id)
def test_prop_cohort_basic_action_days(self): _create_person(distinct_ids=["some_other_id"], team_id=self.team.pk, properties={"$some_prop": "something"}) _create_person( distinct_ids=["some_id"], team_id=self.team.pk, properties={"$some_prop": "something", "$another_prop": "something"}, ) action = _create_action(team=self.team, name="$pageview") _create_event( event="$pageview", team=self.team, distinct_id="some_id", properties={"attr": "some_val"}, timestamp=datetime(2020, 1, 9, 12, 0, 1), ) _create_event( event="$pageview", team=self.team, distinct_id="some_other_id", properties={"attr": "some_val"}, timestamp=datetime(2020, 1, 5, 12, 0, 1), ) with freeze_time("2020-01-10"): cohort1 = Cohort.objects.create( team=self.team, groups=[{"action_id": action.pk, "days": 1}], name="cohort1", ) filter = Filter( data={"properties": [{"key": "id", "value": cohort1.pk, "type": "cohort"}],}, team=self.team ) query, params = parse_prop_grouped_clauses(team_id=self.team.pk, property_group=filter.property_groups) final_query = "SELECT uuid FROM events WHERE team_id = %(team_id)s {}".format(query) result = sync_execute(final_query, {**params, "team_id": self.team.pk}) self.assertEqual(len(result), 1) cohort2 = Cohort.objects.create( team=self.team, groups=[{"action_id": action.pk, "days": 7}], name="cohort2", ) filter = Filter( data={"properties": [{"key": "id", "value": cohort2.pk, "type": "cohort"}],}, team=self.team ) query, params = parse_prop_grouped_clauses(team_id=self.team.pk, property_group=filter.property_groups) final_query = "SELECT uuid FROM events WHERE team_id = %(team_id)s {}".format(query) result = sync_execute(final_query, {**params, "team_id": self.team.pk}) self.assertEqual(len(result), 2)
def format_action_filter( team_id: int, action: Action, prepend: str = "action", use_loop: bool = False, filter_by_team=True, table_name: str = "", person_properties_mode: PersonPropertiesMode = PersonPropertiesMode. USING_SUBQUERY, ) -> Tuple[str, Dict]: # get action steps params = {"team_id": action.team.pk} if filter_by_team else {} steps = action.steps.all() if len(steps) == 0: # If no steps, it shouldn't match this part of the query return "1=2", {} or_queries = [] for index, step in enumerate(steps): conditions: List[str] = [] # filter element if step.event == AUTOCAPTURE_EVENT: from ee.clickhouse.models.property import filter_element # prevent circular import el_condition, element_params = filter_element( model_to_dict(step), prepend=f"{action.pk}_{index}{prepend}") params = {**params, **element_params} if len(el_condition) > 0: conditions.append(el_condition) # filter event conditions (ie URL) event_conditions, event_params = filter_event( step, f"{action.pk}_{index}{prepend}", index, table_name) params = {**params, **event_params} conditions += event_conditions if step.properties: from ee.clickhouse.models.property import parse_prop_grouped_clauses prop_query, prop_params = parse_prop_grouped_clauses( team_id=team_id, property_group=Filter(data={ "properties": step.properties }).property_groups, prepend=f"action_props_{action.pk}_{step.pk}", table_name=table_name, person_properties_mode=person_properties_mode, ) conditions.append(prop_query.replace("AND", "", 1)) params = {**params, **prop_params} if len(conditions) > 0: or_queries.append(" AND ".join(conditions)) if use_loop: formatted_query = "SELECT uuid FROM events WHERE {} AND team_id = %(team_id)s".format( ") OR uuid IN (SELECT uuid FROM events WHERE team_id = %(team_id)s AND " .join(or_queries)) else: formatted_query = "(({}))".format(") OR (".join(or_queries)) return formatted_query, params
def _format_all_query(team_id: int, filter: Filter, **kwargs) -> Tuple[str, Dict]: entity = kwargs.pop("entity", None) parsed_date_from, parsed_date_to, date_params = parse_timestamps( filter=filter, team_id=team_id, table="all_events.") props_to_filter = filter.property_groups if entity and isinstance(entity, Entity): props_to_filter = props_to_filter.combine_property_group( PropertyOperatorType.AND, entity.property_groups) prop_filters, prop_filter_params = parse_prop_grouped_clauses( team_id=team_id, property_group=props_to_filter, prepend="all_cohort_", table_name="all_events", ) query = f""" SELECT DISTINCT distinct_id, {ALL_USERS_COHORT_ID} as value FROM events all_events WHERE team_id = {team_id} {parsed_date_from} {parsed_date_to} {prop_filters} """ return query, {**date_params, **prop_filter_params}
def stats(self, request: request.Request, **kwargs) -> response.Response: filter = Filter(request=request, team=self.team) date_from, date_to, date_params = parse_timestamps( filter, team_id=self.team.pk) prop_filters, prop_filter_params = parse_prop_grouped_clauses( team_id=self.team.pk, property_group=filter.property_groups) result = sync_execute( GET_ELEMENTS.format(date_from=date_from, date_to=date_to, query=prop_filters), { "team_id": self.team.pk, **prop_filter_params, **date_params }, ) return response.Response([{ "count": elements[1], "hash": None, "elements": [ ElementSerializer(element).data for element in chain_to_elements(elements[0]) ], } for elements in result])
def test_prop_cohort_basic(self): _create_person(distinct_ids=["some_other_id"], team_id=self.team.pk, properties={"$some_prop": "something"}) _create_person( distinct_ids=["some_id"], team_id=self.team.pk, properties={"$some_prop": "something", "$another_prop": "something"}, ) _create_person(distinct_ids=["no_match"], team_id=self.team.pk) _create_event( event="$pageview", team=self.team, distinct_id="some_id", properties={"attr": "some_val"}, ) _create_event( event="$pageview", team=self.team, distinct_id="some_other_id", properties={"attr": "some_val"}, ) cohort1 = Cohort.objects.create( team=self.team, groups=[{"properties": {"$some_prop": "something", "$another_prop": "something"}}], name="cohort1", ) filter = Filter(data={"properties": [{"key": "id", "value": cohort1.pk, "type": "cohort"}],}) query, params = parse_prop_grouped_clauses(team_id=self.team.pk, property_group=filter.property_groups) final_query = "SELECT uuid FROM events WHERE team_id = %(team_id)s {}".format(query) result = sync_execute(final_query, {**params, "team_id": self.team.pk}) self.assertEqual(len(result), 1)
def test_prop_cohort_with_negation(self): team2 = Organization.objects.bootstrap(None)[2] _create_person(distinct_ids=["some_other_id"], team_id=self.team.pk, properties={"$some_prop": "something"}) _create_person(distinct_ids=["some_id"], team_id=team2.pk, properties={"$another_prop": "something"}) _create_event( event="$pageview", team=self.team, distinct_id="some_id", properties={"attr": "some_val"}, ) _create_event( event="$pageview", team=self.team, distinct_id="some_other_id", properties={"attr": "some_val"}, ) cohort1 = Cohort.objects.create( team=self.team, groups=[ {"properties": [{"type": "person", "key": "$some_prop", "operator": "is_not", "value": "something"}]} ], name="cohort1", ) filter = Filter(data={"properties": [{"key": "id", "value": cohort1.pk, "type": "cohort"}],}, team=self.team) query, params = parse_prop_grouped_clauses(team_id=self.team.pk, property_group=filter.property_groups) final_query = "SELECT uuid FROM events WHERE team_id = %(team_id)s {}".format(query) self.assertIn("\nFROM person_distinct_id2\n", final_query) result = sync_execute(final_query, {**params, "team_id": self.team.pk}) self.assertEqual(len(result), 0)
def get_person_ids_by_cohort_id(team: Team, cohort_id: int, limit: Optional[int] = None, offset: Optional[int] = None): from ee.clickhouse.models.property import parse_prop_grouped_clauses filters = Filter(data={ "properties": [{ "key": "id", "value": cohort_id, "type": "cohort" }], }) filter_query, filter_params = parse_prop_grouped_clauses( team_id=team.pk, property_group=filters.property_groups, table_name="pdi") results = sync_execute( GET_PERSON_IDS_BY_FILTER.format( distinct_query=filter_query, query="", GET_TEAM_PERSON_DISTINCT_IDS=get_team_distinct_ids_query(team.pk), offset="OFFSET %(offset)s" if offset else "", limit="ORDER BY _timestamp ASC LIMIT %(limit)s" if limit else "", ), { **filter_params, "team_id": team.pk, "offset": offset, "limit": limit }, ) return [str(row[0]) for row in results]
def _get_search_clause(self) -> Tuple[str, Dict]: if not isinstance(self._filter, Filter): return "", {} if self._filter.search: prop_group = PropertyGroup( type=PropertyOperatorType.AND, values=[Property(key="email", operator="icontains", value=self._filter.search, type="person")], ) search_clause, params = parse_prop_grouped_clauses( self._team_id, prop_group, prepend="search", has_person_id_joined=False, group_properties_joined=False, person_properties_mode=PersonPropertiesMode.DIRECT, _top_level=False, ) distinct_id_clause = """ id IN ( SELECT person_id FROM person_distinct_id where distinct_id = %(distinct_id)s ) """ params.update({"distinct_id": self._filter.search}) return f"AND (({search_clause}) OR ({distinct_id_clause}))", params return "", {}
def _get_person_filters(self) -> Tuple[str, Dict]: return parse_prop_grouped_clauses( self._team_id, self._inner_person_properties, has_person_id_joined=False, group_properties_joined=False, person_properties_mode=PersonPropertiesMode.DIRECT, )
def _build_filters(self, entity: Entity, index: int) -> str: prop_filters, prop_filter_params = parse_prop_grouped_clauses( team_id=self._team.pk, property_group=entity.property_groups, prepend=str(index), person_properties_mode=PersonPropertiesMode. USING_PERSON_PROPERTIES_COLUMN, person_id_joined_alias="aggregation_target", ) self.params.update(prop_filter_params) return prop_filters
def _filter_persons(filter: Filter, team: Team): prop_filters, prop_filter_params = parse_prop_grouped_clauses( property_group=filter.property_groups, team_id=team.pk, person_properties_mode=PersonPropertiesMode.USING_PERSON_PROPERTIES_COLUMN, ) # Note this query does not handle person rows changing over time rows = sync_execute( f"SELECT id, properties AS person_props FROM person WHERE team_id = %(team_id)s {prop_filters}", {"team_id": team.pk, **prop_filter_params}, ) return [str(uuid) for uuid, _ in rows]
def _filter_events(filter: Filter, team: Team, order_by: Optional[str] = None): prop_filters, prop_filter_params = parse_prop_grouped_clauses( property_group=filter.property_groups, team_id=team.pk ) params = {"team_id": team.pk, **prop_filter_params} events = sync_execute( GET_EVENTS_WITH_PROPERTIES.format( filters=prop_filters, order_by="ORDER BY {}".format(order_by) if order_by else "" ), params, ) parsed_events = ClickhouseEventSerializer(events, many=True, context={"elements": None, "people": None}).data return parsed_events
def _get_prop_groups( self, prop_group: Optional[PropertyGroup]) -> Tuple[str, Dict]: if not prop_group: return "", {} outer_properties = self._column_optimizer.property_optimizer.parse_property_groups( prop_group).outer return parse_prop_grouped_clauses( team_id=self._team_id, property_group=outer_properties, prepend="global", table_name=self.EVENT_TABLE_ALIAS, allow_denormalized_props=True, person_properties_mode=PersonPropertiesMode. USING_PERSON_PROPERTIES_COLUMN, person_id_joined_alias=f"{self.DISTINCT_ID_TABLE_ALIAS}.person_id", )
def _query_events_list( self, filter: Filter, team: Team, request: request.Request, long_date_from: bool = False, limit: int = 100 ) -> List: limit += 1 limit_sql = "LIMIT %(limit)s" order = "DESC" if self._parse_order_by(self.request)[0] == "-timestamp" else "ASC" conditions, condition_params = determine_event_conditions( team, { "after": (now() - timedelta(days=1)).isoformat(), "before": (now() + timedelta(seconds=5)).isoformat(), **request.GET.dict(), }, long_date_from, ) prop_filters, prop_filter_params = parse_prop_grouped_clauses( team_id=team.pk, property_group=filter.property_groups, has_person_id_joined=False ) if request.GET.get("action_id"): try: action = Action.objects.get(pk=request.GET["action_id"], team_id=team.pk) except Action.DoesNotExist: return [] if action.steps.count() == 0: return [] action_query, params = format_action_filter(team_id=team.pk, action=action) prop_filters += " AND {}".format(action_query) prop_filter_params = {**prop_filter_params, **params} if prop_filters != "": return sync_execute( SELECT_EVENT_BY_TEAM_AND_CONDITIONS_FILTERS_SQL.format( conditions=conditions, limit=limit_sql, filters=prop_filters, order=order ), {"team_id": team.pk, "limit": limit, **condition_params, **prop_filter_params}, ) else: return sync_execute( SELECT_EVENT_BY_TEAM_AND_CONDITIONS_SQL.format(conditions=conditions, limit=limit_sql, order=order), {"team_id": team.pk, "limit": limit, **condition_params}, )
def format_event_filter(self, entity: Entity, prepend: str, team_id: int) -> Tuple[str, Dict[str, Any]]: filter_sql, params = format_entity_filter(team_id=team_id, entity=entity, prepend=prepend, filter_by_team=False) filters, filter_params = parse_prop_grouped_clauses( team_id=team_id, property_group=entity.property_groups, prepend=prepend, allow_denormalized_props=True, has_person_id_joined=True, person_properties_mode=PersonPropertiesMode. USING_PERSON_PROPERTIES_COLUMN, ) filter_sql += f" {filters}" params = {**params, **filter_params} return filter_sql, params
def get_query(self) -> Tuple[str, Dict, Callable]: interval_annotation = get_trunc_func_ch(self.filter.interval) num_intervals, seconds_in_interval, round_interval = get_time_diff( self.filter.interval, self.filter.date_from, self.filter.date_to, self.team_id) _, parsed_date_to, date_params = parse_timestamps(filter=self.filter, team_id=self.team_id) props_to_filter = self.filter.property_groups.combine_property_group( PropertyOperatorType.AND, self.entity.property_groups) outer_properties = self.column_optimizer.property_optimizer.parse_property_groups( props_to_filter).outer prop_filters, prop_filter_params = parse_prop_grouped_clauses( team_id=self.team_id, property_group=outer_properties, table_name="e", person_properties_mode=PersonPropertiesMode. USING_PERSON_PROPERTIES_COLUMN, ) aggregate_operation, _, math_params = process_math(self.entity) action_query = "" action_params: Dict = {} if self.entity.type == TREND_FILTER_TYPE_ACTIONS: action = self.entity.get_action() action_query, action_params = format_action_filter( team_id=self.team_id, action=action, table_name="e") self.params = { **self.params, **math_params, **prop_filter_params, **action_params, "event": self.entity.id, "key": self.filter.breakdown, **date_params, } breakdown_filter_params = { "parsed_date_from": date_from_clause(interval_annotation, round_interval), "parsed_date_to": parsed_date_to, "actions_query": "AND {}".format(action_query) if action_query else "", "event_filter": "AND event = %(event)s" if not action_query else "", "filters": prop_filters if props_to_filter.values else "", } _params, _breakdown_filter_params = {}, {} if self.filter.breakdown_type == "cohort": _params, breakdown_filter, _breakdown_filter_params, breakdown_value = self._breakdown_cohort_params( ) else: _params, breakdown_filter, _breakdown_filter_params, breakdown_value = self._breakdown_prop_params( "count(*)" if self.entity.math == "dau" else aggregate_operation, math_params, ) if len(_params["values"]) == 0: # If there are no breakdown values, we are sure that there's no relevant events, so instead of adjusting # a "real" SELECT for this, we only include the below dummy SELECT. # It's a drop-in replacement for a "real" one, simply always returning 0 rows. # See https://github.com/PostHog/posthog/pull/5674 for context. return ( "SELECT [now()] AS date, [0] AS data, '' AS breakdown_value LIMIT 0", {}, lambda _: [], ) person_join_condition, person_join_params = self._person_join_condition( ) groups_join_condition, groups_join_params = GroupsJoinQuery( self.filter, self.team_id, self.column_optimizer).get_join_query() self.params = { **self.params, **_params, **person_join_params, **groups_join_params } breakdown_filter_params = { **breakdown_filter_params, **_breakdown_filter_params } if self.filter.display in TRENDS_DISPLAY_BY_VALUE: breakdown_filter = breakdown_filter.format( **breakdown_filter_params) content_sql = BREAKDOWN_AGGREGATE_QUERY_SQL.format( breakdown_filter=breakdown_filter, person_join=person_join_condition, groups_join=groups_join_condition, aggregate_operation=aggregate_operation, breakdown_value=breakdown_value, ) time_range = enumerate_time_range(self.filter, seconds_in_interval) return ( content_sql, self.params, self._parse_single_aggregate_result(self.filter, self.entity, {"days": time_range}), ) else: breakdown_filter = breakdown_filter.format( **breakdown_filter_params) if self.entity.math in [WEEKLY_ACTIVE, MONTHLY_ACTIVE]: active_user_params = get_active_user_params( self.filter, self.entity, self.team_id) conditions = BREAKDOWN_ACTIVE_USER_CONDITIONS_SQL.format( **breakdown_filter_params, **active_user_params) inner_sql = BREAKDOWN_ACTIVE_USER_INNER_SQL.format( breakdown_filter=breakdown_filter, person_join=person_join_condition, groups_join=groups_join_condition, aggregate_operation=aggregate_operation, interval_annotation=interval_annotation, breakdown_value=breakdown_value, conditions=conditions, GET_TEAM_PERSON_DISTINCT_IDS=get_team_distinct_ids_query( self.team_id), **active_user_params, **breakdown_filter_params, ) elif self.filter.display == TRENDS_CUMULATIVE and self.entity.math == "dau": inner_sql = BREAKDOWN_CUMULATIVE_INNER_SQL.format( breakdown_filter=breakdown_filter, person_join=person_join_condition, groups_join=groups_join_condition, aggregate_operation=aggregate_operation, interval_annotation=interval_annotation, breakdown_value=breakdown_value, **breakdown_filter_params, ) else: inner_sql = BREAKDOWN_INNER_SQL.format( breakdown_filter=breakdown_filter, person_join=person_join_condition, groups_join=groups_join_condition, aggregate_operation=aggregate_operation, interval_annotation=interval_annotation, breakdown_value=breakdown_value, ) breakdown_query = BREAKDOWN_QUERY_SQL.format( interval=interval_annotation, num_intervals=num_intervals, inner_sql=inner_sql, ) self.params.update({ "seconds_in_interval": seconds_in_interval, "num_intervals": num_intervals, }) return breakdown_query, self.params, self._parse_trend_result( self.filter, self.entity)
def get_breakdown_prop_values( filter: Filter, entity: Entity, aggregate_operation: str, team_id: int, limit: int = BREAKDOWN_VALUES_LIMIT, extra_params={}, column_optimizer: Optional[EnterpriseColumnOptimizer] = None, ): """ Returns the top N breakdown prop values for event/person breakdown e.g. for Browser with limit 3 might return ['Chrome', 'Safari', 'Firefox', 'Other'] """ column_optimizer = column_optimizer or EnterpriseColumnOptimizer( filter, team_id) parsed_date_from, parsed_date_to, date_params = parse_timestamps( filter=filter, team_id=team_id) props_to_filter = filter.property_groups.combine_property_group( PropertyOperatorType.AND, entity.property_groups) outer_properties = column_optimizer.property_optimizer.parse_property_groups( props_to_filter).outer prop_filters, prop_filter_params = parse_prop_grouped_clauses( team_id=team_id, property_group=outer_properties, table_name="e", prepend="e_brkdwn", person_properties_mode=PersonPropertiesMode. USING_PERSON_PROPERTIES_COLUMN, allow_denormalized_props=True, ) entity_params, entity_format_params = get_entity_filtering_params( entity=entity, team_id=team_id, table_name="e") value_expression = _to_value_expression(filter.breakdown_type, filter.breakdown, filter.breakdown_group_type_index) person_join_clauses = "" person_join_params: Dict = {} person_query = PersonQuery(filter, team_id, column_optimizer=column_optimizer, entity=entity) if person_query.is_used: person_subquery, person_join_params = person_query.get_query() person_join_clauses = f""" INNER JOIN ({get_team_distinct_ids_query(team_id)}) AS pdi ON e.distinct_id = pdi.distinct_id INNER JOIN ({person_subquery}) person ON pdi.person_id = person.id """ groups_join_condition, groups_join_params = GroupsJoinQuery( filter, team_id, column_optimizer).get_join_query() elements_query = TOP_ELEMENTS_ARRAY_OF_KEY_SQL.format( value_expression=value_expression, parsed_date_from=parsed_date_from, parsed_date_to=parsed_date_to, prop_filters=prop_filters, aggregate_operation=aggregate_operation, person_join_clauses=person_join_clauses, groups_join_clauses=groups_join_condition, **entity_format_params, ) return sync_execute( elements_query, { "key": filter.breakdown, "limit": limit, "team_id": team_id, "offset": filter.offset, **prop_filter_params, **entity_params, **person_join_params, **groups_join_params, **extra_params, **date_params, }, )[0][0]