def _person_query(self): return ClickhousePersonQuery( self._filter, self._team_id, self._column_optimizer, extra_fields=["created_at"], )
def _get_actor_subquery(self) -> Tuple[str, Dict[str, Any]]: if self.is_aggregating_by_groups: actor_join_subquery, actor_join_subquery_params = GroupsJoinQuery( self._filter, self._team.pk, join_key="funnel_actors.actor_id").get_join_query() else: person_query, actor_join_subquery_params = ClickhousePersonQuery( self._filter, self._team.pk, entity=Entity({ "id": "person", "type": "events", "properties": self._filter.correlation_property_values }), ).get_query() actor_join_subquery = f""" JOIN ({person_query}) person ON person.id = funnel_actors.actor_id """ return actor_join_subquery, actor_join_subquery_params
def _person_query(self): return ClickhousePersonQuery( self._filter, self._team_id, self._column_optimizer, extra_fields=self._extra_person_fields, entity=self._entity, )
def run_query(team: Team, filter: Filter, **kwargs): query, params = ClickhousePersonQuery(filter, team.pk, **kwargs).get_query() rows = sync_execute(query, {**params, "team_id": team.pk}) if len(rows) > 0: return {"rows": len(rows), "columns": len(rows[0])} else: return {"rows": 0}
def __init__(self, entity: Entity, *args, **kwargs): self._entity = entity super().__init__(*args, **kwargs) self._person_query = ClickhousePersonQuery( self._filter, self._team_id, self._column_optimizer, extra_fields=kwargs.get("extra_person_fields", []), entity=entity, )
def _person_join_condition(self) -> Tuple[str, Dict]: person_query = ClickhousePersonQuery(self.filter, self.team_id, self.column_optimizer, entity=self.entity) if person_query.is_used: query, params = person_query.get_query() return ( f""" {EVENT_JOIN_PERSON_SQL} INNER JOIN ({query}) person ON person.id = pdi.person_id """, params, ) elif self.entity.math == "dau": # Only join distinct_ids return EVENT_JOIN_PERSON_SQL, {} else: return "", {}
def _run_query(self, filter: Filter, join_person_tables=False) -> List: query, params = parse_prop_clauses( filter.properties, self.team.pk, allow_denormalized_props=True, person_properties_mode=PersonPropertiesMode.EXCLUDE, ) joins = "" if join_person_tables: person_query = ClickhousePersonQuery(filter, self.team.pk) person_subquery, person_join_params = person_query.get_query() joins = f""" INNER JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) AS pdi ON events.distinct_id = pdi.distinct_id INNER JOIN ({person_subquery}) person ON pdi.person_id = person.id """ params.update(person_join_params) final_query = f"SELECT uuid FROM events {joins} WHERE team_id = %(team_id)s {query}" # Make sure we don't accidentally use json on the properties field self.assertNotIn("json", final_query.lower()) return sync_execute(final_query, {**params, "team_id": self.team.pk})
def _person_join_condition(self) -> Tuple[str, Dict]: person_query = ClickhousePersonQuery(self.filter, self.team_id, self.column_optimizer, entity=self.entity) event_join = EVENT_JOIN_PERSON_SQL.format( GET_TEAM_PERSON_DISTINCT_IDS=get_team_distinct_ids_query( self.team_id)) if person_query.is_used: query, params = person_query.get_query() return ( f""" {event_join} INNER JOIN ({query}) person ON person.id = pdi.person_id """, params, ) elif self.entity.math == "dau": # Only join distinct_ids return event_join, {} else: return "", {}
def _get_aggregation_join_query(self): if self._filter.aggregation_group_type_index is None: person_query, person_query_params = ClickhousePersonQuery( self._filter, self._team.pk, ColumnOptimizer(self._filter, self._team.pk) ).get_query() return ( f""" JOIN ({person_query}) person ON person.id = funnel_actors.actor_id """, person_query_params, ) else: return GroupsJoinQuery(self._filter, self._team.pk, join_key="funnel_actors.actor_id").get_join_query()
def __init__( self, filter: Union[Filter, PathFilter, RetentionFilter], team_id: int, round_interval=False, should_join_distinct_ids=False, should_join_persons=False, # Extra events/person table columns to fetch since parent query needs them extra_fields: List[ColumnName] = [], extra_person_fields: List[ColumnName] = [], **kwargs, ) -> None: self._filter = filter self._team_id = team_id self._column_optimizer = ColumnOptimizer(self._filter, self._team_id) self._person_query = ClickhousePersonQuery( self._filter, self._team_id, self._column_optimizer, extra_fields=extra_person_fields) self.params: Dict[str, Any] = { "team_id": self._team_id, } self._should_join_distinct_ids = should_join_distinct_ids self._should_join_persons = should_join_persons self._extra_fields = extra_fields self._extra_person_fields = extra_person_fields if not self._should_join_distinct_ids: self._determine_should_join_distinct_ids() if not self._should_join_persons: self._determine_should_join_persons() self._should_round_interval = round_interval
def person_query(team: Team, filter: Filter, **kwargs): return ClickhousePersonQuery(filter, team.pk, **kwargs).get_query()[0]
def get_properties_query(self) -> Tuple[str, Dict[str, Any]]: if not self._filter.correlation_property_names: raise ValidationError( "Property Correlation expects atleast one Property to run correlation on" ) funnel_persons_query, funnel_persons_params = self.get_funnel_persons_cte( ) person_prop_query, person_prop_params = self._get_properties_prop_clause( ) person_query, person_query_params = ClickhousePersonQuery( self._filter, self._team.pk, ColumnOptimizer(self._filter, self._team.pk)).get_query() query = f""" WITH funnel_people as ({funnel_persons_query}), %(target_step)s AS target_step SELECT concat(prop.1, '::', prop.2) as name, -- We generate a unique identifier for each property value as: PropertyName::Value countDistinctIf(person_id, steps = target_step) AS success_count, countDistinctIf(person_id, steps <> target_step) AS failure_count FROM ( SELECT person_id, funnel_people.steps as steps, /* We can extract multiple property values at the same time, since we're already querying the person table. This gives us something like: -------------------- person1, steps, [property_value_0, property_value_1, property_value_2] person2, steps, [property_value_0, property_value_1, property_value_2] To group by property name, we need to extract the property from the array. ArrayJoin helps us do that. It transforms the above into: -------------------- person1, steps, property_value_0 person1, steps, property_value_1 person1, steps, property_value_2 person2, steps, property_value_0 person2, steps, property_value_1 person2, steps, property_value_2 To avoid clashes and clarify the values, we also zip with the property name, to generate tuples like: (property_name, property_value), which we then group by */ {person_prop_query} FROM funnel_people JOIN ({person_query}) person ON person.id = funnel_people.person_id ) person_with_props -- Group by the tuple items: (property_name, property_value) generated by zip GROUP BY prop.1, prop.2 HAVING prop.1 NOT IN %(exclude_property_names)s UNION ALL SELECT '{self.TOTAL_IDENTIFIER}' as name, countDistinctIf(person_id, steps = target_step) AS success_count, countDistinctIf(person_id, steps <> target_step) AS failure_count FROM funnel_people """ params = { **funnel_persons_params, **person_prop_params, **person_query_params, "target_step": len(self._filter.entities), "property_names": self._filter.correlation_property_names, "exclude_property_names": self._filter.correlation_property_exclude_names, } return query, params
def get_breakdown_prop_values( filter: Filter, entity: Entity, aggregate_operation: str, team_id: int, limit: int = BREAKDOWN_VALUES_LIMIT, extra_params={}, column_optimizer: Optional[ColumnOptimizer] = None, ): """ Returns the top N breakdown prop values for event/person breakdown e.g. for Browser with limit 3 might return ['Chrome', 'Safari', 'Firefox', 'Other'] """ column_optimizer = column_optimizer or ColumnOptimizer(filter, team_id) parsed_date_from, parsed_date_to, date_params = parse_timestamps( filter=filter, team_id=team_id) props_to_filter = filter.property_groups.combine_property_group( PropertyOperatorType.AND, entity.property_groups) outer_properties = column_optimizer.property_optimizer.parse_property_groups( props_to_filter).outer prop_filters, prop_filter_params = parse_prop_grouped_clauses( team_id=team_id, property_group=outer_properties, table_name="e", prepend="e_brkdwn", person_properties_mode=PersonPropertiesMode. USING_PERSON_PROPERTIES_COLUMN, allow_denormalized_props=True, ) entity_params, entity_format_params = get_entity_filtering_params( entity=entity, team_id=team_id, table_name="e") value_expression = _to_value_expression(filter.breakdown_type, filter.breakdown, filter.breakdown_group_type_index) person_join_clauses = "" person_join_params: Dict = {} person_query = ClickhousePersonQuery(filter, team_id, column_optimizer=column_optimizer, entity=entity) if person_query.is_used: person_subquery, person_join_params = person_query.get_query() person_join_clauses = f""" INNER JOIN ({get_team_distinct_ids_query(team_id)}) AS pdi ON e.distinct_id = pdi.distinct_id INNER JOIN ({person_subquery}) person ON pdi.person_id = person.id """ groups_join_condition, groups_join_params = GroupsJoinQuery( filter, team_id, column_optimizer).get_join_query() elements_query = TOP_ELEMENTS_ARRAY_OF_KEY_SQL.format( value_expression=value_expression, parsed_date_from=parsed_date_from, parsed_date_to=parsed_date_to, prop_filters=prop_filters, aggregate_operation=aggregate_operation, person_join_clauses=person_join_clauses, groups_join_clauses=groups_join_condition, **entity_format_params, ) return sync_execute( elements_query, { "key": filter.breakdown, "limit": limit, "team_id": team_id, "offset": filter.offset, **prop_filter_params, **entity_params, **person_join_params, **groups_join_params, **extra_params, **date_params, }, )[0][0]
class ClickhouseEventQuery(metaclass=ABCMeta): DISTINCT_ID_TABLE_ALIAS = "pdi" PERSON_TABLE_ALIAS = "person" EVENT_TABLE_ALIAS = "e" _filter: Union[Filter, PathFilter, RetentionFilter] _team_id: int _column_optimizer: ColumnOptimizer _person_query: ClickhousePersonQuery _should_join_distinct_ids = False _should_join_persons = False _should_round_interval = False _extra_fields: List[ColumnName] _extra_person_fields: List[ColumnName] def __init__( self, filter: Union[Filter, PathFilter, RetentionFilter], team_id: int, round_interval=False, should_join_distinct_ids=False, should_join_persons=False, # Extra events/person table columns to fetch since parent query needs them extra_fields: List[ColumnName] = [], extra_person_fields: List[ColumnName] = [], **kwargs, ) -> None: self._filter = filter self._team_id = team_id self._column_optimizer = ColumnOptimizer(self._filter, self._team_id) self._person_query = ClickhousePersonQuery( self._filter, self._team_id, self._column_optimizer, extra_fields=extra_person_fields) self.params: Dict[str, Any] = { "team_id": self._team_id, } self._should_join_distinct_ids = should_join_distinct_ids self._should_join_persons = should_join_persons self._extra_fields = extra_fields self._extra_person_fields = extra_person_fields if not self._should_join_distinct_ids: self._determine_should_join_distinct_ids() if not self._should_join_persons: self._determine_should_join_persons() self._should_round_interval = round_interval @abstractmethod def get_query(self) -> Tuple[str, Dict[str, Any]]: pass @abstractmethod def _determine_should_join_distinct_ids(self) -> None: pass def _get_disintct_id_query(self) -> str: if self._should_join_distinct_ids: return f""" INNER JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) AS {self.DISTINCT_ID_TABLE_ALIAS} ON events.distinct_id = {self.DISTINCT_ID_TABLE_ALIAS}.distinct_id """ else: return "" def _determine_should_join_persons(self) -> None: if self._person_query.is_used: self._should_join_distinct_ids = True self._should_join_persons = True return # :KLUDGE: The following is mostly making sure if cohorts are included as well. # Can be simplified significantly after https://github.com/PostHog/posthog/issues/5854 if any( self._should_property_join_persons(prop) for prop in self._filter.properties): self._should_join_distinct_ids = True self._should_join_persons = True return if any( self._should_property_join_persons(prop) for entity in self._filter.entities for prop in entity.properties): self._should_join_distinct_ids = True self._should_join_persons = True return if self._filter.breakdown_type == "person": self._should_join_distinct_ids = True self._should_join_persons = True return def _should_property_join_persons(self, prop: Property) -> bool: return prop.type == "cohort" and self._does_cohort_need_persons(prop) def _does_cohort_need_persons(self, prop: Property) -> bool: try: cohort: Cohort = Cohort.objects.get(pk=prop.value, team_id=self._team_id) except Cohort.DoesNotExist: return False if is_precalculated_query(cohort): return True if cohort.is_static: return True for group in cohort.groups: if group.get("properties"): return True return False def _get_person_query(self) -> Tuple[str, Dict]: if self._should_join_persons: person_query, params = self._person_query.get_query() return ( f""" INNER JOIN ({person_query}) {self.PERSON_TABLE_ALIAS} ON {self.PERSON_TABLE_ALIAS}.id = {self.DISTINCT_ID_TABLE_ALIAS}.person_id """, params, ) else: return "", {} def _get_groups_query(self) -> Tuple[str, Dict]: return GroupsJoinQuery(self._filter, self._team_id, self._column_optimizer).get_join_query() def _get_date_filter(self) -> Tuple[str, Dict]: parsed_date_from, parsed_date_to, date_params = parse_timestamps( filter=self._filter, team_id=self._team_id) query = f""" {parsed_date_from} {parsed_date_to} """ return query, date_params def _get_props(self, filters: List[Property]) -> Tuple[str, Dict]: final = [] params: Dict[str, Any] = {} for idx, prop in enumerate(filters): if prop.type == "cohort": person_id_query, cohort_filter_params = self._get_cohort_subquery( prop) params = {**params, **cohort_filter_params} final.append(f"AND {person_id_query}") else: filter_query, filter_params = parse_prop_clauses( [prop], self._team_id, prepend=f"global_{idx}", allow_denormalized_props=True, person_properties_mode=PersonPropertiesMode.EXCLUDE, ) final.append(filter_query) params.update(filter_params) return " ".join(final), params def _get_cohort_subquery(self, prop) -> Tuple[str, Dict[str, Any]]: try: cohort: Cohort = Cohort.objects.get(pk=prop.value, team_id=self._team_id) except Cohort.DoesNotExist: return "0 = 11", {} # If cohort doesn't exist, nothing can match is_precalculated = is_precalculated_query(cohort) person_id_query, cohort_filter_params = ( format_precalculated_cohort_query( cohort.pk, 0, custom_match_field=f"{self.DISTINCT_ID_TABLE_ALIAS}.person_id") if is_precalculated else format_person_query( cohort, 0, custom_match_field=f"{self.DISTINCT_ID_TABLE_ALIAS}.person_id") ) return person_id_query, cohort_filter_params
def get_breakdown_prop_values( filter: Filter, entity: Entity, aggregate_operation: str, team_id: int, limit: int = 25, extra_params={}, column_optimizer: Optional[ColumnOptimizer] = None, ): "Returns the top N breakdown prop values for event/person breakdown" parsed_date_from, parsed_date_to, date_params = parse_timestamps( filter=filter, team_id=team_id) prop_filters, prop_filter_params = parse_prop_clauses( filter.properties + entity.properties, team_id, table_name="e", prepend="e_brkdwn", person_properties_mode=PersonPropertiesMode.EXCLUDE, allow_denormalized_props=True, ) entity_params, entity_format_params = get_entity_filtering_params( entity, team_id, table_name="e") if filter.breakdown_type == "person": value_expression, _ = get_property_string_expr( "person", cast(str, filter.breakdown), "%(key)s", "person_props") elif filter.breakdown_type == "group": value_expression, _ = get_property_string_expr( "groups", cast(str, filter.breakdown), "%(key)s", f"group_properties_{filter.breakdown_group_type_index}") else: value_expression, _ = get_property_string_expr( "events", cast(str, filter.breakdown), "%(key)s", "properties") person_join_clauses = "" person_join_params: Dict = {} person_query = ClickhousePersonQuery(filter, team_id, column_optimizer=column_optimizer, entity=entity) if person_query.is_used: person_subquery, person_join_params = person_query.get_query() person_join_clauses = f""" INNER JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) AS pdi ON e.distinct_id = pdi.distinct_id INNER JOIN ({person_subquery}) person ON pdi.person_id = person.id """ groups_join_condition, groups_join_params = GroupsJoinQuery( filter, team_id, column_optimizer).get_join_query() elements_query = TOP_ELEMENTS_ARRAY_OF_KEY_SQL.format( value_expression=value_expression, parsed_date_from=parsed_date_from, parsed_date_to=parsed_date_to, prop_filters=prop_filters, aggregate_operation=aggregate_operation, person_join_clauses=person_join_clauses, groups_join_clauses=groups_join_condition, **entity_format_params, ) return sync_execute( elements_query, { "key": filter.breakdown, "limit": limit, "team_id": team_id, "offset": filter.offset, **prop_filter_params, **entity_params, **person_join_params, **groups_join_params, **extra_params, **date_params, }, )[0][0]