def _get_breakdown_select_prop(self) -> str: if self._filter.breakdown: self.params.update({"breakdown": self._filter.breakdown}) if self._filter.breakdown_type == "person": # :TRICKY: We only support string breakdown for event/person properties assert isinstance(self._filter.breakdown, str) expression, _ = get_property_string_expr( "person", self._filter.breakdown, "%(breakdown)s", "person_props") return f", {expression} AS prop" elif self._filter.breakdown_type == "event": # :TRICKY: We only support string breakdown for event/person properties assert isinstance(self._filter.breakdown, str) expression, _ = get_property_string_expr( "events", self._filter.breakdown, "%(breakdown)s", "properties") return f", {expression} AS prop" elif self._filter.breakdown_type == "cohort": return ", value AS prop" elif self._filter.breakdown_type == "group": # :TRICKY: We only support string breakdown for group properties assert isinstance(self._filter.breakdown, str) properties_field = f"group_properties_{self._filter.breakdown_group_type_index}" expression, _ = get_property_string_expr( "groups", self._filter.breakdown, "%(breakdown)s", properties_field) return f", {expression} AS prop" return ""
def _breakdown_prop_params(self, aggregate_operation: str, math_params: Dict): values_arr = get_breakdown_prop_values( self.filter, self.entity, aggregate_operation, self.team_id, extra_params=math_params, column_optimizer=self.column_optimizer, ) # :TRICKY: We only support string breakdown for event/person properties assert isinstance(self.filter.breakdown, str) if self.filter.breakdown_type == "person": breakdown_value, _ = get_property_string_expr( "person", self.filter.breakdown, "%(key)s", "person_props") elif self.filter.breakdown_type == "group": properties_field = f"group_properties_{self.filter.breakdown_group_type_index}" breakdown_value, _ = get_property_string_expr( "groups", self.filter.breakdown, "%(key)s", properties_field) else: breakdown_value, _ = get_property_string_expr( "events", self.filter.breakdown, "%(key)s", "properties") return ( { "values": values_arr }, BREAKDOWN_PROP_JOIN_SQL, { "breakdown_value_expr": breakdown_value }, breakdown_value, )
def _get_properties_prop_clause(self): group_properties_field = f"groups_{self._filter.aggregation_group_type_index}.group_properties_{self._filter.aggregation_group_type_index}" aggregation_properties_alias = ( PersonQuery.PERSON_PROPERTIES_ALIAS if self._filter.aggregation_group_type_index is None else group_properties_field ) if "$all" in cast(list, self._filter.correlation_property_names): map_expr = trim_quotes_expr(f"JSONExtractRaw({aggregation_properties_alias}, x)") return ( f""" arrayMap(x -> x.1, JSONExtractKeysAndValuesRaw({aggregation_properties_alias})) as person_prop_keys, arrayJoin( arrayZip( person_prop_keys, arrayMap(x -> {map_expr}, person_prop_keys) ) ) as prop """, {}, ) else: person_property_expressions = [] person_property_params = {} for index, property_name in enumerate(cast(list, self._filter.correlation_property_names)): param_name = f"property_name_{index}" if self._filter.aggregation_group_type_index is not None: expression, _ = get_property_string_expr( "groups", property_name, f"%({param_name})s", group_properties_field ) else: expression, _ = get_property_string_expr( "person", property_name, f"%({param_name})s", PersonQuery.PERSON_PROPERTIES_ALIAS, ) person_property_params[param_name] = property_name person_property_expressions.append(expression) return ( f""" arrayJoin(arrayZip( %(property_names)s, [{','.join(person_property_expressions)}] )) as prop """, person_property_params, )
def get_property_values_for_key(key: str, team: Team, value: Optional[str] = None): property_field, _ = get_property_string_expr("events", key, "%(key)s", "properties") parsed_date_from = "AND timestamp >= '{}'".format( relative_date_parse("-7d").strftime("%Y-%m-%d 00:00:00")) parsed_date_to = "AND timestamp <= '{}'".format( timezone.now().strftime("%Y-%m-%d 23:59:59")) if value: return sync_execute( SELECT_PROP_VALUES_SQL_WITH_FILTER.format( parsed_date_from=parsed_date_from, parsed_date_to=parsed_date_to, property_field=property_field), { "team_id": team.pk, "key": key, "value": "%{}%".format(value) }, ) return sync_execute( SELECT_PROP_VALUES_SQL.format(parsed_date_from=parsed_date_from, parsed_date_to=parsed_date_to, property_field=property_field), { "team_id": team.pk, "key": key }, )
def process_math(entity: Entity) -> Tuple[str, str, Dict[str, Any]]: aggregate_operation = "count(*)" join_condition = "" params: Dict[str, Any] = {} if entity.math == "dau": join_condition = EVENT_JOIN_PERSON_SQL aggregate_operation = "count(DISTINCT person_id)" elif entity.math == "unique_group": validate_group_type_index("math_group_type_index", entity.math_group_type_index, required=True) aggregate_operation = f"count(DISTINCT $group_{entity.math_group_type_index})" elif entity.math in MATH_FUNCTIONS: if entity.math_property is None: raise ValidationError( { "math_property": "This field is required when `math` is set." }, code="required") key = f"e_{entity.index}_math_prop" value, _ = get_property_string_expr("events", entity.math_property, f"%({key})s", "properties") aggregate_operation = f"{MATH_FUNCTIONS[entity.math]}(toFloat64OrNull({value}))" params["join_property_key"] = entity.math_property params[key] = entity.math_property return aggregate_operation, join_condition, params
def filter_event(step: ActionStep, prepend: str = "event", index: int = 0, table_name: str = "") -> Tuple[List[str], Dict]: from ee.clickhouse.models.property import get_property_string_expr params = {"{}_{}".format(prepend, index): step.event} conditions = [] if table_name != "": table_name += "." if step.url: value_expr, _ = get_property_string_expr("events", "$current_url", "'$current_url'", f"{table_name}properties") prop_name = f"{prepend}_prop_val_{index}" if step.url_matching == ActionStep.EXACT: conditions.append(f"{value_expr} = %({prop_name})s") params.update({prop_name: step.url}) elif step.url_matching == ActionStep.REGEX: conditions.append(f"match({value_expr}, %({prop_name})s)") params.update({prop_name: step.url}) else: conditions.append(f"{value_expr} LIKE %({prop_name})s") params.update({prop_name: f"%{step.url}%"}) conditions.append(f"event = %({prepend}_{index})s") return conditions, params
def _get_properties_select_clause(self) -> str: session_id_clause, _ = get_property_string_expr( "events", "$session_id", "'$session_id'", "properties") clause = f""", {session_id_clause} as session_id """ clause += (f", events.elements_chain as elements_chain" if self._column_optimizer.should_query_elements_chain_column else "") clause += " ".join( f", events.{column_name} as {column_name}" for column_name in self._column_optimizer.event_columns_to_query) return clause
def _to_value_expression( breakdown_type: Optional[BREAKDOWN_TYPES], breakdown: Union[str, List[Union[str, int]], None], breakdown_group_type_index: Optional[GroupTypeIndex], ) -> str: if breakdown_type == "person": return get_single_or_multi_property_string_expr(breakdown, table="person", query_alias="value") elif breakdown_type == "group": value_expression, _ = get_property_string_expr( table="groups", property_name=cast(str, breakdown), var="%(key)s", column=f"group_properties_{breakdown_group_type_index}", ) return f"{value_expression} AS value" else: return get_single_or_multi_property_string_expr(breakdown, table="events", query_alias="value")
def _get_properties_prop_clause(self): if "$all" in cast(list, self._filter.correlation_property_names): return ( f""" arrayMap(x -> x.1, JSONExtractKeysAndValuesRaw({ClickhousePersonQuery.PERSON_PROPERTIES_ALIAS})) as person_prop_keys, arrayJoin( arrayZip( person_prop_keys, arrayMap(x -> trim(BOTH '"' FROM JSONExtractRaw({ClickhousePersonQuery.PERSON_PROPERTIES_ALIAS}, x)), person_prop_keys) ) ) as prop """, {}, ) else: person_property_expressions = [] person_property_params = {} for index, property_name in enumerate( cast(list, self._filter.correlation_property_names)): param_name = f"property_name_{index}" expression, _ = get_property_string_expr( "person", property_name, f"%({param_name})s", ClickhousePersonQuery.PERSON_PROPERTIES_ALIAS, ) person_property_params[param_name] = property_name person_property_expressions.append(expression) return ( f""" arrayJoin(arrayZip( %(property_names)s, [{','.join(person_property_expressions)}] )) as prop """, person_property_params, )
def get_person_property_values_for_key(key: str, team: Team, value: Optional[str] = None): property_field, _ = get_property_string_expr("person", key, "%(key)s", "properties") if value: return sync_execute( SELECT_PERSON_PROP_VALUES_SQL_WITH_FILTER.format( property_field=property_field), { "team_id": team.pk, "key": key, "value": "%{}%".format(value) }, ) return sync_execute( SELECT_PERSON_PROP_VALUES_SQL.format(property_field=property_field), { "team_id": team.pk, "key": key }, )
def _get_breakdown_select_prop(self) -> str: if self._filter.breakdown: self.params.update({"breakdown": self._filter.breakdown}) if self._filter.breakdown_type == "person": return get_single_or_multi_property_string_expr( self._filter.breakdown, table="person", query_alias="prop") elif self._filter.breakdown_type == "event": return get_single_or_multi_property_string_expr( self._filter.breakdown, table="events", query_alias="prop") elif self._filter.breakdown_type == "cohort": return "value AS prop" elif self._filter.breakdown_type == "group": # :TRICKY: We only support string breakdown for group properties assert isinstance(self._filter.breakdown, str) properties_field = f"group_properties_{self._filter.breakdown_group_type_index}" expression, _ = get_property_string_expr( table="groups", property_name=self._filter.breakdown, var="%(breakdown)s", column=properties_field) return f"{expression} AS prop" return ""
def get_event_property_query(self) -> Tuple[str, Dict[str, Any]]: if not self._filter.correlation_event_names: raise ValidationError("Event Property Correlation expects atleast one event name to run correlation on") funnel_persons_query, funnel_persons_params = self.get_funnel_actors_cte() event_join_query = self._get_events_join_query() if self.support_autocapture_elements(): event_type_expression, _ = get_property_string_expr( "events", self.AUTOCAPTURE_EVENT_TYPE, f"'{self.AUTOCAPTURE_EVENT_TYPE}'", "properties", ) array_join_query = f""" 'elements_chain' as prop_key, concat({event_type_expression}, '{self.ELEMENTS_DIVIDER}', elements_chain) as prop_value, tuple(prop_key, prop_value) as prop """ else: array_join_query = f""" arrayMap(x -> x.1, JSONExtractKeysAndValuesRaw(properties)) as prop_keys, arrayMap(x -> trim(BOTH '"' FROM JSONExtractRaw(properties, x)), prop_keys) as prop_values, arrayJoin(arrayZip(prop_keys, prop_values)) as prop """ query = f""" WITH funnel_actors as ({funnel_persons_query}), toDateTime(%(date_to)s) AS date_to, toDateTime(%(date_from)s) AS date_from, %(target_step)s AS target_step, %(funnel_step_names)s as funnel_step_names SELECT concat(event_name, '::', prop.1, '::', prop.2) as name, countDistinctIf(actor_id, steps = target_step) as success_count, countDistinctIf(actor_id, steps <> target_step) as failure_count FROM ( SELECT actors.actor_id as actor_id, actors.steps as steps, events.event as event_name, -- Same as what we do in $all property queries {array_join_query} FROM events AS event {event_join_query} AND event.event IN %(event_names)s ) GROUP BY name -- Discard high cardinality / low hits properties -- This removes the long tail of random properties with empty, null, or very small values HAVING (success_count + failure_count) > 2 AND prop.1 NOT IN %(exclude_property_names)s UNION ALL -- To get the total success/failure numbers, we do an aggregation on -- the funnel people CTE and count distinct actor_ids SELECT '{self.TOTAL_IDENTIFIER}' as name, countDistinctIf( actors.actor_id, actors.steps = target_step ) AS success_count, countDistinctIf( actors.actor_id, actors.steps <> target_step ) AS failure_count FROM funnel_actors AS actors """ params = { **funnel_persons_params, "funnel_step_names": self._get_funnel_step_names(), "target_step": len(self._filter.entities), "event_names": self._filter.correlation_event_names, "exclude_property_names": self._filter.correlation_event_exclude_property_names, } return query, params
def _get_screen_name_parsing(self): path_type, _ = get_property_string_expr("events", "$screen_name", "'$screen_name'", "properties") return path_type
def _get_current_url_parsing(self): path_type, _ = get_property_string_expr("events", "$current_url", "'$current_url'", "properties") return f"if(length({path_type}) > 1, trim( TRAILING '/' FROM {path_type}), {path_type})"
def get_query(self, entities=None, entity_name="events", skip_entity_filter=False) -> Tuple[str, Dict[str, Any]]: _fields = [ f"{self.EVENT_TABLE_ALIAS}.event as event", f"{self.EVENT_TABLE_ALIAS}.team_id as team_id", f"{self.EVENT_TABLE_ALIAS}.distinct_id as distinct_id", f"{self.EVENT_TABLE_ALIAS}.timestamp as timestamp", (f"{self.EVENT_TABLE_ALIAS}.elements_chain as elements_chain" if self._column_optimizer.should_query_elements_chain_column else ""), f"{get_aggregation_target_field(self._filter.aggregation_group_type_index, self.EVENT_TABLE_ALIAS, self.DISTINCT_ID_TABLE_ALIAS)} as aggregation_target", ] _fields += [ f"{self.EVENT_TABLE_ALIAS}.{field} AS {field}" for field in self._extra_fields ] _fields += [ get_property_string_expr("events", field, f"'{field}'", "properties", table_alias=self.EVENT_TABLE_ALIAS)[0] + f' as "{field}"' for field in self._extra_event_properties ] _fields.extend( f'{self.EVENT_TABLE_ALIAS}."{column_name}" as "{column_name}"' for column_name in self._column_optimizer.event_columns_to_query) _fields.extend( f"groups_{group_index}.group_properties_{group_index} as group_properties_{group_index}" for group_index in self._column_optimizer.group_types_to_query) if self._should_join_persons: _fields.extend( f"{self.PERSON_TABLE_ALIAS}.{column_name} as {column_name}" for column_name in self._person_query.fields) _fields = list(filter(None, _fields)) date_query, date_params = self._get_date_filter() self.params.update(date_params) prop_query, prop_params = self._get_prop_groups( self._filter.property_groups) self.params.update(prop_params) if skip_entity_filter: entity_query = "" entity_params: Dict[str, Any] = {} else: entity_query, entity_params = self._get_entity_query( entities, entity_name) self.params.update(entity_params) person_query, person_params = self._get_person_query() self.params.update(person_params) groups_query, groups_params = self._get_groups_query() self.params.update(groups_params) query = f""" SELECT {', '.join(_fields)} FROM events {self.EVENT_TABLE_ALIAS} {self._get_distinct_id_query()} {person_query} {groups_query} WHERE team_id = %(team_id)s {entity_query} {date_query} {prop_query} """ return query, self.params
def _get_current_url_parsing(self): path_type, _ = get_property_string_expr("events", "$current_url", "'$current_url'", "properties") return f"if(length({path_type}) > 1, replaceRegexpAll({path_type}, '/$', ''), {path_type})"
def get_query(self) -> Tuple[str, Dict[str, Any]]: _fields = ( f"{self.EVENT_TABLE_ALIAS}.timestamp as timestamp" + ( " ".join( f", {self.EVENT_TABLE_ALIAS}.{column_name} as {column_name}" for column_name in self._column_optimizer.event_columns_to_query ) ) + " ".join( [ ", " + get_property_string_expr("events", property, f"'{property}'", "properties", table_alias="e")[0] + f" as {property}" for property in self._extra_event_properties ] ) + (f", {self.DISTINCT_ID_TABLE_ALIAS}.person_id as person_id" if self._should_join_distinct_ids else "") + (f", {self.EVENT_TABLE_ALIAS}.distinct_id as distinct_id" if self._aggregate_users_by_distinct_id else "") + ( " ".join( f", {self.EVENT_TABLE_ALIAS}.{column_name} as {column_name}" for column_name in self._extra_fields ) ) + ( " ".join( f", {self.PERSON_TABLE_ALIAS}.{column_name} as {column_name}" for column_name in self._extra_person_fields ) ) ) date_query, date_params = self._get_date_filter() self.params.update(date_params) prop_query, prop_params = self._get_prop_groups( self._filter.property_groups.combine_property_group(PropertyOperatorType.AND, self._entity.property_groups) ) self.params.update(prop_params) entity_query, entity_params = self._get_entity_query() self.params.update(entity_params) person_query, person_params = self._get_person_query() self.params.update(person_params) groups_query, groups_params = self._get_groups_query() self.params.update(groups_params) query = f""" SELECT {_fields} FROM events {self.EVENT_TABLE_ALIAS} {self._get_distinct_id_query()} {person_query} {groups_query} WHERE team_id = %(team_id)s {entity_query} {date_query} {prop_query} """ return query, self.params
def get_query(self) -> Tuple[str, Dict[str, Any]]: funnel_paths_timestamp = "" funnel_paths_join = "" funnel_paths_filter = "" if self._filter.funnel_paths == FUNNEL_PATH_AFTER_STEP or self._filter.funnel_paths == FUNNEL_PATH_BEFORE_STEP: # used when looking for paths up to a dropoff point to account for events happening between the latest even and when the person is deemed dropped off funnel_window = ( f"+ INTERVAL {self._filter.funnel_window_interval} {self._filter.funnel_window_interval_unit_ch()}" ) operator = ">=" if self._filter.funnel_paths == FUNNEL_PATH_AFTER_STEP else "<=" funnel_paths_timestamp = f"{self.FUNNEL_PERSONS_ALIAS}.timestamp AS target_timestamp" funnel_paths_join = f"JOIN {self.FUNNEL_PERSONS_ALIAS} ON {self.FUNNEL_PERSONS_ALIAS}.actor_id = {self.DISTINCT_ID_TABLE_ALIAS}.person_id" funnel_paths_filter = f"AND {self.EVENT_TABLE_ALIAS}.timestamp {operator} target_timestamp {funnel_window if self._filter.funnel_paths == FUNNEL_PATH_BEFORE_STEP and self._filter.funnel_step and self._filter.funnel_step < 0 else ''}" elif self._filter.funnel_paths == FUNNEL_PATH_BETWEEN_STEPS: funnel_paths_timestamp = f"{self.FUNNEL_PERSONS_ALIAS}.min_timestamp as min_timestamp, {self.FUNNEL_PERSONS_ALIAS}.max_timestamp as max_timestamp" funnel_paths_join = f"JOIN {self.FUNNEL_PERSONS_ALIAS} ON {self.FUNNEL_PERSONS_ALIAS}.actor_id = {self.DISTINCT_ID_TABLE_ALIAS}.person_id" funnel_paths_filter = f"AND {self.EVENT_TABLE_ALIAS}.timestamp >= min_timestamp AND {self.EVENT_TABLE_ALIAS}.timestamp <= max_timestamp" # We don't use ColumnOptimizer to decide what to query because Paths query doesn't surface any filter properties _fields = [ f"{self.EVENT_TABLE_ALIAS}.timestamp AS timestamp", f"{self.DISTINCT_ID_TABLE_ALIAS}.person_id as person_id" if self._should_join_distinct_ids else "", funnel_paths_timestamp, ] _fields += [ f"{self.EVENT_TABLE_ALIAS}.{field} AS {field}" for field in self._extra_fields ] _fields += [ get_property_string_expr("events", field, f"'{field}'", "properties", table_alias=self.EVENT_TABLE_ALIAS)[0] + f" as {field}" for field in self._extra_event_properties ] event_conditional = ( f"if({self.EVENT_TABLE_ALIAS}.event = '{SCREEN_EVENT}', {self._get_screen_name_parsing()}, " if self._should_query_screen() else "if(0, '', ") event_conditional += ( f"if({self.EVENT_TABLE_ALIAS}.event = '{PAGEVIEW_EVENT}', {self._get_current_url_parsing()}, " if self._should_query_url() else "if(0, '', ") event_conditional += f"{self.EVENT_TABLE_ALIAS}.event)) AS path_item_ungrouped" _fields.append(event_conditional) grouping_fields, grouping_params = self._get_grouping_fields() _fields.extend(grouping_fields) self.params.update(grouping_params) # remove empty strings _fields = list(filter(None, _fields)) date_query, date_params = self._get_date_filter() self.params.update(date_params) prop_query, prop_params = self._get_prop_groups( self._filter.property_groups) self.params.update(prop_params) event_query, event_params = self._get_event_query() self.params.update(event_params) person_query, person_params = self._get_person_query() self.params.update(person_params) groups_query, groups_params = self._get_groups_query() self.params.update(groups_params) query = f""" SELECT {','.join(_fields)} FROM events {self.EVENT_TABLE_ALIAS} {self._get_distinct_id_query()} {person_query} {groups_query} {funnel_paths_join} WHERE team_id = %(team_id)s {event_query} {date_query} {prop_query} {funnel_paths_filter} ORDER BY {self.DISTINCT_ID_TABLE_ALIAS}.person_id, {self.EVENT_TABLE_ALIAS}.timestamp """ return query, self.params
def get_breakdown_prop_values( filter: Filter, entity: Entity, aggregate_operation: str, team_id: int, limit: int = 25, extra_params={}, column_optimizer: Optional[ColumnOptimizer] = None, ): "Returns the top N breakdown prop values for event/person breakdown" parsed_date_from, parsed_date_to, date_params = parse_timestamps( filter=filter, team_id=team_id) prop_filters, prop_filter_params = parse_prop_clauses( filter.properties + entity.properties, team_id, table_name="e", prepend="e_brkdwn", person_properties_mode=PersonPropertiesMode.EXCLUDE, allow_denormalized_props=True, ) entity_params, entity_format_params = get_entity_filtering_params( entity, team_id, table_name="e") if filter.breakdown_type == "person": value_expression, _ = get_property_string_expr( "person", cast(str, filter.breakdown), "%(key)s", "person_props") elif filter.breakdown_type == "group": value_expression, _ = get_property_string_expr( "groups", cast(str, filter.breakdown), "%(key)s", f"group_properties_{filter.breakdown_group_type_index}") else: value_expression, _ = get_property_string_expr( "events", cast(str, filter.breakdown), "%(key)s", "properties") person_join_clauses = "" person_join_params: Dict = {} person_query = ClickhousePersonQuery(filter, team_id, column_optimizer=column_optimizer, entity=entity) if person_query.is_used: person_subquery, person_join_params = person_query.get_query() person_join_clauses = f""" INNER JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) AS pdi ON e.distinct_id = pdi.distinct_id INNER JOIN ({person_subquery}) person ON pdi.person_id = person.id """ groups_join_condition, groups_join_params = GroupsJoinQuery( filter, team_id, column_optimizer).get_join_query() elements_query = TOP_ELEMENTS_ARRAY_OF_KEY_SQL.format( value_expression=value_expression, parsed_date_from=parsed_date_from, parsed_date_to=parsed_date_to, prop_filters=prop_filters, aggregate_operation=aggregate_operation, person_join_clauses=person_join_clauses, groups_join_clauses=groups_join_condition, **entity_format_params, ) return sync_execute( elements_query, { "key": filter.breakdown, "limit": limit, "team_id": team_id, "offset": filter.offset, **prop_filter_params, **entity_params, **person_join_params, **groups_join_params, **extra_params, **date_params, }, )[0][0]