Esempio n. 1
0
 def _person_query(self):
     return ClickhousePersonQuery(
         self._filter,
         self._team_id,
         self._column_optimizer,
         extra_fields=["created_at"],
     )
Esempio n. 2
0
    def _get_actor_subquery(self) -> Tuple[str, Dict[str, Any]]:
        if self.is_aggregating_by_groups:
            actor_join_subquery, actor_join_subquery_params = GroupsJoinQuery(
                self._filter, self._team.pk,
                join_key="funnel_actors.actor_id").get_join_query()
        else:
            person_query, actor_join_subquery_params = ClickhousePersonQuery(
                self._filter,
                self._team.pk,
                entity=Entity({
                    "id":
                    "person",
                    "type":
                    "events",
                    "properties":
                    self._filter.correlation_property_values
                }),
            ).get_query()

            actor_join_subquery = f"""
                JOIN ({person_query}) person
                ON person.id = funnel_actors.actor_id
            """

        return actor_join_subquery, actor_join_subquery_params
Esempio n. 3
0
 def _person_query(self):
     return ClickhousePersonQuery(
         self._filter,
         self._team_id,
         self._column_optimizer,
         extra_fields=self._extra_person_fields,
         entity=self._entity,
     )
Esempio n. 4
0
def run_query(team: Team, filter: Filter, **kwargs):
    query, params = ClickhousePersonQuery(filter, team.pk,
                                          **kwargs).get_query()
    rows = sync_execute(query, {**params, "team_id": team.pk})

    if len(rows) > 0:
        return {"rows": len(rows), "columns": len(rows[0])}
    else:
        return {"rows": 0}
Esempio n. 5
0
 def __init__(self, entity: Entity, *args, **kwargs):
     self._entity = entity
     super().__init__(*args, **kwargs)
     self._person_query = ClickhousePersonQuery(
         self._filter,
         self._team_id,
         self._column_optimizer,
         extra_fields=kwargs.get("extra_person_fields", []),
         entity=entity,
     )
Esempio n. 6
0
 def _person_join_condition(self) -> Tuple[str, Dict]:
     person_query = ClickhousePersonQuery(self.filter,
                                          self.team_id,
                                          self.column_optimizer,
                                          entity=self.entity)
     if person_query.is_used:
         query, params = person_query.get_query()
         return (
             f"""
         {EVENT_JOIN_PERSON_SQL}
         INNER JOIN ({query}) person
         ON person.id = pdi.person_id
         """,
             params,
         )
     elif self.entity.math == "dau":
         # Only join distinct_ids
         return EVENT_JOIN_PERSON_SQL, {}
     else:
         return "", {}
Esempio n. 7
0
    def _run_query(self, filter: Filter, join_person_tables=False) -> List:
        query, params = parse_prop_clauses(
            filter.properties,
            self.team.pk,
            allow_denormalized_props=True,
            person_properties_mode=PersonPropertiesMode.EXCLUDE,
        )
        joins = ""
        if join_person_tables:
            person_query = ClickhousePersonQuery(filter, self.team.pk)
            person_subquery, person_join_params = person_query.get_query()
            joins = f"""
                INNER JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) AS pdi ON events.distinct_id = pdi.distinct_id
                INNER JOIN ({person_subquery}) person ON pdi.person_id = person.id
            """
            params.update(person_join_params)

        final_query = f"SELECT uuid FROM events {joins} WHERE team_id = %(team_id)s {query}"
        # Make sure we don't accidentally use json on the properties field
        self.assertNotIn("json", final_query.lower())
        return sync_execute(final_query, {**params, "team_id": self.team.pk})
Esempio n. 8
0
 def _person_join_condition(self) -> Tuple[str, Dict]:
     person_query = ClickhousePersonQuery(self.filter,
                                          self.team_id,
                                          self.column_optimizer,
                                          entity=self.entity)
     event_join = EVENT_JOIN_PERSON_SQL.format(
         GET_TEAM_PERSON_DISTINCT_IDS=get_team_distinct_ids_query(
             self.team_id))
     if person_query.is_used:
         query, params = person_query.get_query()
         return (
             f"""
         {event_join}
         INNER JOIN ({query}) person
         ON person.id = pdi.person_id
         """,
             params,
         )
     elif self.entity.math == "dau":
         # Only join distinct_ids
         return event_join, {}
     else:
         return "", {}
Esempio n. 9
0
    def _get_aggregation_join_query(self):
        if self._filter.aggregation_group_type_index is None:
            person_query, person_query_params = ClickhousePersonQuery(
                self._filter, self._team.pk, ColumnOptimizer(self._filter, self._team.pk)
            ).get_query()

            return (
                f"""
                JOIN ({person_query}) person
                    ON person.id = funnel_actors.actor_id
            """,
                person_query_params,
            )
        else:
            return GroupsJoinQuery(self._filter, self._team.pk, join_key="funnel_actors.actor_id").get_join_query()
Esempio n. 10
0
    def __init__(
        self,
        filter: Union[Filter, PathFilter, RetentionFilter],
        team_id: int,
        round_interval=False,
        should_join_distinct_ids=False,
        should_join_persons=False,
        # Extra events/person table columns to fetch since parent query needs them
        extra_fields: List[ColumnName] = [],
        extra_person_fields: List[ColumnName] = [],
        **kwargs,
    ) -> None:
        self._filter = filter
        self._team_id = team_id
        self._column_optimizer = ColumnOptimizer(self._filter, self._team_id)
        self._person_query = ClickhousePersonQuery(
            self._filter,
            self._team_id,
            self._column_optimizer,
            extra_fields=extra_person_fields)
        self.params: Dict[str, Any] = {
            "team_id": self._team_id,
        }

        self._should_join_distinct_ids = should_join_distinct_ids
        self._should_join_persons = should_join_persons
        self._extra_fields = extra_fields
        self._extra_person_fields = extra_person_fields

        if not self._should_join_distinct_ids:
            self._determine_should_join_distinct_ids()

        if not self._should_join_persons:
            self._determine_should_join_persons()

        self._should_round_interval = round_interval
Esempio n. 11
0
def person_query(team: Team, filter: Filter, **kwargs):
    return ClickhousePersonQuery(filter, team.pk, **kwargs).get_query()[0]
Esempio n. 12
0
    def get_properties_query(self) -> Tuple[str, Dict[str, Any]]:

        if not self._filter.correlation_property_names:
            raise ValidationError(
                "Property Correlation expects atleast one Property to run correlation on"
            )

        funnel_persons_query, funnel_persons_params = self.get_funnel_persons_cte(
        )

        person_prop_query, person_prop_params = self._get_properties_prop_clause(
        )

        person_query, person_query_params = ClickhousePersonQuery(
            self._filter, self._team.pk,
            ColumnOptimizer(self._filter, self._team.pk)).get_query()

        query = f"""
            WITH
                funnel_people as ({funnel_persons_query}),
                %(target_step)s AS target_step
            SELECT
                concat(prop.1, '::', prop.2) as name,
                -- We generate a unique identifier for each property value as: PropertyName::Value
                countDistinctIf(person_id, steps = target_step) AS success_count,
                countDistinctIf(person_id, steps <> target_step) AS failure_count
            FROM (
                SELECT
                    person_id,
                    funnel_people.steps as steps,
                    /*
                        We can extract multiple property values at the same time, since we're
                        already querying the person table.
                        This gives us something like:
                        --------------------
                        person1, steps, [property_value_0, property_value_1, property_value_2]
                        person2, steps, [property_value_0, property_value_1, property_value_2]

                        To group by property name, we need to extract the property from the array. ArrayJoin helps us do that.
                        It transforms the above into:

                        --------------------

                        person1, steps, property_value_0
                        person1, steps, property_value_1
                        person1, steps, property_value_2

                        person2, steps, property_value_0
                        person2, steps, property_value_1
                        person2, steps, property_value_2

                        To avoid clashes and clarify the values, we also zip with the property name, to generate
                        tuples like: (property_name, property_value), which we then group by
                    */
                    {person_prop_query}
                FROM funnel_people
                JOIN ({person_query}) person
                ON person.id = funnel_people.person_id
            ) person_with_props
            -- Group by the tuple items: (property_name, property_value) generated by zip
            GROUP BY prop.1, prop.2
            HAVING prop.1 NOT IN %(exclude_property_names)s
            UNION ALL
            SELECT
                '{self.TOTAL_IDENTIFIER}' as name,
                countDistinctIf(person_id, steps = target_step) AS success_count,
                countDistinctIf(person_id, steps <> target_step) AS failure_count
            FROM funnel_people
        """
        params = {
            **funnel_persons_params,
            **person_prop_params,
            **person_query_params,
            "target_step":
            len(self._filter.entities),
            "property_names":
            self._filter.correlation_property_names,
            "exclude_property_names":
            self._filter.correlation_property_exclude_names,
        }

        return query, params
Esempio n. 13
0
def get_breakdown_prop_values(
    filter: Filter,
    entity: Entity,
    aggregate_operation: str,
    team_id: int,
    limit: int = BREAKDOWN_VALUES_LIMIT,
    extra_params={},
    column_optimizer: Optional[ColumnOptimizer] = None,
):
    """
    Returns the top N breakdown prop values for event/person breakdown

    e.g. for Browser with limit 3 might return ['Chrome', 'Safari', 'Firefox', 'Other']
    """
    column_optimizer = column_optimizer or ColumnOptimizer(filter, team_id)
    parsed_date_from, parsed_date_to, date_params = parse_timestamps(
        filter=filter, team_id=team_id)

    props_to_filter = filter.property_groups.combine_property_group(
        PropertyOperatorType.AND, entity.property_groups)
    outer_properties = column_optimizer.property_optimizer.parse_property_groups(
        props_to_filter).outer

    prop_filters, prop_filter_params = parse_prop_grouped_clauses(
        team_id=team_id,
        property_group=outer_properties,
        table_name="e",
        prepend="e_brkdwn",
        person_properties_mode=PersonPropertiesMode.
        USING_PERSON_PROPERTIES_COLUMN,
        allow_denormalized_props=True,
    )

    entity_params, entity_format_params = get_entity_filtering_params(
        entity=entity, team_id=team_id, table_name="e")

    value_expression = _to_value_expression(filter.breakdown_type,
                                            filter.breakdown,
                                            filter.breakdown_group_type_index)

    person_join_clauses = ""
    person_join_params: Dict = {}
    person_query = ClickhousePersonQuery(filter,
                                         team_id,
                                         column_optimizer=column_optimizer,
                                         entity=entity)
    if person_query.is_used:
        person_subquery, person_join_params = person_query.get_query()
        person_join_clauses = f"""
            INNER JOIN ({get_team_distinct_ids_query(team_id)}) AS pdi ON e.distinct_id = pdi.distinct_id
            INNER JOIN ({person_subquery}) person ON pdi.person_id = person.id
        """

    groups_join_condition, groups_join_params = GroupsJoinQuery(
        filter, team_id, column_optimizer).get_join_query()

    elements_query = TOP_ELEMENTS_ARRAY_OF_KEY_SQL.format(
        value_expression=value_expression,
        parsed_date_from=parsed_date_from,
        parsed_date_to=parsed_date_to,
        prop_filters=prop_filters,
        aggregate_operation=aggregate_operation,
        person_join_clauses=person_join_clauses,
        groups_join_clauses=groups_join_condition,
        **entity_format_params,
    )

    return sync_execute(
        elements_query,
        {
            "key": filter.breakdown,
            "limit": limit,
            "team_id": team_id,
            "offset": filter.offset,
            **prop_filter_params,
            **entity_params,
            **person_join_params,
            **groups_join_params,
            **extra_params,
            **date_params,
        },
    )[0][0]
Esempio n. 14
0
class ClickhouseEventQuery(metaclass=ABCMeta):
    DISTINCT_ID_TABLE_ALIAS = "pdi"
    PERSON_TABLE_ALIAS = "person"
    EVENT_TABLE_ALIAS = "e"

    _filter: Union[Filter, PathFilter, RetentionFilter]
    _team_id: int
    _column_optimizer: ColumnOptimizer
    _person_query: ClickhousePersonQuery
    _should_join_distinct_ids = False
    _should_join_persons = False
    _should_round_interval = False
    _extra_fields: List[ColumnName]
    _extra_person_fields: List[ColumnName]

    def __init__(
        self,
        filter: Union[Filter, PathFilter, RetentionFilter],
        team_id: int,
        round_interval=False,
        should_join_distinct_ids=False,
        should_join_persons=False,
        # Extra events/person table columns to fetch since parent query needs them
        extra_fields: List[ColumnName] = [],
        extra_person_fields: List[ColumnName] = [],
        **kwargs,
    ) -> None:
        self._filter = filter
        self._team_id = team_id
        self._column_optimizer = ColumnOptimizer(self._filter, self._team_id)
        self._person_query = ClickhousePersonQuery(
            self._filter,
            self._team_id,
            self._column_optimizer,
            extra_fields=extra_person_fields)
        self.params: Dict[str, Any] = {
            "team_id": self._team_id,
        }

        self._should_join_distinct_ids = should_join_distinct_ids
        self._should_join_persons = should_join_persons
        self._extra_fields = extra_fields
        self._extra_person_fields = extra_person_fields

        if not self._should_join_distinct_ids:
            self._determine_should_join_distinct_ids()

        if not self._should_join_persons:
            self._determine_should_join_persons()

        self._should_round_interval = round_interval

    @abstractmethod
    def get_query(self) -> Tuple[str, Dict[str, Any]]:
        pass

    @abstractmethod
    def _determine_should_join_distinct_ids(self) -> None:
        pass

    def _get_disintct_id_query(self) -> str:
        if self._should_join_distinct_ids:
            return f"""
            INNER JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) AS {self.DISTINCT_ID_TABLE_ALIAS}
            ON events.distinct_id = {self.DISTINCT_ID_TABLE_ALIAS}.distinct_id
            """
        else:
            return ""

    def _determine_should_join_persons(self) -> None:
        if self._person_query.is_used:
            self._should_join_distinct_ids = True
            self._should_join_persons = True
            return

        # :KLUDGE: The following is mostly making sure if cohorts are included as well.
        #   Can be simplified significantly after https://github.com/PostHog/posthog/issues/5854
        if any(
                self._should_property_join_persons(prop)
                for prop in self._filter.properties):
            self._should_join_distinct_ids = True
            self._should_join_persons = True
            return

        if any(
                self._should_property_join_persons(prop)
                for entity in self._filter.entities
                for prop in entity.properties):
            self._should_join_distinct_ids = True
            self._should_join_persons = True
            return

        if self._filter.breakdown_type == "person":
            self._should_join_distinct_ids = True
            self._should_join_persons = True
            return

    def _should_property_join_persons(self, prop: Property) -> bool:
        return prop.type == "cohort" and self._does_cohort_need_persons(prop)

    def _does_cohort_need_persons(self, prop: Property) -> bool:
        try:
            cohort: Cohort = Cohort.objects.get(pk=prop.value,
                                                team_id=self._team_id)
        except Cohort.DoesNotExist:
            return False
        if is_precalculated_query(cohort):
            return True
        if cohort.is_static:
            return True
        for group in cohort.groups:
            if group.get("properties"):
                return True
        return False

    def _get_person_query(self) -> Tuple[str, Dict]:
        if self._should_join_persons:
            person_query, params = self._person_query.get_query()
            return (
                f"""
            INNER JOIN ({person_query}) {self.PERSON_TABLE_ALIAS}
            ON {self.PERSON_TABLE_ALIAS}.id = {self.DISTINCT_ID_TABLE_ALIAS}.person_id
            """,
                params,
            )
        else:
            return "", {}

    def _get_groups_query(self) -> Tuple[str, Dict]:
        return GroupsJoinQuery(self._filter, self._team_id,
                               self._column_optimizer).get_join_query()

    def _get_date_filter(self) -> Tuple[str, Dict]:

        parsed_date_from, parsed_date_to, date_params = parse_timestamps(
            filter=self._filter, team_id=self._team_id)

        query = f"""
        {parsed_date_from}
        {parsed_date_to}
        """

        return query, date_params

    def _get_props(self, filters: List[Property]) -> Tuple[str, Dict]:
        final = []
        params: Dict[str, Any] = {}

        for idx, prop in enumerate(filters):
            if prop.type == "cohort":
                person_id_query, cohort_filter_params = self._get_cohort_subquery(
                    prop)
                params = {**params, **cohort_filter_params}
                final.append(f"AND {person_id_query}")
            else:
                filter_query, filter_params = parse_prop_clauses(
                    [prop],
                    self._team_id,
                    prepend=f"global_{idx}",
                    allow_denormalized_props=True,
                    person_properties_mode=PersonPropertiesMode.EXCLUDE,
                )
                final.append(filter_query)
                params.update(filter_params)
        return " ".join(final), params

    def _get_cohort_subquery(self, prop) -> Tuple[str, Dict[str, Any]]:
        try:
            cohort: Cohort = Cohort.objects.get(pk=prop.value,
                                                team_id=self._team_id)
        except Cohort.DoesNotExist:
            return "0 = 11", {}  # If cohort doesn't exist, nothing can match

        is_precalculated = is_precalculated_query(cohort)

        person_id_query, cohort_filter_params = (
            format_precalculated_cohort_query(
                cohort.pk,
                0,
                custom_match_field=f"{self.DISTINCT_ID_TABLE_ALIAS}.person_id")
            if is_precalculated else format_person_query(
                cohort,
                0,
                custom_match_field=f"{self.DISTINCT_ID_TABLE_ALIAS}.person_id")
        )

        return person_id_query, cohort_filter_params
Esempio n. 15
0
def get_breakdown_prop_values(
    filter: Filter,
    entity: Entity,
    aggregate_operation: str,
    team_id: int,
    limit: int = 25,
    extra_params={},
    column_optimizer: Optional[ColumnOptimizer] = None,
):
    "Returns the top N breakdown prop values for event/person breakdown"

    parsed_date_from, parsed_date_to, date_params = parse_timestamps(
        filter=filter, team_id=team_id)
    prop_filters, prop_filter_params = parse_prop_clauses(
        filter.properties + entity.properties,
        team_id,
        table_name="e",
        prepend="e_brkdwn",
        person_properties_mode=PersonPropertiesMode.EXCLUDE,
        allow_denormalized_props=True,
    )

    entity_params, entity_format_params = get_entity_filtering_params(
        entity, team_id, table_name="e")

    if filter.breakdown_type == "person":
        value_expression, _ = get_property_string_expr(
            "person", cast(str, filter.breakdown), "%(key)s", "person_props")
    elif filter.breakdown_type == "group":
        value_expression, _ = get_property_string_expr(
            "groups", cast(str, filter.breakdown), "%(key)s",
            f"group_properties_{filter.breakdown_group_type_index}")
    else:
        value_expression, _ = get_property_string_expr(
            "events", cast(str, filter.breakdown), "%(key)s", "properties")

    person_join_clauses = ""
    person_join_params: Dict = {}
    person_query = ClickhousePersonQuery(filter,
                                         team_id,
                                         column_optimizer=column_optimizer,
                                         entity=entity)
    if person_query.is_used:
        person_subquery, person_join_params = person_query.get_query()
        person_join_clauses = f"""
            INNER JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) AS pdi ON e.distinct_id = pdi.distinct_id
            INNER JOIN ({person_subquery}) person ON pdi.person_id = person.id
        """

    groups_join_condition, groups_join_params = GroupsJoinQuery(
        filter, team_id, column_optimizer).get_join_query()

    elements_query = TOP_ELEMENTS_ARRAY_OF_KEY_SQL.format(
        value_expression=value_expression,
        parsed_date_from=parsed_date_from,
        parsed_date_to=parsed_date_to,
        prop_filters=prop_filters,
        aggregate_operation=aggregate_operation,
        person_join_clauses=person_join_clauses,
        groups_join_clauses=groups_join_condition,
        **entity_format_params,
    )

    return sync_execute(
        elements_query,
        {
            "key": filter.breakdown,
            "limit": limit,
            "team_id": team_id,
            "offset": filter.offset,
            **prop_filter_params,
            **entity_params,
            **person_join_params,
            **groups_join_params,
            **extra_params,
            **date_params,
        },
    )[0][0]