def _retrieve_people(self, filter: RetentionFilter, team: Team): period = filter.period trunc, fields = self._get_trunc_func("timestamp", period) is_first_time_retention = filter.retention_type == RETENTION_FIRST_TIME entity_condition, _ = self.get_entity_condition( filter.target_entity, "events") returning_condition, _ = self.get_entity_condition( filter.returning_entity, "first_event_date") _entity_condition = returning_condition if filter.selected_interval > 0 else entity_condition events = Event.objects.filter(team_id=team.pk).add_person_id(team.pk) filtered_events = events.filter( filter.recurring_date_filter_Q()).filter( filter.properties_to_Q(team_id=team.pk)) inner_events = (Event.objects.filter(team_id=team.pk).filter( filter.properties_to_Q(team_id=team.pk)).add_person_id( team.pk).filter(**{ "person_id": OuterRef("id") }).filter(entity_condition).values("person_id").annotate( first_date=Min(trunc)).filter( filter.reference_date_filter_Q("first_date")).distinct( ) if is_first_time_retention else Event.objects.filter( team_id=team.pk).filter( filter.reference_date_filter_Q()).filter( filter.properties_to_Q( team_id=team.pk)).add_person_id( team.pk).filter( **{ "person_id": OuterRef("id") }).filter(entity_condition)) filtered_events = (filtered_events.filter(_entity_condition).filter( Exists( Person.objects.filter(**{ "id": OuterRef("person_id"), }).filter(Exists(inner_events)).only("id"))).values( "person_id").distinct()).all() people = Person.objects.filter( team=team, id__in=[ p["person_id"] for p in filtered_events[filter.offset:filter.offset + 100] ], ) people = people.prefetch_related( Prefetch("persondistinctid_set", to_attr="distinct_ids_cache")) from posthog.api.person import PersonSerializer return PersonSerializer(people, many=True).data
def _determine_query_params(self, filter: RetentionFilter, team: Team): period = filter.period is_first_time_retention = filter.retention_type == RETENTION_FIRST_TIME events: QuerySet = QuerySet() entity_condition, entity_condition_strigified = self.get_entity_condition( filter.target_entity, "first_event_date") returning_condition, returning_condition_stringified = self.get_entity_condition( filter.returning_entity, "events") events = Event.objects.filter(team_id=team.pk).add_person_id( team.pk).annotate(event_date=F("timestamp")) trunc, fields = self._get_trunc_func("timestamp", period) if is_first_time_retention: filtered_events = events.filter( filter.properties_to_Q(team_id=team.pk)) first_date = (filtered_events.filter(entity_condition).values( "person_id", "event", "action").annotate(first_date=Min(trunc)).filter( filter.custom_date_filter_Q("first_date")).distinct()) final_query = (filtered_events.filter( filter.date_filter_Q).filter(returning_condition).values_list( "person_id", "event_date", "event", "action").union( first_date.values_list("first_date", "person_id", "event", "action"))) else: filtered_events = events.filter(filter.date_filter_Q).filter( filter.properties_to_Q(team_id=team.pk)) first_date = (filtered_events.filter(entity_condition).annotate( first_date=trunc).values("first_date", "person_id", "event", "action").distinct()) final_query = ( filtered_events.filter(returning_condition).values_list( "person_id", "event_date", "event", "action").union( first_date.values_list("first_date", "person_id", "event", "action"))) start_params = ((filter.date_from, filter.date_from) if period == "Month" or period == "Hour" else (filter.date_from, )) event_query, events_query_params = final_query.query.sql_with_params() reference_event_query, first_date_params = first_date.query.sql_with_params( ) event_params = (filter.target_entity.id, filter.returning_entity.id, filter.target_entity.id) return ( { "event_query": event_query, "reference_event_query": reference_event_query, "fields": fields, "return_condition": returning_condition_stringified, "target_condition": entity_condition_strigified, }, start_params + events_query_params + first_date_params + event_params, )
def _execute_sql( self, filter: RetentionFilter, team: Team, ) -> Dict[Tuple[int, int], Dict[str, Any]]: period = filter.period is_first_time_retention = filter.retention_type == RETENTION_FIRST_TIME events: QuerySet = QuerySet() entity_condition, entity_condition_strigified = self.get_entity_condition( filter.target_entity, "first_event_date") returning_condition, returning_condition_stringified = self.get_entity_condition( filter.returning_entity, "events") events = Event.objects.filter(team_id=team.pk).add_person_id( team.pk).annotate(event_date=F("timestamp")) trunc, fields = self._get_trunc_func("timestamp", period) if is_first_time_retention: filtered_events = events.filter( filter.properties_to_Q(team_id=team.pk)) first_date = (filtered_events.filter(entity_condition).values( "person_id", "event", "action").annotate(first_date=Min(trunc)).filter( filter.custom_date_filter_Q("first_date")).distinct()) final_query = (filtered_events.filter( filter.date_filter_Q).filter(returning_condition).values_list( "person_id", "event_date", "event", "action").union( first_date.values_list("first_date", "person_id", "event", "action"))) else: filtered_events = events.filter(filter.date_filter_Q).filter( filter.properties_to_Q(team_id=team.pk)) first_date = (filtered_events.filter(entity_condition).annotate( first_date=trunc).values("first_date", "person_id", "event", "action").distinct()) final_query = ( filtered_events.filter(returning_condition).values_list( "person_id", "event_date", "event", "action").union( first_date.values_list("first_date", "person_id", "event", "action"))) event_query, events_query_params = final_query.query.sql_with_params() reference_event_query, first_date_params = first_date.query.sql_with_params( ) final_query = """ SELECT {fields} COUNT(DISTINCT "events"."person_id"), array_agg(DISTINCT "events"."person_id") as people FROM ({event_query}) events LEFT JOIN ({reference_event_query}) first_event_date ON (events.person_id = first_event_date.person_id) WHERE event_date >= first_date AND {target_condition} AND {return_condition} OR ({target_condition} AND event_date = first_date) GROUP BY date, first_date """.format( event_query=event_query, reference_event_query=reference_event_query, fields=fields, return_condition=returning_condition_stringified, target_condition=entity_condition_strigified, ) event_params = (filter.target_entity.id, filter.returning_entity.id, filter.target_entity.id) start_params = ((filter.date_from, filter.date_from) if period == "Month" or period == "Hour" else (filter.date_from, )) with connection.cursor() as cursor: cursor.execute( final_query, start_params + events_query_params + first_date_params + event_params, ) data = namedtuplefetchall(cursor) scores: dict = {} for datum in data: key = round(datum.first_date, 1) if not scores.get(key, None): scores.update({key: {}}) for person in datum.people: if not scores[key].get(person, None): scores[key].update({person: 1}) else: scores[key][person] += 1 by_dates = {} for row in data: people = sorted( row.people, key=lambda p: scores[round(row.first_date, 1)][int(p)], reverse=True, ) random_key = "".join( random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(10)) cache_key = generate_cache_key("{}{}{}".format( random_key, str(round(row.first_date, 0)), str(team.pk))) cache.set( cache_key, people, 600, ) by_dates.update({ (int(row.first_date), int(row.date)): { "count": row.count, "people": people[0:100], "offset": 100, "next": cache_key if len(people) > 100 else None, } }) return by_dates