def build_all_sessions_query( self, events: QuerySet, _date_gte=Q()) -> Tuple[Query, QueryParams]: sessions = (events.filter(_date_gte).annotate( previous_timestamp=Window( expression=Lag("timestamp", default=None), partition_by=F("distinct_id"), order_by=F("timestamp").asc(), )).annotate(previous_event=Window( expression=Lag("event", default=None), partition_by=F("distinct_id"), order_by=F("timestamp").asc(), ))) sessions_sql, sessions_sql_params = sessions.query.sql_with_params() all_sessions = "\ SELECT *,\ SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\ SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\ FROM (SELECT id, team_id, distinct_id, event, elements_hash, timestamp, properties, CASE WHEN EXTRACT('EPOCH' FROM (timestamp - previous_timestamp)) >= (60 * 30)\ OR previous_timestamp IS NULL \ THEN 1 ELSE 0 END AS new_session \ FROM ({}) AS inner_sessions\ ) AS outer_sessions".format(sessions_sql) return all_sessions, sessions_sql_params
def calculate_sessions(self, events: QuerySet, session_type: Optional[str], filter: Filter, team: Team, offset: int) -> List[Dict[str, Any]]: # format date filter for session view _date_gte = Q() if session_type is None: # if _date_from is not explicitely set we only want to get the last day worth of data # otherwise the query is very slow if filter._date_from and filter.date_to: _date_gte = Q( timestamp__gte=filter.date_from, timestamp__lte=filter.date_to + relativedelta(days=1), ) else: dt = now() dt = dt.replace(hour=0, minute=0, second=0, microsecond=0) _date_gte = Q(timestamp__gte=dt, timestamp__lte=dt + relativedelta(days=1)) else: if not filter.date_from: filter._date_from = (Event.objects.filter( team_id=team).order_by("timestamp")[0].timestamp.replace( hour=0, minute=0, second=0, microsecond=0).isoformat()) sessions = (events.filter(_date_gte).annotate( previous_timestamp=Window( expression=Lag("timestamp", default=None), partition_by=F("distinct_id"), order_by=F("timestamp").asc(), )).annotate(previous_event=Window( expression=Lag("event", default=None), partition_by=F("distinct_id"), order_by=F("timestamp").asc(), ))) sessions_sql, sessions_sql_params = sessions.query.sql_with_params() all_sessions = "\ SELECT *,\ SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\ SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\ FROM (SELECT id, distinct_id, event, elements_hash, timestamp, properties, CASE WHEN EXTRACT('EPOCH' FROM (timestamp - previous_timestamp)) >= (60 * 30)\ OR previous_timestamp IS NULL \ THEN 1 ELSE 0 END AS new_session \ FROM ({}) AS inner_sessions\ ) AS outer_sessions".format(sessions_sql) result: List = [] if session_type == "avg": result = self._session_avg(all_sessions, sessions_sql_params, filter) elif session_type == "dist": result = self._session_dist(all_sessions, sessions_sql_params) else: result = self._session_list(all_sessions, sessions_sql_params, team, filter, offset) return result
def calculate_sessions( self, events: QuerySet, session_type: Optional[str], date_filter: Dict[str, datetime], team: Team, request: request.Request, ) -> List[Dict[str, Any]]: # format date filter for session view _date_gte = Q() if session_type is None: if request.GET.get("date_from", None): _date_gte = Q( timestamp__gte=date_filter["timestamp__gte"], timestamp__lte=date_filter["timestamp__gte"] + relativedelta(days=1), ) else: dt = datetime.now() dt = dt.replace(hour=0, minute=0, second=0, microsecond=0) _date_gte = Q(timestamp__gte=dt, timestamp__lte=dt + relativedelta(days=1)) sessions = (events.filter(_date_gte).annotate( previous_timestamp=Window( expression=Lag("timestamp", default=None), partition_by=F("distinct_id"), order_by=F("timestamp").asc(), )).annotate(previous_event=Window( expression=Lag("event", default=None), partition_by=F("distinct_id"), order_by=F("timestamp").asc(), ))) sessions_sql, sessions_sql_params = sessions.query.sql_with_params() all_sessions = "\ SELECT *,\ SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\ SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\ FROM (SELECT id, distinct_id, event, elements_hash, timestamp, properties, CASE WHEN EXTRACT('EPOCH' FROM (timestamp - previous_timestamp)) >= (60 * 30)\ OR previous_timestamp IS NULL \ THEN 1 ELSE 0 END AS new_session \ FROM ({}) AS inner_sessions\ ) AS outer_sessions".format(sessions_sql) result: List = [] interval = request.GET.get("interval", None) if session_type == "avg": result = self._session_avg(all_sessions, sessions_sql_params, date_filter, interval) elif session_type == "dist": result = self._session_dist(all_sessions, sessions_sql_params) else: result = self._session_list(all_sessions, sessions_sql_params, team, request) return result
def calculate_sessions(self, events: QuerySet, session_type: Optional[str], date_filter: Dict[str, datetime], team: Team, request: request.Request) -> List[Dict[str, Any]]: if not events: return [] # format date filter for session view _date_gte = Q() if session_type is None: if request.GET.get('date_from', None): _date_gte = Q(timestamp__gte=date_filter['timestamp__gte'], timestamp__lte=date_filter['timestamp__gte'] + relativedelta(days=1)) else: dt = events.order_by('-timestamp').values( 'timestamp')[0]['timestamp'] if dt: dt = dt.replace(hour=0, minute=0, second=0, microsecond=0) _date_gte = Q(timestamp__gte=dt, timestamp__lte=dt + relativedelta(days=1)) sessions = events.filter(_date_gte)\ .annotate(previous_timestamp=Window( expression=Lag('timestamp', default=None), partition_by=F('distinct_id'), order_by=F('timestamp').asc() ))\ .annotate(previous_event=Window( expression=Lag('event', default=None), partition_by=F('distinct_id'), order_by=F('timestamp').asc() )) sessions_sql, sessions_sql_params = sessions.query.sql_with_params() all_sessions = '\ SELECT *,\ SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\ SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\ FROM (SELECT id, distinct_id, event, elements_hash, timestamp, properties, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30)\ OR previous_timestamp IS NULL \ THEN 1 ELSE 0 END AS new_session \ FROM ({}) AS inner_sessions\ ) AS outer_sessions'.format(sessions_sql) result: List = [] if session_type == 'avg': result = self._session_avg(all_sessions, sessions_sql_params, date_filter) elif session_type == 'dist': result = self._session_dist(all_sessions, sessions_sql_params) else: result = self._session_list(all_sessions, sessions_sql_params, team, date_filter, request) return result
def test_lag(self): """ Compute the difference between an employee's salary and the next highest salary in the employee's department. Return None if the employee has the lowest salary. """ qs = Employee.objects.annotate(lag=Window( expression=Lag(expression='salary', offset=1), partition_by=F('department'), order_by=[F('salary').asc(), F('name').asc()], )).order_by('department') self.assertQuerysetEqual(qs, [ ('Williams', 37000, 'Accounting', None), ('Jenson', 45000, 'Accounting', 37000), ('Jones', 45000, 'Accounting', 45000), ('Adams', 50000, 'Accounting', 45000), ('Moore', 34000, 'IT', None), ('Wilkinson', 60000, 'IT', 34000), ('Johnson', 80000, 'Management', None), ('Miller', 100000, 'Management', 80000), ('Smith', 38000, 'Marketing', None), ('Johnson', 40000, 'Marketing', 38000), ('Brown', 53000, 'Sales', None), ('Smith', 55000, 'Sales', 53000), ], transform=lambda row: (row.name, row.salary, row.department, row.lag))
def _navigation_base(filter_class_function, reverse_url_function, user, obj, url_name): context = {"current_element": obj} search_parameters = SearchParametersCache(user, obj.__class__.__name__).cached_data if not search_parameters: return context search_type = search_parameters.get("search_type") filter_form_class = filter_class_function(search_type) order_by = filter_form_class(data=search_parameters).qs.query.order_by order_by_expressions = convert_order_by_strings_to_expressions(order_by) or None qs = filter_form_class(data=search_parameters).qs.annotate( previous_acronym=Window( expression=Lag("acronym"), order_by=order_by_expressions, ), next_acronym=Window( expression=Lead("acronym"), order_by=order_by_expressions, ), previous_id=Window( expression=Lag("id"), order_by=order_by_expressions, ), next_id=Window( expression=Lead("id"), order_by=order_by_expressions, ) ).values_list( "id", "acronym", "previous_acronym", "previous_id", "next_acronym", "next_id", named=True ).order_by(*order_by) current_row = _get_current_row(qs, obj) if current_row: context.update({ "next_element_title": current_row.next_acronym, "next_url": reverse_url_function(current_row.next_id, url_name) if current_row.next_id else None, "previous_element_title": current_row.previous_acronym, "previous_url": reverse_url_function(current_row.previous_id, url_name) if current_row.previous_id else None }) return context
def list(self, request): team = request.user.team_set.get() resp = [] date_query = request_to_date_query(request.GET) sessions = Event.objects.filter( team=team, event='$pageview', **date_query )\ .annotate(previous_timestamp=Window( expression=Lag('timestamp', default=None), partition_by=F('distinct_id'), order_by=F('timestamp').asc() )) sessions_sql, sessions_sql_params = sessions.query.sql_with_params() cursor = connection.cursor() cursor.execute( '\ SELECT source_event, target_event, count(*) from (\ SELECT event_number || \'_\' || current_url as target_event,LAG(event_number || \'_\' || current_url, 1) OVER (\ PARTITION BY session\ ) AS source_event from \ (\ SELECT properties->> \'$current_url\' as current_url, sessionified.session\ ,ROW_NUMBER() OVER (\ PARTITION BY distinct_id\ ,session ORDER BY timestamp\ ) AS event_number\ FROM (\ SELECT events_notated.*, SUM(new_session) OVER (\ ORDER BY distinct_id\ ,timestamp\ ) AS session\ FROM (\ SELECT *, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30) OR previous_timestamp IS NULL THEN 1 ELSE 0 END AS new_session\ FROM ({}) AS inner_sessions \ ) as events_notated \ ) as sessionified\ ) as final\ where event_number <= 4\ ) as counts\ where source_event is not null and target_event is not null and SUBSTRING(source_event, 3) != SUBSTRING(target_event, 3)\ group by source_event, target_event order by count desc limit 15\ '.format(sessions_sql), sessions_sql_params) rows = cursor.fetchall() for row in rows: resp.append({'source': row[0], 'target': row[1], 'value': row[2]}) resp = sorted(resp, key=lambda x: x['value'], reverse=True) return Response(resp)
def test_lag_decimalfield(self): qs = Employee.objects.annotate(lag=Window( expression=Lag(expression='bonus', offset=1), partition_by=F('department'), order_by=[F('bonus').asc(), F('name').asc()], )).order_by('department', F('bonus').asc(), F('name').asc()) self.assertQuerysetEqual(qs, [ ('Williams', 92.5, 'Accounting', None), ('Jenson', 112.5, 'Accounting', 92.5), ('Jones', 112.5, 'Accounting', 112.5), ('Adams', 125, 'Accounting', 112.5), ('Moore', 85, 'IT', None), ('Wilkinson', 150, 'IT', 85), ('Johnson', 200, 'Management', None), ('Miller', 250, 'Management', 200), ('Smith', 95, 'Marketing', None), ('Johnson', 100, 'Marketing', 95), ('Brown', 132.5, 'Sales', None), ('Smith', 137.5, 'Sales', 132.5), ], transform=lambda row: (row.name, row.bonus, row.department, row.lag))
def test_null_source_lag(self): msg = "Lag requires a non-null source expression" with self.assertRaisesMessage(ValueError, msg): Lag(expression=None)
def handle(self, *args, **options): # Retrieve option from command option = options.get('frequency') option = option[0] # See if option is valid try: frequency_lookup = Frequency.objects.get(t_frequency=option) # Empty the Summary table for the specified frequency SummaryByCountyFrequency.objects.filter( n_frequency=frequency_lookup).delete() # Will always look at yesterday's date for retrieving information end_date = datetime.now().date() - timedelta(1) # Calculate how far back to pull data for based on the command option if option == 'Daily': start_date = datetime.now().date() - timedelta(2) if option == 'Bi-Weekly': day_of_week = datetime.now().weekday() # If it's Monday if day_of_week == 0: # Look back to Friday start_date = datetime.now().date() - timedelta(3) # Else when the job runs on Friday else: # Look back to Monday start_date = datetime.now().date() - timedelta(4) if option == 'Weekly': start_date = datetime.now().date() - timedelta(8) if option == 'Monthly': day_of_month = datetime.now().date().day start_date = (datetime.now().date() - timedelta(day_of_month)) + timedelta(1) # Retrieve rows from DailyCountyKnownCases where the date is the start date or the end date # Also, using the lag function to get the previous row's value for cases known_cases = DailyCountyKnownCases.objects.annotate( q_cases_lag=Window(expression=Lag('q_cases', offset=1, default=0), order_by=('n_county', 'd_date')), ).filter(Q(d_date=start_date) | Q(d_date=end_date)) # Define variable to keep track of number of inserts summary_row_inserts = 0 # Iterate through the results for known_case in known_cases: # Only want to look at the row that corresponds to yesterday's date since it has the lag value we need if known_case.d_date == end_date: # Create and fill variables for SummaryByCountyFrequency columns summary_n_county = known_case.n_county summary_n_frequency = frequency_lookup summary_d_updated = datetime.now().date() summary_q_cases_change = known_case.q_cases - known_case.q_cases_lag summary_q_total_cases = known_case.q_cases summary_q_deaths_change = 0 summary_q_total_deaths = 0 # Create SummaryByCountyFrequency object with previously created variables summary_row_insert = SummaryByCountyFrequency( n_county=summary_n_county, n_frequency=summary_n_frequency, d_updated=summary_d_updated, q_cases_change=summary_q_cases_change, q_total_cases=summary_q_total_cases, q_deaths_change=summary_q_deaths_change, q_total_deaths=summary_q_total_deaths) # Insert SummaryByCountyFrequency object summary_row_insert.save() # Increment insert counter summary_row_inserts += 1 self.stdout.write( self.style.SUCCESS('Inserted %s %s known cases summary rows' % (summary_row_inserts, option.lower()))) ################################################### # Update death data on SummaryByCountyFrequency ################################################### summary_row_updates = 0 # Retrieve rows from DailyCountyDeaths where the date is the start date or the end date # Also, using the lag function to get the previous row's value for cases deaths = DailyCountyDeaths.objects.annotate(q_deaths_lag=Window( expression=Lag('q_deaths', offset=1, default=0), order_by=('n_county', 'd_date')), ).filter( Q(d_date=start_date) | Q(d_date=end_date)) # Iterate through the results for death in deaths: # Only want to look at the row that corresponds to yesterday's date since it has the lag value we need if death.d_date == end_date: # Create and fill variables for SummaryByCountyFrequency columns that we want to update summary_q_deaths_change = death.q_deaths - death.q_deaths_lag summary_q_total_deaths = death.q_deaths # Update values by county and frequency SummaryByCountyFrequency.objects.filter( n_county=death.n_county, n_frequency=frequency_lookup).update( q_deaths_change=summary_q_deaths_change, q_total_deaths=summary_q_total_deaths) # Increment updates counter summary_row_updates += 1 self.stdout.write( self.style.SUCCESS('Updated %s summary rows' % summary_row_updates)) # If the specified option was not in the Frequency table except: self.stdout.write( self.style.ERROR( 'Please try a different option. "%s" is invalid.' % option))
def test_lag_negative_offset(self): msg = "Lag requires a positive integer for the offset" with self.assertRaisesMessage(ValueError, msg): Lag(expression="salary", offset=-1)
def calculate_sessions(self, events, session_type): sessions = events\ .annotate(previous_timestamp=Window( expression=Lag('timestamp', default=None), partition_by=F('distinct_id'), order_by=F('timestamp').asc() ))\ .annotate(previous_event=Window( expression=Lag('event', default=None), partition_by=F('distinct_id'), order_by=F('timestamp').asc() )) sessions_sql, sessions_sql_params = sessions.query.sql_with_params() # TODO: add midnight condition all_sessions = '\ SELECT distinct_id, timestamp,\ SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\ SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\ FROM (SELECT *, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30)\ OR previous_timestamp IS NULL \ THEN 1 ELSE 0 END AS new_session \ FROM ({}) AS inner_sessions\ ) AS outer_sessions'.format(sessions_sql) def overall_average_length(query): return 'SELECT COUNT(*) as sessions,\ AVG(length) AS average_session_length\ FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\ AS length FROM ({}) as count GROUP BY 1) agg'.format( query) def distribution(query): return 'SELECT COUNT(CASE WHEN length = 0 THEN 1 ELSE NULL END) as first,\ COUNT(CASE WHEN length > 0 AND length <= 3 THEN 1 ELSE NULL END) as second,\ COUNT(CASE WHEN length > 3 AND length <= 10 THEN 1 ELSE NULL END) as third,\ COUNT(CASE WHEN length > 10 AND length <= 30 THEN 1 ELSE NULL END) as fourth,\ COUNT(CASE WHEN length > 30 AND length <= 60 THEN 1 ELSE NULL END) as fifth,\ COUNT(CASE WHEN length > 60 AND length <= 180 THEN 1 ELSE NULL END) as sixth,\ COUNT(CASE WHEN length > 180 AND length <= 600 THEN 1 ELSE NULL END) as seventh,\ COUNT(CASE WHEN length > 600 AND length <= 1800 THEN 1 ELSE NULL END) as eighth,\ COUNT(CASE WHEN length > 1800 AND length <= 3600 THEN 1 ELSE NULL END) as ninth,\ COUNT(CASE WHEN length > 3600 THEN 1 ELSE NULL END) as tenth\ FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\ AS length FROM ({}) as count GROUP BY 1) agg'.format( query) result = [] if session_type == 'avg': cursor = connection.cursor() cursor.execute(overall_average_length(all_sessions), sessions_sql_params) calculated = cursor.fetchall() avg_length = round(calculated[0][1], 0) avg_formatted = friendly_time(avg_length) result = [{ 'label': 'Number of Sessions', 'count': calculated[0][0] }, { 'label': 'Average Duration of Session', 'count': avg_formatted }] else: dist_labels = [ '0 seconds (1 event)', '0-3 seconds', '3-10 seconds', '10-30 seconds', '30-60 seconds', '1-3 minutes', '3-10 minutes', '10-30 minutes', '30-60 minutes', '1+ hours' ] cursor = connection.cursor() cursor.execute(distribution(all_sessions), sessions_sql_params) calculated = cursor.fetchall() result = [{ 'label': dist_labels[index], 'count': calculated[0][index] } for index in range(len(dist_labels))] return result
def with_prev_attributes(self): prev_attributes = Window(expression=Lag('attributes'), partition_by=F('station'), order_by=F('datetime').asc()) return self.annotate(prev_attributes=prev_attributes)
def get_neighbour_pks(model, pk, filterset=None, ordering=None): ''' Given a model and pk that identify an object (model instance) will, given an ordering (defaulting to the models ordering) and optionally a filterset (from url_filter), will return a tuple that contains two PKs that of the prior and next neighbour in the list either of all objects by that ordering or the filtered list (if a filterset is provided) :returns: a 4 tuple containing (prior_pk, next_pk, row_number, list_length) :param model: The model the object is an instance of :param pk: The primary key of the model instance being considered :param filterset: An optional filterset (see https://github.com/miki725/django-url-filter) :param ordering: An optional ordering (otherwise default model ordering is used). See: https://docs.djangoproject.com/en/2.0/ref/models/options/#ordering ''' # If a filterset is provided ensure it's of the same model as specified (consistency). if filterset and not filterset.Meta.model == model: return (None, None) # Get the ordering list for the model (a list of fields # See: https://docs.djangoproject.com/en/2.0/ref/models/options/#ordering if ordering is None: ordering = model._meta.ordering order_by = [] for f in ordering: if f.startswith("-"): order_by.append(F(f[1:]).desc()) else: order_by.append(F(f).asc()) # A default order. We need an order or the window functions crash if len(order_by) == 0: order_by = ['pk'] # Define the window functions for each neighbour window_lag = Window(expression=Lag("pk"), order_by=order_by) window_lead = Window(expression=Lead("pk"), order_by=order_by) window_rownnum = Window(expression=RowNumber(), order_by=order_by) # Get a queryset annotated with neighbours. If annotated attrs clash with existing attrs an exception # will be raised: https://code.djangoproject.com/ticket/11256 try: # Start with all objects qs = model.objects.all() # Now apply a filterset if we have one if not filterset is None: # We respect the filterset. BUT we need to wrap it inside a sub query, so that # we can apply a DISTNCT ON Pk to avoid duplicate tuples that the window # functions can introduce when we are matching multiple remote objects. # Alas that's what they do. So we have to constrain it to one tuple per # PK. # # FIXME: Aaargh this won't work for injecting the current PK into the query! # My desire is to make sure that the query results include the provided pk. # Needs testing in both cases. I can't think of a way to do it alas. This is # frustrating me. Problem is across related object filters, or JOINS. # qs = filterset.filter() | (model.objects.filter(pk=pk).distinct() & filterset.filter()) qs = qs.filter(pk__in=Subquery(filterset.filter().distinct( 'pk').order_by('pk').values('pk'))) # Now order the objects properly qs = qs.order_by(*order_by) # Now annotate the queryset with the prior and next PKs qs = qs.annotate(neighbour_prior=window_lag, neighbour_next=window_lead, row_number=window_rownnum) except: return None # Finally we need some trickery alas to do a query on the queryset! We can't add this WHERE # as a filter because the LAG and LEAD Window functions fail then, they are empty because # there is no lagger or leader on the one line result! So we have to run that query on the # whole table, then extract from the result the one line we want! Wish I could find a way to # do this in the Django ORM not with a raw() call. # First we need the SQL from the existing query. Many on-line sources seem to recommend # str(qs.query) but this does not return reliable SQL! A bug in Django and much discussed: # https://code.djangoproject.com/ticket/30132 # https://code.djangoproject.com/ticket/25705 # https://code.djangoproject.com/ticket/25092 # https://code.djangoproject.com/ticket/24991 # https://code.djangoproject.com/ticket/17741 # # But this, it seems is the reliable method which involves dipping into Django's # innards a litte (the SQL compiler) sql, params = qs.query.get_compiler(using=qs.db).as_sql() # Now we wrap the SQL sql = "SELECT * FROM ({}) ao WHERE {}={}".format(sql, model._meta.pk.name, pk) # And create a new QuerySet ao = model.objects.raw(sql, params) try: if ao: if len(ao) == 1: return (ao[0].neighbour_prior, ao[0].neighbour_next, ao[0].row_number, qs.count()) else: raise ValueError( "Query error: object appears more than once in neighbour hunt." ) else: return (None, ) * 4 except: return (None, ) * 4
def list(self, request): team = request.user.team_set.get() resp = [] date_query = request_to_date_query(request.GET) event, path_type, event_filter, start_comparator = self._determine_path_type( request) properties = request.GET.get('properties') start_point = request.GET.get('start') sessions = Event.objects.add_person_id(team.pk).filter( team=team, **(event_filter), **date_query )\ .filter(~Q(event__in=['$autocapture', '$pageview', '$identify', '$pageleave']) if event is None else Q())\ .filter(Filter(data={'properties': json.loads(properties)}).properties_to_Q() if properties else Q())\ .annotate(previous_timestamp=Window( expression=Lag('timestamp', default=None), partition_by=F('distinct_id'), order_by=F('timestamp').asc() )) sessions_sql, sessions_sql_params = sessions.query.sql_with_params() if event == "$autocapture": sessions_sql = self._add_elements(query_string=sessions_sql) events_notated = '\ SELECT *, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30) OR previous_timestamp IS NULL THEN 1 ELSE 0 END AS new_session\ FROM ({}) AS inner_sessions\ '.format(sessions_sql) sessionified = '\ SELECT events_notated.*, SUM(new_session) OVER (\ ORDER BY distinct_id\ ,timestamp\ ) AS session\ FROM ({}) as events_notated\ '.format(events_notated) if start_point: sessionified = self._apply_start_point( start_comparator=start_comparator, query_string=sessionified, start_point=start_point) final = '\ SELECT {} as path_type, id, sessionified.session\ ,ROW_NUMBER() OVER (\ PARTITION BY distinct_id\ ,session ORDER BY timestamp\ ) AS event_number\ FROM ({}) as sessionified\ '.format(path_type, sessionified) counts = '\ SELECT event_number || \'_\' || path_type as target_event, id as target_id, LAG(event_number || \'_\' || path_type, 1) OVER (\ PARTITION BY session\ ) AS source_event , LAG(id, 1) OVER (\ PARTITION BY session\ ) AS source_id from \ ({}) as final\ where event_number <= 4\ '.format(final) cursor = connection.cursor() cursor.execute( '\ SELECT source_event, target_event, MAX(target_id), MAX(source_id), count(*) from ({}) as counts\ where source_event is not null and target_event is not null\ group by source_event, target_event order by count desc limit 20\ '.format(counts), sessions_sql_params) rows = cursor.fetchall() for row in rows: resp.append({ 'source': row[0], 'target': row[1], 'target_id': row[2], 'source_id': row[3], 'value': row[4] }) resp = sorted(resp, key=lambda x: x['value'], reverse=True) return Response(resp)
def diff_vs_previous_order(self): return self.annotate(prev_order_id=models.Window( expression=Lag('id'), partition_by=[models.F('customer_id')], order_by=models.F('created_at').asc(), ))
def _window_helper(attr): return F(attr) - Window(expression=Lag(attr), order_by=F('date').desc())
def get_months_plot(queryset, field) -> list: return queryset.annotate(month=TruncMonth('date__month'))\ .values('month').annotate(c=Sum(field)).values('month', 'c')\ .annotate(prev=Window(Lag('c'))).annotate(repayment=F('c')-F('prev'))\ .values('month', 'c', 'repayment')
def list(self, request): team = request.user.team_set.get() resp = [] date_query = request_to_date_query(request.GET) event, path_type = self._determine_path_type(request) sessions = Event.objects.filter( team=team, **({"event":event} if event else {'event__regex':'^[^\$].*'}), #anything without $ (default) **date_query )\ .annotate(previous_timestamp=Window( expression=Lag('timestamp', default=None), partition_by=F('distinct_id'), order_by=F('timestamp').asc() )) sessions_sql, sessions_sql_params = sessions.query.sql_with_params() if event == "$autocapture": element = 'SELECT \'<\'|| e."tag_name" || \'> \' || e."text" as tag_name_source, e."text" as text_source FROM "posthog_element" e JOIN \ ( SELECT group_id, MIN("posthog_element"."order") as minOrder FROM "posthog_element" GROUP BY group_id) e2 ON e.order = e2.minOrder AND e.group_id = e2.group_id where e.group_id = v2.group_id' element_group = 'SELECT g."id" as group_id FROM "posthog_elementgroup" g where v1."elements_hash" = g."hash"' sessions_sql = 'SELECT * FROM ({}) as v1 JOIN LATERAL ({}) as v2 on true JOIN LATERAL ({}) as v3 on true'.format( sessions_sql, element_group, element) cursor = connection.cursor() cursor.execute( '\ SELECT source_event, target_event, MAX(target_id), MAX(source_id), count(*) from (\ SELECT event_number || \'_\' || path_type as target_event, id as target_id, LAG(event_number || \'_\' || path_type, 1) OVER (\ PARTITION BY session\ ) AS source_event , LAG(id, 1) OVER (\ PARTITION BY session\ ) AS source_id from \ (\ SELECT {} as path_type, id, sessionified.session\ ,ROW_NUMBER() OVER (\ PARTITION BY distinct_id\ ,session ORDER BY timestamp\ ) AS event_number\ FROM (\ SELECT events_notated.*, SUM(new_session) OVER (\ ORDER BY distinct_id\ ,timestamp\ ) AS session\ FROM (\ SELECT *, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30) OR previous_timestamp IS NULL THEN 1 ELSE 0 END AS new_session\ FROM ({}) AS inner_sessions \ ) as events_notated \ ) as sessionified\ ) as final\ where event_number <= 4\ ) as counts\ where source_event is not null and target_event is not null and SUBSTRING(source_event, 3) != SUBSTRING(target_event, 3)\ group by source_event, target_event order by count desc limit 15\ '.format(path_type, sessions_sql), sessions_sql_params) rows = cursor.fetchall() for row in rows: resp.append({ 'source': row[0], 'target': row[1], 'target_id': row[2], 'source_id': row[3], 'value': row[4] }) resp = sorted(resp, key=lambda x: x['value'], reverse=True) return Response(resp)
def calculate_sessions(self, events: QuerySet, session_type: str, date_filter) -> List[Dict[str, Any]]: sessions = events\ .annotate(previous_timestamp=Window( expression=Lag('timestamp', default=None), partition_by=F('distinct_id'), order_by=F('timestamp').asc() ))\ .annotate(previous_event=Window( expression=Lag('event', default=None), partition_by=F('distinct_id'), order_by=F('timestamp').asc() )) sessions_sql, sessions_sql_params = sessions.query.sql_with_params() # TODO: add midnight condition all_sessions = '\ SELECT distinct_id, timestamp,\ SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\ SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\ FROM (SELECT *, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30)\ OR previous_timestamp IS NULL \ THEN 1 ELSE 0 END AS new_session \ FROM ({}) AS inner_sessions\ ) AS outer_sessions'.format(sessions_sql) def distribution(query): return 'SELECT COUNT(CASE WHEN length = 0 THEN 1 ELSE NULL END) as first,\ COUNT(CASE WHEN length > 0 AND length <= 3 THEN 1 ELSE NULL END) as second,\ COUNT(CASE WHEN length > 3 AND length <= 10 THEN 1 ELSE NULL END) as third,\ COUNT(CASE WHEN length > 10 AND length <= 30 THEN 1 ELSE NULL END) as fourth,\ COUNT(CASE WHEN length > 30 AND length <= 60 THEN 1 ELSE NULL END) as fifth,\ COUNT(CASE WHEN length > 60 AND length <= 180 THEN 1 ELSE NULL END) as sixth,\ COUNT(CASE WHEN length > 180 AND length <= 600 THEN 1 ELSE NULL END) as seventh,\ COUNT(CASE WHEN length > 600 AND length <= 1800 THEN 1 ELSE NULL END) as eighth,\ COUNT(CASE WHEN length > 1800 AND length <= 3600 THEN 1 ELSE NULL END) as ninth,\ COUNT(CASE WHEN length > 3600 THEN 1 ELSE NULL END) as tenth\ FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\ AS length FROM ({}) as count GROUP BY 1) agg'.format( query) def average_length_time(query): return 'SELECT date_trunc(\'day\', timestamp) as start_time,\ AVG(length) AS average_session_length_per_day,\ SUM(length) AS total_session_length_per_day, \ COUNT(1) as num_sessions_per_day\ FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\ AS length,\ MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time'.format( query) result: List = [] if session_type == 'avg': cursor = connection.cursor() cursor.execute(average_length_time(all_sessions), sessions_sql_params) time_series_avg = cursor.fetchall() time_series_avg_friendly = [] date_range = pd.date_range(date_filter['timestamp__gte'].date(), date_filter['timestamp__lte'].date(), freq='D') time_series_avg_friendly = [ (day, round(time_series_avg[index][1] if index < len(time_series_avg) else 0)) for index, day in enumerate(date_range) ] time_series_data = append_data(time_series_avg_friendly, math=None) # calculate average totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]] overall_average = (totals[0] / totals[1]) if totals else 0 avg_formatted = friendly_time(overall_average) avg_split = avg_formatted.split(' ') time_series_data.update({ 'label': 'Average Duration of Session ({})'.format(avg_split[1]), 'count': int(avg_split[0]) }) time_series_data.update( {"chartLabel": 'Average Duration of Session (seconds)'}) result = [time_series_data] else: dist_labels = [ '0 seconds (1 event)', '0-3 seconds', '3-10 seconds', '10-30 seconds', '30-60 seconds', '1-3 minutes', '3-10 minutes', '10-30 minutes', '30-60 minutes', '1+ hours' ] cursor = connection.cursor() cursor.execute(distribution(all_sessions), sessions_sql_params) calculated = cursor.fetchall() result = [{ 'label': dist_labels[index], 'count': calculated[0][index] } for index in range(len(dist_labels))] return result
def calculate_paths(self, filter: PathFilter, team: Team): date_query = request_to_date_query({"date_from": filter._date_from, "date_to": filter._date_to}, exact=False) resp = [] prop_type = filter.prop_type event, event_filter = filter.target_event start_comparator = filter.comparator sessions = ( Event.objects.add_person_id(team.pk) .filter(team=team, **(event_filter), **date_query) .filter( ~Q(event__in=["$autocapture", "$pageview", "$identify", "$pageleave", "$screen"]) if event is None else Q() ) .filter( properties_to_Q(filter.properties, team_id=team.pk, filter_test_accounts=filter.filter_test_accounts) if filter and (filter.properties or filter.filter_test_accounts) else Q() ) .annotate( previous_timestamp=Window( expression=Lag("timestamp", default=None), partition_by=F("person_id"), order_by=F("timestamp").asc(), ) ) ) sessions_sql, sessions_sql_params = sessions.query.sql_with_params() if event == "$autocapture": sessions_sql = self._add_elements(query_string=sessions_sql) events_notated = "\ SELECT *, CASE WHEN EXTRACT('EPOCH' FROM (timestamp - previous_timestamp)) >= (60 * 30) OR previous_timestamp IS NULL THEN 1 ELSE 0 END AS new_session\ FROM ({}) AS inner_sessions\ ".format( sessions_sql ) sessionified = "\ SELECT events_notated.*, SUM(new_session) OVER (\ ORDER BY person_id\ ,timestamp\ ) AS session\ FROM ({}) as events_notated\ ".format( events_notated ) if filter and filter.start_point: sessionified = self._apply_start_point( start_comparator=start_comparator, query_string=sessionified, start_point=filter.start_point, ) final = "\ SELECT {} as path_type, id, sessionified.session\ ,ROW_NUMBER() OVER (\ PARTITION BY person_id\ ,session ORDER BY timestamp\ ) AS event_number\ FROM ({}) as sessionified\ ".format( prop_type, sessionified ) counts = "\ SELECT event_number || '_' || path_type as target_event, id as target_id, LAG(event_number || '_' || path_type, 1) OVER (\ PARTITION BY session\ ) AS source_event , LAG(id, 1) OVER (\ PARTITION BY session\ ) AS source_id from \ ({}) as final\ where event_number <= 4\ ".format( final ) query = "\ SELECT source_event, target_event, MAX(target_id), MAX(source_id), count(*) from ({}) as counts\ where source_event is not null and target_event is not null\ group by source_event, target_event order by count desc limit 20\ ".format( counts ) cursor = connection.cursor() cursor.execute(query, sessions_sql_params) rows = cursor.fetchall() for row in rows: resp.append( {"source": row[0], "target": row[1], "target_id": row[2], "source_id": row[3], "value": row[4],} ) resp = sorted(resp, key=lambda x: x["value"], reverse=True) return resp
def parse(self, fpn, path, zug_id, info): root_path = path.replace(zug_id, '') zug_tag = fpn.find('Zug') zug = FahrplanZug(path=zug_id, name=info['name']) zug.gattung = zug_tag.get('Gattung') zug.nummer = zug_tag.get('Nummer').split('_') zug.zug_lauf = zug_tag.get('Zuglauf') zug.fahrplan_gruppe = zug_tag.get('FahrplanGruppe') zug.deko = (zug_tag.get('Dekozug') == '1') zug.is_reisezug = (zug_tag.get('Zugtyp') == '1') self.stripRouteNumber(zug) zug.speed_anfang = self.toSpeed(self.getAsFloat(zug_tag, 'spAnfang')) zug.speed_zug = self.toSpeed(self.getAsFloat(zug_tag, 'spZugNiedriger')) zug.fahrzeug_tree = self.processVarianten( fpn.find('Zug/FahrzeugVarianten')) zug.save() zug.autor.add(*info['autor']) pos = 0 for eintrag in fpn.findall("Zug/FahrplanEintrag"): an = self.getDateTime(eintrag, 'Ank') ab = self.getDateTime(eintrag, 'Abf') ort = eintrag.get('Betrst') bedarf = eintrag.get('FplEintrag') == '2' kopf = eintrag.get('FzgVerbandAktion') != None ereignis = eintrag.find('Ereignis') != None if ort == None: self.logger.warn("Ignoring eintrag with empty place") continue eintrag_obj = FahrplanZugEintrag(position=pos, ort=eintrag.get('Betrst'), ab=ab, an=an, zug=zug, bedarfshalt=bedarf, kopf_machen=kopf, ereignis=ereignis) eintrag_obj.save() zug.eintraege.add(eintrag_obj) pos += 1 for fahrzeug in fpn.iter('FahrzeugInfo'): path = fahrzeug.find('Datei').get('Dateiname') try: fahrzeug_obj = FahrzeugVariante.objects.get( root_file__iexact=path, haupt_id=fahrzeug.get('IDHaupt'), neben_id=fahrzeug.get('IDNeben')) zug.fahrzeuge.add(fahrzeug_obj) if not zug.steuerfahrzeug and fahrzeug_obj.fuehrerstand: zug.steuerfahrzeug = fahrzeug_obj if not zug.triebfahrzeug and len(fahrzeug_obj.antrieb) > 0: zug.triebfahrzeug = fahrzeug_obj except FahrzeugVariante.DoesNotExist: self.logger.error("Could not find Fahrzeug Variant " + path + "/" + fahrzeug.get('IDHaupt') + ":" + fahrzeug.get('IDNeben')) raise imgpath = os.path.join('trn', zug.path.replace('\\', '') + ".png") zugrenderer = self.ZugRenderer(root_path, self.getRenderer(20)) zug.bild = zugrenderer.renderImage(zug, imgpath, 1) zeit_diff = FahrplanZugEintrag.objects.filter( zug_id=zug_id).exclude(Q(ab=None) & Q(an=None)).annotate( zeit_previous=Window(expression=Lag('ab'), order_by=F('position').asc()), zeit_diff=Coalesce('an', 'ab') - F('zeit_previous')).order_by().values_list('zeit_diff', flat=True) zug.zeit_bewegung = sum(zeit_diff[1:], timedelta()) zug.save()