Exemple #1
0
    def _format_total_volume_query(self, entity: Entity, filter: Filter,
                                   team_id: int) -> List[Dict[str, Any]]:
        events = process_entity_for_events(
            entity=entity,
            team_id=team_id,
            order_by="-timestamp",
        )

        items, filtered_events = aggregate_by_interval(
            events=events,
            team_id=team_id,
            entity=entity,
            filter=filter,
        )
        formatted_entities: List[Dict[str, Any]] = []
        for _, item in items.items():
            formatted_data = append_data(dates_filled=list(item.items()),
                                         interval=filter.interval)
            if filter.display in TRENDS_DISPLAY_BY_VALUE:
                formatted_data.update({
                    "aggregated_value":
                    get_aggregate_total(filtered_events, entity)
                })
            formatted_entities.append(formatted_data)
        return formatted_entities
Exemple #2
0
    def _serialize_breakdown(self, entity: Entity, filter: Filter,
                             team_id: int) -> List[Dict[str, Any]]:
        events = process_entity_for_events(
            entity=entity,
            team_id=team_id,
            order_by="-timestamp",
        )
        items, filtered_events = aggregate_by_interval(
            events=events,
            team_id=team_id,
            entity=entity,
            filter=filter,
            breakdown="properties__{}".format(filter.breakdown)
            if filter.breakdown else None,
        )
        formatted_entities: List[Dict[str, Any]] = []
        for value, item in items.items():
            new_dict = append_data(dates_filled=list(item.items()),
                                   interval=filter.interval)
            if value != "Total":
                new_dict.update(breakdown_label(entity, value))
            if filter.display in TRENDS_DISPLAY_BY_VALUE:
                new_dict.update({
                    "aggregated_value":
                    get_aggregate_breakdown_total(filtered_events, filter,
                                                  entity, team_id,
                                                  new_dict["breakdown_value"])
                })
            formatted_entities.append(new_dict)

        return formatted_entities
Exemple #3
0
    def _serialize_entity(self, entity: Entity, filter: Filter, team_id: int) -> List[Dict[str, Any]]:
        if filter.interval is None:
            filter.interval = "day"

        serialized: Dict[str, Any] = {
            "action": entity.to_dict(),
            "label": entity.name,
            "count": 0,
            "data": [],
            "labels": [],
            "days": [],
        }
        response = []
        events = process_entity_for_events(entity=entity, team_id=team_id, order_by="-timestamp",)
        events = events.filter(filter_events(team_id, filter, entity))
        items = aggregate_by_interval(
            filtered_events=events,
            team_id=team_id,
            entity=entity,
            filter=filter,
            breakdown="properties__{}".format(filter.breakdown) if filter.breakdown else None,
        )
        for value, item in items.items():
            new_dict = copy.deepcopy(serialized)
            if value != "Total":
                new_dict.update(breakdown_label(entity, value))
            new_dict.update(append_data(dates_filled=list(item.items()), interval=filter.interval))
            if filter.display == TRENDS_CUMULATIVE:
                new_dict["data"] = np.cumsum(new_dict["data"])
            response.append(new_dict)

        return response
Exemple #4
0
 def _format_normal_query(self, entity: Entity, filter: Filter, team_id: int) -> List[Dict[str, Any]]:
     events = process_entity_for_events(entity=entity, team_id=team_id, order_by="-timestamp",)
     events = events.filter(filter_events(team_id, filter, entity))
     items = aggregate_by_interval(filtered_events=events, team_id=team_id, entity=entity, filter=filter,)
     formatted_entities: List[Dict[str, Any]] = []
     for _, item in items.items():
         formatted_entities.append(append_data(dates_filled=list(item.items()), interval=filter.interval))
     return formatted_entities
Exemple #5
0
    def calculate_avg(self, filter: Filter, team: Team):

        parsed_date_from, parsed_date_to, _ = parse_timestamps(filter, team.pk)

        filters, params = parse_prop_clauses(
            filter.properties, team.pk, filter_test_accounts=filter.filter_test_accounts
        )

        interval_notation = get_trunc_func_ch(filter.interval)
        num_intervals, seconds_in_interval, _ = get_time_diff(
            filter.interval or "day", filter.date_from, filter.date_to, team.pk
        )

        entity_conditions, entity_params = entity_query_conditions(filter, team)
        if not entity_conditions:
            entity_conditions = ["event != '$feature_flag_called'"]  # default conditino

        params = {**params, **entity_params}
        entity_query = " OR ".join(entity_conditions)

        avg_query = SESSIONS_NO_EVENTS_SQL.format(
            team_id=team.pk,
            date_from=parsed_date_from,
            date_to=parsed_date_to,
            filters=filters,
            sessions_limit="",
            entity_filter=f"AND ({entity_query})",
        )
        per_period_query = AVERAGE_PER_PERIOD_SQL.format(sessions=avg_query, interval=interval_notation)

        null_sql = NULL_SQL.format(
            date_to=filter.date_to.strftime("%Y-%m-%d 00:00:00"),
            interval=interval_notation,
            num_intervals=num_intervals,
            seconds_in_interval=seconds_in_interval,
        )

        final_query = AVERAGE_SQL.format(sessions=per_period_query, null_sql=null_sql)

        params = {**params, "team_id": team.pk}
        response = sync_execute(final_query, params)
        values = self.clean_values(filter, response)
        time_series_data = append_data(values, interval=filter.interval, math=None)
        scaled_data, _ = scale_time_series(time_series_data["data"])
        time_series_data.update({"data": scaled_data})
        # calculate average
        total = sum(val[1] for val in values)

        if total == 0:
            return []

        valid_days = sum(1 if val[1] else 0 for val in values)
        overall_average = (total / valid_days) if valid_days else 0

        result = self._format_avg(overall_average)
        time_series_data.update(result)

        return [time_series_data]
Exemple #6
0
    def _session_avg(
        self, base_query: str, params: Tuple[Any, ...], date_filter: Dict[str, datetime], interval: Optional[str]
    ) -> List[Dict[str, Any]]:
        def _determineInterval(interval):
            if interval == "minute":
                return (
                    "minute",
                    "min",
                )
            elif interval == "hour":
                return "hour", "H"
            elif interval == "week":
                return "week", "W"
            elif interval == "month":
                return "month", "M"
            else:
                return "day", "D"

        interval, interval_freq = _determineInterval(interval)

        average_length_time = "SELECT date_trunc('{interval}', timestamp) as start_time,\
                        AVG(length) AS average_session_length_per_day,\
                        SUM(length) AS total_session_length_per_day, \
                        COUNT(1) as num_sessions_per_day\
                        FROM (SELECT global_session_id, EXTRACT('EPOCH' FROM (MAX(timestamp) - MIN(timestamp)))\
                            AS length,\
                            MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time".format(
            base_query, interval=interval
        )

        cursor = connection.cursor()
        cursor.execute(average_length_time, params)
        time_series_avg = cursor.fetchall()

        date_range = pd.date_range(date_filter["timestamp__gte"], date_filter["timestamp__lte"], freq=interval_freq,)
        df = pd.DataFrame([{"date": a[0], "count": a[1], "breakdown": "Total"} for a in time_series_avg])
        if interval == "week":
            df["date"] = df["date"].apply(lambda x: x - pd.offsets.Week(weekday=6))
        elif interval == "month":
            df["date"] = df["date"].apply(lambda x: x - pd.offsets.MonthEnd(n=0))

        df_dates = pd.DataFrame(df.groupby("date").mean(), index=date_range)
        df_dates = df_dates.fillna(0)
        values = [(key, round(value[0])) if len(value) > 0 else (key, 0) for key, value in df_dates.iterrows()]

        time_series_data = append_data(values, interval=interval, math=None)
        # calculate average
        totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]]
        overall_average = (totals[0] / totals[1]) if totals else 0
        avg_formatted = friendly_time(overall_average)
        avg_split = avg_formatted.split(" ")

        time_series_data.update(
            {"label": "Average Duration of Session ({})".format(avg_split[1]), "count": int(avg_split[0]),}
        )
        time_series_data.update({"chartLabel": "Average Duration of Session (seconds)"})
        result = [time_series_data]
        return result
Exemple #7
0
    def calculate_avg(self, filter: Filter, team: Team):

        # format default dates
        if not filter._date_from:
            filter._date_from = relative_date_parse("-7d")
        if not filter._date_to:
            filter._date_to = timezone.now()

        parsed_date_from, parsed_date_to = parse_timestamps(filter)

        filters, params = parse_prop_clauses("uuid", filter.properties, team)

        interval_notation = get_interval_annotation_ch(filter.interval)
        num_intervals, seconds_in_interval = get_time_diff(
            filter.interval or "day", filter.date_from, filter.date_to)

        avg_query = SESSIONS_NO_EVENTS_SQL.format(
            team_id=team.pk,
            date_from=parsed_date_from,
            date_to=parsed_date_to,
            filters="{}".format(filters) if filter.properties else "",
            sessions_limit="",
        )
        per_period_query = AVERAGE_PER_PERIOD_SQL.format(
            sessions=avg_query, interval=interval_notation)

        null_sql = NULL_SQL.format(
            date_to=(filter.date_to
                     or timezone.now()).strftime("%Y-%m-%d 00:00:00"),
            interval=interval_notation,
            num_intervals=num_intervals,
            seconds_in_interval=seconds_in_interval,
        )

        final_query = AVERAGE_SQL.format(sessions=per_period_query,
                                         null_sql=null_sql)

        params = {**params, "team_id": team.pk}
        response = sync_execute(final_query, params)
        values = self.clean_values(filter, response)
        time_series_data = append_data(values,
                                       interval=filter.interval,
                                       math=None)
        # calculate average
        total = sum(val[1] for val in values)

        if total == 0:
            return []

        valid_days = sum(1 if val[1] else 0 for val in values)
        overall_average = (total / valid_days) if valid_days else 0

        result = self._format_avg(overall_average)
        time_series_data.update(result)

        return [time_series_data]
Exemple #8
0
def serialize_entity(
    entity: Entity, filter: Filter, params: dict, team_id: int
) -> List[Dict[str, Any]]:
    interval = params.get("interval")
    if interval is None:
        interval = "day"

    serialized: Dict[str, Any] = {
        "action": entity.to_dict(),
        "label": entity.name,
        "count": 0,
        "data": [],
        "labels": [],
        "days": [],
    }
    response = []
    events = process_entity_for_events(
        entity=entity,
        team_id=team_id,
        order_by=None if params.get("shown_as") == "Stickiness" else "-timestamp",
    )
    events = events.filter(filter_events(team_id, filter, entity))
    if params.get("shown_as", "Volume") == "Volume":
        items = aggregate_by_interval(
            filtered_events=events,
            team_id=team_id,
            entity=entity,
            filter=filter,
            interval=interval,
            params=params,
            breakdown="properties__{}".format(params.get("breakdown"))
            if params.get("breakdown")
            else None,
        )
        for value, item in items.items():
            new_dict = copy.deepcopy(serialized)
            if value != "Total":
                new_dict.update(breakdown_label(entity, value))
            new_dict.update(
                append_data(dates_filled=list(item.items()), interval=interval)
            )
            if filter.display == TRENDS_CUMULATIVE:
                new_dict["data"] = np.cumsum(new_dict["data"])
            response.append(new_dict)
    elif params.get("shown_as") == TRENDS_STICKINESS:
        new_dict = copy.deepcopy(serialized)
        new_dict.update(
            stickiness(
                filtered_events=events, entity=entity, filter=filter, team_id=team_id
            )
        )
        response.append(new_dict)

    return response
Exemple #9
0
    def _serialize_entity(self, entity: Entity, filter: Filter,
                          request: request.Request,
                          team: Team) -> List[Dict[str, Any]]:
        interval = request.GET.get('interval')
        if interval is None:
            interval = 'day'

        serialized: Dict[str, Any] = {
            'action': {
                'id': entity.id,
                'name': entity.name,
                'type': entity.type
            },
            'label': entity.name,
            'count': 0,
            'data': [],
            'labels': [],
            'days': []
        }
        response = []
        events = self._process_entity_for_events(
            entity=entity,
            team=team,
            order_by=None
            if request.GET.get('shown_as') == 'Stickiness' else '-timestamp')
        events = events.filter(self._filter_events(filter))

        if request.GET.get('shown_as', 'Volume') == 'Volume':
            items = self._aggregate_by_interval(
                filtered_events=events,
                entity=entity,
                filter=filter,
                interval=interval,
                request=request,
                breakdown='properties__{}'.format(request.GET['breakdown'])
                if request.GET.get('breakdown') else None)
            for value, item in items.items():
                new_dict = copy.deepcopy(serialized)
                if value != 'Total':
                    new_dict['label'] = '{} - {}'.format(
                        entity.name, value if value else 'undefined')
                    new_dict['breakdown_value'] = value
                new_dict.update(
                    append_data(dates_filled=list(item.items()),
                                interval=interval))
                response.append(new_dict)
        elif request.GET['shown_as'] == 'Stickiness':
            new_dict = copy.deepcopy(serialized)
            new_dict.update(
                self._stickiness(filtered_events=events, filter=filter))
            response.append(new_dict)

        return response
Exemple #10
0
    def _serialize_entity(self, entity: Entity, filter: Filter,
                          request: request.Request,
                          team: Team) -> List[Dict[str, Any]]:
        interval = request.GET.get('interval')
        if interval is None:
            interval = 'day'

        serialized: Dict[str, Any] = {
            'action': entity.to_dict(),
            'label': entity.name,
            'count': 0,
            'data': [],
            'labels': [],
            'days': []
        }
        response = []
        events = self._process_entity_for_events(
            entity=entity,
            team=team,
            order_by=None
            if request.GET.get('shown_as') == 'Stickiness' else '-timestamp')
        events = events.filter(self._filter_events(filter, entity))
        if request.GET.get('shown_as', 'Volume') == 'Volume':
            items = self._aggregate_by_interval(
                filtered_events=events,
                team=team,
                entity=entity,
                filter=filter,
                interval=interval,
                request=request,
                breakdown='properties__{}'.format(request.GET['breakdown'])
                if request.GET.get('breakdown') else None,
            )
            for value, item in items.items():
                new_dict = copy.deepcopy(serialized)
                if value != 'Total':
                    new_dict.update(self._breakdown_label(entity, value))
                new_dict.update(
                    append_data(dates_filled=list(item.items()),
                                interval=interval))
                if filter.display == TRENDS_CUMULATIVE:
                    new_dict['data'] = np.cumsum(new_dict['data'])
                response.append(new_dict)
        elif request.GET['shown_as'] == TRENDS_STICKINESS:
            new_dict = copy.deepcopy(serialized)
            new_dict.update(
                self._stickiness(filtered_events=events,
                                 entity=entity,
                                 filter=filter))
            response.append(new_dict)

        return response
Exemple #11
0
    def _session_avg(self, base_query: str, params: Tuple[Any, ...],
                     date_filter: Dict[str, datetime]) -> List[Dict[str, Any]]:
        average_length_time = "SELECT date_trunc('day', timestamp) as start_time,\
                        AVG(length) AS average_session_length_per_day,\
                        SUM(length) AS total_session_length_per_day, \
                        COUNT(1) as num_sessions_per_day\
                        FROM (SELECT global_session_id, EXTRACT('EPOCH' FROM (MAX(timestamp) - MIN(timestamp)))\
                            AS length,\
                            MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time".format(
            base_query)

        cursor = connection.cursor()
        cursor.execute(average_length_time, params)
        time_series_avg = cursor.fetchall()
        time_series_avg_friendly = []
        date_range = pd.date_range(
            date_filter["timestamp__gte"].date(),
            date_filter["timestamp__lte"].date(),
            freq="D",
        )
        time_series_avg_friendly = [(
            day,
            round(time_series_avg[index][1]
                  if index < len(time_series_avg) else 0),
        ) for index, day in enumerate(date_range)]

        time_series_data = append_data(time_series_avg_friendly, math=None)

        # calculate average
        totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]]
        overall_average = (totals[0] / totals[1]) if totals else 0
        avg_formatted = friendly_time(overall_average)
        avg_split = avg_formatted.split(" ")

        time_series_data.update({
            "label":
            "Average Duration of Session ({})".format(avg_split[1]),
            "count":
            int(avg_split[0]),
        })
        time_series_data.update(
            {"chartLabel": "Average Duration of Session (seconds)"})
        result = [time_series_data]
        return result
Exemple #12
0
    def calculate_sessions(self, events: QuerySet, session_type: str,
                           date_filter) -> List[Dict[str, Any]]:
        sessions = events\
            .annotate(previous_timestamp=Window(
                expression=Lag('timestamp', default=None),
                partition_by=F('distinct_id'),
                order_by=F('timestamp').asc()
            ))\
            .annotate(previous_event=Window(
                expression=Lag('event', default=None),
                partition_by=F('distinct_id'),
                order_by=F('timestamp').asc()
            ))

        sessions_sql, sessions_sql_params = sessions.query.sql_with_params()
        # TODO: add midnight condition

        all_sessions = '\
            SELECT distinct_id, timestamp,\
                SUM(new_session) OVER (ORDER BY distinct_id, timestamp) AS global_session_id,\
                SUM(new_session) OVER (PARTITION BY distinct_id ORDER BY timestamp) AS user_session_id\
                FROM (SELECT *, CASE WHEN EXTRACT(\'EPOCH\' FROM (timestamp - previous_timestamp)) >= (60 * 30)\
                    OR previous_timestamp IS NULL \
                    THEN 1 ELSE 0 END AS new_session \
                    FROM ({}) AS inner_sessions\
                ) AS outer_sessions'.format(sessions_sql)

        def distribution(query):
            return 'SELECT COUNT(CASE WHEN length = 0 THEN 1 ELSE NULL END) as first,\
                        COUNT(CASE WHEN length > 0 AND length <= 3 THEN 1 ELSE NULL END) as second,\
                        COUNT(CASE WHEN length > 3 AND length <= 10 THEN 1 ELSE NULL END) as third,\
                        COUNT(CASE WHEN length > 10 AND length <= 30 THEN 1 ELSE NULL END) as fourth,\
                        COUNT(CASE WHEN length > 30 AND length <= 60 THEN 1 ELSE NULL END) as fifth,\
                        COUNT(CASE WHEN length > 60 AND length <= 180 THEN 1 ELSE NULL END) as sixth,\
                        COUNT(CASE WHEN length > 180 AND length <= 600 THEN 1 ELSE NULL END) as seventh,\
                        COUNT(CASE WHEN length > 600 AND length <= 1800 THEN 1 ELSE NULL END) as eighth,\
                        COUNT(CASE WHEN length > 1800 AND length <= 3600 THEN 1 ELSE NULL END) as ninth,\
                        COUNT(CASE WHEN length > 3600 THEN 1 ELSE NULL END) as tenth\
                        FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\
                            AS length FROM ({}) as count GROUP BY 1) agg'.format(
                query)

        def average_length_time(query):
            return 'SELECT date_trunc(\'day\', timestamp) as start_time,\
                        AVG(length) AS average_session_length_per_day,\
                        SUM(length) AS total_session_length_per_day, \
                        COUNT(1) as num_sessions_per_day\
                        FROM (SELECT global_session_id, EXTRACT(\'EPOCH\' FROM (MAX(timestamp) - MIN(timestamp)))\
                            AS length,\
                            MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time'.format(
                query)

        result: List = []
        if session_type == 'avg':

            cursor = connection.cursor()
            cursor.execute(average_length_time(all_sessions),
                           sessions_sql_params)
            time_series_avg = cursor.fetchall()
            time_series_avg_friendly = []
            date_range = pd.date_range(date_filter['timestamp__gte'].date(),
                                       date_filter['timestamp__lte'].date(),
                                       freq='D')
            time_series_avg_friendly = [
                (day,
                 round(time_series_avg[index][1]
                       if index < len(time_series_avg) else 0))
                for index, day in enumerate(date_range)
            ]

            time_series_data = append_data(time_series_avg_friendly, math=None)

            # calculate average
            totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]]
            overall_average = (totals[0] / totals[1]) if totals else 0
            avg_formatted = friendly_time(overall_average)
            avg_split = avg_formatted.split(' ')

            time_series_data.update({
                'label':
                'Average Duration of Session ({})'.format(avg_split[1]),
                'count':
                int(avg_split[0])
            })
            time_series_data.update(
                {"chartLabel": 'Average Duration of Session (seconds)'})

            result = [time_series_data]
        else:
            dist_labels = [
                '0 seconds (1 event)', '0-3 seconds', '3-10 seconds',
                '10-30 seconds', '30-60 seconds', '1-3 minutes',
                '3-10 minutes', '10-30 minutes', '30-60 minutes', '1+ hours'
            ]
            cursor = connection.cursor()
            cursor.execute(distribution(all_sessions), sessions_sql_params)
            calculated = cursor.fetchall()
            result = [{
                'label': dist_labels[index],
                'count': calculated[0][index]
            } for index in range(len(dist_labels))]

        return result
Exemple #13
0
    def _session_avg(self, base_query: Query, params: QueryParams,
                     filter: Filter) -> List[Dict[str, Any]]:
        def _determineInterval(interval):
            if interval == "minute":
                return (
                    "minute",
                    "min",
                )
            elif interval == "hour":
                return "hour", "H"
            elif interval == "week":
                return "week", "W"
            elif interval == "month":
                return "month", "M"
            else:
                return "day", "D"

        interval, interval_freq = _determineInterval(filter.interval)

        average_length_time = "SELECT date_trunc('{interval}', timestamp) as start_time,\
                        AVG(length) AS average_session_length_per_day,\
                        SUM(length) AS total_session_length_per_day, \
                        COUNT(1) as num_sessions_per_day\
                        FROM (SELECT global_session_id, EXTRACT('EPOCH' FROM (MAX(timestamp) - MIN(timestamp)))\
                            AS length,\
                            MIN(timestamp) as timestamp FROM ({}) as count GROUP BY 1) as agg group by 1 order by start_time".format(
            base_query, interval=interval)

        cursor = connection.cursor()
        cursor.execute(average_length_time, params)
        time_series_avg = cursor.fetchall()
        if len(time_series_avg) == 0:
            return []

        date_range = get_daterange(filter.date_from,
                                   filter.date_to,
                                   frequency=interval)
        data_array = [{
            "date": a[0],
            "count": a[1],
            "breakdown": "Total"
        } for a in time_series_avg]

        if interval == "week":
            for df in data_array:
                df["date"] -= datetime.timedelta(days=df["date"].weekday() + 1)
        elif interval == "month":
            for df in data_array:
                df["date"] = (df["date"].replace(day=1) + datetime.timedelta(
                    days=32)).replace(day=1) - datetime.timedelta(days=1)

        datewise_data = {d["date"]: d["count"] for d in data_array}
        values = [(key, datewise_data.get(key, 0)) for key in date_range]

        time_series_data = append_data(values,
                                       interval=filter.interval,
                                       math=None)
        scaled_data, label = scale_time_series(time_series_data["data"])
        time_series_data.update({"data": scaled_data})
        # calculate average
        totals = [sum(x) for x in list(zip(*time_series_avg))[2:4]]
        overall_average = (totals[0] / totals[1]) if totals else 0
        avg_formatted = friendly_time(overall_average)
        avg_split = avg_formatted.split(" ")

        time_series_data.update({
            "label":
            "Average Session Length ({})".format(avg_split[1]),
            "count":
            int(avg_split[0]),
            "aggregated_value":
            int(avg_split[0]),
        })
        time_series_data.update(
            {"chartLabel": "Average Session Length ({})".format(label)})
        result = [time_series_data]
        return result