def setup_periodic_tasks(sender: Celery, **kwargs): if not settings.DEBUG: sender.add_periodic_task(1.0, redis_celery_queue_depth.s(), name="1 sec queue probe", priority=0) # Heartbeat every 10sec to make sure the worker is alive sender.add_periodic_task(10.0, redis_heartbeat.s(), name="10 sec heartbeat", priority=0) # Update events table partitions twice a week sender.add_periodic_task( crontab(day_of_week="mon,fri", hour=0, minute=0), update_event_partitions.s(), # check twice a week ) if getattr(settings, "MULTI_TENANCY", False) and not is_clickhouse_enabled(): sender.add_periodic_task(crontab(minute=0, hour="*/12"), run_session_recording_retention.s()) # Send weekly status report on self-hosted instances if not getattr(settings, "MULTI_TENANCY", False): sender.add_periodic_task(crontab(day_of_week="mon", hour=0, minute=0), status_report.s()) # Cloud (posthog-cloud) cron jobs if getattr(settings, "MULTI_TENANCY", False): sender.add_periodic_task(crontab(hour=0, minute=0), calculate_billing_daily_usage.s()) # every day midnight UTC # Send weekly email report (~ 8:00 SF / 16:00 UK / 17:00 EU) sender.add_periodic_task(crontab(day_of_week="mon", hour=15, minute=0), send_weekly_email_report.s()) sender.add_periodic_task(crontab(day_of_week="fri", hour=0, minute=0), clean_stale_partials.s()) # delete old plugin logs every 4 hours sender.add_periodic_task(crontab(minute=0, hour="*/4"), delete_old_plugin_logs.s()) # sync all Organization.available_features every hour sender.add_periodic_task(crontab(minute=30, hour="*"), sync_all_organization_available_features.s()) sender.add_periodic_task( UPDATE_CACHED_DASHBOARD_ITEMS_INTERVAL_SECONDS, check_cached_items.s(), name="check dashboard items" ) if is_clickhouse_enabled(): sender.add_periodic_task(120, clickhouse_lag.s(), name="clickhouse table lag") sender.add_periodic_task(120, clickhouse_row_count.s(), name="clickhouse events table row count") sender.add_periodic_task(120, clickhouse_part_count.s(), name="clickhouse table parts count") sender.add_periodic_task(120, clickhouse_mutation_count.s(), name="clickhouse table mutations count") else: sender.add_periodic_task( ACTION_EVENT_MAPPING_INTERVAL_SECONDS, calculate_event_action_mappings.s(), name="calculate event action mappings", expires=ACTION_EVENT_MAPPING_INTERVAL_SECONDS, ) sender.add_periodic_task(120, calculate_cohort.s(), name="recalculate cohorts") if settings.ASYNC_EVENT_PROPERTY_USAGE: sender.add_periodic_task( EVENT_PROPERTY_USAGE_INTERVAL_SECONDS, calculate_event_property_usage.s(), name="calculate event property usage", )
def preflight_check(request: HttpRequest) -> JsonResponse: response = { "django": True, "redis": is_redis_alive() or settings.TEST, "plugins": is_plugin_server_alive() or settings.TEST, "celery": is_celery_alive() or settings.TEST, "db": is_postgres_alive(), "initiated": User.objects.exists() if not settings.E2E_TESTING else False, # Enables E2E testing of signup flow "cloud": settings.MULTI_TENANCY, "available_social_auth_providers": get_available_social_auth_providers(), } if request.user.is_authenticated: response = { **response, "ee_available": settings.EE_AVAILABLE, "is_clickhouse_enabled": is_clickhouse_enabled(), "db_backend": settings.PRIMARY_DB.value, "available_timezones": get_available_timezones_with_offsets(), "opt_out_capture": os.environ.get("OPT_OUT_CAPTURE", False), "posthog_version": VERSION, "email_service_available": is_email_available(with_absolute_urls=True), "is_debug": settings.DEBUG, "is_event_property_usage_enabled": settings.ASYNC_EVENT_PROPERTY_USAGE, "licensed_users_available": get_licensed_users_available(), "site_url": settings.SITE_URL, } return JsonResponse(response)
def test_pagination(self): person_factory(team=self.team, distinct_ids=["1"]) for idx in range(0, 150): event_factory( team=self.team, event="some event", distinct_id="1", timestamp=timezone.now() - relativedelta(months=11) + relativedelta(days=idx, seconds=idx), ) response = self.client.get("/api/event/?distinct_id=1").json() self.assertEqual(len(response["results"]), 100) self.assertIn("http://testserver/api/event/?distinct_id=1&before=", response["next"]) page2 = self.client.get(response["next"]).json() from posthog.ee import is_clickhouse_enabled if is_clickhouse_enabled(): from ee.clickhouse.client import sync_execute self.assertEqual( sync_execute("select count(*) from events")[0][0], 150) self.assertEqual(len(page2["results"]), 50)
def calculate_people(self, use_clickhouse=is_clickhouse_enabled()): if self.is_static: return try: if not use_clickhouse: self.is_calculating = True self.save() persons_query = self._clickhouse_persons_query() if use_clickhouse else self._postgres_persons_query() try: sql, params = persons_query.distinct("pk").only("pk").query.sql_with_params() except EmptyResultSet: query = DELETE_QUERY.format(cohort_id=self.pk) params = {} else: query = "{}{}".format(DELETE_QUERY, UPDATE_QUERY).format( cohort_id=self.pk, values_query=sql.replace('FROM "posthog_person"', ', {} FROM "posthog_person"'.format(self.pk), 1,), ) cursor = connection.cursor() with transaction.atomic(): cursor.execute(query, params) self.is_calculating = False self.last_calculation = timezone.now() self.errors_calculating = 0 self.save() except Exception as err: if settings.DEBUG: raise err self.is_calculating = False self.errors_calculating = F("errors_calculating") + 1 self.save() capture_exception(err)
def handle(self, *args, **options): from django.test.runner import DiscoverRunner as TestRunner test_runner = TestRunner(interactive=False) test_runner.setup_databases() test_runner.setup_test_environment() if is_clickhouse_enabled(): from infi.clickhouse_orm import Database # type: ignore from posthog.settings import ( CLICKHOUSE_DATABASE, CLICKHOUSE_HTTP_URL, CLICKHOUSE_PASSWORD, CLICKHOUSE_REPLICATION, CLICKHOUSE_USER, CLICKHOUSE_VERIFY, ) database = Database( CLICKHOUSE_DATABASE, db_url=CLICKHOUSE_HTTP_URL, username=CLICKHOUSE_USER, password=CLICKHOUSE_PASSWORD, verify_ssl_cert=CLICKHOUSE_VERIFY, ) try: database.create_database() except: pass database.migrate("ee.clickhouse.migrations", replicated=CLICKHOUSE_REPLICATION)
def insert_cohort_from_query(cohort_id: int, insight_type: str, filter_data: Dict[str, Any], entity_data: Dict[str, Any]) -> None: if is_clickhouse_enabled(): from ee.clickhouse.queries.clickhouse_stickiness import insert_stickiness_people_into_cohort from ee.clickhouse.queries.util import get_earliest_timestamp from ee.clickhouse.views.actions import insert_entity_people_into_cohort from ee.clickhouse.views.cohort import insert_cohort_people_into_pg from posthog.models.entity import Entity from posthog.models.filters.filter import Filter from posthog.models.filters.stickiness_filter import StickinessFilter cohort = Cohort.objects.get(pk=cohort_id) entity = Entity(data=entity_data) if insight_type == INSIGHT_STICKINESS: _stickiness_filter = StickinessFilter( data=filter_data, team=cohort.team, get_earliest_timestamp=get_earliest_timestamp) insert_stickiness_people_into_cohort(cohort, entity, _stickiness_filter) else: _filter = Filter(data=filter_data) insert_entity_people_into_cohort(cohort, entity, _filter) insert_cohort_people_into_pg(cohort=cohort)
def __call__(self, request: HttpRequest): """ Install monkey-patch on demand. If monkey-patch has not been run in for this process (assuming multiple preforked processes), then do it now. """ from ee.clickhouse import client route = resolve(request.path) client._request_information = { "save": (is_clickhouse_enabled() and request.user.pk and (request.user.is_staff or is_impersonated_session(request) or settings.DEBUG)), "user_id": request.user.pk, "kind": "request", "id": f"{route.route} ({route.func.__name__})", } response: HttpResponse = self.get_response(request) client._request_information = None return response
def demo(request: Request): user = cast(User, request.user) organization = user.organization if not organization: raise AttributeError("This user has no organization.") try: team = organization.teams.get(is_demo=True) except Team.DoesNotExist: team = create_demo_team(organization, user, request) user.current_team = team user.save() if "$pageview" not in team.event_names: team.event_names.append("$pageview") team.event_names_with_usage.append({ "event": "$pageview", "usage_count": None, "volume": None }) team.save() if is_clickhouse_enabled(): # :TRICKY: Lazily backfill missing event data. from ee.clickhouse.models.event import get_events_by_team result = get_events_by_team(team_id=team.pk) if not result: create_demo_data(team, dashboards=False) return render_template("demo.html", request=request, context={"api_token": team.api_token})
def _calculate_funnel(filter: Filter, key: str, team_id: int) -> List[Dict[str, Any]]: dashboard_items = DashboardItem.objects.filter(team_id=team_id, filters_hash=key) dashboard_items.update(refreshing=True) team = Team(pk=team_id) if is_clickhouse_enabled(): funnel_order_class: Type[ClickhouseFunnelBase] = ClickhouseFunnel if filter.funnel_order_type == FunnelOrderType.UNORDERED: funnel_order_class = ClickhouseFunnelUnordered elif filter.funnel_order_type == FunnelOrderType.STRICT: funnel_order_class = ClickhouseFunnelStrict if filter.funnel_viz_type == FunnelVizType.TRENDS: result = ClickhouseFunnelTrends( team=team, filter=filter, funnel_order_class=funnel_order_class).run() elif filter.funnel_viz_type == FunnelVizType.TIME_TO_CONVERT: result = ClickhouseFunnelTimeToConvert( team=team, filter=filter, funnel_order_class=funnel_order_class).run() else: result = funnel_order_class(team=team, filter=filter).run() else: result = Funnel(filter=filter, team=team).run() dashboard_items.update(last_refresh=timezone.now(), refreshing=False) return result
def test_insight_funnels_basic_get(self): event_factory(team=self.team, event="user signed up", distinct_id="1") event_factory(team=self.team, event="user did things", distinct_id="1") response = self.client.get( "/api/insight/funnel/?funnel_window_days=14&events={}".format( json.dumps([ { "id": "user signed up", "type": "events", "order": 0 }, { "id": "user did things", "type": "events", "order": 1 }, ]))).json() # clickhouse funnels don't have a loading system if is_clickhouse_enabled(): self.assertEqual(len(response["result"]), 2) self.assertEqual(response["result"][0]["name"], "user signed up") self.assertEqual(response["result"][1]["name"], "user did things") else: self.assertEqual(response["result"]["loading"], True)
def capture_internal(event, distinct_id, ip, site_url, now, sent_at, team_id): event_uuid = UUIDT() if is_clickhouse_enabled(): log_event( distinct_id=distinct_id, ip=ip, site_url=site_url, data=event, team_id=team_id, now=now, sent_at=sent_at, event_uuid=event_uuid, ) else: task_name = "posthog.tasks.process_event.process_event_with_plugins" celery_queue = settings.PLUGINS_CELERY_QUEUE celery_app.send_task( name=task_name, queue=celery_queue, args=[ distinct_id, ip, site_url, event, team_id, now.isoformat(), sent_at, ], )
def earliest_timestamp_func(team_id: int): if is_clickhouse_enabled(): from ee.clickhouse.queries.util import get_earliest_timestamp return get_earliest_timestamp(team_id) from posthog.models.event import Event return Event.objects.earliest_timestamp(team_id)
def fetch_plugin_log_entries( *, team_id: Optional[int] = None, plugin_config_id: Optional[int] = None, after: Optional[timezone.datetime] = None, before: Optional[timezone.datetime] = None, search: Optional[str] = None, limit: Optional[int] = None, ) -> List[Union[PluginLogEntry, PluginLogEntryRaw]]: if is_clickhouse_enabled(): clickhouse_where_parts: List[str] = [] clickhouse_kwargs: Dict[str, Any] = {} if team_id is not None: clickhouse_where_parts.append("team_id = %(team_id)s") clickhouse_kwargs["team_id"] = team_id if plugin_config_id is not None: clickhouse_where_parts.append( "plugin_config_id = %(plugin_config_id)s") clickhouse_kwargs["plugin_config_id"] = plugin_config_id if after is not None: clickhouse_where_parts.append( "timestamp > toDateTime64(%(after)s, 6)") clickhouse_kwargs["after"] = after.isoformat().replace( "+00:00", "") if before is not None: clickhouse_where_parts.append( "timestamp < toDateTime64(%(before)s, 6)") clickhouse_kwargs["before"] = before.isoformat().replace( "+00:00", "") if search: clickhouse_where_parts.append("message ILIKE %(search)s") clickhouse_kwargs["search"] = f"%{search}%" clickhouse_query = f""" SELECT id, team_id, plugin_id, plugin_config_id, timestamp, source, type, message, instance_id FROM plugin_log_entries WHERE {' AND '.join(clickhouse_where_parts)} ORDER BY timestamp DESC {f'LIMIT {limit}' if limit else ''} """ return [ PluginLogEntryRaw(*result) for result in cast( list, sync_execute(clickhouse_query, clickhouse_kwargs)) ] else: filter_kwargs: Dict[str, Any] = {} if team_id is not None: filter_kwargs["team_id"] = team_id if plugin_config_id is not None: filter_kwargs["plugin_config_id"] = plugin_config_id if after is not None: filter_kwargs["timestamp__gt"] = after if before is not None: filter_kwargs["timestamp__lt"] = before if search: filter_kwargs["message__icontains"] = search query = PluginLogEntry.objects.order_by("-timestamp").filter( **filter_kwargs) if limit: query = query[:limit] return list(query)
def queries(self, request: Request) -> Response: queries = {"postgres_running": self.get_postgres_running_queries()} if is_clickhouse_enabled(): from ee.clickhouse.system_status import get_clickhouse_running_queries, get_clickhouse_slow_log queries["clickhouse_running"] = get_clickhouse_running_queries() queries["clickhouse_slow_log"] = get_clickhouse_slow_log() return Response({"results": queries})
def bulk_import_events(self): if is_clickhouse_enabled(): from ee.clickhouse.demo import bulk_create_events, bulk_create_session_recording_events bulk_create_events(self.events, team=self.team) bulk_create_session_recording_events(self.snapshots, team_id=self.team.pk) else: Event.objects.bulk_create([Event(**kw, team=self.team) for kw in self.events]) SessionRecordingEvent.objects.bulk_create( [SessionRecordingEvent(**kw, team=self.team) for kw in self.snapshots] )
def calculate_actions_from_last_calculation() -> None: if is_clickhouse_enabled(): # In EE, actions are not precalculated return start_time_overall = time.time() for action in cast(Sequence[Action], Action.objects.filter(is_calculating=False, deleted=False)): start_time = time.time() action.calculate_events(start=action.last_calculated_at) total_time = time.time() - start_time logger.info(f"Calculating action {action.pk} took {total_time:.2f} seconds") total_time_overall = time.time() - start_time_overall logger.info(f"Calculated new event-action pairs in {total_time_overall:.2f} s")
def _calculate_funnel(filter: Filter, key: str, team_id: int) -> List[Dict[str, Any]]: dashboard_items = DashboardItem.objects.filter(team_id=team_id, filters_hash=key) dashboard_items.update(refreshing=True) if is_clickhouse_enabled(): insight_class = import_from("ee.clickhouse.queries.clickhouse_funnel", "ClickhouseFunnel") else: insight_class = import_from("posthog.queries.funnel", "Funnel") result = insight_class(filter=filter, team=Team(pk=team_id)).run() dashboard_items.update(last_refresh=timezone.now(), refreshing=False) return result
def _calculate_by_filter(filter: FilterType, key: str, team_id: int, cache_type: CacheType) -> List[Dict[str, Any]]: dashboard_items = DashboardItem.objects.filter(team_id=team_id, filters_hash=key) dashboard_items.update(refreshing=True) if is_clickhouse_enabled(): insight_class_path = CH_TYPE_TO_IMPORT[cache_type] else: insight_class_path = TYPE_TO_IMPORT[cache_type] insight_class = import_from(insight_class_path[0], insight_class_path[1]) result = insight_class().run(filter, Team(pk=team_id)) dashboard_items.update(last_refresh=timezone.now(), refreshing=False) return result
def _get_properties_volume(team: Team) -> List[Tuple[str, int]]: timestamp = now() - timedelta(days=30) if is_clickhouse_enabled(): from ee.clickhouse.client import sync_execute from ee.clickhouse.sql.events import GET_PROPERTIES_VOLUME return sync_execute(GET_PROPERTIES_VOLUME, {"team_id": team.pk, "timestamp": timestamp},) cursor = connection.cursor() cursor.execute( "SELECT json_build_array(jsonb_object_keys(properties)) ->> 0 as key1, count(1) FROM posthog_event WHERE team_id = %s AND timestamp > %s group by key1 order by count desc", [team.pk, timestamp], ) return cursor.fetchall()
def _get_events_volume(team: Team) -> List[Tuple[str, int]]: timestamp = now() - timedelta(days=30) if is_clickhouse_enabled(): from ee.clickhouse.client import sync_execute from ee.clickhouse.sql.events import GET_EVENTS_VOLUME return sync_execute(GET_EVENTS_VOLUME, {"team_id": team.pk, "timestamp": timestamp},) return ( Event.objects.filter(team=team, timestamp__gt=timestamp) .values("event") .annotate(count=Count("id")) .values_list("event", "count") )
def calculate_cohorts() -> None: # This task will be run every minute # Every minute, grab a few cohorts off the list and execute them for cohort in (Cohort.objects.filter( deleted=False, is_calculating=False, last_calculation__lte=timezone.now() - relativedelta(minutes=MAX_AGE_MINUTES), errors_calculating__lte=20, ).exclude(is_static=True).order_by( F("last_calculation").asc(nulls_first=True))[0:PARALLEL_COHORTS]): calculate_cohort.delay(cohort.id) if is_clickhouse_enabled(): calculate_cohort_ch.delay(cohort.id)
def calculate_people_ch(self): if is_clickhouse_enabled(): from ee.clickhouse.models.cohort import recalculate_cohortpeople try: recalculate_cohortpeople(self) self.is_calculating = False self.last_calculation = timezone.now() self.errors_calculating = 0 self.save() except Exception as err: self.is_calculating = False self.errors_calculating = F("errors_calculating") + 1 self.save() capture_exception(err)
def clickhouse_row_count(): if is_clickhouse_enabled() and settings.EE_AVAILABLE: from ee.clickhouse.client import sync_execute from posthog.internal_metrics import gauge for table in CLICKHOUSE_TABLES: try: QUERY = """select count(1) freq from {table};""" query = QUERY.format(table=table) rows = sync_execute(query)[0][0] gauge(f"posthog_celery_clickhouse_table_row_count", rows, tags={"table": table}) except: pass else: pass
def clickhouse_part_count(): if is_clickhouse_enabled() and settings.EE_AVAILABLE: from ee.clickhouse.client import sync_execute from posthog.internal_metrics import gauge QUERY = """ select table, count(1) freq from system.parts group by table order by freq desc; """ rows = sync_execute(QUERY) for (table, parts) in rows: gauge(f"posthog_celery_clickhouse_table_parts_count", parts, tags={"table": table}) else: pass
def clickhouse_lag(): if is_clickhouse_enabled() and settings.EE_AVAILABLE: from ee.clickhouse.client import sync_execute from posthog.internal_metrics import gauge for table in CLICKHOUSE_TABLES: try: QUERY = ( """select max(_timestamp) observed_ts, now() now_ts, now() - max(_timestamp) as lag from {table};""" ) query = QUERY.format(table=table) lag = sync_execute(query)[0][2] gauge("posthog_celery_clickhouse__table_lag_seconds", lag, tags={"table": table}) except: pass else: pass
def clickhouse_mutation_count(): if is_clickhouse_enabled() and settings.EE_AVAILABLE: from ee.clickhouse.client import sync_execute from posthog.internal_metrics import gauge QUERY = """ SELECT table, count(1) AS freq FROM system.mutations GROUP BY table ORDER BY freq DESC """ rows = sync_execute(QUERY) for (table, muts) in rows: gauge(f"posthog_celery_clickhouse_table_mutations_count", muts, tags={"table": table}) else: pass
def insert_users_by_list(self, items: List[str]) -> None: """ Items can be distinct_id or email """ batchsize = 1000 use_clickhouse = is_clickhouse_enabled() if use_clickhouse: from ee.clickhouse.models.cohort import insert_static_cohort try: cursor = connection.cursor() for i in range(0, len(items), batchsize): batch = items[i:i + batchsize] persons_query = (Person.objects.filter( team_id=self.team_id).filter( Q(persondistinctid__team_id=self.team_id, persondistinctid__distinct_id__in=batch)).exclude( cohort__id=self.id)) if use_clickhouse: insert_static_cohort([ p for p in persons_query.values_list("uuid", flat=True) ], self.pk, self.team) sql, params = persons_query.distinct("pk").only( "pk").query.sql_with_params() query = UPDATE_QUERY.format( cohort_id=self.pk, values_query=sql.replace( 'FROM "posthog_person"', ', {} FROM "posthog_person"'.format(self.pk), 1, ), ) cursor.execute(query, params) self.is_calculating = False self.last_calculation = timezone.now() self.errors_calculating = 0 self.save() except Exception as err: if settings.DEBUG: raise err self.is_calculating = False self.errors_calculating = F("errors_calculating") + 1 self.save() capture_exception(err)
def _calculate_funnel(filter: Filter, key: str, team_id: int) -> List[Dict[str, Any]]: dashboard_items = DashboardItem.objects.filter(team_id=team_id, filters_hash=key) dashboard_items.update(refreshing=True) insight_class: Union[Type[Funnel]] if is_clickhouse_enabled(): if filter.funnel_viz_type == FunnelVizType.TRENDS: insight_class = ClickhouseFunnelTrends elif filter.funnel_viz_type == FunnelVizType.TIME_TO_CONVERT: insight_class = ClickhouseFunnelTimeToConvert else: insight_class = ClickhouseFunnel else: insight_class = Funnel result = insight_class(filter=filter, team=Team(pk=team_id)).run() dashboard_items.update(last_refresh=timezone.now(), refreshing=False) return result
def test_insight_funnels_basic_post(self): person_factory(team=self.team, distinct_ids=["1"]) event_factory(team=self.team, event="user signed up", distinct_id="1") event_factory(team=self.team, event="user did things", distinct_id="1") response = self.client.post( "/api/insight/funnel/", { "events": [ { "id": "user signed up", "type": "events", "order": 0 }, { "id": "user did things", "type": "events", "order": 1 }, ], "funnel_window_days": 14, }, ).json() # clickhouse funnels don't have a loading system if is_clickhouse_enabled(): self.assertEqual(len(response["result"]), 2) self.assertEqual(response["result"][0]["name"], "user signed up") self.assertEqual(response["result"][0]["count"], 1) self.assertEqual(response["result"][1]["name"], "user did things") self.assertEqual(response["result"][1]["count"], 1) else: self.assertEqual(response["result"]["loading"], True)
def create_people(self): self.people = [self.make_person(i) for i in range(self.n_people)] self.distinct_ids = [str(UUIDT()) for _ in self.people] Person.objects.bulk_create(self.people) pids = [ PersonDistinctId(team=self.team, person=person, distinct_id=distinct_id) for person, distinct_id in zip(self.people, self.distinct_ids) ] PersonDistinctId.objects.bulk_create(pids) if is_clickhouse_enabled(): from ee.clickhouse.models.person import create_person, create_person_distinct_id for person in self.people: create_person(team_id=person.team.pk, properties=person.properties, is_identified=person.is_identified) for pid in pids: create_person_distinct_id( 0, pid.team.pk, pid.distinct_id, str(pid.person.uuid)) # use dummy number for id