def demo(request: Request): user = cast(User, request.user) organization = user.organization if not organization: raise AttributeError("This user has no organization.") try: team = organization.teams.get(is_demo=True) except Team.DoesNotExist: team = create_demo_team(organization, user, request) user.current_team = team user.save() EventDefinition.objects.get_or_create(team=team, name="$pageview") if is_clickhouse_enabled(): # :TRICKY: Lazily backfill missing event data. from ee.clickhouse.models.event import get_events_by_team result = get_events_by_team(team_id=team.pk) if not result: create_demo_data(team, dashboards=False) return render_template("demo.html", request=request, context={"api_token": team.api_token})
def handle(self, *args, **options): from django.test.runner import DiscoverRunner as TestRunner test_runner = TestRunner(interactive=False) test_runner.setup_databases() test_runner.setup_test_environment() if is_clickhouse_enabled(): from infi.clickhouse_orm import Database # type: ignore from posthog.settings import ( CLICKHOUSE_DATABASE, CLICKHOUSE_HTTP_URL, CLICKHOUSE_PASSWORD, CLICKHOUSE_REPLICATION, CLICKHOUSE_USER, CLICKHOUSE_VERIFY, ) database = Database( CLICKHOUSE_DATABASE, db_url=CLICKHOUSE_HTTP_URL, username=CLICKHOUSE_USER, password=CLICKHOUSE_PASSWORD, verify_ssl_cert=CLICKHOUSE_VERIFY, ) try: database.create_database() except: pass database.migrate("ee.clickhouse.migrations", replicated=CLICKHOUSE_REPLICATION)
def preflight_check(request: HttpRequest) -> JsonResponse: response = { "django": True, "redis": is_redis_alive() or settings.TEST, "plugins": is_plugin_server_alive() or settings.TEST, "celery": is_celery_alive() or settings.TEST, "db": is_postgres_alive(), "initiated": Organization.objects.exists(), "cloud": settings.MULTI_TENANCY, "realm": get_instance_realm(), "available_social_auth_providers": get_available_social_auth_providers(), "can_create_org": get_can_create_org(), "email_service_available": is_email_available(with_absolute_urls=True), } if request.user.is_authenticated: response = { **response, "ee_available": settings.EE_AVAILABLE, "is_clickhouse_enabled": is_clickhouse_enabled(), "db_backend": settings.PRIMARY_DB.value, "available_timezones": get_available_timezones_with_offsets(), "opt_out_capture": os.environ.get("OPT_OUT_CAPTURE", False), "posthog_version": VERSION, "is_debug": settings.DEBUG, "is_event_property_usage_enabled": settings.ASYNC_EVENT_PROPERTY_USAGE, "licensed_users_available": get_licensed_users_available(), "site_url": settings.SITE_URL, "instance_preferences": settings.INSTANCE_PREFERENCES, } return JsonResponse(response)
def _calculate_funnel(filter: Filter, key: str, team_id: int) -> List[Dict[str, Any]]: team = Team(pk=team_id) if is_clickhouse_enabled(): funnel_order_class: Type[ClickhouseFunnelBase] = ClickhouseFunnel if filter.funnel_order_type == FunnelOrderType.UNORDERED: funnel_order_class = ClickhouseFunnelUnordered elif filter.funnel_order_type == FunnelOrderType.STRICT: funnel_order_class = ClickhouseFunnelStrict if filter.funnel_viz_type == FunnelVizType.TRENDS: result = ClickhouseFunnelTrends( team=team, filter=filter, funnel_order_class=funnel_order_class).run() elif filter.funnel_viz_type == FunnelVizType.TIME_TO_CONVERT: result = ClickhouseFunnelTimeToConvert( team=team, filter=filter, funnel_order_class=funnel_order_class).run() else: result = funnel_order_class(team=team, filter=filter).run() else: result = Funnel(filter=filter, team=team).run() return result
def identify( distinct_id: str, team_id: int, # TODO: I believe the handling of properties here isn't totally true to how # it is handled in reality. We could update for `identify` to reflect # reality, but I think really we should update to use the `/e/` endpoint and # remove any room for discrepancies. properties: Optional[Dict[str, Any]] = None, ): """ Simulate what is being done in the plugin-server, so we end up with the database in the right state """ properties = properties or {} if is_clickhouse_enabled(): from ee.clickhouse.models.person import Person, PersonDistinctId person = Person.objects.create(team_id=team_id, properties=properties) PersonDistinctId.objects.create(distinct_id=distinct_id, team_id=team_id, person_id=person.id) else: from posthog.models.person import Person, PersonDistinctId person = Person.objects.create(team_id=team_id, properties=properties) PersonDistinctId.objects.create(distinct_id=distinct_id, team_id=team_id, person_id=person.id) capture_event( event=EventData( event="$identify", team_id=team_id, distinct_id=distinct_id, timestamp=datetime.now(), properties={"distinct_id": distinct_id}, ) )
def insert_cohort_from_query(cohort_id: int, insight_type: str, filter_data: Dict[str, Any], entity_data: Dict[str, Any]) -> None: if is_clickhouse_enabled(): from ee.clickhouse.queries.clickhouse_stickiness import insert_stickiness_people_into_cohort from ee.clickhouse.queries.util import get_earliest_timestamp from ee.clickhouse.views.actions import insert_entity_people_into_cohort from ee.clickhouse.views.cohort import insert_cohort_people_into_pg from posthog.models.entity import Entity from posthog.models.filters.filter import Filter from posthog.models.filters.stickiness_filter import StickinessFilter cohort = Cohort.objects.get(pk=cohort_id) entity = Entity(data=entity_data) if insight_type == INSIGHT_STICKINESS: _stickiness_filter = StickinessFilter( data=filter_data, team=cohort.team, get_earliest_timestamp=get_earliest_timestamp) insert_stickiness_people_into_cohort(cohort, entity, _stickiness_filter) else: _filter = Filter(data=filter_data) insert_entity_people_into_cohort(cohort, entity, _filter) insert_cohort_people_into_pg(cohort=cohort)
def earliest_timestamp_func(team_id: int): if is_clickhouse_enabled(): from ee.clickhouse.queries.util import get_earliest_timestamp return get_earliest_timestamp(team_id) from posthog.models.event import Event return Event.objects.earliest_timestamp(team_id)
def analyze_ch_query(self, request: Request) -> Response: response = {} if is_clickhouse_enabled(): from ee.clickhouse.system_status import analyze_query response["results"] = analyze_query(request.data["query"]) return Response(response)
def send_org_usage_report(): if is_clickhouse_enabled(): from ee.tasks.org_usage_report import send_all_org_usage_reports as send_reports_clickhouse send_reports_clickhouse() else: from posthog.tasks.org_usage_report import send_all_org_usage_reports as send_reports_postgres send_reports_postgres()
def calculate_action(action_id: int) -> None: if is_clickhouse_enabled(): # In EE, actions are not precalculated return start_time = time.time() action: Action = Action.objects.get(pk=action_id) action.calculate_events() total_time = time.time() - start_time logger.info( f"Calculating action {action.pk} took {total_time:.2f} seconds")
def queries(self, request: Request) -> Response: queries = {"postgres_running": self.get_postgres_running_queries()} if is_clickhouse_enabled(): from ee.clickhouse.system_status import get_clickhouse_running_queries, get_clickhouse_slow_log queries["clickhouse_running"] = get_clickhouse_running_queries() queries["clickhouse_slow_log"] = get_clickhouse_slow_log() return Response({"results": queries})
def test_pagination_bounded_date_range(self): with freeze_time("2021-10-10T12:03:03.829294Z"): person_factory(team=self.team, distinct_ids=["1"]) now = timezone.now() - relativedelta(months=11) after = (now).astimezone(pytz.utc).isoformat() before = (now + relativedelta(days=23)).astimezone(pytz.utc).isoformat() params = {"distinct_id": "1", "after": after, "before": before, "limit": 10} params_string = urlencode(params) for idx in range(0, 25): event_factory( team=self.team, event="some event", distinct_id="1", timestamp=now + relativedelta(days=idx, seconds=-idx), ) response = self.client.get(f"/api/event/?{params_string}").json() self.assertEqual(len(response["results"]), 10) self.assertIn("before=", unquote(response["next"])) self.assertIn(f"after={after}", unquote(response["next"])) params = { "distinct_id": "1", "after": after, "before": before, "limit": 10, } params_string = urlencode(params) response = self.client.get(f"/api/projects/{self.team.id}/events/?{params_string}").json() self.assertEqual(len(response["results"]), 10) self.assertIn(f"before=", unquote(response["next"])) self.assertIn(f"after={after}", unquote(response["next"])) page2 = self.client.get(response["next"]).json() from posthog.utils import is_clickhouse_enabled if is_clickhouse_enabled(): from ee.clickhouse.client import sync_execute self.assertEqual( sync_execute( "select count(*) from events where team_id = %(team_id)s", {"team_id": self.team.pk} )[0][0], 25, ) self.assertEqual(len(page2["results"]), 10) self.assertIn(f"before=", unquote(page2["next"])) self.assertIn(f"after={after}", unquote(page2["next"])) page3 = self.client.get(page2["next"]).json() self.assertEqual(len(page3["results"]), 3) self.assertIsNone(page3["next"])
def bulk_import_events(self): if is_clickhouse_enabled(): from ee.clickhouse.demo import bulk_create_events, bulk_create_session_recording_events bulk_create_events(self.events, team=self.team) bulk_create_session_recording_events(self.snapshots, team_id=self.team.pk) else: Event.objects.bulk_create( [Event(**kw, team=self.team) for kw in self.events]) SessionRecordingEvent.objects.bulk_create([ SessionRecordingEvent(**kw, team=self.team) for kw in self.snapshots ])
def test_insight_funnels_basic_get(self): event_factory(team=self.team, event="user signed up", distinct_id="1") event_factory(team=self.team, event="user did things", distinct_id="1") response = self.client.get( f"/api/projects/{self.team.id}/insights/funnel/?funnel_window_days=14&events={json.dumps([{'id': 'user signed up', 'type': 'events', 'order': 0},{'id': 'user did things', 'type': 'events', 'order': 1},])}" ).json() # clickhouse funnels don't have a loading system if is_clickhouse_enabled(): self.assertEqual(len(response["result"]), 2) self.assertEqual(response["result"][0]["name"], "user signed up") self.assertEqual(response["result"][1]["name"], "user did things") else: self.assertEqual(response["result"]["loading"], True)
def calculate_actions_from_last_calculation() -> None: if is_clickhouse_enabled(): # In EE, actions are not precalculated return start_time_overall = time.time() for action in cast( Sequence[Action], Action.objects.filter(is_calculating=False, deleted=False)): start_time = time.time() action.calculate_events(start=action.last_calculated_at) total_time = time.time() - start_time logger.info( f"Calculating action {action.pk} took {total_time:.2f} seconds") total_time_overall = time.time() - start_time_overall logger.info( f"Calculated new event-action pairs in {total_time_overall:.2f} s")
def calculate_people_ch(self): if is_clickhouse_enabled(): from ee.clickhouse.models.cohort import recalculate_cohortpeople from posthog.tasks.calculate_cohort import calculate_cohort try: recalculate_cohortpeople(self) calculate_cohort(self.id) self.last_calculation = timezone.now() self.errors_calculating = 0 except Exception as e: self.errors_calculating = F("errors_calculating") + 1 raise e finally: self.is_calculating = False self.save()
def _get_events_volume(team: Team) -> List[Tuple[str, int]]: timestamp = now() - timedelta(days=30) if is_clickhouse_enabled(): from ee.clickhouse.client import sync_execute from ee.clickhouse.sql.events import GET_EVENTS_VOLUME return sync_execute( GET_EVENTS_VOLUME, { "team_id": team.pk, "timestamp": timestamp }, ) return (Event.objects.filter( team=team, timestamp__gt=timestamp).values("event").annotate( count=Count("id")).values_list("event", "count"))
def calculate_cohorts() -> None: # This task will be run every minute # Every minute, grab a few cohorts off the list and execute them for cohort in (Cohort.objects.filter( deleted=False, is_calculating=False, last_calculation__lte=timezone.now() - relativedelta(minutes=MAX_AGE_MINUTES), errors_calculating__lte=20, ).exclude(is_static=True).order_by( F("last_calculation").asc( nulls_first=True))[0:settings.CALCULATE_X_COHORTS_PARALLEL]): if is_clickhouse_enabled(): calculate_cohort_ch.delay(cohort.id) else: calculate_cohort.delay(cohort.id)
def clickhouse_row_count(): if is_clickhouse_enabled() and settings.EE_AVAILABLE: from ee.clickhouse.client import sync_execute from posthog.internal_metrics import gauge for table in CLICKHOUSE_TABLES: try: QUERY = """select count(1) freq from {table};""" query = QUERY.format(table=table) rows = sync_execute(query)[0][0] gauge(f"posthog_celery_clickhouse_table_row_count", rows, tags={"table": table}) except: pass else: pass
def clickhouse_part_count(): if is_clickhouse_enabled() and settings.EE_AVAILABLE: from ee.clickhouse.client import sync_execute from posthog.internal_metrics import gauge QUERY = """ select table, count(1) freq from system.parts group by table order by freq desc; """ rows = sync_execute(QUERY) for (table, parts) in rows: gauge(f"posthog_celery_clickhouse_table_parts_count", parts, tags={"table": table}) else: pass
def calculate_people(self, use_clickhouse=is_clickhouse_enabled()): if self.is_static: return try: if not use_clickhouse: self.is_calculating = True self.save() persons_query = self._postgres_persons_query() else: persons_query = self._clickhouse_persons_query() try: sql, params = persons_query.distinct("pk").only( "pk").query.sql_with_params() except EmptyResultSet: query = DELETE_QUERY.format(cohort_id=self.pk) params = {} else: query = f""" {DELETE_QUERY}; {UPDATE_QUERY}; """.format( cohort_id=self.pk, values_query=sql.replace( 'FROM "posthog_person"', ', {} FROM "posthog_person"'.format(self.pk), 1, ), ) cursor = connection.cursor() with transaction.atomic(): cursor.execute(query, params) if not use_clickhouse: self.last_calculation = timezone.now() self.errors_calculating = 0 except Exception as err: if not use_clickhouse: self.errors_calculating = F("errors_calculating") + 1 raise err finally: if not use_clickhouse: self.is_calculating = False self.save()
def insert_users_by_list(self, items: List[str]) -> None: """ Items can be distinct_id or email Important! Does not insert into clickhouse """ batchsize = 1000 use_clickhouse = is_clickhouse_enabled() if use_clickhouse: from ee.clickhouse.models.cohort import insert_static_cohort try: cursor = connection.cursor() for i in range(0, len(items), batchsize): batch = items[i:i + batchsize] persons_query = (Person.objects.filter( team_id=self.team_id).filter( Q(persondistinctid__team_id=self.team_id, persondistinctid__distinct_id__in=batch)).exclude( cohort__id=self.id)) if use_clickhouse: insert_static_cohort([ p for p in persons_query.values_list("uuid", flat=True) ], self.pk, self.team) sql, params = persons_query.distinct("pk").only( "pk").query.sql_with_params() query = UPDATE_QUERY.format( cohort_id=self.pk, values_query=sql.replace( 'FROM "posthog_person"', ', {} FROM "posthog_person"'.format(self.pk), 1, ), ) cursor.execute(query, params) self.is_calculating = False self.last_calculation = timezone.now() self.errors_calculating = 0 self.save() except Exception as err: if settings.DEBUG: raise err self.is_calculating = False self.errors_calculating = F("errors_calculating") + 1 self.save() capture_exception(err)
def clickhouse_lag(): if is_clickhouse_enabled() and settings.EE_AVAILABLE: from ee.clickhouse.client import sync_execute from posthog.internal_metrics import gauge for table in CLICKHOUSE_TABLES: try: QUERY = ( """select max(_timestamp) observed_ts, now() now_ts, now() - max(_timestamp) as lag from {table};""" ) query = QUERY.format(table=table) lag = sync_execute(query)[0][2] gauge("posthog_celery_clickhouse__table_lag_seconds", lag, tags={"table": table}) except: pass else: pass
def _simplify_property( self, team: "Team", property: "Property", is_clickhouse_enabled=is_clickhouse_enabled() ) -> List["Property"]: if property.type == "cohort" and is_clickhouse_enabled: from ee.clickhouse.models.cohort import simplified_cohort_filter_properties from posthog.models import Cohort try: cohort = Cohort.objects.get(pk=property.value, team_id=team.pk) except Cohort.DoesNotExist: # :TODO: Handle non-existing resource in-query instead return [property] return simplified_cohort_filter_properties(cohort, team) return [property]
def test_pagination(self): with freeze_time("2021-10-10T12:03:03.829294Z"): person_factory(team=self.team, distinct_ids=["1"]) for idx in range(0, 250): event_factory( team=self.team, event="some event", distinct_id="1", timestamp=timezone.now() - relativedelta(months=11) + relativedelta(days=idx, seconds=idx), ) response = self.client.get("/api/event/?distinct_id=1").json() self.assertEqual(len(response["results"]), 100) self.assertIn("http://testserver/api/event/?distinct_id=1&before=", unquote(response["next"])) response = self.client.get(f"/api/projects/{self.team.id}/events/?distinct_id=1").json() self.assertEqual(len(response["results"]), 100) self.assertIn( f"http://testserver/api/projects/{self.team.id}/events/?distinct_id=1&before=", unquote(response["next"]), ) page2 = self.client.get(response["next"]).json() from posthog.utils import is_clickhouse_enabled if is_clickhouse_enabled(): from ee.clickhouse.client import sync_execute self.assertEqual( sync_execute( "select count(*) from events where team_id = %(team_id)s", {"team_id": self.team.pk} )[0][0], 250, ) self.assertEqual(len(page2["results"]), 100) self.assertEqual( unquote(page2["next"]), f"http://testserver/api/projects/{self.team.id}/events/?distinct_id=1&before=2020-12-30T12:03:53.829294+00:00", ) page3 = self.client.get(page2["next"]).json() self.assertEqual(len(page3["results"]), 50) self.assertIsNone(page3["next"])
def create(self, validated_data: Dict, *args: Any, **kwargs: Any) -> Cohort: request = self.context["request"] validated_data["created_by"] = request.user if not validated_data.get("is_static"): validated_data["is_calculating"] = True cohort = Cohort.objects.create(team_id=self.context["team_id"], **validated_data) if cohort.is_static: self._handle_static(cohort, request) else: if is_clickhouse_enabled(): calculate_cohort_ch.delay(cohort.id) else: calculate_cohort.delay(cohort.id) posthoganalytics.capture(request.user.distinct_id, "cohort created", cohort.get_analytics_metadata()) return cohort
def clickhouse_mutation_count(): if is_clickhouse_enabled() and settings.EE_AVAILABLE: from ee.clickhouse.client import sync_execute from posthog.internal_metrics import gauge QUERY = """ SELECT table, count(1) AS freq FROM system.mutations WHERE is_done = 0 GROUP BY table ORDER BY freq DESC """ rows = sync_execute(QUERY) for (table, muts) in rows: gauge(f"posthog_celery_clickhouse_table_mutations_count", muts, tags={"table": table}) else: pass
def update(self, cohort: Cohort, validated_data: Dict, *args: Any, **kwargs: Any) -> Cohort: # type: ignore request = self.context["request"] cohort.name = validated_data.get("name", cohort.name) cohort.description = validated_data.get("description", cohort.description) cohort.groups = validated_data.get("groups", cohort.groups) cohort.is_static = validated_data.get("is_static", cohort.is_static) deleted_state = validated_data.get("deleted", None) is_deletion_change = deleted_state is not None and cohort.deleted != deleted_state if is_deletion_change: cohort.deleted = deleted_state if not cohort.is_static and not is_deletion_change: cohort.is_calculating = True cohort.save() if not deleted_state: if cohort.is_static: # You can't update a static cohort using the trend/stickiness thing if request.FILES.get("csv"): self._calculate_static_by_csv(request.FILES["csv"], cohort) else: if is_clickhouse_enabled(): calculate_cohort_ch.delay(cohort.id) else: calculate_cohort.delay(cohort.id) posthoganalytics.capture( request.user.distinct_id, "cohort updated", { **cohort.get_analytics_metadata(), "updated_by_creator": request.user == cohort.created_by }, ) return cohort
def create_people(self): self.people = [self.make_person(i) for i in range(self.n_people)] self.distinct_ids = [str(UUIDT()) for _ in self.people] Person.objects.bulk_create(self.people) pids = [ PersonDistinctId(team=self.team, person=person, distinct_id=distinct_id) for person, distinct_id in zip(self.people, self.distinct_ids) ] PersonDistinctId.objects.bulk_create(pids) if is_clickhouse_enabled(): from ee.clickhouse.models.person import create_person, create_person_distinct_id for person in self.people: create_person(team_id=person.team.pk, properties=person.properties, is_identified=person.is_identified) for pid in pids: create_person_distinct_id( pid.team.pk, pid.distinct_id, str(pid.person.uuid)) # use dummy number for id
def __call__(self, request: HttpRequest): """ Install monkey-patch on demand. If monkey-patch has not been run in for this process (assuming multiple preforked processes), then do it now. """ from ee.clickhouse import client route = resolve(request.path) route_id = f"{route.route} ({route.func.__name__})" client._request_information = { "save": (is_clickhouse_enabled() and request.user.pk and (request.user.is_staff or is_impersonated_session(request) or settings.DEBUG)), "user_id": request.user.pk, "kind": "request", "id": route_id, } response: HttpResponse = self.get_response(request) if "api/" in route_id and "capture" not in route_id: incr("http_api_request_response", tags={ "id": route_id, "status_code": response.status_code }) client._request_information = None return response