コード例 #1
0
ファイル: test_columns.py プロジェクト: PostHog/posthog
    def test_column_types(self):
        materialize("events", "myprop")

        expr = "replaceRegexpAll(JSONExtractRaw(properties, 'myprop'), '^\"|\"$', '')"
        self.assertEqual(("MATERIALIZED", expr),
                         self._get_column_types("mat_myprop"))

        backfill_materialized_columns("events", ["myprop"], timedelta(days=50))
        self.assertEqual(("DEFAULT", expr),
                         self._get_column_types("mat_myprop"))

        mark_all_materialized()
        self.assertEqual(("MATERIALIZED", expr),
                         self._get_column_types("mat_myprop"))
コード例 #2
0
ファイル: analyze.py プロジェクト: PostHog/posthog
def materialize_properties_task(
    columns_to_materialize: Optional[List[Suggestion]] = None,
    time_to_analyze_hours: int = MATERIALIZE_COLUMNS_ANALYSIS_PERIOD_HOURS,
    maximum: int = MATERIALIZE_COLUMNS_MAX_AT_ONCE,
    min_query_time: int = MATERIALIZE_COLUMNS_MINIMUM_QUERY_TIME,
    backfill_period_days: int = MATERIALIZE_COLUMNS_BACKFILL_PERIOD_DAYS,
    dry_run: bool = False,
) -> None:
    """
    Creates materialized columns for event and person properties based off of slow queries
    """

    if columns_to_materialize is None:
        columns_to_materialize = analyze(
            get_queries(time_to_analyze_hours, min_query_time))
    result = []
    for suggestion in columns_to_materialize:
        table, property_name, _ = suggestion
        if property_name not in get_materialized_columns(table):
            result.append(suggestion)

    if len(result) > 0:
        logger.info(
            f"Calculated columns that could be materialized. count={len(result)}"
        )
    else:
        logger.info("Found no columns to materialize.")

    properties: Dict[TableWithProperties, List[PropertyName]] = {
        "events": [],
        "person": [],
    }
    for table, property_name, cost in result[:maximum]:
        logger.info(
            f"Materializing column. table={table}, property_name={property_name}, cost={cost}"
        )

        if not dry_run:
            materialize(table, property_name)
        properties[table].append(property_name)

    if backfill_period_days > 0 and not dry_run:
        logger.info(
            f"Starting backfill for new materialized columns. period_days={backfill_period_days}"
        )
        backfill_materialized_columns("events", properties["events"],
                                      timedelta(days=backfill_period_days))
        backfill_materialized_columns("person", properties["person"],
                                      timedelta(days=backfill_period_days))
コード例 #3
0
ファイル: test_columns.py プロジェクト: GalDayan/posthog
    def test_column_types(self):
        materialize("events", "myprop")

        # :KLUDGE: ClickHouse replaces our trim(BOTH '"' FROM properties) with this
        expr = "replaceRegexpAll(JSONExtractRaw(properties, 'myprop'), concat('^[', regexpQuoteMeta('\"'), ']*|[', regexpQuoteMeta('\"'), ']*$'), '')"
        self.assertEqual(("MATERIALIZED", expr),
                         self._get_column_types("events", "mat_myprop"))

        backfill_materialized_columns("events", ["myprop"], timedelta(days=50))
        self.assertEqual(("DEFAULT", expr),
                         self._get_column_types("events", "mat_myprop"))

        mark_all_materialized()
        self.assertEqual(("MATERIALIZED", expr),
                         self._get_column_types("events", "mat_myprop"))
コード例 #4
0
ファイル: test_columns.py プロジェクト: GalDayan/posthog
    def test_backfilling_data(self):
        sync_execute("ALTER TABLE events DROP COLUMN IF EXISTS mat_prop")
        sync_execute("ALTER TABLE events DROP COLUMN IF EXISTS mat_another")

        _create_event(event="some_event",
                      distinct_id="1",
                      team=self.team,
                      timestamp="2020-01-01 00:00:00",
                      properties={"prop": 1})
        _create_event(
            event="some_event",
            distinct_id="1",
            team=self.team,
            timestamp="2021-05-02 00:00:00",
            properties={
                "prop": 2,
                "another": 5
            },
        )
        _create_event(event="some_event",
                      distinct_id="1",
                      team=self.team,
                      timestamp="2021-05-03 00:00:00",
                      properties={"prop": 3})
        _create_event(event="another_event",
                      distinct_id="1",
                      team=self.team,
                      timestamp="2021-05-04 00:00:00")
        _create_event(
            event="third_event",
            distinct_id="1",
            team=self.team,
            timestamp="2021-05-05 00:00:00",
            properties={"prop": 4},
        )
        _create_event(
            event="fourth_event",
            distinct_id="1",
            team=self.team,
            timestamp="2021-05-06 00:00:00",
            properties={"another": 6},
        )

        materialize("events", "prop")
        materialize("events", "another")

        self.assertEqual(self._count_materialized_rows("mat_prop"), 0)
        self.assertEqual(self._count_materialized_rows("mat_another"), 0)

        with freeze_time("2021-05-10T14:00:01Z"):
            backfill_materialized_columns(
                "events", ["prop", "another"],
                timedelta(days=50),
                test_settings={"mutations_sync": "0"})

        _create_event(
            event="fifth_event",
            distinct_id="1",
            team=self.team,
            timestamp="2021-05-07 00:00:00",
            properties={"another": 7},
        )

        iterations = 0
        while self._get_count_of_mutations_running() > 0 and iterations < 100:
            sleep(0.1)
            iterations += 1

        self.assertGreaterEqual(self._count_materialized_rows("mat_prop"), 4)
        self.assertGreaterEqual(self._count_materialized_rows("mat_another"),
                                4)

        self.assertEqual(
            sync_execute(
                "SELECT mat_prop, mat_another FROM events ORDER BY timestamp"),
            [("1", ""), ("2", "5"), ("3", ""), ("", ""), ("4", ""), ("", "6"),
             ("", "7")],
        )