Example #1
0
    def _generate_query_with_params(
            ingest_view: DirectIngestPreProcessedIngestView,
            ingest_view_export_args: GcsfsIngestViewExportArgs
    ) -> Tuple[str, List[bigquery.ScalarQueryParameter]]:
        """Generates a date bounded query that represents the data that has changed for this view between the specified
        date bounds in the provided export args.

        If there is no lower bound, this produces a query for a historical query up to the upper bound date. Otherwise,
        it diffs two historical queries to produce a delta query, using the SQL 'EXCEPT DISTINCT' function.

        Returns the query, along with the query params that must be passed to the BigQuery query job.
        """

        query_params = [
            bigquery.ScalarQueryParameter(UPPER_BOUND_TIMESTAMP_PARAM_NAME,
                                          bigquery.enums.SqlTypeNames.DATETIME.value,
                                          ingest_view_export_args.upper_bound_datetime_to_export)
        ]
        query = ingest_view.date_parametrized_view_query(UPPER_BOUND_TIMESTAMP_PARAM_NAME)
        if ingest_view_export_args.upper_bound_datetime_prev:
            query_params.append(
                bigquery.ScalarQueryParameter(LOWER_BOUND_TIMESTAMP_PARAM_NAME,
                                              bigquery.enums.SqlTypeNames.DATETIME.value,
                                              ingest_view_export_args.upper_bound_datetime_prev)
            )
            query = query.rstrip().rstrip(';')
            filter_query = \
                ingest_view.date_parametrized_view_query(LOWER_BOUND_TIMESTAMP_PARAM_NAME).rstrip().rstrip(';')
            query = f'(\n{query}\n) EXCEPT DISTINCT (\n{filter_query}\n)'

        return query, query_params
    def _debug_generate_unified_query(
            ingest_view: DirectIngestPreProcessedIngestView,
            ingest_view_export_args: GcsfsIngestViewExportArgs
    ) -> Tuple[str, List[bigquery.ScalarQueryParameter]]:
        """Generates a single query that is date bounded such that it represents the data that has changed for this view
        between the specified date bounds in the provided export args.

        If there is no lower bound, this produces a query for a historical query up to the upper bound date. Otherwise,
        it diffs two historical queries to produce a delta query, using the SQL 'EXCEPT DISTINCT' function.

        Important Note: This query is meant for debug use only. In the actual DirectIngest flow, query results for
        individual dates are persisted into temporary tables, and those temporary tables are then diff'd using SQL's
        `EXCEPT DISTINCT` function.
        """

        query_params = [
            bigquery.ScalarQueryParameter(UPPER_BOUND_TIMESTAMP_PARAM_NAME,
                                          bigquery.enums.SqlTypeNames.DATETIME.value,
                                          ingest_view_export_args.upper_bound_datetime_to_export)
        ]
        query = ingest_view.date_parametrized_view_query(UPPER_BOUND_TIMESTAMP_PARAM_NAME)
        if ingest_view_export_args.upper_bound_datetime_prev:
            query_params.append(
                bigquery.ScalarQueryParameter(LOWER_BOUND_TIMESTAMP_PARAM_NAME,
                                              bigquery.enums.SqlTypeNames.DATETIME.value,
                                              ingest_view_export_args.upper_bound_datetime_prev)
            )
            query = DirectIngestIngestViewExportManager.create_date_diff_query(
                upper_bound_query=query,
                upper_bound_prev_query=ingest_view.date_parametrized_view_query(LOWER_BOUND_TIMESTAMP_PARAM_NAME),
                do_reverse_date_diff=ingest_view.do_reverse_date_diff)
            query = DirectIngestPreProcessedIngestView.add_order_by_suffix(
                query=query, order_by_cols=ingest_view.order_by_cols)
        return query, query_params
Example #3
0
    def test_direct_ingest_preprocessed_view_with_reference_table(self):
        region_config = DirectIngestRegionRawFileConfig(
            region_code='us_xx',
            yaml_config_file_path=fixtures.as_filepath(
                'us_xx_raw_data_files.yaml'),
        )

        view_query_template = """SELECT * FROM {file_tag_first}
LEFT OUTER JOIN `{{project_id}}.reference_tables.my_table`
USING (col1);"""

        view = DirectIngestPreProcessedIngestView(
            ingest_view_name='ingest_view_tag',
            view_query_template=view_query_template,
            region_raw_table_config=region_config,
            order_by_cols='col1, col2')

        self.assertEqual(
            ['file_tag_first'],
            [c.file_tag for c in view.raw_table_dependency_configs])

        expected_view_query = """WITH
file_tag_first_generated_view AS (
    SELECT * FROM `recidiviz-456.us_xx_raw_data_up_to_date_views.file_tag_first_latest`
)
SELECT * FROM file_tag_first_generated_view
LEFT OUTER JOIN `recidiviz-456.reference_tables.my_table`
USING (col1) 
ORDER BY col1, col2;"""

        self.assertEqual(expected_view_query, view.view_query)

        expected_date_parametrized_view_query = """WITH
file_tag_first_generated_view AS (
    WITH rows_with_recency_rank AS (
        SELECT 
            * EXCEPT (file_id, update_datetime), 
            ROW_NUMBER() OVER (PARTITION BY col_name_1a, col_name_1b
                               ORDER BY update_datetime DESC) AS recency_rank
        FROM 
            `recidiviz-456.us_xx_raw_data.file_tag_first`
        WHERE 
            update_datetime <= @my_param
    )

    SELECT * 
    EXCEPT (recency_rank)
    FROM rows_with_recency_rank
    WHERE recency_rank = 1
)
SELECT * FROM file_tag_first_generated_view
LEFT OUTER JOIN `recidiviz-456.reference_tables.my_table`
USING (col1) 
ORDER BY col1, col2;"""

        self.assertEqual(expected_date_parametrized_view_query,
                         view.date_parametrized_view_query('my_param'))
    def test_direct_ingest_preprocessed_view_other_materialized_subquery(self) -> None:
        region_config = DirectIngestRegionRawFileConfig(
            region_code='us_xx',
            yaml_config_file_path=fixtures.as_filepath('us_xx_raw_data_files.yaml'),
        )

        view_query_template = """
CREATE TEMP TABLE my_subquery AS (SELECT * FROM {file_tag_first});
SELECT * FROM my_subquery;"""

        view = DirectIngestPreProcessedIngestView(
            ingest_view_name='ingest_view_tag',
            view_query_template=view_query_template,
            region_raw_table_config=region_config,
            order_by_cols='col1, col2',
            is_detect_row_deletion_view=False,
            primary_key_tables_for_entity_deletion=[],
            materialize_raw_data_table_views=True
        )
        expected_view_query = """CREATE TEMP TABLE file_tag_first_generated_view AS (
    SELECT * FROM `recidiviz-456.us_xx_raw_data_up_to_date_views.file_tag_first_latest`
);

CREATE TEMP TABLE my_subquery AS (SELECT * FROM file_tag_first_generated_view);
SELECT * FROM my_subquery 
ORDER BY col1, col2;"""

        self.assertEqual(expected_view_query, view.view_query)

        expected_parametrized_view_query = """CREATE TEMP TABLE file_tag_first_generated_view AS (
    WITH rows_with_recency_rank AS (
        SELECT 
            * EXCEPT (file_id, update_datetime), 
            ROW_NUMBER() OVER (PARTITION BY col_name_1a, col_name_1b
                               ORDER BY update_datetime DESC) AS recency_rank
        FROM 
            `recidiviz-456.us_xx_raw_data.file_tag_first`
        WHERE 
            update_datetime <= @my_update_timestamp_param_name
    )

    SELECT * 
    EXCEPT (recency_rank)
    FROM rows_with_recency_rank
    WHERE recency_rank = 1
);

CREATE TEMP TABLE my_subquery AS (SELECT * FROM file_tag_first_generated_view);
SELECT * FROM my_subquery 
ORDER BY col1, col2;"""

        self.assertEqual(expected_parametrized_view_query,
                         view.date_parametrized_view_query('my_update_timestamp_param_name'))
 def _generate_query_and_params_for_date(
         ingest_view: DirectIngestPreProcessedIngestView,
         update_timestamp: datetime.datetime
 ) -> Tuple[str, List[bigquery.ScalarQueryParameter]]:
     """Generates a single query for the provided |ingest view| that is date bounded by |update_timestamp|."""
     query_params = [
         bigquery.ScalarQueryParameter(UPDATE_TIMESTAMP_PARAM_NAME,
                                       bigquery.enums.SqlTypeNames.DATETIME.value,
                                       update_timestamp)
     ]
     query = ingest_view.date_parametrized_view_query(UPDATE_TIMESTAMP_PARAM_NAME)
     logging.info('Generated bound query with params \nquery: [%s]\nparams: [%s]', query, query_params)
     return query, query_params
    def test_direct_ingest_preprocessed_view_with_subqueries(self):
        region_config = DirectIngestRegionRawFileConfig(
            region_code='us_xx',
            yaml_config_file_path=fixtures.as_filepath(
                'us_xx_raw_data_files.yaml'),
        )

        view_query_template = """WITH
foo AS (SELECT * FROM bar)
SELECT * FROM {file_tag_first}
LEFT OUTER JOIN {file_tag_second}
USING (col1);"""

        view = DirectIngestPreProcessedIngestView(
            ingest_view_name='ingest_view_tag',
            view_query_template=view_query_template,
            region_raw_table_config=region_config)

        self.assertEqual(
            ['file_tag_first', 'file_tag_second'],
            [c.file_tag for c in view.raw_table_dependency_configs])

        expected_view_query = """WITH
file_tag_first_generated_view AS (
    SELECT * FROM `recidiviz-456.us_xx_raw_data_up_to_date_views.file_tag_first_latest`
),
file_tag_second_generated_view AS (
    SELECT * FROM `recidiviz-456.us_xx_raw_data_up_to_date_views.file_tag_second_latest`
),
foo AS (SELECT * FROM bar)
SELECT * FROM file_tag_first_generated_view
LEFT OUTER JOIN file_tag_second_generated_view
USING (col1);"""

        self.assertEqual(expected_view_query, view.view_query)

        expected_parametrized_view_query = """WITH
file_tag_first_generated_view AS (
    WITH rows_with_recency_rank AS (
        SELECT 
            * EXCEPT (file_id, update_datetime), 
            ROW_NUMBER() OVER (PARTITION BY col_name_1a, col_name_1b
                               ORDER BY update_datetime DESC) AS recency_rank
        FROM 
            `recidiviz-456.us_xx_raw_data.file_tag_first`
        WHERE 
            update_datetime <= @update_timestamp
    )

    SELECT * 
    EXCEPT (recency_rank)
    FROM rows_with_recency_rank
    WHERE recency_rank = 1
),
file_tag_second_generated_view AS (
    WITH rows_with_recency_rank AS (
        SELECT 
            * EXCEPT (file_id, update_datetime), 
            ROW_NUMBER() OVER (PARTITION BY col_name_2a
                               ORDER BY update_datetime DESC) AS recency_rank
        FROM 
            `recidiviz-456.us_xx_raw_data.file_tag_second`
        WHERE 
            update_datetime <= @update_timestamp
    )

    SELECT * 
    EXCEPT (recency_rank)
    FROM rows_with_recency_rank
    WHERE recency_rank = 1
),
foo AS (SELECT * FROM bar)
SELECT * FROM file_tag_first_generated_view
LEFT OUTER JOIN file_tag_second_generated_view
USING (col1);"""

        self.assertEqual(expected_parametrized_view_query,
                         view.date_parametrized_view_query())

        # Also check that appending whitespace before the WITH prefix produces the same results
        view_query_template = '\n ' + view_query_template

        view = DirectIngestPreProcessedIngestView(
            ingest_view_name='ingest_view_tag',
            view_query_template=view_query_template,
            region_raw_table_config=region_config)

        self.assertEqual(
            ['file_tag_first', 'file_tag_second'],
            [c.file_tag for c in view.raw_table_dependency_configs])

        self.assertEqual(expected_view_query, view.view_query)
        self.assertEqual(expected_parametrized_view_query,
                         view.date_parametrized_view_query())
    def test_direct_ingest_preprocessed_view_detect_row_deletion(self) -> None:
        region_config = DirectIngestRegionRawFileConfig(
            region_code='us_xx',
            yaml_config_file_path=fixtures.as_filepath('us_xx_raw_data_files.yaml'),
        )

        view_query_template = """SELECT * FROM {file_tag_first}
LEFT OUTER JOIN {tagFullHistoricalExport}
USING (col1);"""

        view = DirectIngestPreProcessedIngestView(
            ingest_view_name='ingest_view_tag',
            view_query_template=view_query_template,
            region_raw_table_config=region_config,
            order_by_cols='col1, col2',
            is_detect_row_deletion_view=True,
            primary_key_tables_for_entity_deletion=['tagFullHistoricalExport'],
        )

        self.assertEqual(['file_tag_first', 'tagFullHistoricalExport'],
                         [c.file_tag for c in view.raw_table_dependency_configs])

        expected_view_query = """WITH
file_tag_first_generated_view AS (
    SELECT * FROM `recidiviz-456.us_xx_raw_data_up_to_date_views.file_tag_first_latest`
),
tagFullHistoricalExport_generated_view AS (
    SELECT * FROM `recidiviz-456.us_xx_raw_data_up_to_date_views.tagFullHistoricalExport_latest`
)
SELECT * FROM file_tag_first_generated_view
LEFT OUTER JOIN tagFullHistoricalExport_generated_view
USING (col1) 
ORDER BY col1, col2;"""

        self.assertEqual(expected_view_query, view.view_query)

        expected_parametrized_view_query = """WITH
file_tag_first_generated_view AS (
    WITH rows_with_recency_rank AS (
        SELECT 
            * EXCEPT (file_id, update_datetime), 
            ROW_NUMBER() OVER (PARTITION BY col_name_1a, col_name_1b
                               ORDER BY update_datetime DESC) AS recency_rank
        FROM 
            `recidiviz-456.us_xx_raw_data.file_tag_first`
        WHERE 
            update_datetime <= @my_update_timestamp_param_name
    )

    SELECT * 
    EXCEPT (recency_rank)
    FROM rows_with_recency_rank
    WHERE recency_rank = 1
),
tagFullHistoricalExport_generated_view AS (
    WITH max_update_datetime AS (
        SELECT 
            MAX(update_datetime) AS update_datetime
        FROM
            `recidiviz-456.us_xx_raw_data.tagFullHistoricalExport`
        WHERE 
            update_datetime <= @my_update_timestamp_param_name
    ),
    max_file_id AS (
        SELECT
            MAX(file_id) AS file_id
        FROM
            `recidiviz-456.us_xx_raw_data.tagFullHistoricalExport`
        WHERE 
            update_datetime = (SELECT update_datetime FROM max_update_datetime)
    ),
    rows_with_recency_rank AS (
        SELECT 
            * EXCEPT (file_id, update_datetime), 
            ROW_NUMBER() OVER (PARTITION BY COL_1
                               ORDER BY update_datetime DESC) AS recency_rank
        FROM 
            `recidiviz-456.us_xx_raw_data.tagFullHistoricalExport`
        WHERE 
            file_id = (SELECT file_id FROM max_file_id)
    )
    SELECT * 
    EXCEPT (recency_rank)
    FROM rows_with_recency_rank
    WHERE recency_rank = 1
)
SELECT * FROM file_tag_first_generated_view
LEFT OUTER JOIN tagFullHistoricalExport_generated_view
USING (col1) 
ORDER BY col1, col2;"""

        self.assertEqual(expected_parametrized_view_query,
                         view.date_parametrized_view_query('my_update_timestamp_param_name'))