Python BigQueryClientImpl.insert_into_table_from_query Exemples, recidiviz.big_query.big_query_client.BigQueryClientImpl.insert_into_table_from_query Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : calculation_data_storage_manager.py Projet : Recidiviz/pulse-data

def _decommission_dataflow_metric_table(bq_client: BigQueryClientImpl,
                                        table_ref: TableListItem) -> None:
    """Decommissions a deprecated Dataflow metric table. Moves all remaining rows
    to cold storage and deletes the table in the DATAFLOW_METRICS_DATASET."""
    logging.info("Decommissioning Dataflow metric table: [%s]",
                 table_ref.table_id)

    dataflow_metrics_dataset = DATAFLOW_METRICS_DATASET
    cold_storage_dataset = dataflow_config.DATAFLOW_METRICS_COLD_STORAGE_DATASET
    table_id = table_ref.table_id

    # Move all rows in the table to cold storage
    insert_query = (
        """SELECT * FROM `{project_id}.{dataflow_metrics_dataset}.{table_id}`"""
        .format(
            project_id=table_ref.project,
            dataflow_metrics_dataset=dataflow_metrics_dataset,
            table_id=table_id,
        ))

    insert_job = bq_client.insert_into_table_from_query(
        destination_dataset_id=cold_storage_dataset,
        destination_table_id=table_id,
        query=insert_query,
        allow_field_additions=True,
        write_disposition=WriteDisposition.WRITE_APPEND,
    )

    # Wait for the insert job to complete before deleting the table
    insert_job.result()

    bq_client.delete_table(dataset_id=dataflow_metrics_dataset,
                           table_id=table_id)

Exemple #2

0

Afficher le fichier

Fichier : upload_sendgrid_csv_to_gcs_and_bq.py Projet : Recidiviz/pulse-data

def load_from_temp_to_permanent_table(bq_client: BigQueryClientImpl,
                                      project_id: str) -> None:
    """Query temporary table and persist view to permanent table"""
    num_rows_before = bq_client.get_table(
        dataset_ref=bigquery.DatasetReference(
            project=project_id,
            dataset_id=DATASET_ID,
        ),
        table_id=FINAL_DESTINATION_TABLE,
    ).num_rows

    insert_job = bq_client.insert_into_table_from_query(
        destination_dataset_id=DATASET_ID,
        destination_table_id=FINAL_DESTINATION_TABLE,
        query=INSERT_QUERY_TEMPLATE.format(
            project_id=project_id,
            dataset_id=DATASET_ID,
            temp_table=TEMP_DESTINATION_TABLE,
            final_table=FINAL_DESTINATION_TABLE,
        ),
        write_disposition=WriteDisposition.WRITE_APPEND,
    )

    insert_job_result = insert_job.result()

    logging.info(
        "Loaded [%d] non-duplicate rows into table [%s]",
        (insert_job_result.total_rows - num_rows_before),
        FINAL_DESTINATION_TABLE,
    )

    bq_client.delete_table(dataset_id=DATASET_ID,
                           table_id=TEMP_DESTINATION_TABLE)

Exemple #3

0

Afficher le fichier

Fichier : 2021_01_08_export_methodology_metric_period_months.py Projet : jazzPouls/pulse-data

def main(dry_run: bool) -> None:
    """Moves all metrics with a PERSON methodology or a metric_period_months value greater than 1 to cold storage."""
    bq_client = BigQueryClientImpl()
    dataflow_metrics_dataset = DATAFLOW_METRICS_DATASET
    cold_storage_dataset = DATAFLOW_METRICS_COLD_STORAGE_DATASET
    dataflow_metrics_tables = bq_client.list_tables(dataflow_metrics_dataset)

    for table_ref in dataflow_metrics_tables:
        table_id = table_ref.table_id

        logging.info("Migrating data to cold storage for table [%s]", table_id)

        filter_clause = "WHERE methodology = 'PERSON'"

        # Every metric except this one has a metric_period_months column
        if table_id != DATAFLOW_METRICS_TO_TABLES[
                ReincarcerationRecidivismRateMetric]:
            filter_clause += " OR metric_period_months != 1"

        # Query for rows to be moved to the cold storage table
        insert_query = """
            SELECT * FROM
            `{project_id}.{dataflow_metrics_dataset}.{table_id}`
            {filter_clause}
        """.format(
            project_id=table_ref.project,
            dataflow_metrics_dataset=table_ref.dataset_id,
            table_id=table_id,
            filter_clause=filter_clause,
        )

        if dry_run:
            logging.info(
                "[DRY RUN] Would insert rows into [%s].[%s] from [%s].[%s] that match this query: %s",
                cold_storage_dataset,
                table_id,
                dataflow_metrics_dataset,
                table_id,
                insert_query,
            )
        else:
            # Move data from the Dataflow metrics dataset into the cold storage table, creating the table if necessary
            insert_job = bq_client.insert_into_table_from_query(
                destination_dataset_id=cold_storage_dataset,
                destination_table_id=table_id,
                query=insert_query,
                allow_field_additions=True,
                write_disposition=WriteDisposition.WRITE_APPEND,
            )

            # Wait for the insert job to complete before running the delete job
            insert_job.result()

        if dry_run:
            logging.info(
                "[DRY RUN] Would delete rows from [%s].[%s] %s",
                dataflow_metrics_dataset,
                table_id,
                filter_clause,
            )
        else:
            # Delete these rows from the Dataflow metrics table
            delete_job = bq_client.delete_from_table_async(
                dataflow_metrics_dataset,
                table_id,
                filter_clause=filter_clause)

            # Wait for the replace job to complete before moving on
            delete_job.result()

        logging.info("Done migrating data for table [%s]", table_id)

Exemple #4

0

Afficher le fichier

Fichier : calculation_data_storage_manager.py Projet : Leo-Ryu/pulse-data

def move_old_dataflow_metrics_to_cold_storage() -> None:
    """Moves old output in Dataflow metrics tables to tables in a cold storage dataset. We only keep the
    MAX_DAYS_IN_DATAFLOW_METRICS_TABLE days worth of data in a Dataflow metric table at once. All other
    output is moved to cold storage.
    """
    bq_client = BigQueryClientImpl()
    dataflow_metrics_dataset = DATAFLOW_METRICS_DATASET
    cold_storage_dataset = DATAFLOW_METRICS_COLD_STORAGE_DATASET
    dataflow_metrics_tables = bq_client.list_tables(dataflow_metrics_dataset)

    for table_ref in dataflow_metrics_tables:
        table_id = table_ref.table_id

        source_data_join_clause = """LEFT JOIN
                          (SELECT DISTINCT job_id AS keep_job_id FROM
                          `{project_id}.{reference_views_dataset}.most_recent_job_id_by_metric_and_state_code_materialized`)
                        ON job_id = keep_job_id
                        LEFT JOIN 
                          (SELECT DISTINCT created_on AS keep_created_date FROM
                          `{project_id}.{dataflow_metrics_dataset}.{table_id}`
                          ORDER BY created_on DESC
                          LIMIT {day_count_limit})
                        ON created_on = keep_created_date
                        """.format(
                            project_id=table_ref.project,
                            dataflow_metrics_dataset=table_ref.dataset_id,
                            reference_views_dataset=REFERENCE_VIEWS_DATASET,
                            table_id=table_id,
                            day_count_limit=MAX_DAYS_IN_DATAFLOW_METRICS_TABLE
                        )

        # Exclude these columns leftover from the exclusion join from being added to the metric tables in cold storage
        columns_to_exclude_from_transfer = ['keep_job_id', 'keep_created_date']

        # This filter will return the rows that should be moved to cold storage
        insert_filter_clause = "WHERE keep_job_id IS NULL AND keep_created_date IS NULL"

        # Query for rows to be moved to the cold storage table
        insert_query = """
            SELECT * EXCEPT({columns_to_exclude}) FROM
            `{project_id}.{dataflow_metrics_dataset}.{table_id}`
            {source_data_join_clause}
            {insert_filter_clause}
        """.format(
            columns_to_exclude=', '.join(columns_to_exclude_from_transfer),
            project_id=table_ref.project,
            dataflow_metrics_dataset=table_ref.dataset_id,
            table_id=table_id,
            source_data_join_clause=source_data_join_clause,
            insert_filter_clause=insert_filter_clause
        )

        # Move data from the Dataflow metrics dataset into the cold storage table, creating the table if necessary
        insert_job = bq_client.insert_into_table_from_query(
            destination_dataset_id=cold_storage_dataset,
            destination_table_id=table_id,
            query=insert_query,
            allow_field_additions=True,
            write_disposition=WriteDisposition.WRITE_APPEND)

        # Wait for the insert job to complete before running the replace job
        insert_job.result()

        # This will return the rows that were not moved to cold storage and should remain in the table
        replace_query = """
            SELECT * EXCEPT({columns_to_exclude}) FROM
            `{project_id}.{dataflow_metrics_dataset}.{table_id}`
            {source_data_join_clause}
            WHERE keep_job_id IS NOT NULL OR keep_created_date IS NOT NULL
        """.format(
            columns_to_exclude=', '.join(columns_to_exclude_from_transfer),
            project_id=table_ref.project,
            dataflow_metrics_dataset=table_ref.dataset_id,
            table_id=table_id,
            source_data_join_clause=source_data_join_clause,
        )

        # Replace the Dataflow table with only the rows that should remain
        replace_job = bq_client.create_table_from_query_async(
            dataflow_metrics_dataset, table_ref.table_id, query=replace_query, overwrite=True)

        # Wait for the replace job to complete before moving on
        replace_job.result()

Exemple #5

0

Afficher le fichier

Fichier : calculation_data_storage_manager.py Projet : Recidiviz/pulse-data

def move_old_dataflow_metrics_to_cold_storage() -> None:
    """Moves old output in Dataflow metrics tables to tables in a cold storage dataset.
    We only keep the MAX_DAYS_IN_DATAFLOW_METRICS_TABLE days worth of data in a Dataflow
    metric table at once. All other output is moved to cold storage, unless it is the
    most recent job_id for a metric in a state where that metric is regularly calculated,
    and where the year and month of the output falls into the window of what is regularly
    calculated for that metric and state. See the production_calculation_pipeline_templates.yaml
    file for a list of regularly scheduled calculations.

    If a metric has been entirely decommissioned, handles the deletion of the corresponding table.
    """
    bq_client = BigQueryClientImpl()
    dataflow_metrics_dataset = DATAFLOW_METRICS_DATASET
    cold_storage_dataset = dataflow_config.DATAFLOW_METRICS_COLD_STORAGE_DATASET
    dataflow_metrics_tables = bq_client.list_tables(dataflow_metrics_dataset)

    month_range_for_metric_and_state = _get_month_range_for_metric_and_state()

    for table_ref in dataflow_metrics_tables:
        table_id = table_ref.table_id

        if table_id not in dataflow_config.DATAFLOW_TABLES_TO_METRIC_TYPES:
            # This metric has been deprecated. Handle the deletion of the table
            _decommission_dataflow_metric_table(bq_client, table_ref)
            continue

        is_unbounded_date_pipeline = any(
            pipeline in table_id
            for pipeline in dataflow_config.ALWAYS_UNBOUNDED_DATE_PIPELINES)

        # This means there are no currently scheduled pipelines writing metrics to
        # this table with specific month ranges
        no_active_month_range_pipelines = not month_range_for_metric_and_state[
            table_id].items()

        if is_unbounded_date_pipeline or no_active_month_range_pipelines:
            source_data_join_clause = SOURCE_DATA_JOIN_CLAUSE_STANDARD_TEMPLATE.format(
                project_id=table_ref.project,
                dataflow_metrics_dataset=table_ref.dataset_id,
                materialized_metrics_dataset=
                DATAFLOW_METRICS_MATERIALIZED_DATASET,
                table_id=table_id,
                day_count_limit=dataflow_config.
                MAX_DAYS_IN_DATAFLOW_METRICS_TABLE,
            )
        else:
            month_limit_by_state = "\nUNION ALL\n".join([
                f"SELECT '{state_code}' as state_code, {month_limit} as month_limit"
                for state_code, month_limit in
                month_range_for_metric_and_state[table_id].items()
            ])

            source_data_join_clause = (
                SOURCE_DATA_JOIN_CLAUSE_WITH_MONTH_LIMIT_TEMPLATE.format(
                    project_id=table_ref.project,
                    dataflow_metrics_dataset=table_ref.dataset_id,
                    materialized_metrics_dataset=
                    DATAFLOW_METRICS_MATERIALIZED_DATASET,
                    table_id=table_id,
                    day_count_limit=dataflow_config.
                    MAX_DAYS_IN_DATAFLOW_METRICS_TABLE,
                    month_limit_by_state=month_limit_by_state,
                ))

        # Exclude these columns leftover from the exclusion join from being added to the metric tables in cold storage
        columns_to_exclude_from_transfer = ["keep_job_id", "keep_created_date"]

        # This filter will return the rows that should be moved to cold storage
        insert_filter_clause = "WHERE keep_job_id IS NULL AND keep_created_date IS NULL"

        # Query for rows to be moved to the cold storage table
        insert_query = """
            SELECT * EXCEPT({columns_to_exclude}) FROM
            `{project_id}.{dataflow_metrics_dataset}.{table_id}`
            {source_data_join_clause}
            {insert_filter_clause}
        """.format(
            columns_to_exclude=", ".join(columns_to_exclude_from_transfer),
            project_id=table_ref.project,
            dataflow_metrics_dataset=table_ref.dataset_id,
            table_id=table_id,
            source_data_join_clause=source_data_join_clause,
            insert_filter_clause=insert_filter_clause,
        )

        # Move data from the Dataflow metrics dataset into the cold storage table, creating the table if necessary
        insert_job = bq_client.insert_into_table_from_query(
            destination_dataset_id=cold_storage_dataset,
            destination_table_id=table_id,
            query=insert_query,
            allow_field_additions=True,
            write_disposition=WriteDisposition.WRITE_APPEND,
        )

        # Wait for the insert job to complete before running the replace job
        insert_job.result()

        # This will return the rows that were not moved to cold storage and should remain in the table
        replace_query = """
            SELECT * EXCEPT({columns_to_exclude}) FROM
            `{project_id}.{dataflow_metrics_dataset}.{table_id}`
            {source_data_join_clause}
            WHERE keep_job_id IS NOT NULL OR keep_created_date IS NOT NULL
        """.format(
            columns_to_exclude=", ".join(columns_to_exclude_from_transfer),
            project_id=table_ref.project,
            dataflow_metrics_dataset=table_ref.dataset_id,
            table_id=table_id,
            source_data_join_clause=source_data_join_clause,
        )

        # Replace the Dataflow table with only the rows that should remain
        replace_job = bq_client.create_table_from_query_async(
            dataflow_metrics_dataset,
            table_ref.table_id,
            query=replace_query,
            overwrite=True,
        )

        # Wait for the replace job to complete before moving on
        replace_job.result()