Exemple #1
0
def run_export(project_id: str, dry_run: bool, state_codes: List[str],
               target_dataset: str) -> None:
    """Performs the export operation, exporting rows for the given state codes from the tables from the state dataset
    in the given project to tables with the same names in the target dataset."""
    big_query_client = BigQueryClientImpl()
    dataset_ref = big_query_client.dataset_ref_for_id(STATE_BASE_DATASET)
    if not big_query_client.dataset_exists(dataset_ref):
        raise ValueError(f'Dataset {dataset_ref.dataset_id} does not exist')

    tables = big_query_client.list_tables(dataset_ref.dataset_id)

    for table in tables:
        logging.info("******************************")
        export_query = state_table_export_query_str(table, state_codes)
        logging.info(export_query)

        if not export_query:
            continue

        if target_dataset:
            if dry_run:
                logging.info(
                    "[DRY RUN] Exporting to target project.dataset.table [%s.%s.%s]",
                    project_id, target_dataset, table.table_id)
            else:
                logging.info(
                    "Exporting to target project.dataset.table [%s.%s.%s]",
                    project_id, target_dataset, table.table_id)
                copy_table_to_dataset(target_dataset, table.table_id,
                                      export_query, big_query_client)
Exemple #2
0
def run_export(dry_run: bool, state_code: str, target_bucket: str) -> None:
    """Performs the export operation, exporting rows for the given state codes from the tables from the state dataset
    in the given project to CSV files with the same names as the tables to the given GCS bucket."""
    today = datetime.date.today()

    big_query_client = BigQueryClientImpl()
    dataset_ref = big_query_client.dataset_ref_for_id(STATE_BASE_DATASET)
    if not big_query_client.dataset_exists(dataset_ref):
        raise ValueError(f"Dataset {dataset_ref.dataset_id} does not exist")

    tables = big_query_client.list_tables(dataset_ref.dataset_id)

    export_configs = []
    for table in tables:
        logging.info("******************************")
        export_query = state_table_export_query_str(table,
                                                    [state_code.upper()])
        logging.info(export_query)

        if not export_query:
            continue

        export_dir = gcs_export_directory(target_bucket, today,
                                          state_code.lower())
        export_file_name = f"{table.table_id}_{today.isoformat()}_export.csv"
        file = GcsfsFilePath.from_directory_and_file_name(
            export_dir, export_file_name)
        output_uri = file.uri()

        export_config = ExportQueryConfig(
            query=export_query,
            query_parameters=[],
            intermediate_dataset_id="export_temporary_tables",
            intermediate_table_name=
            f"{dataset_ref.dataset_id}_{table.table_id}_{state_code.lower()}",
            output_uri=output_uri,
            output_format=bigquery.DestinationFormat.CSV,
        )
        export_configs.append(export_config)
        if dry_run:
            logging.info(
                "[DRY RUN] Created export configuration to export table to GCS: %s",
                export_config,
            )
        else:
            logging.info(
                "Created export configuration to export table to GCS: %s",
                export_config)

    if dry_run:
        logging.info("[DRY RUN] Exporting [%d] tables to GCS",
                     len(export_configs))
    else:
        logging.info("Exporting [%d] tables to GCS", len(export_configs))
        big_query_client.export_query_results_to_cloud_storage(
            export_configs, print_header=True)
    def test_state_table_export_query_str(self) -> None:
        query = state_table_export_query_str(bigquery.table.TableListItem({
            "tableReference": {
                "projectId": "recidiviz-456",
                "datasetId": "state",
                "tableId": "state_person_race"
            }
        }),
                                             state_codes=['us_pa'])

        expected_query = \
            "SELECT state_person_race.state_code,state_person_race.race,state_person_race.race_raw_text," \
            "state_person_race.person_race_id,state_person_race.person_id " \
            "FROM `recidiviz-456.state.state_person_race` state_person_race " \
            "WHERE state_code IN ('US_PA');"
        self.assertEqual(query, expected_query)
    def test_state_table_export_query_str_association_table(self) -> None:
        query = state_table_export_query_str(bigquery.table.TableListItem({
            "tableReference": {
                "projectId":
                "recidiviz-456",
                "datasetId":
                "state",
                "tableId":
                "state_incarceration_sentence_supervision_period_association"
            }
        }),
                                             state_codes=['us_pa'])

        expected_query = \
            "SELECT state_incarceration_sentence_supervision_period_association.incarceration_sentence_id," \
            "state_incarceration_sentence_supervision_period_association.supervision_period_id," \
            "state_incarceration_sentence_supervision_period_association.state_code AS state_code " \
            "FROM `recidiviz-456.state.state_incarceration_sentence_supervision_period_association` " \
            "state_incarceration_sentence_supervision_period_association " \
            "WHERE state_code IN ('US_PA');"
        self.assertEqual(query, expected_query)