Ejemplo n.º 1
0
def table_exists(dataset_ref: bigquery.dataset.DatasetReference,
                 table_id: str) -> bool:
    """Check whether or not a BigQuery Table or View exists in a Dataset."""
    table_ref = dataset_ref.table(table_id)

    try:
        client().get_table(table_ref)
        return True
    except exceptions.NotFound:
        logging.warning("Table [%s] does not exist in dataset [%s]", table_id,
                        str(dataset_ref))
        return False
Ejemplo n.º 2
0
def create_or_update_view(dataset_ref: bigquery.dataset.DatasetReference,
                          view: bqview.BigQueryView):
    """Create a View if it does not exist, or update its query if it does.

    Args:
        dataset_ref: The BigQuery dataset to store the view in.
        view: The View to create or update.
    """
    view_ref = dataset_ref.table(view.view_id)
    bq_view = bigquery.Table(view_ref)
    bq_view.view_query = view.view_query

    if table_exists(dataset_ref, view.view_id):
        logging.info("Updating existing view [%s]", str(bq_view))
        client().update_table(bq_view, ['view_query'])
    else:
        logging.info("Creating view %s", str(bq_view))
        client().create_table(bq_view)
Ejemplo n.º 3
0
def export_to_cloud_storage(dataset_ref: bigquery.dataset.DatasetReference,
                            bucket: str, view: bqview.BigQueryView,
                            state_code: str):
    """Exports the table corresponding to the given view to the bucket.

    Extracts the entire table and exports in JSON format to the given bucket in
    Cloud Storage.

    This is a synchronous function that waits for the query job to complete
    before returning.

    Args:
        dataset_ref: The dataset where the view and table exist.
        bucket: The bucket in Cloud Storage where the export should go.
        view: The view whose corresponding table to export.
        state_code: The state code of the data being exported.
    """
    source_tablename = _table_name_for_view(view, state_code)

    if table_exists(dataset_ref, source_tablename):
        destination_filename = _destination_filename_for_view(view, state_code)
        destination_uri = "gs://{}/{}".format(bucket, destination_filename)

        table_ref = dataset_ref.table(source_tablename)

        job_config = bigquery.ExtractJobConfig()
        job_config.destination_format = \
            bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON

        extract_job = client().extract_table(
            table_ref,
            destination_uri,
            # Location must match that of the source table.
            location=LOCATION,
            job_config=job_config)
        # Waits for job to complete
        extract_job.result()
    else:
        logging.error("Table [%s] does not exist in dataset [%s]",
                      source_tablename, str(dataset_ref))
Ejemplo n.º 4
0
def start_table_load(
        dataset_ref: bigquery.dataset.DatasetReference,
        table_name: str, schema_type: SchemaType) -> \
        Optional[Tuple[bigquery.job.LoadJob, bigquery.table.TableReference]]:
    """Loads a table from CSV data in GCS to BigQuery.

    Given a table name, retrieve the export URI and schema from export_config,
    then load the table into BigQuery.

    This starts the job, but does not wait until it completes.

    Tables are created if they do not exist, and overwritten if they do exist.

    Because we are using bigquery.WriteDisposition.WRITE_TRUNCATE, the table's
    data will be completely wiped and overwritten with the contents of the CSV.

    Args:
        dataset_ref: The BigQuery dataset to load the table into. Gets created
            if it does not already exist.
        table_name: Table to import. Table must be defined
            in the export_config.*_TABLES_TO_EXPORT for the given module
        schema_type: The schema of the table being loaded, either
            SchemaType.JAILS or SchemaType.STATE.
    Returns:
        (load_job, table_ref) where load_job is the LoadJob object containing
            job details, and table_ref is the destination TableReference object.
            If the job fails to start, returns None.
    """
    if schema_type == SchemaType.JAILS:
        export_schema = export_config.COUNTY_TABLE_EXPORT_SCHEMA
    elif schema_type == SchemaType.STATE:
        export_schema = export_config.STATE_TABLE_EXPORT_SCHEMA
    else:
        logging.exception("Unknown schema type: %s", schema_type)
        return None

    bq_utils.create_dataset_if_necessary(dataset_ref)

    uri = export_config.gcs_export_uri(table_name)
    table_ref = dataset_ref.table(table_name)

    try:
        bq_schema = [
            bigquery.SchemaField(field['name'], field['type'], field['mode'])
            for field in export_schema[table_name]
        ]
    except KeyError:
        logging.exception(
            "Unknown table name '%s'. Is it listed in "
            "the TABLES_TO_EXPORT for the %s module?", schema_type, table_name)
        return None

    job_config = bigquery.LoadJobConfig()
    job_config.schema = bq_schema
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

    load_job = bq_utils.client().load_table_from_uri(uri,
                                                     table_ref,
                                                     job_config=job_config)

    logging.info("Started load job %s for table %s.%s.%s", load_job.job_id,
                 table_ref.project, table_ref.dataset_id, table_ref.table_id)

    return load_job, table_ref