Ejemplo n.º 1
0
def execute_transformation_query(bq_client):
    """Executes transformation query to a new destination table.
    Args:
        bq_client: Object representing a reference to a BigQuery Client
    """
    dataset_ref = bq_client.get_dataset(bigquery.DatasetReference(
        project=config.billing_project_id,
        dataset_id=config.output_dataset_id))
    table_ref = dataset_ref.table(config.output_table_name)
    job_config = bigquery.QueryJobConfig()
    job_config.destination = table_ref
    job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE
    job_config.time_partitioning = bigquery.TimePartitioning(
        field='usage_start_time',
        expiration_ms=None)
    sql = Template(file_to_string(config.sql_file_path))
    sql = sql.safe_substitute(billing_table=config.billing_project_id +
                                            '.' + config.billing_dataset_id +
                                            '.' + config.billing_table_name,
                              allocation_method=config.allocation_method
                              )
    logging.info('Attempting query on all dates...')
    # Execute Query
    query_job = bq_client.query(
        sql,
        job_config=job_config)

    query_job.result()  # Waits for the query to finish
    logging.info('Transformation query complete. All partitions are updated.')
Ejemplo n.º 2
0
def execute_query(bq_client: bigquery.Client, env_vars: {}, query_path: object,
                  output_table_name: str, time_partition: bool) -> None:
    """Executes transformation query to a new destination table.
    Args:
        bq_client: bigquery.Client object
        env_vars: Dictionary of key: value, where value is environment variable
        query_path: Object representing location of SQL query to execute
        output_table_name: String representing name of table that holds output
        time_partition: Boolean indicating whether to time-partition output
    """
    dataset_ref = bq_client.get_dataset(
        bigquery.DatasetReference(project=bq_client.project,
                                  dataset_id=env_vars['corrected_dataset_id']))
    table_ref = dataset_ref.table(output_table_name)
    job_config = bigquery.QueryJobConfig()
    job_config.destination = table_ref
    job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE

    # Time Partitioning table is only needed for final output query
    if time_partition:
        job_config.time_partitioning = bigquery.TimePartitioning(
            field='usage_start_time', expiration_ms=None)
    logging.info('Attempting query...')
    # Execute Query
    query_job = bq_client.query(query=render_template(query_path, env_vars),
                                job_config=job_config)

    query_job.result()  # Waits for the query to finish
Ejemplo n.º 3
0
def csv_in_gcs_to_table(event, context):

    from google.cloud import bigquery

    client = bigquery.Client()

    bucket_name = "egen-poc-bucket"
    object_name = event['name']
    table_id = 'cparkar-project-310718.egen_poc_dataset.egen_poc_table'

    schema = [
        bigquery.SchemaField('Sno', 'INTEGER'),
        bigquery.SchemaField('State', 'STRING'),
        bigquery.SchemaField('ConfirmedIndianNational', 'INTEGER'),
        bigquery.SchemaField('ConfirmedForeignNational', 'INTEGER'),
        bigquery.SchemaField('Cured', 'INTEGER'),
        bigquery.SchemaField('Deaths', 'INTEGER'),
        bigquery.SchemaField('Confirmed', 'INTEGER')
    ]

    job_config = bigquery.LoadJobConfig()
    job_config.schema = schema
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.write_disposition = bigquery.WriteDisposition().WRITE_APPEND
    job_config.skip_leading_rows = 1

    uri = "gs://{}/{}".format(bucket_name, object_name)

    load_job = client.load_table_from_uri(uri, table_id, job_config=job_config)
    load_job.result()
Ejemplo n.º 4
0
def execute_query(bq_client):
    """Executes transformation query to a new destination table.
    Args:
        bq_client: Object representing a reference to a BigQuery Client
    """
    dataset_ref = bq_client.get_dataset(
        bigquery.DatasetReference(
            project=config.config_vars['project_id'],
            dataset_id=config.config_vars['output_dataset_id']))
    table_ref = dataset_ref.table(config.config_vars['output_table_name'])
    job_config = bigquery.QueryJobConfig()
    job_config.destination = table_ref
    job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE
    sql = file_to_string(config.config_vars['sql_file_path'])
    logging.info('Attempting query on all dates...')
    # Execute Query
    query_job = bq_client.query(sql, job_config=job_config)

    query_job.result()  # Waits for the query to finish
    logging.info('Query complete. The table is updated.')
Ejemplo n.º 5
0
def csv_in_gcs_to_table(bucket_name: str, object_name: str, dataset_id: str,
                        table_id: str,
                        schema: List[bigquery.SchemaField]) -> None:
    """Upload CSV to BigQuery table.
        If the table already exists, it overwrites the table data.

    Args:
        bucket_name: Bucket name for holding the object
        object_name: Name of object to be uploaded
        dataset_id: Dataset id where the table is located.
        table_id: String holding id of hte table.
        schema: Schema of the table_id
    """
    client = bigquery.Client()
    dataset_ref = client.dataset(dataset_id)
    job_config = bigquery.LoadJobConfig()
    job_config.schema = schema
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE
    uri = "gs://{}/{}".format(bucket_name, object_name)
    load_job = client.load_table_from_uri(uri,
                                          dataset_ref.table(table_id),
                                          job_config=job_config)
    load_job.result()