def bigquery_dataset(bigquery_client: bigquery.Client, bigquery_schema: List[bigquery.SchemaField]): project_id = bigquery_client.project dataset_id = "test_pybigquery" dataset = bigquery.Dataset(f"{project_id}.{dataset_id}") dataset = bigquery_client.create_dataset(dataset, exists_ok=True) sample_table_id = f"{project_id}.{dataset_id}.sample" try: # Since the data changes rarely and the tests are mostly read-only, # only create the tables if they don't already exist. # TODO: Create shared sample data tables in bigquery-public-data that # include test values for all data types. bigquery_client.get_table(sample_table_id) except google.api_core.exceptions.NotFound: job1 = load_sample_data(sample_table_id, bigquery_client, bigquery_schema) job1.result() one_row_table_id = f"{project_id}.{dataset_id}.sample_one_row" try: bigquery_client.get_table(one_row_table_id) except google.api_core.exceptions.NotFound: job2 = load_sample_data( one_row_table_id, bigquery_client, bigquery_schema, filename="sample_one_row.json", ) job2.result() view = bigquery.Table(f"{project_id}.{dataset_id}.sample_view", ) view.view_query = f"SELECT string FROM `{dataset_id}.sample`" bigquery_client.create_table(view, exists_ok=True) return dataset_id
def _recreate_table(client: bigquery.Client, table_id: str, schema: list) -> None: logger.info(f'Recreating {table_id} table...') dataset_ref = client.dataset(DATASET_ID) table_ref = dataset_ref.table(table_id) table = bigquery.Table(table_ref, schema=schema) client.delete_table(table_ref, not_found_ok=True) client.create_table(table)
def load_views(bq: bigquery.Client, dataset: bigquery.Dataset, views: Dict[str, str]): """Load views for a test.""" for table, view_query in views.items(): view = bigquery.Table(dataset.table(table)) view.view_query = view_query.format(project=dataset.project, dataset=dataset.dataset_id) bq.create_table(view)
def table_id_us_east1(bigquery_client: bigquery.Client, project_id: str, dataset_id_us_east1: str): table_id = prefixer.create_prefix() full_table_id = f"{project_id}.{dataset_id_us_east1}.{table_id}" table = bigquery.Table( full_table_id, schema=[bigquery.SchemaField("string_col", "STRING")]) bigquery_client.create_table(table) yield full_table_id bigquery_client.delete_table(table, not_found_ok=True)
def create_table( client: bq.Client, dataset_id: str, table_id: str, schema: List[bq.SchemaField], ): table_full_id = get_full_table_name(client, dataset_id, table_id) table = bq.Table(table_full_id, schema=schema) client.create_table(table)
def writable_table(bigquery_client: bigquery.Client, project_id: str, random_dataset: bigquery.Dataset): full_table_id = f"{project_id}.{random_dataset.dataset_id}.writable_table_{random.randrange(1_000_000_000)}" table = bigquery.Table(full_table_id) table.schema = [ bigquery.SchemaField("field1", "STRING"), bigquery.SchemaField("field2", "INTEGER"), ] bigquery_client.create_table(table) yield full_table_id bigquery_client.delete_table(full_table_id)
def bigquery_empty_table( bigquery_dataset: str, bigquery_client: bigquery.Client, bigquery_schema: List[bigquery.SchemaField], ): project_id = bigquery_client.project # Create new table in its own dataset. dataset_id = bigquery_dataset table_id = f"{project_id}.{dataset_id}.sample_dml_empty" empty_table = bigquery.Table(table_id, schema=bigquery_schema) bigquery_client.create_table(empty_table) return table_id
def load_bigquery_table_via_bq_apis(bq_client: bigquery.Client, dataset_id, table_name, imported_data_info, src_uris): """ Load tables using BigQuery Load jobs, using the same configuration as BQ DTS ImportedDataInfo :return: """ # https://googlecloudplatform.github.io/google-cloud-python/latest/_modules/google/cloud/bigquery/client.html#Client.load_table_from_uri # Step 1 - Translate required fields for BigQuery Python SDK tgt_tabledef = imported_data_info['table_defs'][0] # Step 2 - Create target table if it doesn't exist dataset_ref = bq_client.dataset(dataset_id) table_ref = dataset_ref.table(table_name) try: bq_client.get_table(table_ref) except exceptions.NotFound: # Step 2a - Attach schema tgt_schema = RPCRecordSchema_to_GCloudSchema(tgt_tabledef['schema']) tgt_table = bigquery.Table(table_ref, schema=tgt_schema) # Step 2b - Attach description tgt_table.description = imported_data_info[ 'destination_table_description'] # Step 2c - Conditionally set partitioning type if '$' in table_name: tgt_table.partitioning_type = 'DAY' tgt_table._properties['tableReference'][ 'tableId'], _, _ = table_name.partition('$') # Step 2d - Create BigQuery table bq_client.create_table(tgt_table) # Step 3a - Create BigQuery Load Job ID current_datetime = datetime.datetime.utcnow().isoformat() raw_job_id = f'{table_name}_{current_datetime}' clean_job_id = BQ_JOB_ID_MATCHER.sub('___', raw_job_id) # Step 3b - Create BigQuery Job Config job_config = DTSTableDefinition_to_BQLoadJobConfig(tgt_tabledef) # Step 4 - Execute BigQuery Load Job using Python SDK load_job = bq_client.load_table_from_uri(source_uris=src_uris, destination=table_ref, job_id=clean_job_id, job_config=job_config) return load_job
def create_external_table_hive_partitioning( bq_client: bigquery.Client, dataset: bigquery.Dataset, table_id: str, gcs_directory_path: str) -> bigquery.Table: """ Creates an external table with AUTO hive partitioning in GCS :param bq_client: Client object to bigquery :param dataset: dataset object. Check 'get_or_create_dataset' method :param table_id: Table to be created :param gcs_directory_path: Directory of GCS with the data. For example: If you have a structure like this: "gs://bucket/images_metadata/source_id=abc/date=2018-02-20" You should pass: "******" :return: """ table = bigquery.Table(dataset.table(table_id)) external_config = bigquery.ExternalConfig( bigquery.SourceFormat.PARQUET) external_config.source_uris = [f"{gcs_directory_path}/*"] hive_part_opt = HivePartitioningOptions() hive_part_opt.mode = "AUTO" hive_part_opt.source_uri_prefix = gcs_directory_path external_config.hive_partitioning = hive_part_opt table.external_data_configuration = external_config table = bq_client.create_table(table, exists_ok=True) return table
def dest_partitioned_table(request, bq: bigquery.Client, mock_env, dest_dataset) -> bigquery.Table: public_table: bigquery.Table = bq.get_table( bigquery.TableReference.from_string( "bigquery-public-data.new_york_311.311_service_requests")) schema = public_table.schema table: bigquery.Table = bigquery.Table( f"{os.environ.get('GCP_PROJECT')}" f".{dest_dataset.dataset_id}.cf_test_nyc_311_" f"{str(uuid.uuid4()).replace('-','_')}", schema=schema, ) table.time_partitioning = bigquery.TimePartitioning() table.time_partitioning.type_ = bigquery.TimePartitioningType.HOUR table.time_partitioning.field = "created_date" table = bq.create_table(table) def teardown(): bq.delete_table(table, not_found_ok=True) request.addfinalizer(teardown) return table
def get_or_create_table(client: bigquery.Client, dataset_id: str, table_id: str) -> bigquery.Table: """ BigQueryのデータセットとテーブルを作成する。既に存在する場合は取得する。 """ logging.info(f'Creating dataset {dataset_id} if not exists...') dataset = client.create_dataset(dataset_id, exists_ok=True) # データセットを作成または取得する。 logging.info(f'Creating table {dataset_id}.{table_id} if not exists...') table_ref = dataset.table(table_id) return client.create_table( # テーブルを作成または取得する。 bigquery.Table(table_ref, schema=[ bigquery.SchemaField('id', 'string', description='ツイートのID'), bigquery.SchemaField('lang', 'string', description='ツイートの言語'), bigquery.SchemaField('screen_name', 'string', description='ユーザー名'), bigquery.SchemaField('text', 'string', description='ツイートの本文'), bigquery.SchemaField('created_at', 'timestamp', description='ツイートの日時'), ]), exists_ok=True)
def dest_partitioned_table_allow_jagged(bq: bigquery.Client, dest_dataset, monkeypatch) -> bigquery.Table: public_table: bigquery.Table = bq.get_table( bigquery.TableReference.from_string( "bigquery-public-data.new_york_311.311_service_requests")) schema = public_table.schema if os.getenv('GCP_PROJECT') is None: monkeypatch.setenv("GCP_PROJECT", bq.project) extra_field_for_jagged_row_test = bigquery.schema.SchemaField( "extra_jagged_row_test_column", "STRING") schema.append(extra_field_for_jagged_row_test) table: bigquery.Table = bigquery.Table( f"{os.getenv('GCP_PROJECT')}" f".{dest_dataset.dataset_id}.cf_test_nyc_311_" f"{str(uuid.uuid4()).replace('-', '_')}", schema=schema, ) table.time_partitioning = bigquery.TimePartitioning() table.time_partitioning.type_ = bigquery.TimePartitioningType.HOUR table.time_partitioning.field = "created_date" table = bq.create_table(table) return table
def create_view_task(): client = Client() dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table(dest_table_name) table = Table(dest_table_ref) sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql = read_file(sql_path, environment) table.view_query = sql description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) logging.info('Creating view: ' + json.dumps(table.to_api_repr())) try: table = client.create_table(table) except Conflict: # https://cloud.google.com/bigquery/docs/managing-views table = client.update_table(table, ['view_query']) assert table.table_id == dest_table_name
def create_tables(client: bigquery.Client, tableSchemas: dict) -> dict: """Create empty BigQuery tables for the given partial Table schemas. Returns a dict of `{ftId : bqId}` to the caller """ ds = create_dataset(client, f'FusionTable_Autoimport_{datetime.now()}') def _create_field_schema(col_schema: dict) -> bigquery.SchemaField: """Create a SchemaField from the dict""" name = to_safe_name(col_schema['name']) return bigquery.SchemaField(name, col_schema.get('type'), col_schema.get('mode', 'NULLABLE'), col_schema.get('description', '')) def _table_from_ft(ft_schema: dict) -> bigquery.Table: """Create a local representation of a BigQuery table""" # A "TableSchema" is just a sequence of SchemaFields https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.table.Table.html schema = list(map(_create_field_schema, ft_schema['columns'])) table = bigquery.Table( bigquery.TableReference(ds, to_safe_name(ft_schema['name'])), schema) table.description = ft_schema.get('description', '') return table return { ftId: client.create_table(_table_from_ft(ftSchema)) for (ftId, ftSchema) in tableSchemas.items() }
def create_view_task(ds, **kwargs): template_context = kwargs.copy() template_context['ds'] = ds template_context['params'] = environment client = Client() dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table(dest_table_name) table = Table(dest_table_ref) sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql_template = read_file(sql_path) sql = kwargs['task'].render_template('', sql_template, template_context) table.view_query = sql description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) logging.info('Creating view: ' + json.dumps(table.to_api_repr())) try: table = client.create_table(table) except Conflict: # https://cloud.google.com/bigquery/docs/managing-views table = client.update_table(table, ['view_query']) assert table.table_id == dest_table_name
def bigquery_empty_table( bigquery_dataset: str, bigquery_dml_dataset: str, bigquery_client: bigquery.Client, bigquery_schema: List[bigquery.SchemaField], ): project_id = bigquery_client.project # Cleanup the sample_dml table, if it exists. old_table_id = f"{project_id}.{bigquery_dataset}.sample_dml" bigquery_client.delete_table(old_table_id, not_found_ok=True) # Create new table in its own dataset. dataset_id = bigquery_dml_dataset table_id = f"{project_id}.{dataset_id}.sample_dml_{temp_suffix()}" empty_table = bigquery.Table(table_id, schema=bigquery_schema) bigquery_client.create_table(empty_table) yield table_id bigquery_client.delete_table(empty_table)
def test_list_rows_empty_table(bigquery_client: bigquery.Client, table_id: str): from google.cloud.bigquery.table import RowIterator table = bigquery_client.create_table(table_id) # It's a bit silly to list rows for an empty table, but this does # happen as the result of a DDL query from an IPython magic command. rows = bigquery_client.list_rows(table) assert isinstance(rows, RowIterator) assert tuple(rows) == ()
def bigquery_dataset(bigquery_client: bigquery.Client, bigquery_schema: List[bigquery.SchemaField]): project_id = bigquery_client.project dataset_id = prefixer.create_prefix() dataset = bigquery.Dataset(f"{project_id}.{dataset_id}") dataset = bigquery_client.create_dataset(dataset) sample_table_id = f"{project_id}.{dataset_id}.sample" job1 = load_sample_data(sample_table_id, bigquery_client, bigquery_schema) job1.result() one_row_table_id = f"{project_id}.{dataset_id}.sample_one_row" job2 = load_sample_data( one_row_table_id, bigquery_client, bigquery_schema, filename="sample_one_row.json", ) job2.result() view = bigquery.Table(f"{project_id}.{dataset_id}.sample_view", ) view.view_query = f"SELECT string FROM `{dataset_id}.sample`" bigquery_client.create_table(view) yield dataset_id bigquery_client.delete_dataset(dataset_id, delete_contents=True)
def create_table(client: bigquery.Client, dataset_id: str, table_id: str, schema: list): """ Creates a table according to the given schema in the specified project:dataset Args: client: BQ API client dataset_id: destination dataset table_id: table to be created schema: schema of the table to be created Returns: Examples: create_table(client, 'my_dataset', 'my_table', my_schema) """ dataset_ref = client.dataset(dataset_id=dataset_id) tables_list = [t.table_id for t in list(client.list_tables(dataset_ref))] if table_id in tables_list: print("THIS TABLE ALREADY EXISTS IN {}:{}".format(client.project, dataset_id)) else: table_ref = dataset_ref.table(table_id) client.create_table(bigquery.Table(table_ref, schema))
def create_table(client: bigquery.Client, dataset_id: str, table_id: str, fields: List[Dict]): dataset = bigquery.Dataset("{}.{}".format(client.project, dataset_id)) table_ref = dataset.table(table_id) table = bigquery.Table(table_ref, schema=fields) table.time_partitioning = bigquery.table.TimePartitioning( ) # partition by day try: table = client.create_table(table) print("Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)) except gexceptions.GoogleAPIError as e: print("Table {} could not be created: {}. Skipping...".format( table_id, e))
def get_or_create_table(client: bigquery.Client) -> bigquery.Table: try: dataset = client.get_dataset("sensors") except NotFound as _: dataset = client.create_dataset("sensors") # The default project ID is not set and hence a fully-qualified ID is required. table_ref = bigquery.TableReference(dataset, table_id="particulate_matter") try: return client.get_table(table_ref) except NotFound as _: return client.create_table( bigquery.Table( table_ref, schema=[ bigquery.SchemaField( "humidity", "NUMERIC", description="Sensor DHT22humidity in %"), bigquery.SchemaField("max_micro", "NUMERIC", description=""), bigquery.SchemaField("min_micro", "NUMERIC", description=""), bigquery.SchemaField("samples", "NUMERIC", description=""), bigquery.SchemaField( "sds_p1", "NUMERIC", description="Sensor SDS011 PM10 in µg/m³"), bigquery.SchemaField( "sds_p2", "NUMERIC", description="Sensor SDS011 PM2.5 in µg/m³"), bigquery.SchemaField( "signal", "NUMERIC", description="WiFi signal strength in dBm"), bigquery.SchemaField( "temperature", "NUMERIC", description="Sensor DHT22 temperature in °C"), bigquery.SchemaField("datetime", "DATETIME", description="Datetime of measurement", mode="REQUIRED"), ], ))
def create_bq_table( client: bigquery.Client, dataset: bigquery.Dataset, table_id: str, table_schema: List[bigquery.SchemaField], table_description: str = None, ) -> bigquery.Table: """ Create empty table. """ # TODO: validate 'table_id'. # note: it is not intuitive the dual instantiation of a 'Table' object. table = bigquery.Table(dataset.table(table_id), schema=table_schema) # type: bigquery.Table table.description = table_description # API request return client.create_table(table) # type: bigquery.Table
def dest_table(request, bq: bigquery.Client, dest_dataset) -> bigquery.Table: public_table: bigquery.Table = bq.get_table( bigquery.TableReference.from_string( "bigquery-public-data.new_york_311.311_service_requests")) schema = public_table.schema table: bigquery.Table = bigquery.Table( f"{os.environ.get('TF_VAR_project_id', 'bqutil')}" f".{dest_dataset.dataset_id}.cf_e2e_test_nyc_311_" f"{os.getenv('SHORT_SHA', 'manual')}", schema=schema, ) table = bq.create_table(table) def teardown(): bq.delete_table(table, not_found_ok=True) request.addfinalizer(teardown) return table
def load_folder(dst_dataset: str, bq_client: BQClient, bucket_name: str, prefix: str, gcs_client: GCSClient, hpo_id: str) -> List[LoadJob]: """ Stage files from a bucket to a dataset :param dst_dataset: Identifies the destination dataset :param bq_client: a BigQuery client object :param bucket_name: the bucket in GCS containing the archive files :param prefix: prefix of the filepath URI :param gcs_client: a Cloud Storage client object :param hpo_id: Identifies the HPO site :return: list of completed load jobs """ blobs = list(gcs_client.list_blobs(bucket_name, prefix=prefix)) load_jobs = [] for blob in blobs: table_name = _filename_to_table_name(blob.name) if table_name not in AOU_REQUIRED: LOGGER.debug(f'Skipping file for {table_name}') continue schema = get_table_schema(table_name) hpo_table_name = f'{hpo_id}_{table_name}' fq_hpo_table = f'{bq_client.project}.{dst_dataset}.{hpo_table_name}' destination = Table(fq_hpo_table, schema=schema) destination = bq_client.create_table(destination) job_config = LoadJobConfig() job_config.schema = schema job_config.skip_leading_rows = 1 job_config.source_format = 'CSV' source_uri = f'gs://{bucket_name}/{blob.name}' load_job = bq_client.load_table_from_uri( source_uri, destination, job_config=job_config, job_id_prefix=f"{__file__.split('/')[-1].split('.')[0]}_") LOGGER.info(f'table:{destination} job_id:{load_job.job_id}') load_jobs.append(load_job) load_job.result() return load_jobs
def create( cls, client: bigquery.Client, project_id: str, dataset_id: str, schema: List[bigquery.SchemaField] = None, partition: bool = False, ): if cls.name is None: raise NotImplementedError( "class attribute `name` must be set in order to create table") bq_table = bigquery.Table( _fulltable(project_id, dataset_id, cls.name), schema=schema, ) bq_table.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field=cls.partition_field, ) bq_table = client.create_table(bq_table) return cls(client, bq_table=bq_table)
def apply_schema_differences( schema_diffs: _SchemaDiffs, bigquery_client: BigQueryClient, ) -> None: print("Applying changes...") for table_identifier, difference in schema_diffs.items(): if isinstance(difference, MissingTable): print("Creating table...") table = Table( table_identifier, schema=difference.local_table.get_schema_fields(), ) if difference.local_table.time_partitioning: table.time_partitioning = difference.local_table.time_partitioning remote_table = bigquery_client.create_table(table) print(remote_table) elif isinstance(difference, ExistingTable): difference.remote_table.schema = difference.local_table.get_schema_fields( ) print( bigquery_client.update_table(difference.remote_table, ["schema"]))
def create_bq_table(table_name='CRY', dataset_name='price_data'): '''Create table if not exists''' client = Client() tables = [ i.table_id for i in client.list_tables(client.project + "." + dataset_name) ] if table_name not in tables: if table_name == 'CRY': schema = [ SchemaField("open", "FLOAT64", mode="NULLABLE"), SchemaField("high", "FLOAT64", mode="NULLABLE"), SchemaField("low", "FLOAT64", mode="NULLABLE"), SchemaField("close", "FLOAT64", mode="NULLABLE"), SchemaField("volume", "FLOAT64", mode="NULLABLE"), SchemaField("market_cap", "FLOAT64", mode="NULLABLE"), SchemaField("symbol", "STRING", mode="NULLABLE"), SchemaField("date", "TIMESTAMP", mode="NULLABLE"), ] else: schema = [ SchemaField("open", "FLOAT64", mode="NULLABLE"), SchemaField("high", "FLOAT64", mode="NULLABLE"), SchemaField("low", "FLOAT64", mode="NULLABLE"), SchemaField("close", "FLOAT64", mode="NULLABLE"), SchemaField("adjusted_close", "FLOAT64", mode="NULLABLE"), SchemaField("volume", "FLOAT64", mode="NULLABLE"), SchemaField("dividend_amount", "FLOAT64", mode="NULLABLE"), SchemaField("split_coefficient", "FLOAT64", mode="NULLABLE"), SchemaField("symbol", "STRING", mode="NULLABLE"), SchemaField("date", "TIMESTAMP", mode="NULLABLE"), ] table = Table(client.project + "." + dataset_name + "." + table_name, schema=schema) table = client.create_table(table) else: print("Table already exists")
def enrich_task(): client = Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) table = Table(temp_table_ref) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) if time_partitioning_field is not None: table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table.schema = schema table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql = read_file(sql_path, environment) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' # Copy temporary table to destination copy_job_config = CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table(dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)
def bigquery_table(bigquery_client: bigquery.Client) -> bigquery.Table: """ Create a temporary table for the test, remove on cleanup """ table = bigquery_client.create_table(TEST_TABLE, exists_ok=True) yield table bigquery_client.delete_table(table)
def enrich_task(ds, **kwargs): template_context = kwargs.copy() template_context['ds'] = ds template_context['params'] = environment client = Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) table = Table(temp_table_ref) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table.schema = schema table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql_template = read_file(sql_path) sql = kwargs['task'].render_template('', sql_template, template_context) print('Enrichment sql:') print(sql) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' if load_all_partitions: # Copy temporary table to destination copy_job_config = CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table( dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' else: # Merge # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement merge_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 merge_job_config.priority = QueryPriority.INTERACTIVE merge_sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/merge_{task}.sql'.format( task=task)) merge_sql_template = read_file(merge_sql_path) template_context['params']['source_table'] = temp_table_name merge_sql = kwargs['task'].render_template( '', merge_sql_template, template_context) print('Merge sql:') print(merge_sql) merge_job = client.query(merge_sql, location='US', job_config=merge_job_config) submit_bigquery_job(merge_job, merge_job_config) assert merge_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)