def _do_temp_table_based_load(self, rows): assert isinstance(rows, dict) loaded_tmp_tables = [] try: for stream in rows.keys(): tmp_table_name = "t_{}_{}".format( self.tables[stream], str(uuid.uuid4()).replace("-", "")) job = self._load_to_bq( client=self.client, dataset=self.dataset, table_name=tmp_table_name, table_schema=self.schemas[stream], table_config=self.table_configs.get(stream, {}), key_props=self.key_properties[stream], metadata_columns=self.add_metadata_columns, truncate=True, rows=self.rows[stream]) loaded_tmp_tables.append((stream, tmp_table_name)) # copy tables to production tables for stream, tmp_table_name in loaded_tmp_tables: truncate = self.truncate if stream not in self.partially_loaded_streams else False copy_config = CopyJobConfig() if truncate: copy_config.write_disposition = WriteDisposition.WRITE_TRUNCATE self.logger.info( f"Copy {tmp_table_name} to {self.tables[stream]} by FULL_TABLE" ) else: copy_config.write_disposition = WriteDisposition.WRITE_APPEND self.logger.info( f"Copy {tmp_table_name} to {self.tables[stream]} by APPEND" ) self.client.copy_table( sources=self.dataset.table(tmp_table_name), destination=self.dataset.table(self.tables[stream]), job_config=copy_config).result() self.partially_loaded_streams.add(stream) self.rows[stream].close() # erase the file self.rows[stream] = TemporaryFile(mode="w+b") except Exception as e: raise e finally: # delete temp tables for stream, tmp_table_name in loaded_tmp_tables: self.client.delete_table( table=self.dataset.table(tmp_table_name), not_found_ok=True)
def copy_table(self): # Overwrite parent method for alternative approach # If target table was created by a previous run, drop it first self.drop_table_if_exists(self.table_name, self.schema_name + self.schema_suffix, self.database_name) # Create target table if source table exists if self.test_if_table_exists( table_name=self.table_name, schema_name=self.schema_name, project_id=self.database_name, ): self.log.info("Copying table into temporary dataset!") conn = self.dwh_hook.dbconn ds_old = conn.get_dataset(self.schema_name) ds_new = conn.get_dataset(self.schema_name + self.schema_suffix) table_old = conn.get_table(table=TableReference( dataset_ref=ds_old, table_id=self.table_name)) table_new = ds_new.table(self.table_name) copy_job = conn.copy_table( table_old, table_new, job_config=CopyJobConfig(write_disposition="WRITE_TRUNCATE"), ) copy_job.result() # Waits until the job is done assert copy_job.state == "DONE", "Unexpected job state: {0}".format( job.state) self.log.info("Successfully copied {0}!".format( copy_job.__dict__["_properties"]["configuration"]["copy"] ["destinationTable"]["tableId"]))
def enrich_task(): client = Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) table = Table(temp_table_ref) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) if time_partitioning_field is not None: table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table.schema = schema table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql = read_file(sql_path, environment) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' # Copy temporary table to destination copy_job_config = CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table(dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)
def enrich_task(ds, **kwargs): template_context = kwargs.copy() template_context['ds'] = ds template_context['params'] = environment client = Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) table = Table(temp_table_ref) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table.schema = schema table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql_template = read_file(sql_path) sql = kwargs['task'].render_template('', sql_template, template_context) print('Enrichment sql:') print(sql) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' if load_all_partitions: # Copy temporary table to destination copy_job_config = CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table( dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' else: # Merge # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement merge_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 merge_job_config.priority = QueryPriority.INTERACTIVE merge_sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/merge_{task}.sql'.format( task=task)) merge_sql_template = read_file(merge_sql_path) template_context['params']['source_table'] = temp_table_name merge_sql = kwargs['task'].render_template( '', merge_sql_template, template_context) print('Merge sql:') print(merge_sql) merge_job = client.query(merge_sql, location='US', job_config=merge_job_config) submit_bigquery_job(merge_job, merge_job_config) assert merge_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)