def __create_job_config( self, ems_query_job_config: EmsQueryJobConfig) -> QueryJobConfig: job_config = QueryJobConfig() job_config.priority = ems_query_job_config.priority.value job_config.use_legacy_sql = False job_config.use_query_cache = ems_query_job_config.use_query_cache job_config.labels = ems_query_job_config.labels if ems_query_job_config.destination_table is not None: job_config.time_partitioning = TimePartitioning("DAY") table_reference = TableReference( DatasetReference( ems_query_job_config.destination_project_id or self.__project_id, ems_query_job_config.destination_dataset), ems_query_job_config.destination_table) job_config.destination = table_reference job_config.write_disposition = ems_query_job_config.write_disposition.value job_config.create_disposition = ems_query_job_config.create_disposition.value partitioning = ems_query_job_config.time_partitioning if partitioning is not None: job_config.time_partitioning = TimePartitioning( partitioning.type.value, partitioning.field, partitioning.expiration_ms, partitioning.require_partition_filter) if ems_query_job_config.table_definitions is not None: job_config.table_definitions = ems_query_job_config.table_definitions return job_config
def enrich_task(): client = Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) table = Table(temp_table_ref) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) if time_partitioning_field is not None: table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table.schema = schema table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql = read_file(sql_path, environment) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' # Copy temporary table to destination copy_job_config = CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table(dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)
def parse_url(url): # noqa: C901 query = dict(url.query) # need mutable query. # use_legacy_sql (legacy) if "use_legacy_sql" in query: raise ValueError("legacy sql is not supported by this dialect") # allow_large_results (legacy) if "allow_large_results" in query: raise ValueError( "allow_large_results is only allowed for legacy sql, which is not supported by this dialect" ) # flatten_results (legacy) if "flatten_results" in query: raise ValueError( "flatten_results is only allowed for legacy sql, which is not supported by this dialect" ) # maximum_billing_tier (deprecated) if "maximum_billing_tier" in query: raise ValueError("maximum_billing_tier is a deprecated argument") project_id = url.host location = None dataset_id = url.database or None arraysize = None credentials_path = None # location if "location" in query: location = query.pop("location") # credentials_path if "credentials_path" in query: credentials_path = query.pop("credentials_path") # arraysize if "arraysize" in query: str_arraysize = query.pop("arraysize") try: arraysize = int(str_arraysize) except ValueError: raise ValueError("invalid int in url query arraysize: " + str_arraysize) # if only these "non-config" values were present, the dict will now be empty if not query: # if a dataset_id exists, we need to return a job_config that isn't None # so it can be updated with a dataset reference from the client if dataset_id: return ( project_id, location, dataset_id, arraysize, credentials_path, QueryJobConfig(), ) else: return project_id, location, dataset_id, arraysize, credentials_path, None job_config = QueryJobConfig() # clustering_fields list(str) if "clustering_fields" in query: clustering_fields = GROUP_DELIMITER.split(query["clustering_fields"]) job_config.clustering_fields = list(clustering_fields) # create_disposition if "create_disposition" in query: create_disposition = query["create_disposition"] try: job_config.create_disposition = getattr(CreateDisposition, create_disposition) except AttributeError: raise ValueError("invalid create_disposition in url query: " + create_disposition) # default_dataset if "default_dataset" in query or "dataset_id" in query or "project_id" in query: raise ValueError( "don't pass default_dataset, dataset_id, project_id in url query, instead use the url host and database" ) # destination if "destination" in query: dest_project = None dest_dataset = None dest_table = None try: dest_project, dest_dataset, dest_table = query[ "destination"].split(".") except ValueError: raise ValueError( "url query destination parameter should be fully qualified with project, dataset, and table" ) job_config.destination = TableReference( DatasetReference(dest_project, dest_dataset), dest_table) # destination_encryption_configuration if "destination_encryption_configuration" in query: job_config.destination_encryption_configuration = EncryptionConfiguration( query["destination_encryption_configuration"]) # dry_run if "dry_run" in query: try: job_config.dry_run = parse_boolean(query["dry_run"]) except ValueError: raise ValueError("invalid boolean in url query for dry_run: " + query["dry_run"]) # labels if "labels" in query: label_groups = GROUP_DELIMITER.split(query["labels"]) labels = {} for label_group in label_groups: try: key, value = KEY_VALUE_DELIMITER.split(label_group) except ValueError: raise ValueError("malformed url query in labels: " + label_group) labels[key] = value job_config.labels = labels # maximum_bytes_billed if "maximum_bytes_billed" in query: try: job_config.maximum_bytes_billed = int( query["maximum_bytes_billed"]) except ValueError: raise ValueError( "invalid int in url query maximum_bytes_billed: " + query["maximum_bytes_billed"]) # priority if "priority" in query: try: job_config.priority = getattr(QueryPriority, query["priority"]) except AttributeError: raise ValueError("invalid priority in url query: " + query["priority"]) # query_parameters if "query_parameters" in query: raise NotImplementedError("url query query_parameters not implemented") # schema_update_options if "schema_update_options" in query: schema_update_options = GROUP_DELIMITER.split( query["schema_update_options"]) try: job_config.schema_update_options = [ getattr(SchemaUpdateOption, schema_update_option) for schema_update_option in schema_update_options ] except AttributeError: raise ValueError("invalid schema_update_options in url query: " + query["schema_update_options"]) # table_definitions if "table_definitions" in query: raise NotImplementedError( "url query table_definitions not implemented") # time_partitioning if "time_partitioning" in query: raise NotImplementedError( "url query time_partitioning not implemented") # udf_resources if "udf_resources" in query: raise NotImplementedError("url query udf_resources not implemented") # use_query_cache if "use_query_cache" in query: try: job_config.use_query_cache = parse_boolean( query["use_query_cache"]) except ValueError: raise ValueError( "invalid boolean in url query for use_query_cache: " + query["use_query_cache"]) # write_disposition if "write_disposition" in query: try: job_config.write_disposition = getattr(WriteDisposition, query["write_disposition"]) except AttributeError: raise ValueError("invalid write_disposition in url query: " + query["write_disposition"]) return project_id, location, dataset_id, arraysize, credentials_path, job_config
def enrich_task(ds, **kwargs): template_context = kwargs.copy() template_context['ds'] = ds template_context['params'] = environment client = Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) table = Table(temp_table_ref) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table.schema = schema table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql_template = read_file(sql_path) sql = kwargs['task'].render_template('', sql_template, template_context) print('Enrichment sql:') print(sql) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' if load_all_partitions: # Copy temporary table to destination copy_job_config = CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table( dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' else: # Merge # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement merge_job_config = QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 merge_job_config.priority = QueryPriority.INTERACTIVE merge_sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/merge_{task}.sql'.format( task=task)) merge_sql_template = read_file(merge_sql_path) template_context['params']['source_table'] = temp_table_name merge_sql = kwargs['task'].render_template( '', merge_sql_template, template_context) print('Merge sql:') print(merge_sql) merge_job = client.query(merge_sql, location='US', job_config=merge_job_config) submit_bigquery_job(merge_job, merge_job_config) assert merge_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)