def __create_job_config( self, ems_query_job_config: EmsQueryJobConfig) -> QueryJobConfig: job_config = QueryJobConfig() job_config.priority = ems_query_job_config.priority.value job_config.use_legacy_sql = False job_config.use_query_cache = ems_query_job_config.use_query_cache job_config.labels = ems_query_job_config.labels if ems_query_job_config.destination_table is not None: job_config.time_partitioning = TimePartitioning("DAY") table_reference = TableReference( DatasetReference( ems_query_job_config.destination_project_id or self.__project_id, ems_query_job_config.destination_dataset), ems_query_job_config.destination_table) job_config.destination = table_reference job_config.write_disposition = ems_query_job_config.write_disposition.value job_config.create_disposition = ems_query_job_config.create_disposition.value partitioning = ems_query_job_config.time_partitioning if partitioning is not None: job_config.time_partitioning = TimePartitioning( partitioning.type.value, partitioning.field, partitioning.expiration_ms, partitioning.require_partition_filter) if ems_query_job_config.table_definitions is not None: job_config.table_definitions = ems_query_job_config.table_definitions return job_config
def test_fill_from_default(self): from google.cloud.bigquery import QueryJobConfig job_config = QueryJobConfig() job_config.dry_run = True job_config.maximum_bytes_billed = 1000 default_job_config = QueryJobConfig() default_job_config.use_query_cache = True default_job_config.maximum_bytes_billed = 2000 final_job_config = job_config._fill_from_default(default_job_config) self.assertTrue(final_job_config.dry_run) self.assertTrue(final_job_config.use_query_cache) self.assertEqual(final_job_config.maximum_bytes_billed, 1000)
from datetime import datetime, timedelta, date from google.cloud import bigquery from google.cloud.bigquery.table import RowIterator from google.cloud.bigquery import QueryJobConfig billing_project = "momovn-dev" conf = QueryJobConfig() conf.use_query_cache = True conf.use_legacy_sql = False checkpointDate = None start_date = datetime.strptime('20201002', '%Y%m%d').date() end_date = datetime.strptime('20201002', '%Y%m%d').date() day_count = (end_date - start_date).days + 1 for checkpointDate in (start_date + timedelta(n) for n in range(day_count)): try: # checkpointDate = datetime.strptime(single_date, '%Y%m%d').date() checkpointDateWithoutDash = checkpointDate.strftime("%Y%m%d") checkpointDateWithDash = checkpointDate.strftime("%Y-%m-%d") query = f"""WITH A AS( SELECT GPS.reference PHONE FROM `momovn-prod.HERMES.HERMES_LOCATIONS` GPS WHERE DATE(GPS.event_timestamp,'Asia/Bangkok') = {checkpointDateWithDash}) SELECT COUNT(DISTINCT T1.USER_ID), 'HERMES LOCATION' FROM `momovn-prod.BITEAM_INTERN.{checkpointDateWithoutDash}_CHECK_LOCATION` T1 LEFT JOIN A T2 ON T1.USER_ID = T2.PHONE WHERE T2.PHONE IS NULL UNION ALL SELECT COUNT(DISTINCT T1.USER_ID), 'USER_LOCATION' FROM `momovn-prod.BITEAM_INTERN.{checkpointDateWithoutDash}_CHECK_LOCATION` T1 LEFT JOIN `momovn-prod.HERMES.USER_LOCATIONS_{checkpointDateWithoutDash}` T2 ON T1.USER_ID = T2.USER_ID
def parse_url(url): # noqa: C901 query = dict(url.query) # need mutable query. # use_legacy_sql (legacy) if "use_legacy_sql" in query: raise ValueError("legacy sql is not supported by this dialect") # allow_large_results (legacy) if "allow_large_results" in query: raise ValueError( "allow_large_results is only allowed for legacy sql, which is not supported by this dialect" ) # flatten_results (legacy) if "flatten_results" in query: raise ValueError( "flatten_results is only allowed for legacy sql, which is not supported by this dialect" ) # maximum_billing_tier (deprecated) if "maximum_billing_tier" in query: raise ValueError("maximum_billing_tier is a deprecated argument") project_id = url.host location = None dataset_id = url.database or None arraysize = None credentials_path = None # location if "location" in query: location = query.pop("location") # credentials_path if "credentials_path" in query: credentials_path = query.pop("credentials_path") # arraysize if "arraysize" in query: str_arraysize = query.pop("arraysize") try: arraysize = int(str_arraysize) except ValueError: raise ValueError("invalid int in url query arraysize: " + str_arraysize) # if only these "non-config" values were present, the dict will now be empty if not query: # if a dataset_id exists, we need to return a job_config that isn't None # so it can be updated with a dataset reference from the client if dataset_id: return ( project_id, location, dataset_id, arraysize, credentials_path, QueryJobConfig(), ) else: return project_id, location, dataset_id, arraysize, credentials_path, None job_config = QueryJobConfig() # clustering_fields list(str) if "clustering_fields" in query: clustering_fields = GROUP_DELIMITER.split(query["clustering_fields"]) job_config.clustering_fields = list(clustering_fields) # create_disposition if "create_disposition" in query: create_disposition = query["create_disposition"] try: job_config.create_disposition = getattr(CreateDisposition, create_disposition) except AttributeError: raise ValueError("invalid create_disposition in url query: " + create_disposition) # default_dataset if "default_dataset" in query or "dataset_id" in query or "project_id" in query: raise ValueError( "don't pass default_dataset, dataset_id, project_id in url query, instead use the url host and database" ) # destination if "destination" in query: dest_project = None dest_dataset = None dest_table = None try: dest_project, dest_dataset, dest_table = query[ "destination"].split(".") except ValueError: raise ValueError( "url query destination parameter should be fully qualified with project, dataset, and table" ) job_config.destination = TableReference( DatasetReference(dest_project, dest_dataset), dest_table) # destination_encryption_configuration if "destination_encryption_configuration" in query: job_config.destination_encryption_configuration = EncryptionConfiguration( query["destination_encryption_configuration"]) # dry_run if "dry_run" in query: try: job_config.dry_run = parse_boolean(query["dry_run"]) except ValueError: raise ValueError("invalid boolean in url query for dry_run: " + query["dry_run"]) # labels if "labels" in query: label_groups = GROUP_DELIMITER.split(query["labels"]) labels = {} for label_group in label_groups: try: key, value = KEY_VALUE_DELIMITER.split(label_group) except ValueError: raise ValueError("malformed url query in labels: " + label_group) labels[key] = value job_config.labels = labels # maximum_bytes_billed if "maximum_bytes_billed" in query: try: job_config.maximum_bytes_billed = int( query["maximum_bytes_billed"]) except ValueError: raise ValueError( "invalid int in url query maximum_bytes_billed: " + query["maximum_bytes_billed"]) # priority if "priority" in query: try: job_config.priority = getattr(QueryPriority, query["priority"]) except AttributeError: raise ValueError("invalid priority in url query: " + query["priority"]) # query_parameters if "query_parameters" in query: raise NotImplementedError("url query query_parameters not implemented") # schema_update_options if "schema_update_options" in query: schema_update_options = GROUP_DELIMITER.split( query["schema_update_options"]) try: job_config.schema_update_options = [ getattr(SchemaUpdateOption, schema_update_option) for schema_update_option in schema_update_options ] except AttributeError: raise ValueError("invalid schema_update_options in url query: " + query["schema_update_options"]) # table_definitions if "table_definitions" in query: raise NotImplementedError( "url query table_definitions not implemented") # time_partitioning if "time_partitioning" in query: raise NotImplementedError( "url query time_partitioning not implemented") # udf_resources if "udf_resources" in query: raise NotImplementedError("url query udf_resources not implemented") # use_query_cache if "use_query_cache" in query: try: job_config.use_query_cache = parse_boolean( query["use_query_cache"]) except ValueError: raise ValueError( "invalid boolean in url query for use_query_cache: " + query["use_query_cache"]) # write_disposition if "write_disposition" in query: try: job_config.write_disposition = getattr(WriteDisposition, query["write_disposition"]) except AttributeError: raise ValueError("invalid write_disposition in url query: " + query["write_disposition"]) return project_id, location, dataset_id, arraysize, credentials_path, job_config
def dry_run_query(self, query): job_config = QueryJobConfig() job_config.dry_run = True job_config.use_query_cache = False return self.client.query(query=(query), job_config=job_config)