def get_table_reference_from_path(self, table_path): # type: (str) -> TableReference """ Returns a TableReference for a given path to a BigQuery table. Args: table_path: A BigQuery table path in the form project.dataset.table Returns: A TableReference for the table specified by the path """ _, dataset, table = self.parse_table_path(table_path) dataset_ref = DatasetReference(self.project_id, dataset) return TableReference(dataset_ref, table)
def source(self): """Union[ \ google.cloud.bigquery.table.TableReference, \ google.cloud.bigquery.model.ModelReference \ ]: Table or Model from which data is to be loaded or extracted. """ source_config = _helpers._get_sub_prop( self._properties, ["configuration", "extract", "sourceTable"]) if source_config: return TableReference.from_api_repr(source_config) else: source_config = _helpers._get_sub_prop( self._properties, ["configuration", "extract", "sourceModel"]) return ModelReference.from_api_repr(source_config)
def test_from_api_repr(self): from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery.table import TableReference dataset_ref = DatasetReference('project_1', 'dataset_1') expected = self._make_one(dataset_ref, 'table_1') got = TableReference.from_api_repr( { 'projectId': 'project_1', 'datasetId': 'dataset_1', 'tableId': 'table_1', }) self.assertEqual(expected, got)
def test_to_gbq_w_default_project(mock_bigquery_client): """If no project is specified, we should be able to use project from default credentials. """ import google.api_core.exceptions from google.cloud.bigquery.table import TableReference mock_bigquery_client.get_table.side_effect = ( google.api_core.exceptions.NotFound("my_table")) gbq.to_gbq(DataFrame(), "my_dataset.my_table") mock_bigquery_client.get_table.assert_called_with( TableReference.from_string("default-project.my_dataset.my_table")) mock_bigquery_client.create_table.assert_called_with(mock.ANY) table = mock_bigquery_client.create_table.call_args[0][0] assert table.project == "default-project"
def get_schema(self, dataset_id, table_name, project_id=None): # type: (str, str, Optional[str]) -> List[SchemaField] """Returns the schema of a table. Args: dataset_id: The dataset to query. table_name: The name of the table. project_id: The project ID of the table. Returns: A list of SchemaFields representing the schema. """ dataset_ref = DatasetReference(project_id if project_id else self.project_id, dataset_id) table = self.gclient.get_table(TableReference(dataset_ref, table_name)) return table.schema
def drop_table_if_exists(self, table_name, schema_name, project_id=None): if self.test_if_table_exists( table_name=table_name, schema_name=schema_name, project_id=project_id, ): conn = self.dwh_hook.dbconn conn.delete_table( conn.get_table( TableReference( dataset_ref=FakeDatasetRef( dataset_id=schema_name, project_id=project_id or self.database_name, ), table_id=table_name, )))
def _get_table(self, connection, table_name, schema=None): if isinstance(connection, Engine): connection = connection.connect() client = connection.connection._client project_id, dataset_id, table_id = self._split_table_name(table_name) project_id = project_id or client.project dataset_id = dataset_id or schema or self.dataset_id table_ref = TableReference.from_string("{}.{}.{}".format( project_id, dataset_id, table_id)) try: table = client.get_table(table_ref) except NotFound: raise NoSuchTableError(table_name) return table
def __init__(self, table_name, dataset_name=None, project_name=None): self._project = project_name self._dataset = dataset_name self._table = table_name parts = table_name.replace(":", ".").split(".") if len(parts) == 3: # project.dataset.table self._project = parts[0] self._dataset = parts[1] self._table = parts[2] elif len(parts) == 2: # dataset.table self._dataset = parts[0] self._table = parts[1] self._dataset_ref = DatasetReference(dataset_id=self._dataset, project=self._project) self._table_ref = TableReference(dataset_ref=self._dataset_ref, table_id=self._table)
def sources(self): """List[google.cloud.bigquery.table.TableReference]): Table(s) from which data is to be loaded. """ source_configs = _helpers._get_sub_prop( self._properties, ["configuration", "copy", "sourceTables"]) if source_configs is None: single = _helpers._get_sub_prop( self._properties, ["configuration", "copy", "sourceTable"]) if single is None: raise KeyError( "Resource missing 'sourceTables' / 'sourceTable'") source_configs = [single] sources = [] for source_config in source_configs: table_ref = TableReference.from_api_repr(source_config) sources.append(table_ref) return sources
def _table_reference(self, provided_schema_name, provided_table_name, client_project): project_id_from_table, dataset_id_from_table, table_id = self._split_table_name(provided_table_name) project_id_from_schema = None dataset_id_from_schema = None if provided_schema_name is not None: provided_schema_name_split = provided_schema_name.split('.') if len(provided_schema_name_split) == 0: pass elif len(provided_schema_name_split) == 1: if dataset_id_from_table: project_id_from_schema = provided_schema_name_split[0] else: dataset_id_from_schema = provided_schema_name_split[0] elif len(provided_schema_name_split) == 2: project_id_from_schema = provided_schema_name_split[0] dataset_id_from_schema = provided_schema_name_split[1] else: raise ValueError("Did not understand schema: {}".format(provided_schema_name)) if (dataset_id_from_schema and dataset_id_from_table and dataset_id_from_schema != dataset_id_from_table): raise ValueError( "dataset_id specified in schema and table_name disagree: " "got {} in schema, and {} in table_name".format( dataset_id_from_schema, dataset_id_from_table ) ) if (project_id_from_schema and project_id_from_table and project_id_from_schema != project_id_from_table): raise ValueError( "project_id specified in schema and table_name disagree: " "got {} in schema, and {} in table_name".format( project_id_from_schema, project_id_from_table ) ) project_id = project_id_from_schema or project_id_from_table or client_project dataset_id = dataset_id_from_schema or dataset_id_from_table or self.dataset_id table_ref = TableReference.from_string("{}.{}.{}".format( project_id, dataset_id, table_id )) return table_ref
def __load_many(self, dt_ref, tables, gcs_base_dir, file_format, jc, preview): """ :param tables: :param gcs_base_dir: to map to table """ jobs = list() for tbl in tables: data_uri = "{}/{}/*.{}".format(gcs_base_dir, tbl, file_format) table_ref = TableReference(dataset_ref=dt_ref.dataset_ref, table_id=tbl) print("-- {}{} <= {} ".format("preview: " if preview else "", tbl, data_uri)) if preview: continue jobs.append( self.connect(dt_ref.project).load_table_from_uri( data_uri, table_ref, job_config=jc)) self.__check_jobs(jobs)
def test_to_gbq_w_project_table(mock_bigquery_client): """If a project is included in the table ID, use that instead of the client project. See: https://github.com/pydata/pandas-gbq/issues/321 """ import google.api_core.exceptions from google.cloud.bigquery.table import TableReference mock_bigquery_client.get_table.side_effect = ( google.api_core.exceptions.NotFound("my_table")) gbq.to_gbq( DataFrame(), "project_table.my_dataset.my_table", project_id="project_client", ) mock_bigquery_client.get_table.assert_called_with( TableReference.from_string("project_table.my_dataset.my_table")) mock_bigquery_client.create_table.assert_called_with(mock.ANY) table = mock_bigquery_client.create_table.call_args[0][0] assert table.project == "project_table"
def _bq_get_data(self): hook = BigQueryHook( bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) table_ref = TableReference.from_string( self.source_project_dataset_table) self.log.info('Fetching Data from:') self.log.info('Dataset: %s, Table: %s', table_ref.dataset_id, table_ref.table_id) conn = hook.get_conn() cursor = conn.cursor() i = 0 while True: response = cursor.get_tabledata( dataset_id=table_ref.dataset_id, table_id=table_ref.table_id, max_results=self.batch_size, selected_fields=self.selected_fields, start_index=i * self.batch_size, ) if 'rows' not in response: self.log.info('Job Finished') return rows = response['rows'] self.log.info('Total Extracted rows: %s', len(rows) + i * self.batch_size) table_data = [] table_data = [[fields['v'] for fields in dict_row['f']] for dict_row in rows] yield table_data i += 1
def test_run_async_extract_job_submitsExtractJobAndReturnsJobIdWithProperConfig( self, bigquery_module_patch: bigquery): project_id = "some-project-id" table = "some-project.some-dataset.some-table" destination_uris = [ "gs://some-source-uri/to_object1", "gs://some-source-uri/to_object2" ] job_prefix = "some_job_prefix" bigquery_module_patch.Client.return_value = self.client_mock expected_job_id = self.JOB_ID self.extract_job_mock = Mock(ExtractJob) self.extract_job_mock.job_id = expected_job_id self.client_mock.extract_table.return_value = self.extract_job_mock ems_job_config = EmsExtractJobConfig( compression=Compression.GZIP, destination_format=DestinationFormat.CSV, field_delimiter="Deli mit R", print_header=True, labels={"label1": "label1_value"}) ems_bigquery_client = EmsBigqueryClient(project_id, "Emelet") result_job_id = ems_bigquery_client.run_async_extract_job( job_id_prefix=job_prefix, table=table, destination_uris=destination_uris, job_config=ems_job_config) call_args_list = self.client_mock.extract_table.call_args_list args = call_args_list[0][1] assert args["location"] == "Emelet" assert args["source"] == TableReference.from_string(table_id=table) assert args["job_id_prefix"] == job_prefix assert args["destination_uris"] == destination_uris assert args["job_config"].compression == "GZIP" assert args["job_config"].destination_format == "CSV" assert args["job_config"].field_delimiter == "Deli mit R" assert args["job_config"].print_header == True assert args["job_config"].labels == {"label1": "label1_value"} assert result_job_id == expected_job_id
def create_tables_from_dict( self, table_names_to_schemas, # type: Dict[str, List[SchemaField]] dataset_id=None, # type: Optional[str] replace_existing_tables=False, # type: Optional[bool] ): # type: (...) -> None """Creates a set of tables from a dictionary of table names to their schemas. Args: table_names_to_schemas: A dictionary of: key: The table name. value: A list of SchemaField objects. dataset_id: The dataset in which to create tables. If not specified, use default dataset. replace_existing_tables: If True, delete and re-create tables. Otherwise, checks to see if any of the requested tables exist. If they do, it will raise a RuntimeError. Raises: RuntimeError if replace_existing_tables is False and any of the tables requested for creation already exist """ dataset_id = dataset_id or self.default_dataset_id dataset_ref = DatasetReference(self.project_id, dataset_id) # If the flag isn't set to replace existing tables, raise an error if any tables we're # trying to create already exist. if not replace_existing_tables: self._raise_if_tables_exist(table_names_to_schemas.keys(), dataset_id) for name, schema in six.iteritems(table_names_to_schemas): table_ref = TableReference(dataset_ref, name) # Use the Table object so it retains its schema. table = bigquery.Table(table_ref, schema=schema) if self.table_exists(table) and replace_existing_tables: self.delete_table(table) self.create_table(table)
def __create_extract_job_mock(self, job_id: str, table: str, has_error: bool, created: datetime = datetime.now()): error_result = { 'reason': 'someReason', 'location': 'query', 'message': 'error occurred' } extract_job_mock = Mock(ExtractJob) extract_job_mock.job_id = job_id extract_job_mock.destination_uris = ["uri1"] extract_job_mock.labels = {"label1": "label1_value"} extract_job_mock.source = TableReference.from_string(table) extract_job_mock.compression = None extract_job_mock.field_delimiter = "," extract_job_mock.print_header = True extract_job_mock.destination_format = "CSV" extract_job_mock.state = "DONE" extract_job_mock.error_result = error_result if has_error else None extract_job_mock.created = created return extract_job_mock
def execute(self, context): self.log.info( 'Executing extract of %s into: %s', self.source_project_dataset_table, self.destination_cloud_storage_uris, ) hook = BigQueryHook( bigquery_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) table_ref = TableReference.from_string( self.source_project_dataset_table, hook.project_id) configuration: Dict[str, Any] = { 'extract': { 'sourceTable': table_ref.to_api_repr(), 'compression': self.compression, 'destinationUris': self.destination_cloud_storage_uris, 'destinationFormat': self.export_format, } } if self.labels: configuration['labels'] = self.labels if self.export_format == 'CSV': # Only set fieldDelimiter and printHeader fields if using CSV. # Google does not like it if you set these fields for other export # formats. configuration['extract']['fieldDelimiter'] = self.field_delimiter configuration['extract']['printHeader'] = self.print_header hook.insert_job(configuration=configuration)
class _Base(unittest.TestCase): from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery.table import TableReference ENDPOINT = "https://bigquery.googleapis.com" PROJECT = "project" SOURCE1 = "http://example.com/source1.csv" DS_ID = "dataset_id" DS_REF = DatasetReference(PROJECT, DS_ID) TABLE_ID = "table_id" TABLE_REF = TableReference(DS_REF, TABLE_ID) JOB_ID = "JOB_ID" JOB_TYPE = "unknown" KMS_KEY_NAME = "projects/1/locations/us/keyRings/1/cryptoKeys/1" def _make_one(self, *args, **kw): return self._get_target_class()(*args, **kw) def _setUpConstants(self): import datetime from google.cloud._helpers import UTC self.WHEN_TS = 1437767599.006 self.WHEN = datetime.datetime.utcfromtimestamp( self.WHEN_TS).replace(tzinfo=UTC) self.ETAG = "ETAG" self.FULL_JOB_ID = "%s:%s" % (self.PROJECT, self.JOB_ID) self.RESOURCE_URL = "{}/bigquery/v2/projects/{}/jobs/{}".format( self.ENDPOINT, self.PROJECT, self.JOB_ID) self.USER_EMAIL = "*****@*****.**" def _table_ref(self, table_id): from google.cloud.bigquery.table import TableReference return TableReference(self.DS_REF, table_id) def _make_resource(self, started=False, ended=False, location="US"): self._setUpConstants() return _make_job_resource( creation_time_ms=int(self.WHEN_TS * 1000), started_time_ms=int(self.WHEN_TS * 1000), ended_time_ms=int(self.WHEN_TS * 1000) + 1000000, started=started, ended=ended, etag=self.ETAG, endpoint=self.ENDPOINT, job_type=self.JOB_TYPE, job_id=self.JOB_ID, project_id=self.PROJECT, user_email=self.USER_EMAIL, location=location, ) def _verifyInitialReadonlyProperties(self, job): # root elements of resource self.assertIsNone(job.etag) self.assertIsNone(job.self_link) self.assertIsNone(job.user_email) # derived from resource['statistics'] self.assertIsNone(job.created) self.assertIsNone(job.started) self.assertIsNone(job.ended) # derived from resource['status'] self.assertIsNone(job.error_result) self.assertIsNone(job.errors) self.assertIsNone(job.state) def _verifyReadonlyResourceProperties(self, job, resource): from datetime import timedelta statistics = resource.get("statistics", {}) if "creationTime" in statistics: self.assertEqual(job.created, self.WHEN) else: self.assertIsNone(job.created) if "startTime" in statistics: self.assertEqual(job.started, self.WHEN) else: self.assertIsNone(job.started) if "endTime" in statistics: self.assertEqual(job.ended, self.WHEN + timedelta(seconds=1000)) else: self.assertIsNone(job.ended) if "etag" in resource: self.assertEqual(job.etag, self.ETAG) else: self.assertIsNone(job.etag) if "selfLink" in resource: self.assertEqual(job.self_link, self.RESOURCE_URL) else: self.assertIsNone(job.self_link) if "user_email" in resource: self.assertEqual(job.user_email, self.USER_EMAIL) else: self.assertIsNone(job.user_email)
assert location == "some-location" assert dataset_id == "some-dataset" assert arraysize == 1000 assert credentials_path == "/some/path/to.json" assert isinstance(job_config, QueryJobConfig) @pytest.mark.parametrize( "param, value, default", [ ("clustering_fields", ["a", "b", "c"], None), ("create_disposition", "CREATE_IF_NEEDED", None), ( "destination", TableReference( DatasetReference("different-project", "different-dataset"), "table" ), None, ), ( "destination_encryption_configuration", lambda enc: enc.kms_key_name == EncryptionConfiguration("some-configuration").kms_key_name, None, ), ("dry_run", True, None), ("labels", {"a": "b", "c": "d"}, {}), ("maximum_bytes_billed", 1000, None), ("priority", "INTERACTIVE", None), ( "schema_update_options",
def final_func(schema_name, schema_suffix, dwh_conn_id): # final: move new data into the final dataset conn = EWAHBaseHook.get_hook_from_conn_id(dwh_conn_id).dbconn # get dataset objects try: # create final dataset if not exists ds_final = conn.get_dataset(schema_name) except: print("Creating dataset {0}".format(schema_name)) ds_final = conn.create_dataset(schema_name) ds_temp = conn.get_dataset(schema_name + schema_suffix) # copy all tables from temp dataset to final dataset new_tables = conn.list_tables(ds_temp) new_table_ids = [ table.table_id for table in conn.list_tables(ds_temp) ] old_table_ids = [ table.table_id for table in conn.list_tables(ds_final) ] copy_jobs = [] for table in new_tables: print("Copying table {0} from temp to final dataset".format( table.table_id)) try: old_table = conn.get_table(table=TableReference( dataset_ref=ds_final, table_id=table.table_id)) conn.delete_table(old_table) except: # ignore failure, fails if old table does not exist to begin with pass finally: final_table = ds_final.table(table.table_id) copy_jobs.append(conn.copy_table(table, final_table)) # delete tables that don't exist in temp dataset from final dataset for table_id in old_table_ids: if not table_id in new_table_ids: print("Deleting table {0}".format(table_id)) conn.delete_table( conn.get_table( TableReference(dataset_ref=ds_final, table_id=table_id))) # make sure all copy jobs succeeded while copy_jobs: sleep(0.1) job = copy_jobs.pop(0) job.result() assert job.state in ("RUNNING", "DONE") if job.state == "RUNNING": copy_jobs.append(job) else: print("Successfully copied {0}".format( job.__dict__["_properties"]["configuration"]["copy"] ["destinationTable"]["tableId"])) # delete temp dataset print("Deleting temp dataset.") conn.delete_dataset(ds_temp, delete_contents=True, not_found_ok=False) print("Done.")
def commit(self): # The commit is where the upload is actually done for BigQuery (special case). # The _create_or_update_table method can be called multiple times; # each time, data is appended to the .avro file. When "committing", # this .avro file is uploaded and, depending on the load strategy, used. if not hasattr(self, "avro_file_name"): # There was no data ever uploaded # Do nothing self.log.info("Nothing to upload!") return # Clean up after yourself first self.avro_writer.close() # Fetch the relevant configuration project_id = self.table_creation_config.get("database_name", self.database_name) assert project_id, "Missing Project ID!" load_strategy = self.table_creation_config["load_strategy"] primary_key = self.table_creation_config["primary_key"] schema_name = self.table_creation_config["schema_name"] schema_suffix = self.table_creation_config["schema_suffix"] table_name_final = self.table_creation_config["table_name"] table_suffix = "__ewah_tmp" columns_definition = self.table_creation_config["columns_definition"] new_schema_name = schema_name + schema_suffix is_full_refresh = (load_strategy == EC.LS_INSERT_REPLACE or not self.test_if_table_exists( table_name=table_name_final, schema_name=new_schema_name, project_id=project_id, )) conn = self.dwh_hook.dbconn ds_new = conn.get_dataset(new_schema_name) # Create temp table with .avro file if is_full_refresh: # temp table is also the final table for full refresh! table_name = table_name_final else: table_name = table_name_final + table_suffix # Drop temp table if it already exists if self.test_if_table_exists( table_name=table_name, schema_name=new_schema_name, project_id=project_id, ): # Drop table before re-creating it conn.delete_table( conn.get_table( TableReference(dataset_ref=ds_new, table_id=table_name))) # Create temp table with .avro file table_obj = Table(".".join([project_id, new_schema_name, table_name])) if is_full_refresh and self.partition_field: table_obj.time_partitioning = bigquery.TimePartitioning( type_=self.partition_type, field=self.partition_field, ) if self.require_partition_filter: table_obj.require_partition_filter = True self.log.info("Uploading data into table now...") with open(self.avro_file_name, "rb") as source_file: job = conn.load_table_from_file( file_obj=source_file, destination=table_obj, job_id_prefix="ewah_", rewind=True, job_config=LoadJobConfig( autodetect=False, source_format="AVRO", schema=[ SchemaField(name=name, field_type=field["data_type"]) for name, field in columns_definition.items() ], ), ) try: job.result() except: self.log.info("Errors occured - job errors: {0}".format( job.errors)) raise assert job.state == "DONE", "Invalid job state: {0}".format( job.state) if not is_full_refresh: # Need to merge new rows into the existing table fields_pk = set(primary_key or []) fields_all = set(columns_definition.keys() or []) fields_non_pk = fields_all - fields_pk if load_strategy == EC.LS_UPSERT: assert fields_pk elif load_strategy == EC.LS_INSERT_ADD: fields_pk = [] # Ignore if set else: raise Exception("Not implemented!") merge_statement = """ MERGE INTO `{target}` AS TARGET USING `{source}` AS SOURCE ON {condition} WHEN MATCHED THEN UPDATE SET {update_fields} WHEN NOT MATCHED THEN INSERT ({insert_fields}) VALUES ({insert_fields}) """.format( target=".".join( [project_id, new_schema_name, table_name_final]), source=".".join([project_id, new_schema_name, table_name]), condition=" AND ".join([ "TARGET.`{0}` = SOURCE.`{0}`".format(field) for field in fields_pk ]) or "FALSE", insert_fields="`{0}`".format("`, `".join(fields_all)), update_fields=", ".join([ "`{0}` = SOURCE.`{0}`".format(field) for field in fields_non_pk ]), ) self.log.info( "Executing query:\n\n{0}\n\n".format(merge_statement)) job = conn.query( query=merge_statement, job_id_prefix="ewah_", ) try: job.result() except: self.log.info("Errors occured - job errors: {0}".format( job.errors)) raise assert job.state == "DONE", "Invalid job state: {0}".format( job.state) # Remove old temp table from dataset conn.delete_table( conn.get_table( TableReference(dataset_ref=ds_new, table_id=table_name))) self.log.info("Done!")
def bq_insert(rows: List): """ Inserts rows into BigQuery :param rows: list of dictionaries which are representing rows :return: """ from google.cloud import bigquery if not rows: logging.error("no rows to upload") return bq = bigquery.Client(project=GCP_PROJECT) table_ref = TableReference.from_string( f"{GCP_PROJECT}.live.om_state_latencies") schema = [ { "name": "date", "type": "DATE" }, { "name": "sym", "type": "STRING" }, { "name": "from_state", "type": "STRING" }, { "name": "to_state", "type": "STRING" }, { "name": "count", "type": "INTEGER" }, { "name": "average", "type": "FLOAT" }, { "name": "percentile_10", "type": "FLOAT" }, { "name": "percentile_50", "type": "FLOAT" }, { "name": "percentile_90", "type": "FLOAT" }, { "name": "percentile_99", "type": "FLOAT" }, { "name": "percentile_99_99", "type": "FLOAT" }, ] table = Table(table_ref) table.schema = schema table = bq.create_table(table, exists_ok=True) logging.info("inserting {} rows".format(len(rows))) res = bq.insert_rows(table, rows) logging.info(res)
def _table_ref(self, table_id): from google.cloud.bigquery.table import TableReference return TableReference(self.DS_REF, table_id)
def parse_url(url): # noqa: C901 query = dict(url.query) # need mutable query. # use_legacy_sql (legacy) if "use_legacy_sql" in query: raise ValueError("legacy sql is not supported by this dialect") # allow_large_results (legacy) if "allow_large_results" in query: raise ValueError( "allow_large_results is only allowed for legacy sql, which is not supported by this dialect" ) # flatten_results (legacy) if "flatten_results" in query: raise ValueError( "flatten_results is only allowed for legacy sql, which is not supported by this dialect" ) # maximum_billing_tier (deprecated) if "maximum_billing_tier" in query: raise ValueError("maximum_billing_tier is a deprecated argument") project_id = url.host location = None dataset_id = url.database or None arraysize = None credentials_path = None # location if "location" in query: location = query.pop("location") # credentials_path if "credentials_path" in query: credentials_path = query.pop("credentials_path") # arraysize if "arraysize" in query: str_arraysize = query.pop("arraysize") try: arraysize = int(str_arraysize) except ValueError: raise ValueError("invalid int in url query arraysize: " + str_arraysize) # if only these "non-config" values were present, the dict will now be empty if not query: # if a dataset_id exists, we need to return a job_config that isn't None # so it can be updated with a dataset reference from the client if dataset_id: return ( project_id, location, dataset_id, arraysize, credentials_path, QueryJobConfig(), ) else: return project_id, location, dataset_id, arraysize, credentials_path, None job_config = QueryJobConfig() # clustering_fields list(str) if "clustering_fields" in query: clustering_fields = GROUP_DELIMITER.split(query["clustering_fields"]) job_config.clustering_fields = list(clustering_fields) # create_disposition if "create_disposition" in query: create_disposition = query["create_disposition"] try: job_config.create_disposition = getattr(CreateDisposition, create_disposition) except AttributeError: raise ValueError("invalid create_disposition in url query: " + create_disposition) # default_dataset if "default_dataset" in query or "dataset_id" in query or "project_id" in query: raise ValueError( "don't pass default_dataset, dataset_id, project_id in url query, instead use the url host and database" ) # destination if "destination" in query: dest_project = None dest_dataset = None dest_table = None try: dest_project, dest_dataset, dest_table = query[ "destination"].split(".") except ValueError: raise ValueError( "url query destination parameter should be fully qualified with project, dataset, and table" ) job_config.destination = TableReference( DatasetReference(dest_project, dest_dataset), dest_table) # destination_encryption_configuration if "destination_encryption_configuration" in query: job_config.destination_encryption_configuration = EncryptionConfiguration( query["destination_encryption_configuration"]) # dry_run if "dry_run" in query: try: job_config.dry_run = parse_boolean(query["dry_run"]) except ValueError: raise ValueError("invalid boolean in url query for dry_run: " + query["dry_run"]) # labels if "labels" in query: label_groups = GROUP_DELIMITER.split(query["labels"]) labels = {} for label_group in label_groups: try: key, value = KEY_VALUE_DELIMITER.split(label_group) except ValueError: raise ValueError("malformed url query in labels: " + label_group) labels[key] = value job_config.labels = labels # maximum_bytes_billed if "maximum_bytes_billed" in query: try: job_config.maximum_bytes_billed = int( query["maximum_bytes_billed"]) except ValueError: raise ValueError( "invalid int in url query maximum_bytes_billed: " + query["maximum_bytes_billed"]) # priority if "priority" in query: try: job_config.priority = getattr(QueryPriority, query["priority"]) except AttributeError: raise ValueError("invalid priority in url query: " + query["priority"]) # query_parameters if "query_parameters" in query: raise NotImplementedError("url query query_parameters not implemented") # schema_update_options if "schema_update_options" in query: schema_update_options = GROUP_DELIMITER.split( query["schema_update_options"]) try: job_config.schema_update_options = [ getattr(SchemaUpdateOption, schema_update_option) for schema_update_option in schema_update_options ] except AttributeError: raise ValueError("invalid schema_update_options in url query: " + query["schema_update_options"]) # table_definitions if "table_definitions" in query: raise NotImplementedError( "url query table_definitions not implemented") # time_partitioning if "time_partitioning" in query: raise NotImplementedError( "url query time_partitioning not implemented") # udf_resources if "udf_resources" in query: raise NotImplementedError("url query udf_resources not implemented") # use_query_cache if "use_query_cache" in query: try: job_config.use_query_cache = parse_boolean( query["use_query_cache"]) except ValueError: raise ValueError( "invalid boolean in url query for use_query_cache: " + query["use_query_cache"]) # write_disposition if "write_disposition" in query: try: job_config.write_disposition = getattr(WriteDisposition, query["write_disposition"]) except AttributeError: raise ValueError("invalid write_disposition in url query: " + query["write_disposition"]) return project_id, location, dataset_id, arraysize, credentials_path, job_config
def test_basic(url_with_everything): project_id, location, dataset_id, arraysize, credentials_path, job_config = parse_url(url_with_everything) assert project_id == 'some-project' assert location == 'some-location' assert dataset_id == 'some-dataset' assert arraysize == 1000 assert credentials_path == '/some/path/to.json' assert isinstance(job_config, QueryJobConfig) @pytest.mark.parametrize('param, value', [ ('clustering_fields', ['a', 'b', 'c']), ('create_disposition', 'CREATE_IF_NEEDED'), ('destination', TableReference(DatasetReference('different-project', 'different-dataset'), 'table')), ('destination_encryption_configuration', lambda enc: enc.kms_key_name == EncryptionConfiguration('some-configuration').kms_key_name), ('dry_run', True), ('labels', {'a': 'b', 'c': 'd'}), ('maximum_bytes_billed', 1000), ('priority', 'INTERACTIVE'), ('schema_update_options', ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION']), ('use_query_cache', True), ('write_disposition', 'WRITE_APPEND'), ]) def test_all_values(url_with_everything, param, value): job_config = parse_url(url_with_everything)[5] config_value = getattr(job_config, param) if callable(value):
project_id, location, dataset_id, arraysize, credentials_path, job_config = parse_url( url_with_everything) assert project_id == 'some-project' assert location == 'some-location' assert dataset_id == 'some-dataset' assert arraysize == 1000 assert credentials_path == '/some/path/to.json' assert isinstance(job_config, QueryJobConfig) @pytest.mark.parametrize('param, value', [ ('clustering_fields', ['a', 'b', 'c']), ('create_disposition', 'CREATE_IF_NEEDED'), ('destination', TableReference(DatasetReference('different-project', 'different-dataset'), 'table')), ('destination_encryption_configuration', lambda enc: enc.kms_key_name == EncryptionConfiguration('some-configuration').kms_key_name), ('dry_run', True), ('labels', { 'a': 'b', 'c': 'd' }), ('maximum_bytes_billed', 1000), ('priority', 'INTERACTIVE'), ('schema_update_options', ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION']), ('use_query_cache', True), ('write_disposition', 'WRITE_APPEND'), ]) def test_all_values(url_with_everything, param, value):