def test__row_from_mapping_wo_schema(self): from google.cloud.bigquery.table import Table, _TABLE_HAS_NO_SCHEMA MAPPING = {'full_name': 'Phred Phlyntstone', 'age': 32} dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) table = Table(table_ref) with self.assertRaises(ValueError) as exc: self._call_fut(MAPPING, table.schema) self.assertEqual(exc.exception.args, (_TABLE_HAS_NO_SCHEMA,))
def test_partition_type_setter_w_known_value(self): from google.cloud.bigquery.table import SchemaField dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) full_name = SchemaField('full_name', 'STRING', mode='REQUIRED') age = SchemaField('age', 'INTEGER', mode='REQUIRED') table = self._make_one(table_ref, schema=[full_name, age]) self.assertIsNone(table.partitioning_type) table.partitioning_type = 'DAY' self.assertEqual(table.partitioning_type, 'DAY')
def test_encryption_configuration_setter(self): from google.cloud.bigquery.table import EncryptionConfiguration dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) table = self._make_one(table_ref) encryption_configuration = EncryptionConfiguration( kms_key_name=self.KMS_KEY_NAME) table.encryption_configuration = encryption_configuration self.assertEqual(table.encryption_configuration.kms_key_name, self.KMS_KEY_NAME) table.encryption_configuration = None self.assertIsNone(table.encryption_configuration)
def test_partition_type_setter_w_none(self): from google.cloud.bigquery.table import SchemaField dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) full_name = SchemaField('full_name', 'STRING', mode='REQUIRED') age = SchemaField('age', 'INTEGER', mode='REQUIRED') table = self._make_one(table_ref, schema=[full_name, age]) table._properties['timePartitioning'] = {'type': 'DAY'} table.partitioning_type = None self.assertIsNone(table.partitioning_type) self.assertFalse('timePartitioning' in table._properties)
def test_partition_expiration_w_none_no_partition_set(self): from google.cloud.bigquery.table import SchemaField dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) full_name = SchemaField('full_name', 'STRING', mode='REQUIRED') age = SchemaField('age', 'INTEGER', mode='REQUIRED') table = self._make_one(table_ref, schema=[full_name, age]) self.assertIsNone(table.partition_expiration) table.partition_expiration = None self.assertIsNone(table.partitioning_type) self.assertIsNone(table.partition_expiration)
def test_ctor_explicit(self): from google.cloud.bigquery.dataset import DatasetReference, AccessEntry phred = AccessEntry('OWNER', 'userByEmail', '*****@*****.**') bharney = AccessEntry('OWNER', 'userByEmail', '*****@*****.**') entries = [phred, bharney] OTHER_PROJECT = 'foo-bar-123' dataset = self._make_one(DatasetReference(OTHER_PROJECT, self.DS_ID)) dataset.access_entries = entries self.assertEqual(dataset.dataset_id, self.DS_ID) self.assertEqual(dataset.project, OTHER_PROJECT) self.assertEqual( dataset.path, '/projects/%s/datasets/%s' % (OTHER_PROJECT, self.DS_ID)) self.assertEqual(dataset.access_entries, entries) self.assertIsNone(dataset.created) self.assertIsNone(dataset.full_dataset_id) self.assertIsNone(dataset.etag) self.assertIsNone(dataset.modified) self.assertIsNone(dataset.self_link) self.assertIsNone(dataset.default_table_expiration_ms) self.assertIsNone(dataset.description) self.assertIsNone(dataset.friendly_name) self.assertIsNone(dataset.location)
def load_to_gbq(date, data_set_id, gcs_dir): table_name = conf.TABLE_PREFIX + date.replace("-", "") print("Load start [table_name]", table_name) client = bigquery.Client() data_set_ref = DatasetReference.from_string(data_set_id) job_config = bigquery.LoadJobConfig() # The source format defaults to CSV, so the line below is optional. job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON job_config.schema = conf.GBQ_SCHEMA table_ref = data_set_ref.table(table_name) try: client.delete_table(table_ref) # API request except: pass uri = gcs_dir + table_name + '.json' load_job = client.load_table_from_uri( uri, data_set_ref.table(table_name), job_config=job_config) # API request assert load_job.job_type == 'load' try: load_job.result() # Waits for table load to complete. except Exception as e: for error in load_job.errors: print("Error detail [message]", error.get("message")) raise e assert load_job.state == 'DONE'
def test_from_api_repr_normal(self): from google.cloud.bigquery.dataset import DatasetReference resource = { "query": { "useLegacySql": True, "query": "no property for me", "defaultDataset": { "projectId": "someproject", "datasetId": "somedataset", }, "someNewProperty": "I should be saved, too.", }, "dryRun": True, } klass = self._get_target_class() config = klass.from_api_repr(resource) self.assertTrue(config.use_legacy_sql) self.assertEqual(config.default_dataset, DatasetReference("someproject", "somedataset")) self.assertTrue(config.dry_run) # Make sure unknown properties propagate. self.assertEqual(config._properties["query"]["query"], "no property for me") self.assertEqual(config._properties["query"]["someNewProperty"], "I should be saved, too.")
def export_csv(self, bucket_name: str, bucket_path: str, dataset: str, table: str, sep: str = "\t") -> str: bucket_url = f"gs://{bucket_name}/{self.config.lake_path}/{bucket_path}" logging.info( f"DataWarehouse.export_csv {bucket_url} to {dataset}.{table} ...") client = self._get_client() dataset_ref = DatasetReference(self.config.gcp_project, dataset) to_export = TableReference(dataset_ref, table) config = ExtractJobConfig() config.field_delimiter = sep config.destination_format = bigquery.DestinationFormat.CSV extract_job = client.extract_table(to_export, bucket_url, job_config=config) result = extract_job.result() logging.info( f"DataWarehouse.export_csv {bucket_url} to {dataset}.{table} Complete!" ) return bucket_url
def __init__(self, table_name=None, dataset_name=None, project_name=None): """ table name: [project.][dataset.]table. could be tbl_*_sth, or tbl_[min-max] for table names in range """ self.project = project_name self.dataset = dataset_name self.table = table_name self.table_min = None # to qualify date partitioned table, table min name self.table_max = None # table max name if table_name: name_parts = table_name.replace(":", ".").split(".") if len(name_parts) == 3: # project.dataset.table self.project = name_parts[0] self.dataset = name_parts[1] self.table = name_parts[2] elif len(name_parts) == 2: # dataset.table self.dataset = name_parts[0] self.table = name_parts[1] tbl_parts = self.table.split("[") if len(tbl_parts) == 2: self.table = tbl_parts[0] dt_parts = tbl_parts[1].rstrip("]").split("-") self.table_min = self.table.rstrip( "*") + dt_parts[0] if dt_parts[0] else None self.table_max = self.table.rstrip( "*") + dt_parts[1] if dt_parts[1] else None if not self.project: self.project = DtTblRef.default_project() self.dataset_ref = DatasetReference(dataset_id=self.dataset, project=self.project) self.table_ref = TableReference(dataset_ref=self.dataset_ref, table_id=self.table)
def delete_dataset_by_name(self, name, delete_all_tables=False): # type: (str, Optional[bool]) -> None """Delete a dataset within the current project. Args: name: The name of the dataset to delete. delete_all_tables: If True, will delete all tables in the dataset before attempting to delete the dataset. You can't delete a dataset until it contains no tables. Raises: RuntimeError if there are still tables in the dataset and you try to delete it (with delete_all_tables set to False) """ dataset_ref = DatasetReference(self.project_id, str(name)) tables_in_dataset = self.gclient.list_tables(dataset_ref) if delete_all_tables: for table_list_item in tables_in_dataset: self.delete_table(table_list_item.reference) elif tables_in_dataset.num_items > 0: raise RuntimeError("Dataset {} still contains {} tables so you can't delete it." .format(name, str(tables_in_dataset))) self.delete_dataset(dataset_ref)
def test_ctor_defaults(self): from google.cloud.bigquery.dataset import DatasetReference dataset_ref = DatasetReference('project_1', 'dataset_1') table_ref = self._make_one(dataset_ref, 'table_1') self.assertEqual(table_ref.dataset_id, dataset_ref.dataset_id) self.assertEqual(table_ref.table_id, 'table_1')
def test_create_dataset_w_reference(PROJECT, DS_ID, LOCATION): path = "/projects/%s/datasets" % PROJECT resource = { "datasetReference": { "projectId": PROJECT, "datasetId": DS_ID }, "etag": "etag", "id": "%s:%s" % (PROJECT, DS_ID), "location": LOCATION, } client = make_client(location=LOCATION) conn = client._connection = make_connection(resource) dataset = client.create_dataset(DatasetReference(PROJECT, DS_ID)) assert dataset.dataset_id == DS_ID assert dataset.project == PROJECT assert dataset.etag == resource["etag"] assert dataset.full_dataset_id == resource["id"] assert dataset.location == LOCATION conn.api_request.assert_called_once_with( method="POST", path=path, data={ "datasetReference": { "projectId": PROJECT, "datasetId": DS_ID }, "labels": {}, "location": LOCATION, }, timeout=DEFAULT_TIMEOUT, )
def test___eq___wrong_type(self): from google.cloud.bigquery.dataset import DatasetReference dataset_ref = DatasetReference('project_1', 'dataset_1') table = self._make_one(dataset_ref, 'table_1') other = object() self.assertNotEqual(table, other) self.assertEqual(table, mock.ANY)
def from_string(cls, full_table_id): """Construct a table reference from fully-qualified table ID. Args: full_table_id (str): A fully-qualified table ID in standard SQL format. Must included a project ID, dataset ID, and table ID, each separated by ``.``. Returns: TableReference: Table reference parsed from ``full_table_id``. Examples: >>> TableReference.from_string('my-project.mydataset.mytable') TableRef...(DatasetRef...('my-project', 'mydataset'), 'mytable') Raises: ValueError: If ``full_table_id`` is not a fully-qualified table ID in standard SQL format. """ from google.cloud.bigquery.dataset import DatasetReference parts = full_table_id.split('.') if len(parts) != 3: raise ValueError( 'full_table_id must be a fully-qualified table ID in ' 'standard SQL format. e.g. "project.dataset.table", got ' '{}'.format(full_table_id)) return cls(DatasetReference(parts[0], parts[1]), parts[2])
def test___hash__not_equals(self): from google.cloud.bigquery.dataset import DatasetReference dataset = DatasetReference('project_1', 'dataset_1') table1 = self._make_one(dataset, 'table1') table2 = self._make_one(dataset, 'table2') set_one = {table1} set_two = {table2} self.assertNotEqual(set_one, set_two)
def test_begin_w_bound_client(self): from google.cloud.bigquery.dataset import DatasetReference PATH = "/projects/%s/jobs" % (self.PROJECT, ) RESOURCE = self._make_resource() # Ensure None for missing server-set props del RESOURCE["statistics"]["creationTime"] del RESOURCE["etag"] del RESOURCE["selfLink"] del RESOURCE["user_email"] conn = _make_connection(RESOURCE) client = _make_client(project=self.PROJECT, connection=conn) source_dataset = DatasetReference(self.PROJECT, self.DS_ID) source = source_dataset.table(self.SOURCE_TABLE) job = self._make_one(self.JOB_ID, source, [self.DESTINATION_URI], client) with mock.patch( "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" ) as final_attributes: job._begin() final_attributes.assert_called_with({"path": PATH}, client, job) conn.api_request.assert_called_once_with( method="POST", path=PATH, data={ "jobReference": { "projectId": self.PROJECT, "jobId": self.JOB_ID }, "configuration": { "extract": { "sourceTable": { "projectId": self.PROJECT, "datasetId": self.DS_ID, "tableId": self.SOURCE_TABLE, }, "destinationUris": [self.DESTINATION_URI], } }, }, timeout=None, ) self._verifyResourceProperties(job, RESOURCE)
def from_string(cls, table_id, default_project=None): """Construct a table reference from table ID string. Args: table_id (str): A table ID in standard SQL format. If ``default_project`` is not specified, this must included a project ID, dataset ID, and table ID, each separated by ``.``. default_project (str): Optional. The project ID to use when ``table_id`` does not include a project ID. Returns: TableReference: Table reference parsed from ``table_id``. Examples: >>> TableReference.from_string('my-project.mydataset.mytable') TableRef...(DatasetRef...('my-project', 'mydataset'), 'mytable') Raises: ValueError: If ``table_id`` is not a fully-qualified table ID in standard SQL format. """ from google.cloud.bigquery.dataset import DatasetReference output_project_id = default_project output_dataset_id = None output_table_id = None parts = table_id.split(".") if len(parts) < 2: raise ValueError( "table_id must be a fully-qualified table ID in " 'standard SQL format. e.g. "project.dataset.table", got ' "{}".format(table_id) ) elif len(parts) == 2: if not default_project: raise ValueError( "When default_project is not set, table_id must be a " "fully-qualified table ID in standard SQL format. " 'e.g. "project.dataset_id.table_id", got {}'.format(table_id) ) output_dataset_id, output_table_id = parts elif len(parts) == 3: output_project_id, output_dataset_id, output_table_id = parts if len(parts) > 3: raise ValueError( "Too many parts in table_id. Must be a fully-qualified table " 'ID in standard SQL format. e.g. "project.dataset.table", ' "got {}".format(table_id) ) return cls( DatasetReference(output_project_id, output_dataset_id), output_table_id )
def test__row_from_mapping_w_schema(self): from google.cloud.bigquery.table import Table, SchemaField MAPPING = { 'full_name': 'Phred Phlyntstone', 'age': 32, 'colors': ['red', 'green'], 'extra': 'IGNORED', } dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) full_name = SchemaField('full_name', 'STRING', mode='REQUIRED') age = SchemaField('age', 'INTEGER', mode='REQUIRED') colors = SchemaField('colors', 'DATETIME', mode='REPEATED') joined = SchemaField('joined', 'STRING', mode='NULLABLE') table = Table(table_ref, schema=[full_name, age, colors, joined]) self.assertEqual(self._call_fut(MAPPING, table.schema), ('Phred Phlyntstone', 32, ['red', 'green'], None))
def test_from_api_repr(self): from google.cloud.bigquery.dataset import DatasetReference expected = self._make_one('project_1', 'dataset_1') got = DatasetReference.from_api_repr({ 'projectId': 'project_1', 'datasetId': 'dataset_1', }) self.assertEqual(expected, got)
def copy_table( self, source_table_path, # type: str destination_table_name, # type: str destination_dataset=None, # type: Optional[str] destination_project=None, # type: Optional[str] replace_existing_table=False # type: bool ): # type: (...) -> None """ Copies the table at source_table_path to the location destination_project.destination_dataset.destination_table_name. If the destination project or dataset aren't set, the class default will be used. Args: source_table_path: The path of the table to copy. destination_table_name: The name of the table to copy to. destination_dataset: The name of the destination dataset. If unset, the client default dataset will be used. destination_project: The name of the destination project. If unset, the client default project will be used. replace_existing_table: If True, if the destination table already exists, it will delete it and copy the source table in its place. Raises: RuntimeError if the destination table already exists and replace_existing_table is False or the destination dataset does not exist """ destination_dataset = destination_dataset or self.default_dataset_id destination_project = destination_project or self.project_id dataset_ref = DatasetReference(destination_project, destination_dataset) if not self.dataset_exists(dataset_ref): raise RuntimeError( 'The dataset {} does not exist in project {}.'.format( destination_dataset, destination_project)) dest_table_ref = TableReference(dataset_ref, destination_table_name) if self.table_exists(dest_table_ref): if replace_existing_table: self.delete_table(dest_table_ref) else: raise RuntimeError( 'The table {} already exists in dataset {}.'.format( destination_table_name, destination_dataset)) dest_table_path = self.path(destination_table_name, destination_dataset, destination_project) self.create_table_from_query( 'SELECT * FROM `{}`'.format(source_table_path), dest_table_path)
def test_from_api_repr(self): from google.cloud.bigquery.dataset import DatasetReference expected = self._make_one('project_1', 'dataset_1') got = DatasetReference.from_api_repr( { 'projectId': 'project_1', 'datasetId': 'dataset_1', }) self.assertEqual(expected, got)
def test__row_from_mapping_w_invalid_schema(self): from google.cloud.bigquery.table import Table, SchemaField MAPPING = { 'full_name': 'Phred Phlyntstone', 'age': 32, 'colors': ['red', 'green'], 'bogus': 'WHATEVER', } dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) full_name = SchemaField('full_name', 'STRING', mode='REQUIRED') age = SchemaField('age', 'INTEGER', mode='REQUIRED') colors = SchemaField('colors', 'DATETIME', mode='REPEATED') bogus = SchemaField('joined', 'STRING', mode='BOGUS') table = Table(table_ref, schema=[full_name, age, colors, bogus]) with self.assertRaises(ValueError) as exc: self._call_fut(MAPPING, table.schema) self.assertIn('Unknown field mode: BOGUS', str(exc.exception))
def test_num_rows_getter(self): dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) table = self._make_one(table_ref) # Check with no value set. self.assertIsNone(table.num_rows) num_rows = 42 # Check with integer value set. table._properties = {'numRows': num_rows} self.assertEqual(table.num_rows, num_rows) # Check with a string value set. table._properties = {'numRows': str(num_rows)} self.assertEqual(table.num_rows, num_rows) # Check with invalid int value. table._properties = {'numRows': 'x'} with self.assertRaises(ValueError): getattr(table, 'num_rows')
def dataset_exists_with_name(self, dataset_name): # type: (str) -> bool """Determines whether a dataset exists with the given name. Args: dataset_name: The name of the dataset to check. Returns: True if the dataset exists in this client's project, False otherwise. """ return self.dataset_exists( DatasetReference(self.project_id, dataset_name))
def table_reference(self, table_name: str, dataset_name: str = None) -> TableReference: if dataset_name is None or dataset_name == self._dataset.dataset_id: dataset = self._dataset elif dataset_name in self._additional_datasets: dataset = self._additional_datasets[dataset_name] else: raise DatasetDoesNotExistError('The dataset "' + str(dataset_name) + '" does not exist') return DatasetReference( self._configuration.databases.bigquery.project, dataset.dataset_id ).table(table_name)
def test_to_api_repr(self): from google.cloud.bigquery.dataset import DatasetReference dataset_ref = DatasetReference('project_1', 'dataset_1') table_ref = self._make_one(dataset_ref, 'table_1') resource = table_ref.to_api_repr() self.assertEqual( resource, { 'projectId': 'project_1', 'datasetId': 'dataset_1', 'tableId': 'table_1', })
def test_reload_w_bound_client(self): from google.cloud.bigquery.dataset import DatasetReference PATH = "/projects/%s/jobs/%s" % (self.PROJECT, self.JOB_ID) RESOURCE = self._make_resource() conn = _make_connection(RESOURCE) client = _make_client(project=self.PROJECT, connection=conn) source_dataset = DatasetReference(self.PROJECT, self.DS_ID) source = source_dataset.table(self.SOURCE_TABLE) job = self._make_one(self.JOB_ID, source, [self.DESTINATION_URI], client) with mock.patch( "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes" ) as final_attributes: job.reload() final_attributes.assert_called_with({"path": PATH}, client, job) conn.api_request.assert_called_once_with(method="GET", path=PATH, query_params={}, timeout=None) self._verifyResourceProperties(job, RESOURCE)
def test_from_api_repr(self): from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery.table import TableReference dataset_ref = DatasetReference('project_1', 'dataset_1') expected = self._make_one(dataset_ref, 'table_1') got = TableReference.from_api_repr({ 'projectId': 'project_1', 'datasetId': 'dataset_1', 'tableId': 'table_1', }) self.assertEqual(expected, got)
def tables(self, dataset_id): # type: (str) -> List[str] """Returns a list of table names in a given dataset. Args: dataset_id: The name of the dataset to query. Returns: A list of table names (strings). """ dataset_ref = DatasetReference(self.project_id, dataset_id) tables = self.gclient.list_tables(dataset_ref, retry=self.default_retry) return [t.table_id for t in tables]
def get_table_reference_from_path(self, table_path): # type: (str) -> TableReference """ Returns a TableReference for a given path to a BigQuery table. Args: table_path: A BigQuery table path in the form project.dataset.table Returns: A TableReference for the table specified by the path """ _, dataset, table = self.parse_table_path(table_path) dataset_ref = DatasetReference(self.project_id, dataset) return TableReference(dataset_ref, table)