def test_handle_bq_export_task_county(self, mock_export, mock_project_id): """Tests that the export is called for a given table and module when the /export_manager/export endpoint is hit for a table in the COUNTY module.""" mock_export.return_value = True mock_project_id.return_value = 'test-project' self.mock_client.dataset_ref_for_id.return_value = DatasetReference( mock_project_id.return_value, self.mock_dataset_name) table = 'fake_table' module = 'JAILS' route = '/export' data = {"table_name": table, "schema_type": module} response = self.mock_flask_client.post( route, data=json.dumps(data), content_type='application/json', headers={'X-Appengine-Inbound-Appid': 'test-project'}) assert response.status_code == HTTPStatus.OK mock_export.assert_called_with( self.mock_client, table, DatasetReference.from_string(self.mock_dataset_name, mock_project_id.return_value), SchemaType.JAILS)
def test_handle_bq_export_task_state(self, mock_export, mock_project_id): """Tests that the export is called for a given table and module when the /export_manager/export endpoint is hit for a table in the STATE module.""" mock_export.return_value = True self.mock_export_config.STATE_BASE_TABLES_BQ_DATASET.return_value = 'state' mock_project_id.return_value = 'test-project' self.mock_client.dataset_ref_for_id.return_value = \ DatasetReference.from_string('dataset', 'test-project') table = 'fake_table' module = 'STATE' route = '/export' data = {"table_name": table, "schema_type": module} response = self.mock_flask_client.post( route, data=json.dumps(data), content_type='application/json', headers={'X-Appengine-Inbound-Appid': 'test-project'}) assert response.status_code == HTTPStatus.OK mock_export.assert_called_with( self.mock_client, table, DatasetReference.from_string('dataset', 'test-project'), SchemaType.STATE)
def test_table_lookup(self): # type: () -> None dataset_ref1 = DatasetReference('my_project', 'dataset1') table_ref1 = TableReference(dataset_ref1, 'table1') table1 = Table(table_ref1, _TEST_SCHEMA) # Trying to get the same dataset/table in another project doesn't work. with self.assertRaisesRegexp(NotFound, 'other_project'): self.bq_client.get_table( TableReference(DatasetReference('other_project', 'dataset1'), 'table1')) # Trying to get the table before the dataset exists doesn't work with self.assertRaisesRegexp(NotFound, 'dataset1'): self.bq_client.get_table(table_ref1) self.bq_client.create_dataset(Dataset(dataset_ref1)) # Trying to get the table before the table exists doesn't work with self.assertRaises(NotFound): self.bq_client.get_table(table_ref1) self.bq_client.create_table(table1) # Assert the created table has the expected properties. table_found = self.bq_client.get_table(table_ref1) self.assertEqual(table1.project, "my_project") self.assertEqual(table1.dataset_id, "dataset1") self.assertEqual(table1.table_id, "table1") six.assertCountEqual(self, table_found.schema, _TEST_SCHEMA)
def test_update_tables_for_state(self, mock_region_config_fn) -> None: mock_region_config = FakeDirectIngestRegionRawFileConfig(self.test_region.region_code) mock_region_config_fn.return_value = mock_region_config self.mock_raw_file_configs = mock_region_config.raw_file_configs self.update_controller = DirectIngestRawDataTableLatestViewUpdater( state_code=self.test_region.region_code, project_id=self.project_id, bq_client=self.mock_big_query_client ) with local_project_id_override(self.project_id): self.update_controller.update_views_for_state() self.assertEqual(self.mock_big_query_client.create_or_update_view.call_count, 2) raw_data_dataset = DatasetReference(self.project_id, 'us_xx_raw_data') self.mock_big_query_client.table_exists.assert_has_calls([ mock.call(raw_data_dataset, 'tagA'), mock.call(raw_data_dataset, 'tagB'), mock.call(raw_data_dataset, 'tagC'), mock.call(raw_data_dataset, 'tagWeDoNotIngest') ]) mock_views = [DirectIngestRawDataTableLatestView(region_code=self.test_region.region_code, raw_file_config=self.mock_raw_file_configs['tagA']), DirectIngestRawDataTableLatestView(region_code=self.test_region.region_code, raw_file_config=self.mock_raw_file_configs['tagC'])] views_dataset = DatasetReference(self.project_id, 'us_xx_raw_data_up_to_date_views') self.mock_big_query_client.create_or_update_view.assert_has_calls([mock.call(views_dataset, x) for x in mock_views]) self.mock_big_query_client.create_dataset_if_necessary.assert_called_once() self.mock_big_query_client.create_dataset_if_necessary.assert_has_calls([mock.call(views_dataset)])
def test_dataset_lookup(self): # type: () -> None with self.assertRaisesRegexp(NotFound, 'some_other_project'): self.bq_client.get_dataset(DatasetReference('some_other_project', 'dataset1')) with self.assertRaisesRegexp(NotFound, 'dataset1'): self.bq_client.get_dataset(DatasetReference('my_project', 'dataset1')) dataset1 = Dataset(DatasetReference('my_project', 'dataset1')) self.bq_client.create_dataset(dataset1) self.assertDatasetReferenceEqual( self.bq_client.get_dataset(DatasetReference('my_project', 'dataset1')).reference, dataset1.reference) self.assertDatasetReferenceEqual(self.bq_client.dataset('dataset1'), dataset1.reference) self.assertDatasetReferenceEqual(self.bq_client.dataset('dataset1', 'my_project'), dataset1.reference)
def __create_job_config( self, ems_query_job_config: EmsQueryJobConfig) -> QueryJobConfig: job_config = QueryJobConfig() job_config.priority = ems_query_job_config.priority.value job_config.use_legacy_sql = False job_config.use_query_cache = ems_query_job_config.use_query_cache job_config.labels = ems_query_job_config.labels if ems_query_job_config.destination_table is not None: job_config.time_partitioning = TimePartitioning("DAY") table_reference = TableReference( DatasetReference( ems_query_job_config.destination_project_id or self.__project_id, ems_query_job_config.destination_dataset), ems_query_job_config.destination_table) job_config.destination = table_reference job_config.write_disposition = ems_query_job_config.write_disposition.value job_config.create_disposition = ems_query_job_config.create_disposition.value partitioning = ems_query_job_config.time_partitioning if partitioning is not None: job_config.time_partitioning = TimePartitioning( partitioning.type.value, partitioning.field, partitioning.expiration_ms, partitioning.require_partition_filter) if ems_query_job_config.table_definitions is not None: job_config.table_definitions = ems_query_job_config.table_definitions return job_config
def _table_ref(self, table_id): """Return a BigQuery client library table reference""" from google.cloud.bigquery import DatasetReference from google.cloud.bigquery import TableReference return TableReference( DatasetReference(self.project_id, self.dataset_id), table_id)
def _load_table_from_cloud_storage_async( self, source_uri: str, destination_dataset_ref: bigquery.DatasetReference, destination_table_id: str, destination_table_schema: List[bigquery.SchemaField], write_disposition: bigquery.WriteDisposition ) -> bigquery.job.LoadJob: self.create_dataset_if_necessary(destination_dataset_ref) destination_table_ref = destination_dataset_ref.table( destination_table_id) job_config = bigquery.LoadJobConfig() job_config.schema = destination_table_schema job_config.source_format = bigquery.SourceFormat.CSV job_config.allow_quoted_newlines = True job_config.write_disposition = write_disposition load_job = self.client.load_table_from_uri(source_uri, destination_table_ref, job_config=job_config) logging.info("Started load job [%s] for table [%s.%s.%s]", load_job.job_id, destination_table_ref.project, destination_table_ref.dataset_id, destination_table_ref.table_id) return load_job
def make_table(project, dataset_id, table_id, friendly_name=None, description=None, expires=None, partitioning_type=None, view_use_legacy_sql=None, view_query=None, schema=None, labels=None): dataset_ref = DatasetReference(project, dataset_id) table_ref = TableReference(dataset_ref, table_id) table = Table(table_ref) table.friendly_name = friendly_name table.description = description table.expires = expires table.partitioning_type = partitioning_type if view_use_legacy_sql is not None: table.view_use_legacy_sql = view_use_legacy_sql if view_query is not None: table.view_query = view_query table.schema = schema if labels is not None: table.labels = labels return table
def test_listing_datasets(self): # type: () -> None self.assertFalse(self.bq_client.list_datasets()) self.assertFalse(self.bq_client.list_datasets('my_project')) self.bq_client.create_dataset(Dataset(DatasetReference('my_project', 'dataset1'))) self.bq_client.create_dataset(Dataset(DatasetReference('my_project', 'dataset2'))) self.bq_client.create_dataset(Dataset(DatasetReference('other_project', 'dataset3'))) six.assertCountEqual(self, [dataset.dataset_id for dataset in self.bq_client.list_datasets()], ['dataset1', 'dataset2']) six.assertCountEqual( self, [dataset.dataset_id for dataset in self.bq_client.list_datasets('my_project')], ['dataset1', 'dataset2']) six.assertCountEqual( self, [dataset.dataset_id for dataset in self.bq_client.list_datasets('other_project')], ['dataset3'])
def setUp(self): # Input parameters expected by the class self.project_id = 'bar_project' self.dataset_id = 'foo_dataset' self.description = 'fake_description' self.existing_labels_or_tags = {'label': 'value', 'tag': ''} self.new_labels_or_tags = {'label': 'new_value', 'new_tag': ''} self.updated = {'tag': '', 'label': 'new_value', 'new_tag': ''} self.dataset_ref = DatasetReference(self.project_id, self.dataset_id)
def test_listing_tables_with_max(self): # type: () -> None dataset_ref1 = DatasetReference('my_project', 'dataset1') self.bq_client.create_dataset(Dataset(dataset_ref1)) for i in range(10): self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table{}'.format(i)), _TEST_SCHEMA)) self.assertEqual(5, len(self.bq_client.list_tables(dataset_ref1, max_results=5))) self.assertEqual(10, len(self.bq_client.list_tables(dataset_ref1, max_results=20))) self.assertEqual(10, len(self.bq_client.list_tables(dataset_ref1)))
def test_listing_tables(self): # type: () -> None dataset_ref1 = DatasetReference('my_project', 'dataset1') self.bq_client.create_dataset(Dataset(dataset_ref1)) self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table1'), _TEST_SCHEMA)) self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table2'), [])) six.assertCountEqual( self, [table_ref.table_id for table_ref in self.bq_client.list_tables(dataset_ref1)], ['table1', 'table2'])
def test_delete_dataset_if_exists_WhenItIsNotEmpty(self): dataset_id = self.__generate_test_name("dataset") table_name = self.__generate_test_name("table") self.client.create_dataset_if_not_exists(dataset_id) self.__create_test_table( table_name, Dataset(DatasetReference(GCP_PROJECT_ID, dataset_id)).reference) self.client.delete_dataset_if_exists(dataset_id, delete_contents=True) self.assertFalse(self.client.dataset_exists(dataset_id))
def table_exists(self, dataset_ref: bigquery.DatasetReference, table_id: str) -> bool: table_ref = dataset_ref.table(table_id) try: self.client.get_table(table_ref) return True except exceptions.NotFound: logging.warning("Table [%s] does not exist in dataset [%s]", table_id, str(dataset_ref)) return False
def run_async_load_job(self, job_id_prefix: str, config: EmsLoadJobConfig) -> str: return self.__bigquery_client.load_table_from_uri( source_uris=config.source_uri_template, destination=TableReference( DatasetReference(config.destination_project_id, config.destination_dataset), config.destination_table), job_id_prefix=job_id_prefix, location=self.__location, job_config=self.__create_load_job_config(config)).job_id
def dataset(self, dataset_id, project=None): # type: (str, Optional[str]) -> DatasetReference """Constructs a reference to a dataset. Args: dataset_id: Dataset to look up project: If specified, project to find the dataset in, otherwise client's default. Returns: Reference to the dataset found. """ project = project or self.project return DatasetReference(project, dataset_id)
def test_dataset_delete(self): # type: () -> None dataset1 = Dataset(DatasetReference('my_project', 'dataset1')) # Can't delete dataset, doesn't exist yet. with self.assertRaisesRegexp(NotFound, 'dataset1'): self.bq_client.delete_dataset(dataset1.reference) self.bq_client.create_dataset(dataset1) self.bq_client.create_table(Table(TableReference(dataset1.reference, 'table1'), _TEST_SCHEMA)) # Can't delete dataset, not empty with self.assertRaises(BadRequest): self.bq_client.delete_dataset(dataset1.reference) # Okay to delete, specifically requesting to delete contents. self.bq_client.delete_dataset(dataset1.reference, delete_contents=True) # And now dataset is gone again. with self.assertRaisesRegexp(NotFound, 'dataset1'): self.bq_client.get_dataset(DatasetReference('my_project', 'dataset1'))
def get_raw_details(self, dataset_id, view_id): if self.did_full_update: return self.raw_details[dataset_id][view_id] if dataset_id not in self.raw_details: self.raw_details[dataset_id] = dict() if view_id not in self.raw_details[dataset_id]: try: self.raw_details[dataset_id][view_id] = self.bq_client._client.get_table( \ TableReference(DatasetReference(self.bq_conductor_conf.GOOGLE_CLOUD_PROJECT, dataset_id.split('.')[-1]), view_id)).to_api_repr() except: self.raw_details[dataset_id][view_id] = None return self.raw_details[dataset_id][view_id]
def setUp(self): project_id = 'fake_project_id' dataset_id = 'fake_dataset_id' bucket_name = 'fake_bucket' dataset_ref = DatasetReference(project_id, dataset_id) self.dst_dataset = Dataset(dataset_ref) self.bq_client = mock.MagicMock() self.gcs_client = mock.MagicMock() self.bucket_name = bucket_name self.all_blobs = [ Blob(f'{table}.csv', self.bucket_name) for table in common.VOCABULARY_TABLES ]
def setUp(self): self.project_id = 'fake_project' self.dataset_id = 'fake_dataset' self.dataset = DatasetReference(self.project_id, self.dataset_id) self.hpo_ids = ['fake_hpo1', 'fake_hpo2'] self.hpo_cdm_tables = [ self._table_list_item(hpo_id, cdm_table) for hpo_id in self.hpo_ids for cdm_table in CDM_TABLES ] self.hpo_cdm_and_achilles_tables = [ self._table_list_item(hpo_id, cdm_table) for hpo_id in self.hpo_ids for cdm_table in CDM_TABLES + ['achilles', 'achilles_results'] ]
def setUp(self): super(ClientWriteFromQueryTest, self).setUp() dataset_ref = DatasetReference(self.bq_client.project, 'my_dataset') schema = [SchemaField(name="a", field_type='INT64'), SchemaField(name="b", field_type='FLOAT64'), ] self.source_table = Table(TableReference(dataset_ref, 'source_table'), schema) self.destination_table = Table(TableReference(dataset_ref, 'destination_table'), schema) self.bq_client.create_dataset(Dataset(dataset_ref)) self.bq_client.create_table(self.source_table) # We don't create the destination table here; some tests do not want it created. # Stick two rows into source_table self.assertFalse(self.bq_client.insert_rows(self.source_table, [{'a': 1, 'b': 2.5}, {'a': 3, 'b': 4.25}]))
def get_mapping_table_ids(self, client): """ returns all the mapping table ids found in the dataset :param project_id: project_id containing the dataset :param mapping_dataset_id: dataset_id containing mapping tables :return: returns mapping table ids """ dataset_ref = DatasetReference(self.project_id, self._mapping_dataset_id) table_objs = client.list_tables(dataset_ref) mapping_table_ids = [ table_obj.table_id for table_obj in table_objs if table_obj.table_id in MAPPING_TABLES ] return mapping_table_ids
def to_dataset(project, model): access_entries = model.access_entries if access_entries: access_entries = tuple( BigQueryAccessEntry.to_access_entry(a) for a in access_entries) else: access_entries = () dataset_ref = DatasetReference(project, model.dataset_id) dataset = Dataset(dataset_ref) dataset.friendly_name = model.friendly_name dataset.description = model.description dataset.default_table_expiration_ms = model.default_table_expiration_ms dataset.location = model.location dataset.access_entries = access_entries dataset.labels = model.labels if model.labels is not None else dict() return dataset
def setUp(self): self.project_id = os.environ.get(PROJECT_ID) self.dataset_id = os.environ.get('COMBINED_DATASET_ID') self.dataset_ref = DatasetReference(self.project_id, self.dataset_id) self.client = bq.get_client(self.project_id) self.schema = [ SchemaField("person_id", "INT64"), SchemaField("first_name", "STRING"), SchemaField("last_name", "STRING"), SchemaField("algorithm", "STRING") ] self.ps_api_fields = [ dict(name='person_id', type='integer', mode='nullable'), dict(name='first_name', type='string', mode='nullable'), dict(name='last_name', type='string', mode='nullable') ] self.id_match_fields = [ dict(name='person_id', type='integer', mode='nullable'), dict(name='first_name', type='string', mode='nullable'), dict(name='last_name', type='string', mode='nullable'), dict(name='algorithm', type='string', mode='nullable') ] self.hpo_id = 'fake_site' self.id_match_table_id = f'{IDENTITY_MATCH_TABLE}_{self.hpo_id}' self.ps_values_table_id = f'ps_api_values_{self.hpo_id}' # Create and populate the ps_values site table schema = bq.get_table_schema(PS_API_VALUES) tablename = self.ps_values_table_id table = Table(f'{self.project_id}.{self.dataset_id}.{tablename}', schema=schema) table.time_partitioning = TimePartitioning( type_=TimePartitioningType.HOUR) table = self.client.create_table(table) populate_query = POPULATE_PS_VALUES.render( project_id=self.project_id, drc_dataset_id=self.dataset_id, ps_values_table_id=self.ps_values_table_id) job = self.client.query(populate_query) job.result()
def ensure_dataset(project_id, dataset_id, location): from google.cloud.bigquery import DatasetReference client = bigquery.Client(project=project_id, location=location) dataset_ref = DatasetReference(project_id, dataset_id) try: client.create_dataset(dataset_ref) except exceptions.GoogleAPICallError as e: if e.response.status_code == 409: # dataset exists pass else: logger.critical( f"unable to create dataset {dataset_id} in project {project_id}; Exception {e}" ) return 2 # sys.exit(2) return client, Dataset(dataset_ref)
def send_to_bq(dataset, table, row): bigquery_client = bigquery.Client(project='theo-home') table_ref = TableReference( dataset_ref=DatasetReference(dataset_id=dataset, project='theo-home'), table_id=table, ) schema = [SchemaField(name=field, field_type=bq_types[type(data)]) for field, data in row.items()] table = bigquery_client.create_table( Table(table_ref, schema=schema), exists_ok=True ) errors = bigquery_client.insert_rows(table, [row]) if errors: print(errors, file=sys.stderr)
def make_dataset(project, dataset_id, friendly_name=None, description=None, default_table_expiration_ms=None, location=None, labels=None, access_entries=None): dataset_ref = DatasetReference(project, dataset_id) dataset = Dataset(dataset_ref) dataset.friendly_name = friendly_name dataset.description = description dataset.default_table_expiration_ms = default_table_expiration_ms dataset.location = location if labels is not None: dataset.labels = labels if access_entries is not None: dataset.access_entries = access_entries return dataset
def test_load_table_from_file(self, column_to_null): # type: (int) -> None dataset_ref = DatasetReference('my_project', 'my_dataset') dataset = Dataset(dataset_ref) table = Table(TableReference(dataset_ref, 'table1'), _TEST_SCHEMA) self.bq_client.create_dataset(dataset) self.bq_client.create_table(table) output = cStringIO() csv_out = csv.writer(output) input_row = list(self.INPUT_ROW) expected_row = list(self.EXPECTED_ROW) if column_to_null is not None: input_row[column_to_null] = 'NULL' expected_row[column_to_null] = None csv_out.writerow(input_row) self.bq_client.load_table_from_file(output, table.reference, job_config=None, rewind=True) self.assertRowsExpected( self.bq_client.query('SELECT * FROM `my_project.my_dataset.table1`', QueryJobConfig()), [expected_row])
def copy_view( self, view: BigQueryView, destination_client: BigQueryClient, destination_dataset_ref: bigquery.DatasetReference ) -> bigquery.Table: if destination_client.table_exists(destination_dataset_ref, view.view_id): raise ValueError( f"Table [{view.view_id}] already exists in dataset!") # Create the destination dataset if it doesn't yet exist destination_client.create_dataset_if_necessary(destination_dataset_ref) new_view_ref = destination_dataset_ref.table(view.view_id) new_view = bigquery.Table(new_view_ref) new_view.view_query = view.view_query.format( destination_client.project_id, destination_dataset_ref.dataset_id, view.view_id) table = destination_client.create_table(new_view) logging.info("Created %s", new_view_ref) return table