def test_create_dataset(self): dataset = bq.create_dataset(self.project_id, self.dataset_id, self.description, self.label_or_tag) self.assertEqual(dataset.dataset_id, self.dataset_id) # Try to create same dataset, which now already exists self.assertRaises(RuntimeError, bq.create_dataset, self.project_id, self.dataset_id, self.description, self.label_or_tag) dataset = bq.create_dataset(self.project_id, self.dataset_id, self.description, self.label_or_tag, overwrite_existing=True) self.assertEqual(dataset.dataset_id, self.dataset_id)
def main(project_id: str, bucket_name: str, dst_dataset_id: str): """ Load and transform vocabulary files in GCS to a BigQuery dataset :param project_id: :param bucket_name: refers to the bucket containing vocabulary files :param dst_dataset_id: final destination to load the vocabulary in BigQuery """ bq_client = bq.get_client(project_id) gcs_client = storage.Client(project_id) sandbox_dataset_id = get_sandbox_dataset_id(dst_dataset_id) sandbox_dataset = bq.create_dataset( project_id, sandbox_dataset_id, f'Vocabulary loaded from gs://{bucket_name}', label_or_tag={'type': 'vocabulary'}, overwrite_existing=True) stage_jobs = load_stage(sandbox_dataset, bq_client, bucket_name, gcs_client) wait_jobs(stage_jobs) load_jobs = load(project_id, bq_client, sandbox_dataset_id, dst_dataset_id, overwrite_ok=True) wait_jobs(load_jobs)
def create_sandbox_dataset(project_id, dataset_id): """ A helper function create a sandbox dataset if the sandbox dataset doesn't exist :param project_id: project_id :param dataset_id: any dataset_id :return: the sandbox dataset_id """ sandbox_dataset_id = get_sandbox_dataset_id(dataset_id) friendly_name = f'Sandbox for {dataset_id}' description = f'Sandbox created for storing records affected by the cleaning rules applied to {dataset_id}' label_or_tag = {'label': '', 'tag': ''} create_dataset(project_id=project_id, dataset_id=sandbox_dataset_id, friendly_name=friendly_name, description=description, label_or_tag=label_or_tag, overwrite_existing=False) return sandbox_dataset_id
def test_query_sheet_linked_bq_table(self): dataset = bq.create_dataset(self.project_id, self.dataset_id, self.description, self.label_or_tag) # add Google Drive scope external_data_scopes = [ "https://www.googleapis.com/auth/drive", "https://www.googleapis.com/auth/cloud-platform" ] client = bq.get_client(self.project_id, external_data_scopes) # Configure the external data source and query job. external_config = bigquery.ExternalConfig("GOOGLE_SHEETS") # Grant viewing access to the test sheet to BigQuery test accounts sheet_url = ( "https://docs.google.com/spreadsheets" "/d/1JI-KyigmwZU9I2J6TZqVTPNoEAWVqiFeF8Y549-dvzM/edit#gid=0") schema = [ bigquery.SchemaField("site_name", "STRING"), bigquery.SchemaField("hpo_id", "STRING"), bigquery.SchemaField("site_point_of_contact", "STRING"), ] external_config.source_uris = [sheet_url] external_config.schema = schema external_config.options.range = ( "contacts!A1:C5" # limit scope so that other items can be added to sheet ) external_config.options.skip_leading_rows = 1 # Optionally skip header row. table_id = consts.HPO_ID_CONTACT_LIST_TABLE_ID table = bigquery.Table(dataset.table(table_id), schema=schema) table.external_data_configuration = external_config table = client.create_table(table) table_content_query = f'SELECT * FROM `{dataset.dataset_id}.{table.table_id}`' actual_df = bq.query_sheet_linked_bq_table(self.project_id, table_content_query, external_data_scopes) expected_dict = [{ 'site_name': 'Fake Site Name 1', 'hpo_id': 'fake_1', 'site_point_of_contact': 'fake.email.1@site_1.fakedomain; fake.email.2@site_1.fakedomain' }, { 'site_name': 'Fake Site Name 2', 'hpo_id': 'fake_2', 'site_point_of_contact': 'no data steward' }, { 'site_name': 'Fake Site Name 3', 'hpo_id': 'fake_3', 'site_point_of_contact': 'Fake.Email.1@site_3.fake_domain; Fake.Email.2@site_3.fake_domain' }, { 'site_name': 'Fake Site Name 4', 'hpo_id': 'fake_4', 'site_point_of_contact': '[email protected]; [email protected]' }] expected_df = pd.DataFrame( expected_dict, columns=["site_name", "hpo_id", "site_point_of_contact"]) pd.testing.assert_frame_equal(actual_df, expected_df)