def createBigQuery(): biguery_name = 'quotedataset' bigquery_description = 'this is a quotedataset test' client = bigquery.Client() dataset_ref = client.dataset(biguery_name) dataset = Dataset(dataset_ref) dataset.description = bigquery_description dataset = client.create_dataset(dataset) # API request print('Dataset {} created.'.format(dataset.dataset_id)) table_id = 'quotes_table' table_ref = dataset_ref.table(table_id) table = bigquery.Table(table_ref) table.schema = ( bigquery.SchemaField('Quote', 'STRING', 'REPEATED'), bigquery.SchemaField('Author', 'STRING', 'REPEATED'), bigquery.SchemaField('Tags', 'STRING', 'REPEATED'), ) table = client.create_table(table) print('Created table {} in dataset {}.'.format(table_id, biguery_name))
def upload_tweets(): big_query_client = bigquery.Client.from_service_account_json('my-beam-project-b2834963a4ae.json') dataset_ref = big_query_client.dataset('Tweets') dataset = Dataset(dataset_ref) dataset.description = 'This represents tweets of trending topics' dataset = big_query_client.create_dataset(dataset) SCHEMA = [ SchemaField('Tweets', 'STRING', mode='Nullable'), ] table_ref = big_query_client.dataset('Tweets').table('tabletweet') load_config = LoadJobConfig() load_config.skip_leading_rows = 0 load_config.schema = SCHEMA load_config.allow_quoted_newlines = True load_config.ignore_unknown_values = False load_config.max_bad_records = 200 with open('tweets.csv', 'rb') as readable: big_query_client.load_table_from_file( readable, table_ref, job_config=load_config) print('tweets file uploaded to big query')
def load(project_id, bq_client, src_dataset_id, dst_dataset_id): """ Transform safely loaded tables and store results in target dataset. :param project_id: Identifies the BQ project :param bq_client: a BigQuery client object :param src_dataset_id: reference to source dataset object :param dst_dataset_id: reference to destination dataset object :return: List of BQ job_ids """ dst_dataset = Dataset(f'{bq_client.project}.{dst_dataset_id}') dst_dataset.description = f'Vocabulary cleaned and loaded from {src_dataset_id}' dst_dataset.labels = {'type': 'vocabulary'} dst_dataset.location = "US" bq_client.create_dataset(dst_dataset, exists_ok=True) src_tables = list(bq_client.list_tables(dataset=src_dataset_id)) job_config = QueryJobConfig() query_jobs = [] for src_table in src_tables: schema = bq.get_table_schema(src_table.table_id) destination = f'{project_id}.{dst_dataset_id}.{src_table.table_id}' table = bq_client.create_table(Table(destination, schema=schema), exists_ok=True) job_config.destination = table query = SELECT_TPL.render(project_id=project_id, dataset_id=src_dataset_id, table=src_table.table_id, fields=schema) query_job = bq_client.query(query, job_config=job_config) LOGGER.info(f'table:{destination} job_id:{query_job.job_id}') query_jobs.append(query_job) query_job.result() return query_jobs
def test_dataflow_event_big_query_success(sdc_builder, sdc_executor, gcp): """ Create data using Google BigQuery client and then check if Google BigQuery origin receives them using wiretap, including a query successfully read event. The pipeline looks like: google_bigquery >> wiretap """ pipeline_builder = sdc_builder.get_pipeline_builder() dataset_name = get_random_string(string.ascii_letters, 5) table_name = get_random_string(string.ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY id' google_bigquery.set_attributes(query=query_str) wiretap = pipeline_builder.add_wiretap() events_wiretap = pipeline_builder.add_wiretap() google_bigquery >> wiretap.destination google_bigquery >= events_wiretap.destination pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=SCHEMA)) errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT) assert not errors, 'Errors found when creating data using bigquery client' # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() assert len(wiretap.output_records) == len(ROWS_TO_INSERT),\ f'Expected {len(ROWS_TO_INSERT)} records, received {len(wiretap.output_records)}' rows_from_wiretap = get_rows_from_wiretap(wiretap) assert rows_from_wiretap == ROWS_TO_INSERT # We have exactly one output record, check that it is a big-query-success event assert len( events_wiretap.output_records ) == 1, f'Expected 1 records, received {len(events_wiretap.output_records)}' event_record = events_wiretap.output_records[0] event_type = event_record.header.values['sdc.event.type'] assert event_type == 'big-query-success', 'Received %s as event type (expected new-file)' % event_type finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_multiple_batch(sdc_builder, sdc_executor, gcp, number_batches): """ Create data using Google BigQuery destination using different batch sizes and then check if it has been correctly created using BigQuery client. The pipeline looks like: dev_raw_data_source >> google_bigquery """ number_records = 1000 batch_size = number_records//number_batches pipeline_builder = sdc_builder.get_pipeline_builder() dev_data_generator = pipeline_builder.add_stage('Dev Data Generator') dev_data_generator.set_attributes(batch_size=batch_size, fields_to_generate=[ {"type": "INTEGER", "field": "age"}, {"type": "STRING", "field": "full_name"} ]) dataset_name = get_random_string(string.ascii_letters, 5) table_name = get_random_string(string.ascii_letters, 5) google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination') google_bigquery.set_attributes(dataset=dataset_name, table_name=table_name) dev_data_generator >> google_bigquery pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=SCHEMA)) # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_pipeline_output_records_count(number_records) sdc_executor.stop_pipeline(pipeline) data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)] data_from_bigquery.sort() assert data_from_bigquery == data_from_bigquery history = sdc_executor.get_pipeline_history(pipeline) records = history.latest.metrics.counter('pipeline.batchInputRecords.counter').count assert len(data_from_bigquery) == records finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_data_type(sdc_builder, sdc_executor, gcp, data_type, data, expected_data): """ Create data using Google BigQuery client and then check if Google BigQuery origin receives them using wiretap with one data format each. The pipeline looks like: google_bigquery >> wiretap """ pipeline_builder = sdc_builder.get_pipeline_builder() dataset_name = get_random_string(string.ascii_letters, 5) table_name = get_random_string(string.ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') query_str = f'SELECT * FROM {dataset_name}.{table_name}' google_bigquery.set_attributes(query=query_str) wiretap = pipeline_builder.add_wiretap() google_bigquery >> wiretap.destination pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [ SchemaField('id', 'INTEGER', mode='NULLABLE'), SchemaField('data', data_type, mode='NULLABLE') ] try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=schema)) errors = bigquery_client.insert_rows(table, [(None, data)]) assert not errors, 'Errors found when creating data using bigquery client' # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() assert len( wiretap.output_records ) == 1, f'Expected 1 record, received {len(wiretap.output_records)}' received_data = wiretap.output_records[0].field['data'] assert received_data == expected_data finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def createDataset(datasetname): dataset_ref = bigquery_client.dataset(datasetname) dataset = Dataset(dataset_ref) dataset.location = 'US' dataset = bigquery_client.create_dataset(dataset) return
def test_google_bigquery_origin_batch_handling(sdc_builder, sdc_executor, gcp): """Verify proper batch handling by the BigQuery origin. In this test, we write 8 records to BigQuery with a batch size of 3, verifying that each batch is of size 3 and that 8 total records are captured. The small numbers are used because of the limitations described in SDC-8765. """ MAX_BATCH_SIZE = 3 TOTAL_RECORDS = 8 dataset_name = get_random_string(ascii_letters, 5) table_name = get_random_string(ascii_letters, 5) query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY number' pipeline_builder = sdc_builder.get_pipeline_builder() google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') google_bigquery.set_attributes(query=query_str, max_batch_size_in_records=MAX_BATCH_SIZE) trash = pipeline_builder.add_stage('Trash') google_bigquery >> trash pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [SchemaField('number', 'STRING', mode='required'), SchemaField('value', 'STRING', mode='required')] data = [dict(number=str(i), value=get_random_string()) for i in range(TOTAL_RECORDS)] dataset = Dataset(bigquery_client.dataset(dataset_name)) try: # Using Google bigquery client, create dataset, table and data inside table. logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset) table = bigquery_client.create_table(Table(dataset.table(table_name), schema=schema)) errors = bigquery_client.insert_rows(table, data) assert not errors # Ceiling division is needed to capture all the complete batches along with the last partial one. snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True, batches=math.ceil(TOTAL_RECORDS / MAX_BATCH_SIZE)).snapshot # Assert that the batch size is being respected in each batch (including the last). In this case, # we'd expect batch sizes of 3, 3, and 2. for i, batch in enumerate(snapshot.snapshot_batches, start=1): # for 8 records, we'd expect batch sizes of 3, 3, and 2. assert (len(batch.stage_outputs[google_bigquery.instance_name].output) == MAX_BATCH_SIZE if i * MAX_BATCH_SIZE <= TOTAL_RECORDS else TOTAL_RECORDS % MAX_BATCH_SIZE) all_records = [record.field for batch in snapshot.snapshot_batches for record in batch.stage_outputs[google_bigquery.instance_name].output] assert all_records == data finally: bigquery_client.delete_dataset(dataset, delete_contents=True)
def test_google_bigquery_destination(sdc_builder, sdc_executor, gcp): """ Send data to Google BigQuery from Dev Raw Data Source and confirm that Google BigQuery destination successfully recieves them using Google BigQuery client. This is achieved by using a deduplicator which assures that there is only one ingest to Google BigQuery. The pipeline looks like: dev_raw_data_source >> record_deduplicator >> google_bigquery record_deduplicator >> trash """ pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='DELIMITED', header_line='WITH_HEADER', raw_data='\n'.join(CSV_DATA_TO_INSERT)) dataset_name = get_random_string(ascii_letters, 5) table_name = get_random_string(ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='destination') google_bigquery.set_attributes(dataset=dataset_name, table_name=table_name) record_deduplicator = pipeline_builder.add_stage('Record Deduplicator') trash = pipeline_builder.add_stage('Trash') dev_raw_data_source >> record_deduplicator >> google_bigquery record_deduplicator >> trash pipeline = pipeline_builder.build(title='Google BigQuery Destination').configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [SchemaField('full_name', 'STRING', mode='required'), SchemaField('age', 'INTEGER', mode='required')] dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) try: logger.info('Creating dataset %s using Google BigQuery client ...', dataset_name) dataset = bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema)) logger.info('Starting BigQuery Destination pipeline and waiting for it to produce records ...') sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(1) logger.info('Stopping BigQuery Destination pipeline and getting the count of records produced in total ...') sdc_executor.stop_pipeline(pipeline) # Verify by reading records using Google BigQuery client data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)] data_from_bigquery.sort() logger.debug('read_data = {}'.format(data_from_bigquery)) assert ROWS_EXPECTED == data_from_bigquery finally: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_object_names_columns(sdc_builder, sdc_executor, gcp, column_name): """ Create data using Google BigQuery destination with different column names and then check if it has been correctly created using BigQuery client. The pipeline looks like: dev_raw_data_source >> google_bigquery """ data_to_insert = [(1, 'data')] rows_to_insert = [f'id,{column_name}'] + [','.join(str(element) for element in row) for row in data_to_insert] data_to_expect = [(1, 'data')] pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='DELIMITED', header_line='WITH_HEADER', stop_after_first_batch=True, raw_data='\n'.join(rows_to_insert)) dataset_name = get_random_string(string.ascii_letters, 5) table_name = get_random_string(string.ascii_letters, 5) google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination') google_bigquery.set_attributes(dataset=dataset_name, table_name=table_name) dev_raw_data_source >> google_bigquery pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [SchemaField('id', 'INTEGER', mode='NULLABLE'), SchemaField(column_name, 'STRING', mode='NULLABLE')] try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema)) # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)] data_from_bigquery.sort() assert data_to_expect == data_from_bigquery finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_google_bigquery_origin(sdc_builder, sdc_executor, gcp): """ Create data using Google BigQuery client and then check if Google BigQuery origin receives them using snapshot. The pipeline looks like: google_bigquery >> trash """ pipeline_builder = sdc_builder.get_pipeline_builder() dataset_name = get_random_string(ascii_letters, 5) table_name = get_random_string(ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY full_name' google_bigquery.set_attributes(query=query_str) trash = pipeline_builder.add_stage('Trash') google_bigquery >> trash pipeline = pipeline_builder.build( title='Google BigQuery').configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [ SchemaField('full_name', 'STRING', mode='required'), SchemaField('age', 'INTEGER', mode='required') ] dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) try: # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) dataset = bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=schema)) errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT) assert errors == [] # Start pipeline and verify correct rows are received using snaphot. logger.info('Starting pipeline and snapshot') snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True).snapshot if sdc_executor.get_pipeline_status(pipeline) == 'RUNNING': sdc_executor.stop_pipeline(pipeline) rows_from_snapshot = [(record.value['value'][0]['value'], int(record.value['value'][1]['value'])) for record in snapshot[google_bigquery].output] assert rows_from_snapshot == ROWS_TO_INSERT finally: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def create_bq_dataset(dataset_name='price_data'): '''Create dataset if not exists''' client = Client() datasets = [ client.project + "." + i.dataset_id for i in list(client.list_datasets()) ] if client.project + "." + dataset_name not in datasets: dataset = Dataset(dataset_name) dataset.location = "US" client.create_dataset(dataset) else: print("Dataset already exists")
def __create_dataset(self, data_set, data_set_description): """ This method create a DataSet in Google's Big Query. :param data_set: Name of the dataSet :type data_set: String :param data_set_description: Short description about dataSet. :type data_set_description: String. """ dataset_ref = self.client.dataset(data_set) dataset = Dataset(dataset_ref) dataset.description = data_set_description dataset = self.client.create_dataset(dataset) # API request
def setUp(self): project_id = 'fake_project_id' dataset_id = 'fake_dataset_id' bucket_name = 'fake_bucket' dataset_ref = DatasetReference(project_id, dataset_id) self.dst_dataset = Dataset(dataset_ref) self.bq_client = mock.MagicMock() self.gcs_client = mock.MagicMock() self.bucket_name = bucket_name self.all_blobs = [ Blob(f'{table}.csv', self.bucket_name) for table in common.VOCABULARY_TABLES ]
def test_object_names_datasets(sdc_builder, sdc_executor, gcp, dataset_name): """ Create data using Google BigQuery destination with different dataset names and then check if it has been correctly created using BigQuery client. The pipeline looks like: dev_raw_data_source >> google_bigquery """ pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='DELIMITED', header_line='WITH_HEADER', stop_after_first_batch=True, raw_data='\n'.join(CSV_DATA_TO_INSERT)) table_name = get_random_string(string.ascii_letters, 5) # If tests fail for any reason, leftovers with equal names might lead to more errors dataset_name = f'{dataset_name}_{get_random_string(string.ascii_letters, 5)}' google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination') google_bigquery.set_attributes(dataset=dataset_name, table_name=table_name) dev_raw_data_source >> google_bigquery pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=SCHEMA)) # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)] data_from_bigquery.sort() assert ROWS_EXPECTED == data_from_bigquery finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_object_names_datasets(sdc_builder, sdc_executor, gcp, dataset_name): """ Create data using Google BigQuery client with specific table names and then check if Google BigQuery origin receives them using wiretap. The pipeline looks like: google_bigquery >> wiretap """ pipeline_builder = sdc_builder.get_pipeline_builder() table_name = get_random_string(string.ascii_letters, 5) dataset_name = f'{dataset_name}_{get_random_string(string.ascii_letters, 5)}' google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') query_str = f"SELECT * FROM `{dataset_name}`.`{table_name}` ORDER BY id" google_bigquery.set_attributes(query=query_str) wiretap = pipeline_builder.add_wiretap() google_bigquery >> wiretap.destination pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=SCHEMA)) errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT) assert not errors, 'Errors found when creating data using bigquery client' # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() assert len(wiretap.output_records) == len(ROWS_TO_INSERT),\ f'Expected {len(ROWS_TO_INSERT)} records, received {len(wiretap.output_records)}' rows_from_wiretap = get_rows_from_wiretap(wiretap) assert rows_from_wiretap == ROWS_TO_INSERT finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def create_external_table_hive_partitioning( bq_client: bigquery.Client, dataset: bigquery.Dataset, table_id: str, gcs_directory_path: str) -> bigquery.Table: """ Creates an external table with AUTO hive partitioning in GCS :param bq_client: Client object to bigquery :param dataset: dataset object. Check 'get_or_create_dataset' method :param table_id: Table to be created :param gcs_directory_path: Directory of GCS with the data. For example: If you have a structure like this: "gs://bucket/images_metadata/source_id=abc/date=2018-02-20" You should pass: "******" :return: """ table = bigquery.Table(dataset.table(table_id)) external_config = bigquery.ExternalConfig( bigquery.SourceFormat.PARQUET) external_config.source_uris = [f"{gcs_directory_path}/*"] hive_part_opt = HivePartitioningOptions() hive_part_opt.mode = "AUTO" hive_part_opt.source_uri_prefix = gcs_directory_path external_config.hive_partitioning = hive_part_opt table.external_data_configuration = external_config table = bq_client.create_table(table, exists_ok=True) return table
def _create_brand_new_datasets(self, product_id): """ Creates a new Google BigQuery dataset Args: product_id (str): A string representing the desired product_id """ dataset = Dataset(self.client.dataset(dataset_id=product_id)) dataset.location = _DATASET_GEOLOCATION self.datasets[product_id]['datasets'] = ( self.client.create_dataset(dataset)) self._create_brand_new_tables(product_id=product_id) return
def prepare_table(self, dataset: bigquery.Dataset, table_name: str, columns_schema: list, incremental: bool) -> bigquery.TableReference: table_reference = dataset.table(table_name) table = bigquery.Table(table_reference, columns_schema) try: bq_table = self.bigquery_client.get_table(table_reference) table_exist = True if incremental: schema_mapper.is_table_definition_in_match_with_bigquery( columns_schema, bq_table) else: self.bigquery_client.delete_table(table_reference) table_exist = False except bq_exceptions.NotFound: table_exist = False except bq_exceptions.BadRequest as err: message = 'Cannot create table %s: %s' % (table_reference, str(err)) raise UserException(message) if not table_exist: try: self.bigquery_client.create_table(table) except bq_exceptions.BadRequest as err: message = 'Cannot create table %s: %s' % (table_name, str(err)) raise UserException(message) return table_reference
def test_table_lookup(self): # type: () -> None dataset_ref1 = DatasetReference('my_project', 'dataset1') table_ref1 = TableReference(dataset_ref1, 'table1') table1 = Table(table_ref1, _TEST_SCHEMA) # Trying to get the same dataset/table in another project doesn't work. with self.assertRaisesRegexp(NotFound, 'other_project'): self.bq_client.get_table( TableReference(DatasetReference('other_project', 'dataset1'), 'table1')) # Trying to get the table before the dataset exists doesn't work with self.assertRaisesRegexp(NotFound, 'dataset1'): self.bq_client.get_table(table_ref1) self.bq_client.create_dataset(Dataset(dataset_ref1)) # Trying to get the table before the table exists doesn't work with self.assertRaises(NotFound): self.bq_client.get_table(table_ref1) self.bq_client.create_table(table1) # Assert the created table has the expected properties. table_found = self.bq_client.get_table(table_ref1) self.assertEqual(table1.project, "my_project") self.assertEqual(table1.dataset_id, "dataset1") self.assertEqual(table1.table_id, "table1") six.assertCountEqual(self, table_found.schema, _TEST_SCHEMA)
def load_views(bq: bigquery.Client, dataset: bigquery.Dataset, views: Dict[str, str]): """Load views for a test.""" for table, view_query in views.items(): view = bigquery.Table(dataset.table(table)) view.view_query = view_query.format(project=dataset.project, dataset=dataset.dataset_id) bq.create_table(view)
def test_listing_datasets(self): # type: () -> None self.assertFalse(self.bq_client.list_datasets()) self.assertFalse(self.bq_client.list_datasets('my_project')) self.bq_client.create_dataset(Dataset(DatasetReference('my_project', 'dataset1'))) self.bq_client.create_dataset(Dataset(DatasetReference('my_project', 'dataset2'))) self.bq_client.create_dataset(Dataset(DatasetReference('other_project', 'dataset3'))) six.assertCountEqual(self, [dataset.dataset_id for dataset in self.bq_client.list_datasets()], ['dataset1', 'dataset2']) six.assertCountEqual( self, [dataset.dataset_id for dataset in self.bq_client.list_datasets('my_project')], ['dataset1', 'dataset2']) six.assertCountEqual( self, [dataset.dataset_id for dataset in self.bq_client.list_datasets('other_project')], ['dataset3'])
def check_and_create_staging_dataset(dst_dataset_id, bucket_name, bq_client): """ :param dst_dataset_id: final destination to load the vocabulary in BigQuery :param bucket_name: the location in GCS containing the vocabulary files :param bq_client: google bigquery client :return: staging dataset object """ staging_dataset_id = f'{dst_dataset_id}_staging' staging_dataset = Dataset(f'{bq_client.project}.{staging_dataset_id}') try: bq_client.get_dataset(staging_dataset) except NotFound: staging_dataset.description = f'Vocabulary loaded from gs://{bucket_name}' staging_dataset.labels = {'type': 'vocabulary', 'phase': 'staging'} staging_dataset.location = "US" staging_dataset = bq_client.create_dataset(staging_dataset) LOGGER.info(f'Successfully created dataset {staging_dataset_id}') return staging_dataset
def create_data_set(self, data_set_name): """ :param data_set_name: str - The name of the dataset to be created :return: 0 indicates success """ data_set_ref = self.client.dataset(data_set_name) data_set = Dataset(data_set_ref) data_set.description = '' data_set.location = 'EU' try: self.client.create_dataset(data_set) # API request logging.info('Data set - ' + data_set_name + ' successfully created') except Conflict: logging.info('Data set - ' + data_set_name + ' already exists') return 0
def test_listing_tables(self): # type: () -> None dataset_ref1 = DatasetReference('my_project', 'dataset1') self.bq_client.create_dataset(Dataset(dataset_ref1)) self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table1'), _TEST_SCHEMA)) self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table2'), [])) six.assertCountEqual( self, [table_ref.table_id for table_ref in self.bq_client.list_tables(dataset_ref1)], ['table1', 'table2'])
def test_listing_tables_with_max(self): # type: () -> None dataset_ref1 = DatasetReference('my_project', 'dataset1') self.bq_client.create_dataset(Dataset(dataset_ref1)) for i in range(10): self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table{}'.format(i)), _TEST_SCHEMA)) self.assertEqual(5, len(self.bq_client.list_tables(dataset_ref1, max_results=5))) self.assertEqual(10, len(self.bq_client.list_tables(dataset_ref1, max_results=20))) self.assertEqual(10, len(self.bq_client.list_tables(dataset_ref1)))
def test_delete_dataset_if_exists_WhenItIsNotEmpty(self): dataset_id = self.__generate_test_name("dataset") table_name = self.__generate_test_name("table") self.client.create_dataset_if_not_exists(dataset_id) self.__create_test_table( table_name, Dataset(DatasetReference(GCP_PROJECT_ID, dataset_id)).reference) self.client.delete_dataset_if_exists(dataset_id, delete_contents=True) self.assertFalse(self.client.dataset_exists(dataset_id))
def ensure_dataset(project_id, dataset_id, location): client = bigquery.Client(project=project_id, location=location) dataset_ref = client.dataset(dataset_id) try: client.create_dataset(dataset_ref, exists_ok=True) except Exception: # attempt to run even if creation fails due to permissions etc. pass return client, Dataset(dataset_ref)
def create_scratch_dataset( context: AbstractComputeExecutionContext) -> HcaScratchDatasetName: """ Creates a staging dataset that will house records for update/insertion into the final TDR dataset :return: Name of the staging dataset """ scratch_bq_project = context.resources.scratch_config.scratch_bq_project scratch_dataset_prefix = context.resources.scratch_config.scratch_dataset_prefix load_tag = context.resources.load_tag dataset_name = f"{scratch_bq_project}.{scratch_dataset_prefix}_{load_tag}" dataset = Dataset(dataset_name) dataset.default_table_expiration_ms = context.resources.scratch_config.scratch_table_expiration_ms bq_client = context.resources.bigquery_client bq_client.create_dataset(dataset) context.log.info(f"Created scratch dataset {dataset_name}") return HcaScratchDatasetName(dataset_name)
def create(self, dataset_id): """ Create a dataset in Google BigQuery Parameters ---------- dataset : str Name of dataset to be written """ from google.cloud.bigquery import Dataset if self.exists(dataset_id): raise DatasetCreationError("Dataset {0} already " "exists".format(dataset_id)) dataset = Dataset(self.client.dataset(dataset_id)) if self.location is not None: dataset.location = self.location try: self.client.create_dataset(dataset) except self.http_error as ex: self.process_http_error(ex)