def make_table(project, dataset_id, table_id, friendly_name=None, description=None, expires=None, partitioning_type=None, view_use_legacy_sql=None, view_query=None, schema=None, labels=None): dataset_ref = DatasetReference(project, dataset_id) table_ref = TableReference(dataset_ref, table_id) table = Table(table_ref) table.friendly_name = friendly_name table.description = description table.expires = expires table.partitioning_type = partitioning_type if view_use_legacy_sql is not None: table.view_use_legacy_sql = view_use_legacy_sql if view_query is not None: table.view_query = view_query table.schema = schema if labels is not None: table.labels = labels return table
def create_view_task(ds, **kwargs): template_context = kwargs.copy() template_context['ds'] = ds template_context['params'] = environment client = Client() dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table(dest_table_name) table = Table(dest_table_ref) sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql_template = read_file(sql_path) sql = kwargs['task'].render_template('', sql_template, template_context) table.view_query = sql description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) logging.info('Creating view: ' + json.dumps(table.to_api_repr())) try: table = client.create_table(table) except Conflict: # https://cloud.google.com/bigquery/docs/managing-views table = client.update_table(table, ['view_query']) assert table.table_id == dest_table_name
def create_view_task(): client = Client() dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dataset_name, project=destination_dataset_project_id).table(dest_table_name) table = Table(dest_table_ref) sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql = read_file(sql_path, environment) table.view_query = sql description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) logging.info('Creating view: ' + json.dumps(table.to_api_repr())) try: table = client.create_table(table) except Conflict: # https://cloud.google.com/bigquery/docs/managing-views table = client.update_table(table, ['view_query']) assert table.table_id == dest_table_name
def create_table(client, dataset_ref, table_name, is_partitioned=False): table_ref = dataset_ref.table(table_name) table_obj = Table(table_ref, schema=TABLE_SCHEMA) if is_partitioned: time_partitioning = TimePartitioning() time_partitioning.field = 'partition_value' table_obj.time_partitioning = time_partitioning return client.create_table(table_obj)
def test_listing_tables(self): # type: () -> None dataset_ref1 = DatasetReference('my_project', 'dataset1') self.bq_client.create_dataset(Dataset(dataset_ref1)) self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table1'), _TEST_SCHEMA)) self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table2'), [])) six.assertCountEqual( self, [table_ref.table_id for table_ref in self.bq_client.list_tables(dataset_ref1)], ['table1', 'table2'])
def __create_test_table(self, table_name, dataset_id): table_schema = [ SchemaField("int_data", "INT64"), SchemaField("str_data", "STRING") ] table_reference = TableReference(dataset_id, table_name) test_table = Table(table_reference, table_schema) test_table.time_partitioning = TimePartitioning("DAY") self.__delete_if_exists(test_table) self.GCP_BIGQUERY_CLIENT.create_table(test_table) return test_table
def store_digital_health_status_data(project_id, json_data, destination_table, schema=None): """ Stores the fetched digital_health_sharing_status data in a BigQuery dataset. If the table doesn't exist, it will create that table. If the table does exist, it will create a partition in the designated table or append to the same partition. This is necessary for storing data has "RECORD" type fields which do not conform to a dataframe. The data is stored using a JSON file object since it is one of the ways BigQuery expects it. :param project_id: identifies the project :param json_data: list of json objects retrieved from process_digital_health_data_to_json :param destination_table: fully qualified destination table name as 'project.dataset.table' :param schema: a list of SchemaField objects corresponding to the destination table :return: returns the bq job_id for the loading of digital health data """ # Parameter check if not isinstance(project_id, str): raise RuntimeError( f'Please specify the project in which to create the table') client = get_client(project_id) if not schema: schema = get_table_schema(DIGITAL_HEALTH_SHARING_STATUS) try: table = client.get_table(destination_table) except NotFound: table = Table(destination_table, schema=schema) table.time_partitioning = TimePartitioning( type_=TimePartitioningType.DAY) table = client.create_table(table) file_obj = StringIO() for json_obj in json_data: json.dump(json_obj, file_obj) file_obj.write('\n') job_config = LoadJobConfig( source_format=SourceFormat.NEWLINE_DELIMITED_JSON, schema=schema) job = client.load_table_from_file(file_obj, table, rewind=True, job_config=job_config, job_id_prefix='ps_digital_health_load_') job.result() return job.job_id
def setUp(self): super(ClientWriteFromQueryTest, self).setUp() dataset_ref = DatasetReference(self.bq_client.project, 'my_dataset') schema = [SchemaField(name="a", field_type='INT64'), SchemaField(name="b", field_type='FLOAT64'), ] self.source_table = Table(TableReference(dataset_ref, 'source_table'), schema) self.destination_table = Table(TableReference(dataset_ref, 'destination_table'), schema) self.bq_client.create_dataset(Dataset(dataset_ref)) self.bq_client.create_table(self.source_table) # We don't create the destination table here; some tests do not want it created. # Stick two rows into source_table self.assertFalse(self.bq_client.insert_rows(self.source_table, [{'a': 1, 'b': 2.5}, {'a': 3, 'b': 4.25}]))
def setUp(self): self.project_id = app_identity.get_application_id() self.dataset_id = os.environ.get('UNIONED_DATASET_ID') self.fq_person = f'{self.project_id}.{self.dataset_id}.person' self.fq_person_ext = f'{self.project_id}.{self.dataset_id}.person_ext' self.table_ids = [self.fq_person, self.fq_person_ext] self.person = Table.from_string(self.fq_person) self.person_ext = Table.from_string(self.fq_person_ext) self.client = bq.get_client(self.project_id) for table in self.table_ids: self.client.delete_table(table, not_found_ok=True) bq.create_tables(self.client, self.project_id, self.table_ids) self.populate_tables()
def load(project_id, bq_client, src_dataset_id, dst_dataset_id): """ Transform safely loaded tables and store results in target dataset. :param project_id: Identifies the BQ project :param bq_client: a BigQuery client object :param src_dataset_id: reference to source dataset object :param dst_dataset_id: reference to destination dataset object :return: List of BQ job_ids """ dst_dataset = Dataset(f'{bq_client.project}.{dst_dataset_id}') dst_dataset.description = f'Vocabulary cleaned and loaded from {src_dataset_id}' dst_dataset.labels = {'type': 'vocabulary'} dst_dataset.location = "US" bq_client.create_dataset(dst_dataset, exists_ok=True) src_tables = list(bq_client.list_tables(dataset=src_dataset_id)) job_config = QueryJobConfig() query_jobs = [] for src_table in src_tables: schema = bq.get_table_schema(src_table.table_id) destination = f'{project_id}.{dst_dataset_id}.{src_table.table_id}' table = bq_client.create_table(Table(destination, schema=schema), exists_ok=True) job_config.destination = table query = SELECT_TPL.render(project_id=project_id, dataset_id=src_dataset_id, table=src_table.table_id, fields=schema) query_job = bq_client.query(query, job_config=job_config) LOGGER.info(f'table:{destination} job_id:{query_job.job_id}') query_jobs.append(query_job) query_job.result() return query_jobs
def insert_rows_from_uri(client, job_location, dataset, table_name, table_schema, table_uri, load_job_id, timeout=10): """ Insert rows from remote :param client: BQ Client :param job_location : BQ job location :param dataset: BQ Dataset :param table_name: The table string name :param table_schema: The table schema :param table_uri: The GCS blob uri to insert :param load_job_id: Load job ID :param timeout : optional timeout, default to 10 :return: The created Table """ table_ref = dataset.table(table_name) table = Table(table_ref, table_schema) table_load_job_conf = bigquery.LoadJobConfig( schema=table_schema, source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON ) load_job = client.load_table_from_uri( table_uri, table, job_id=load_job_id, location=job_location, job_config=table_load_job_conf ) try: load_job.result(timeout=timeout) rows = client.list_rows(table, timeout=timeout) print_rows(rows) except Exception as error: logging.error(error) else: return table
def copy_fitbit_tables_from_views(client, from_dataset, to_dataset, table_prefix): """ Copies tables from views with prefix :param client: bq client :param from_dataset: dataset containing views :param to_dataset: dataset to create tables :param table_prefix: prefix added to table_ids :return: """ for table in FITBIT_TABLES: schema_list = bq.get_table_schema(table) fq_dest_table = f'{client.project}.{to_dataset}.{table}' dest_table = Table(fq_dest_table, schema=schema_list) dest_table = client.create_table(dest_table) LOGGER.info(f'Created empty table {fq_dest_table}') fields_name_str = ',\n'.join([item.name for item in schema_list]) fields_casted_str = ',\n'.join([ cast_to_schema_type(item.name, item.field_type) for item in schema_list ]) content_query = INSERT_QUERY.render(fq_dest_table=fq_dest_table, fields=fields_name_str, fields_casted=fields_casted_str, client=client, from_dataset=from_dataset, table_prefix=table_prefix, table=table) job = client.query(content_query) job.result() LOGGER.info(f'Copied fitbit tables from `{from_dataset}` to `{to_dataset}`')
def test_should_insert_records_to_non_partitioned_table(self): # given table_id = f'{self.dataset_manager.project_id}.{self.dataset_manager.dataset_name}.example_test_table' table = Table(table_id, schema=[ { "mode": "NULLABLE", "name": "example_field", "type": "STRING" }, ]) self.dataset_manager.create_table_from_schema('example_test_table', table=table) self.dataset_manager.insert('example_test_table', [{ "example_field": "example_field_value" }], partitioned=False) # then expected_result = [{"example_field": "example_field_value"}] actual_result = self.dataset_manager.collect(''' SELECT * FROM `{example_test_table}` ''').to_dict(orient='records') self.assertEqual(expected_result, actual_result)
def test_table_lookup(self): # type: () -> None dataset_ref1 = DatasetReference('my_project', 'dataset1') table_ref1 = TableReference(dataset_ref1, 'table1') table1 = Table(table_ref1, _TEST_SCHEMA) # Trying to get the same dataset/table in another project doesn't work. with self.assertRaisesRegexp(NotFound, 'other_project'): self.bq_client.get_table( TableReference(DatasetReference('other_project', 'dataset1'), 'table1')) # Trying to get the table before the dataset exists doesn't work with self.assertRaisesRegexp(NotFound, 'dataset1'): self.bq_client.get_table(table_ref1) self.bq_client.create_dataset(Dataset(dataset_ref1)) # Trying to get the table before the table exists doesn't work with self.assertRaises(NotFound): self.bq_client.get_table(table_ref1) self.bq_client.create_table(table1) # Assert the created table has the expected properties. table_found = self.bq_client.get_table(table_ref1) self.assertEqual(table1.project, "my_project") self.assertEqual(table1.dataset_id, "dataset1") self.assertEqual(table1.table_id, "table1") six.assertCountEqual(self, table_found.schema, _TEST_SCHEMA)
def setUp(self): self.project_id = os.environ.get(PROJECT_ID) self.dataset_id = os.environ.get('COMBINED_DATASET_ID') self.dataset_ref = DatasetReference(self.project_id, self.dataset_id) self.client = bq.get_client(self.project_id) self.schema = [ SchemaField("person_id", "INT64"), SchemaField("first_name", "STRING"), SchemaField("last_name", "STRING"), SchemaField("algorithm", "STRING") ] self.ps_api_fields = [ dict(name='person_id', type='integer', mode='nullable'), dict(name='first_name', type='string', mode='nullable'), dict(name='last_name', type='string', mode='nullable') ] self.id_match_fields = [ dict(name='person_id', type='integer', mode='nullable'), dict(name='first_name', type='string', mode='nullable'), dict(name='last_name', type='string', mode='nullable'), dict(name='algorithm', type='string', mode='nullable') ] self.hpo_id = 'fake_site' self.id_match_table_id = f'{IDENTITY_MATCH_TABLE}_{self.hpo_id}' self.ps_values_table_id = f'ps_api_values_{self.hpo_id}' # Create and populate the ps_values site table schema = bq.get_table_schema(PS_API_VALUES) tablename = self.ps_values_table_id table = Table(f'{self.project_id}.{self.dataset_id}.{tablename}', schema=schema) table.time_partitioning = TimePartitioning( type_=TimePartitioningType.HOUR) table = self.client.create_table(table) populate_query = POPULATE_PS_VALUES.render( project_id=self.project_id, drc_dataset_id=self.dataset_id, ps_values_table_id=self.ps_values_table_id) job = self.client.query(populate_query) job.result()
def test_dataflow_event_big_query_success(sdc_builder, sdc_executor, gcp): """ Create data using Google BigQuery client and then check if Google BigQuery origin receives them using wiretap, including a query successfully read event. The pipeline looks like: google_bigquery >> wiretap """ pipeline_builder = sdc_builder.get_pipeline_builder() dataset_name = get_random_string(string.ascii_letters, 5) table_name = get_random_string(string.ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY id' google_bigquery.set_attributes(query=query_str) wiretap = pipeline_builder.add_wiretap() events_wiretap = pipeline_builder.add_wiretap() google_bigquery >> wiretap.destination google_bigquery >= events_wiretap.destination pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=SCHEMA)) errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT) assert not errors, 'Errors found when creating data using bigquery client' # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() assert len(wiretap.output_records) == len(ROWS_TO_INSERT),\ f'Expected {len(ROWS_TO_INSERT)} records, received {len(wiretap.output_records)}' rows_from_wiretap = get_rows_from_wiretap(wiretap) assert rows_from_wiretap == ROWS_TO_INSERT # We have exactly one output record, check that it is a big-query-success event assert len( events_wiretap.output_records ) == 1, f'Expected 1 records, received {len(events_wiretap.output_records)}' event_record = events_wiretap.output_records[0] event_type = event_record.header.values['sdc.event.type'] assert event_type == 'big-query-success', 'Received %s as event type (expected new-file)' % event_type finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def create(self, table_id, schema): """ Create a table in Google BigQuery given a table and schema Parameters ---------- table : str Name of table to be written schema : str Use the generate_bq_schema to generate your table schema from a dataframe. """ from google.cloud.bigquery import SchemaField from google.cloud.bigquery import Table if self.exists(table_id): raise TableCreationError( "Table {0} already " "exists".format(table_id) ) if not _Dataset(self.project_id, credentials=self.credentials).exists( self.dataset_id ): _Dataset( self.project_id, credentials=self.credentials, location=self.location, ).create(self.dataset_id) table_ref = self.client.dataset(self.dataset_id).table(table_id) table = Table(table_ref) # Manually create the schema objects, adding NULLABLE mode # as a workaround for # https://github.com/GoogleCloudPlatform/google-cloud-python/issues/4456 for field in schema["fields"]: if "mode" not in field: field["mode"] = "NULLABLE" table.schema = [ SchemaField.from_api_repr(field) for field in schema["fields"] ] try: self.client.create_table(table) except self.http_error as ex: self.process_http_error(ex)
def test_multiple_batch(sdc_builder, sdc_executor, gcp, number_batches): """ Create data using Google BigQuery destination using different batch sizes and then check if it has been correctly created using BigQuery client. The pipeline looks like: dev_raw_data_source >> google_bigquery """ number_records = 1000 batch_size = number_records//number_batches pipeline_builder = sdc_builder.get_pipeline_builder() dev_data_generator = pipeline_builder.add_stage('Dev Data Generator') dev_data_generator.set_attributes(batch_size=batch_size, fields_to_generate=[ {"type": "INTEGER", "field": "age"}, {"type": "STRING", "field": "full_name"} ]) dataset_name = get_random_string(string.ascii_letters, 5) table_name = get_random_string(string.ascii_letters, 5) google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination') google_bigquery.set_attributes(dataset=dataset_name, table_name=table_name) dev_data_generator >> google_bigquery pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=SCHEMA)) # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_pipeline_output_records_count(number_records) sdc_executor.stop_pipeline(pipeline) data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)] data_from_bigquery.sort() assert data_from_bigquery == data_from_bigquery history = sdc_executor.get_pipeline_history(pipeline) records = history.latest.metrics.counter('pipeline.batchInputRecords.counter').count assert len(data_from_bigquery) == records finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_google_bigquery_origin_batch_handling(sdc_builder, sdc_executor, gcp): """Verify proper batch handling by the BigQuery origin. In this test, we write 8 records to BigQuery with a batch size of 3, verifying that each batch is of size 3 and that 8 total records are captured. The small numbers are used because of the limitations described in SDC-8765. """ MAX_BATCH_SIZE = 3 TOTAL_RECORDS = 8 dataset_name = get_random_string(ascii_letters, 5) table_name = get_random_string(ascii_letters, 5) query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY number' pipeline_builder = sdc_builder.get_pipeline_builder() google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') google_bigquery.set_attributes(query=query_str, max_batch_size_in_records=MAX_BATCH_SIZE) trash = pipeline_builder.add_stage('Trash') google_bigquery >> trash pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [SchemaField('number', 'STRING', mode='required'), SchemaField('value', 'STRING', mode='required')] data = [dict(number=str(i), value=get_random_string()) for i in range(TOTAL_RECORDS)] dataset = Dataset(bigquery_client.dataset(dataset_name)) try: # Using Google bigquery client, create dataset, table and data inside table. logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset) table = bigquery_client.create_table(Table(dataset.table(table_name), schema=schema)) errors = bigquery_client.insert_rows(table, data) assert not errors # Ceiling division is needed to capture all the complete batches along with the last partial one. snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True, batches=math.ceil(TOTAL_RECORDS / MAX_BATCH_SIZE)).snapshot # Assert that the batch size is being respected in each batch (including the last). In this case, # we'd expect batch sizes of 3, 3, and 2. for i, batch in enumerate(snapshot.snapshot_batches, start=1): # for 8 records, we'd expect batch sizes of 3, 3, and 2. assert (len(batch.stage_outputs[google_bigquery.instance_name].output) == MAX_BATCH_SIZE if i * MAX_BATCH_SIZE <= TOTAL_RECORDS else TOTAL_RECORDS % MAX_BATCH_SIZE) all_records = [record.field for batch in snapshot.snapshot_batches for record in batch.stage_outputs[google_bigquery.instance_name].output] assert all_records == data finally: bigquery_client.delete_dataset(dataset, delete_contents=True)
def _create_brand_new_tables(self, product_id): """ Creates new tables in the specified BigQuery dataset Args: product_id (str): A string representing the desired product_id """ for table_name in self.tables: table_ref = self.datasets[product_id]['datasets'].table(table_name) table = Table(table_ref, schema=self.schemas[table_name]) table.partitioning_type = _TABLE_PARTITION_TIME self.client.create_table(table) # API request self.datasets[product_id]['tables'][table_name] = ( self.client.get_table(table_ref)) return
def test_data_type(sdc_builder, sdc_executor, gcp, data_type, data, expected_data): """ Create data using Google BigQuery client and then check if Google BigQuery origin receives them using wiretap with one data format each. The pipeline looks like: google_bigquery >> wiretap """ pipeline_builder = sdc_builder.get_pipeline_builder() dataset_name = get_random_string(string.ascii_letters, 5) table_name = get_random_string(string.ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') query_str = f'SELECT * FROM {dataset_name}.{table_name}' google_bigquery.set_attributes(query=query_str) wiretap = pipeline_builder.add_wiretap() google_bigquery >> wiretap.destination pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [ SchemaField('id', 'INTEGER', mode='NULLABLE'), SchemaField('data', data_type, mode='NULLABLE') ] try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=schema)) errors = bigquery_client.insert_rows(table, [(None, data)]) assert not errors, 'Errors found when creating data using bigquery client' # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() assert len( wiretap.output_records ) == 1, f'Expected 1 record, received {len(wiretap.output_records)}' received_data = wiretap.output_records[0].field['data'] assert received_data == expected_data finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def create_table(schema_file_name): # Creates table named bq_table_inventory to store metadata credentials2 = service_account.Credentials.from_service_account_file( SERVICE_ACCOUNT_KEY_FILE_PATH2) ct_bigquery_client = client.Client(project=PROJECT_ID2, credentials=credentials2) table1 = Table.from_string(PROJECT_ID2 + "." + DATASET_ID + "." + TABLE_ID) table1.schema = prepare_schema(schema_file_name) table1.partitioning_type = 'DAY' ct_bigquery_client.create_table(table1, exists_ok=True)
def test_listing_tables_with_max(self): # type: () -> None dataset_ref1 = DatasetReference('my_project', 'dataset1') self.bq_client.create_dataset(Dataset(dataset_ref1)) for i in range(10): self.bq_client.create_table(Table(TableReference(dataset_ref1, 'table{}'.format(i)), _TEST_SCHEMA)) self.assertEqual(5, len(self.bq_client.list_tables(dataset_ref1, max_results=5))) self.assertEqual(10, len(self.bq_client.list_tables(dataset_ref1, max_results=20))) self.assertEqual(10, len(self.bq_client.list_tables(dataset_ref1)))
def test_google_bigquery_destination(sdc_builder, sdc_executor, gcp): """ Send data to Google BigQuery from Dev Raw Data Source and confirm that Google BigQuery destination successfully recieves them using Google BigQuery client. This is achieved by using a deduplicator which assures that there is only one ingest to Google BigQuery. The pipeline looks like: dev_raw_data_source >> record_deduplicator >> google_bigquery record_deduplicator >> trash """ pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='DELIMITED', header_line='WITH_HEADER', raw_data='\n'.join(CSV_DATA_TO_INSERT)) dataset_name = get_random_string(ascii_letters, 5) table_name = get_random_string(ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='destination') google_bigquery.set_attributes(dataset=dataset_name, table_name=table_name) record_deduplicator = pipeline_builder.add_stage('Record Deduplicator') trash = pipeline_builder.add_stage('Trash') dev_raw_data_source >> record_deduplicator >> google_bigquery record_deduplicator >> trash pipeline = pipeline_builder.build(title='Google BigQuery Destination').configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [SchemaField('full_name', 'STRING', mode='required'), SchemaField('age', 'INTEGER', mode='required')] dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) try: logger.info('Creating dataset %s using Google BigQuery client ...', dataset_name) dataset = bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema)) logger.info('Starting BigQuery Destination pipeline and waiting for it to produce records ...') sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(1) logger.info('Stopping BigQuery Destination pipeline and getting the count of records produced in total ...') sdc_executor.stop_pipeline(pipeline) # Verify by reading records using Google BigQuery client data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)] data_from_bigquery.sort() logger.debug('read_data = {}'.format(data_from_bigquery)) assert ROWS_EXPECTED == data_from_bigquery finally: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_should_create_table_from_table_object(self): # given table_id = f'{self.dataset_manager.project_id}.{self.dataset_manager.dataset_name}.example_test_table' table = Table(table_id, schema=[ { "mode": "NULLABLE", "name": "example_field", "type": "STRING" }, ]) table.time_partitioning = TimePartitioning() # when self.dataset_manager.create_table_from_schema('example_test_table', schema=None, table=table) # then self.table_should_exists()
def test_object_names_columns(sdc_builder, sdc_executor, gcp, column_name): """ Create data using Google BigQuery destination with different column names and then check if it has been correctly created using BigQuery client. The pipeline looks like: dev_raw_data_source >> google_bigquery """ data_to_insert = [(1, 'data')] rows_to_insert = [f'id,{column_name}'] + [','.join(str(element) for element in row) for row in data_to_insert] data_to_expect = [(1, 'data')] pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='DELIMITED', header_line='WITH_HEADER', stop_after_first_batch=True, raw_data='\n'.join(rows_to_insert)) dataset_name = get_random_string(string.ascii_letters, 5) table_name = get_random_string(string.ascii_letters, 5) google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination') google_bigquery.set_attributes(dataset=dataset_name, table_name=table_name) dev_raw_data_source >> google_bigquery pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [SchemaField('id', 'INTEGER', mode='NULLABLE'), SchemaField(column_name, 'STRING', mode='NULLABLE')] try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema)) # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)] data_from_bigquery.sort() assert data_to_expect == data_from_bigquery finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def create(self, table_id, schema): """Create a table in Google BigQuery given a table and schema Parameters ---------- table : str Name of table to be written schema : str Use the generate_bq_schema to generate your table schema from a dataframe. """ from google.cloud.bigquery import SchemaField from google.cloud.bigquery import Table if self.exists(table_id): raise TableCreationError( "Table {0} already " "exists".format(table_id) ) if not _Dataset(self.project_id, credentials=self.credentials).exists( self.dataset_id ): _Dataset( self.project_id, credentials=self.credentials, location=self.location, ).create(self.dataset_id) table_ref = self.client.dataset(self.dataset_id).table(table_id) table = Table(table_ref) schema = pandas_gbq.schema.add_default_nullable_mode(schema) table.schema = [ SchemaField.from_api_repr(field) for field in schema["fields"] ] try: self.client.create_table(table) except self.http_error as ex: self.process_http_error(ex)
def test_google_bigquery_origin(sdc_builder, sdc_executor, gcp): """ Create data using Google BigQuery client and then check if Google BigQuery origin receives them using snapshot. The pipeline looks like: google_bigquery >> trash """ pipeline_builder = sdc_builder.get_pipeline_builder() dataset_name = get_random_string(ascii_letters, 5) table_name = get_random_string(ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY full_name' google_bigquery.set_attributes(query=query_str) trash = pipeline_builder.add_stage('Trash') google_bigquery >> trash pipeline = pipeline_builder.build( title='Google BigQuery').configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [ SchemaField('full_name', 'STRING', mode='required'), SchemaField('age', 'INTEGER', mode='required') ] dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) try: # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) dataset = bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=schema)) errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT) assert errors == [] # Start pipeline and verify correct rows are received using snaphot. logger.info('Starting pipeline and snapshot') snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True).snapshot if sdc_executor.get_pipeline_status(pipeline) == 'RUNNING': sdc_executor.stop_pipeline(pipeline) rows_from_snapshot = [(record.value['value'][0]['value'], int(record.value['value'][1]['value'])) for record in snapshot[google_bigquery].output] assert rows_from_snapshot == ROWS_TO_INSERT finally: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def create(self, table_id, schema): """ Create a table in Google BigQuery given a table and schema Parameters ---------- table : str Name of table to be written schema : str Use the generate_bq_schema to generate your table schema from a dataframe. """ from google.cloud.bigquery import SchemaField from google.cloud.bigquery import Table if self.exists(table_id): raise TableCreationError("Table {0} already " "exists".format(table_id)) if not _Dataset(self.project_id, private_key=self.private_key).exists(self.dataset_id): _Dataset(self.project_id, private_key=self.private_key).create(self.dataset_id) table_ref = self.client.dataset(self.dataset_id).table(table_id) table = Table(table_ref) for field in schema['fields']: if 'mode' not in field: field['mode'] = 'NULLABLE' table.schema = [ SchemaField.from_api_repr(field) for field in schema['fields'] ] try: self.client.create_table(table) except self.http_error as ex: self.process_http_error(ex)
def create(self, table_id, schema): """Create a table in Google BigQuery given a table and schema Parameters ---------- table : str Name of table to be written schema : str Use the generate_bq_schema to generate your table schema from a dataframe. """ from google.cloud.bigquery import DatasetReference from google.cloud.bigquery import Table from google.cloud.bigquery import TableReference if self.exists(table_id): raise TableCreationError( "Table {0} already exists".format(table_id) ) if not _Dataset(self.project_id, credentials=self.credentials).exists( self.dataset_id ): _Dataset( self.project_id, credentials=self.credentials, location=self.location, ).create(self.dataset_id) table_ref = TableReference( DatasetReference(self.project_id, self.dataset_id), table_id ) table = Table(table_ref) table.schema = pandas_gbq.schema.to_google_cloud_bigquery(schema) try: self.client.create_table(table) except self.http_error as ex: self.process_http_error(ex)