def create_external_table_hive_partitioning( bq_client: bigquery.Client, dataset: bigquery.Dataset, table_id: str, gcs_directory_path: str) -> bigquery.Table: """ Creates an external table with AUTO hive partitioning in GCS :param bq_client: Client object to bigquery :param dataset: dataset object. Check 'get_or_create_dataset' method :param table_id: Table to be created :param gcs_directory_path: Directory of GCS with the data. For example: If you have a structure like this: "gs://bucket/images_metadata/source_id=abc/date=2018-02-20" You should pass: "******" :return: """ table = bigquery.Table(dataset.table(table_id)) external_config = bigquery.ExternalConfig( bigquery.SourceFormat.PARQUET) external_config.source_uris = [f"{gcs_directory_path}/*"] hive_part_opt = HivePartitioningOptions() hive_part_opt.mode = "AUTO" hive_part_opt.source_uri_prefix = gcs_directory_path external_config.hive_partitioning = hive_part_opt table.external_data_configuration = external_config table = bq_client.create_table(table, exists_ok=True) return table
def prepare_table(self, dataset: bigquery.Dataset, table_name: str, columns_schema: list, incremental: bool) -> bigquery.TableReference: table_reference = dataset.table(table_name) table = bigquery.Table(table_reference, columns_schema) try: bq_table = self.bigquery_client.get_table(table_reference) table_exist = True if incremental: schema_mapper.is_table_definition_in_match_with_bigquery( columns_schema, bq_table) else: self.bigquery_client.delete_table(table_reference) table_exist = False except bq_exceptions.NotFound: table_exist = False except bq_exceptions.BadRequest as err: message = 'Cannot create table %s: %s' % (table_reference, str(err)) raise UserException(message) if not table_exist: try: self.bigquery_client.create_table(table) except bq_exceptions.BadRequest as err: message = 'Cannot create table %s: %s' % (table_name, str(err)) raise UserException(message) return table_reference
def test_dataflow_event_big_query_success(sdc_builder, sdc_executor, gcp): """ Create data using Google BigQuery client and then check if Google BigQuery origin receives them using wiretap, including a query successfully read event. The pipeline looks like: google_bigquery >> wiretap """ pipeline_builder = sdc_builder.get_pipeline_builder() dataset_name = get_random_string(string.ascii_letters, 5) table_name = get_random_string(string.ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY id' google_bigquery.set_attributes(query=query_str) wiretap = pipeline_builder.add_wiretap() events_wiretap = pipeline_builder.add_wiretap() google_bigquery >> wiretap.destination google_bigquery >= events_wiretap.destination pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=SCHEMA)) errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT) assert not errors, 'Errors found when creating data using bigquery client' # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() assert len(wiretap.output_records) == len(ROWS_TO_INSERT),\ f'Expected {len(ROWS_TO_INSERT)} records, received {len(wiretap.output_records)}' rows_from_wiretap = get_rows_from_wiretap(wiretap) assert rows_from_wiretap == ROWS_TO_INSERT # We have exactly one output record, check that it is a big-query-success event assert len( events_wiretap.output_records ) == 1, f'Expected 1 records, received {len(events_wiretap.output_records)}' event_record = events_wiretap.output_records[0] event_type = event_record.header.values['sdc.event.type'] assert event_type == 'big-query-success', 'Received %s as event type (expected new-file)' % event_type finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def load_views(bq: bigquery.Client, dataset: bigquery.Dataset, views: Dict[str, str]): """Load views for a test.""" for table, view_query in views.items(): view = bigquery.Table(dataset.table(table)) view.view_query = view_query.format(project=dataset.project, dataset=dataset.dataset_id) bq.create_table(view)
def test_multiple_batch(sdc_builder, sdc_executor, gcp, number_batches): """ Create data using Google BigQuery destination using different batch sizes and then check if it has been correctly created using BigQuery client. The pipeline looks like: dev_raw_data_source >> google_bigquery """ number_records = 1000 batch_size = number_records//number_batches pipeline_builder = sdc_builder.get_pipeline_builder() dev_data_generator = pipeline_builder.add_stage('Dev Data Generator') dev_data_generator.set_attributes(batch_size=batch_size, fields_to_generate=[ {"type": "INTEGER", "field": "age"}, {"type": "STRING", "field": "full_name"} ]) dataset_name = get_random_string(string.ascii_letters, 5) table_name = get_random_string(string.ascii_letters, 5) google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination') google_bigquery.set_attributes(dataset=dataset_name, table_name=table_name) dev_data_generator >> google_bigquery pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=SCHEMA)) # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_pipeline_output_records_count(number_records) sdc_executor.stop_pipeline(pipeline) data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)] data_from_bigquery.sort() assert data_from_bigquery == data_from_bigquery history = sdc_executor.get_pipeline_history(pipeline) records = history.latest.metrics.counter('pipeline.batchInputRecords.counter').count assert len(data_from_bigquery) == records finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_data_type(sdc_builder, sdc_executor, gcp, data_type, data, expected_data): """ Create data using Google BigQuery client and then check if Google BigQuery origin receives them using wiretap with one data format each. The pipeline looks like: google_bigquery >> wiretap """ pipeline_builder = sdc_builder.get_pipeline_builder() dataset_name = get_random_string(string.ascii_letters, 5) table_name = get_random_string(string.ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') query_str = f'SELECT * FROM {dataset_name}.{table_name}' google_bigquery.set_attributes(query=query_str) wiretap = pipeline_builder.add_wiretap() google_bigquery >> wiretap.destination pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [ SchemaField('id', 'INTEGER', mode='NULLABLE'), SchemaField('data', data_type, mode='NULLABLE') ] try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=schema)) errors = bigquery_client.insert_rows(table, [(None, data)]) assert not errors, 'Errors found when creating data using bigquery client' # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() assert len( wiretap.output_records ) == 1, f'Expected 1 record, received {len(wiretap.output_records)}' received_data = wiretap.output_records[0].field['data'] assert received_data == expected_data finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_google_bigquery_origin_batch_handling(sdc_builder, sdc_executor, gcp): """Verify proper batch handling by the BigQuery origin. In this test, we write 8 records to BigQuery with a batch size of 3, verifying that each batch is of size 3 and that 8 total records are captured. The small numbers are used because of the limitations described in SDC-8765. """ MAX_BATCH_SIZE = 3 TOTAL_RECORDS = 8 dataset_name = get_random_string(ascii_letters, 5) table_name = get_random_string(ascii_letters, 5) query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY number' pipeline_builder = sdc_builder.get_pipeline_builder() google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') google_bigquery.set_attributes(query=query_str, max_batch_size_in_records=MAX_BATCH_SIZE) trash = pipeline_builder.add_stage('Trash') google_bigquery >> trash pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [SchemaField('number', 'STRING', mode='required'), SchemaField('value', 'STRING', mode='required')] data = [dict(number=str(i), value=get_random_string()) for i in range(TOTAL_RECORDS)] dataset = Dataset(bigquery_client.dataset(dataset_name)) try: # Using Google bigquery client, create dataset, table and data inside table. logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset) table = bigquery_client.create_table(Table(dataset.table(table_name), schema=schema)) errors = bigquery_client.insert_rows(table, data) assert not errors # Ceiling division is needed to capture all the complete batches along with the last partial one. snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True, batches=math.ceil(TOTAL_RECORDS / MAX_BATCH_SIZE)).snapshot # Assert that the batch size is being respected in each batch (including the last). In this case, # we'd expect batch sizes of 3, 3, and 2. for i, batch in enumerate(snapshot.snapshot_batches, start=1): # for 8 records, we'd expect batch sizes of 3, 3, and 2. assert (len(batch.stage_outputs[google_bigquery.instance_name].output) == MAX_BATCH_SIZE if i * MAX_BATCH_SIZE <= TOTAL_RECORDS else TOTAL_RECORDS % MAX_BATCH_SIZE) all_records = [record.field for batch in snapshot.snapshot_batches for record in batch.stage_outputs[google_bigquery.instance_name].output] assert all_records == data finally: bigquery_client.delete_dataset(dataset, delete_contents=True)
def test_google_bigquery_destination(sdc_builder, sdc_executor, gcp): """ Send data to Google BigQuery from Dev Raw Data Source and confirm that Google BigQuery destination successfully recieves them using Google BigQuery client. This is achieved by using a deduplicator which assures that there is only one ingest to Google BigQuery. The pipeline looks like: dev_raw_data_source >> record_deduplicator >> google_bigquery record_deduplicator >> trash """ pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='DELIMITED', header_line='WITH_HEADER', raw_data='\n'.join(CSV_DATA_TO_INSERT)) dataset_name = get_random_string(ascii_letters, 5) table_name = get_random_string(ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='destination') google_bigquery.set_attributes(dataset=dataset_name, table_name=table_name) record_deduplicator = pipeline_builder.add_stage('Record Deduplicator') trash = pipeline_builder.add_stage('Trash') dev_raw_data_source >> record_deduplicator >> google_bigquery record_deduplicator >> trash pipeline = pipeline_builder.build(title='Google BigQuery Destination').configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [SchemaField('full_name', 'STRING', mode='required'), SchemaField('age', 'INTEGER', mode='required')] dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) try: logger.info('Creating dataset %s using Google BigQuery client ...', dataset_name) dataset = bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema)) logger.info('Starting BigQuery Destination pipeline and waiting for it to produce records ...') sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(1) logger.info('Stopping BigQuery Destination pipeline and getting the count of records produced in total ...') sdc_executor.stop_pipeline(pipeline) # Verify by reading records using Google BigQuery client data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)] data_from_bigquery.sort() logger.debug('read_data = {}'.format(data_from_bigquery)) assert ROWS_EXPECTED == data_from_bigquery finally: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_object_names_columns(sdc_builder, sdc_executor, gcp, column_name): """ Create data using Google BigQuery destination with different column names and then check if it has been correctly created using BigQuery client. The pipeline looks like: dev_raw_data_source >> google_bigquery """ data_to_insert = [(1, 'data')] rows_to_insert = [f'id,{column_name}'] + [','.join(str(element) for element in row) for row in data_to_insert] data_to_expect = [(1, 'data')] pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='DELIMITED', header_line='WITH_HEADER', stop_after_first_batch=True, raw_data='\n'.join(rows_to_insert)) dataset_name = get_random_string(string.ascii_letters, 5) table_name = get_random_string(string.ascii_letters, 5) google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination') google_bigquery.set_attributes(dataset=dataset_name, table_name=table_name) dev_raw_data_source >> google_bigquery pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [SchemaField('id', 'INTEGER', mode='NULLABLE'), SchemaField(column_name, 'STRING', mode='NULLABLE')] try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema)) # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)] data_from_bigquery.sort() assert data_to_expect == data_from_bigquery finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_google_bigquery_origin(sdc_builder, sdc_executor, gcp): """ Create data using Google BigQuery client and then check if Google BigQuery origin receives them using snapshot. The pipeline looks like: google_bigquery >> trash """ pipeline_builder = sdc_builder.get_pipeline_builder() dataset_name = get_random_string(ascii_letters, 5) table_name = get_random_string(ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY full_name' google_bigquery.set_attributes(query=query_str) trash = pipeline_builder.add_stage('Trash') google_bigquery >> trash pipeline = pipeline_builder.build( title='Google BigQuery').configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [ SchemaField('full_name', 'STRING', mode='required'), SchemaField('age', 'INTEGER', mode='required') ] dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) try: # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) dataset = bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=schema)) errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT) assert errors == [] # Start pipeline and verify correct rows are received using snaphot. logger.info('Starting pipeline and snapshot') snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True).snapshot if sdc_executor.get_pipeline_status(pipeline) == 'RUNNING': sdc_executor.stop_pipeline(pipeline) rows_from_snapshot = [(record.value['value'][0]['value'], int(record.value['value'][1]['value'])) for record in snapshot[google_bigquery].output] assert rows_from_snapshot == ROWS_TO_INSERT finally: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_object_names_datasets(sdc_builder, sdc_executor, gcp, dataset_name): """ Create data using Google BigQuery destination with different dataset names and then check if it has been correctly created using BigQuery client. The pipeline looks like: dev_raw_data_source >> google_bigquery """ pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='DELIMITED', header_line='WITH_HEADER', stop_after_first_batch=True, raw_data='\n'.join(CSV_DATA_TO_INSERT)) table_name = get_random_string(string.ascii_letters, 5) # If tests fail for any reason, leftovers with equal names might lead to more errors dataset_name = f'{dataset_name}_{get_random_string(string.ascii_letters, 5)}' google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination') google_bigquery.set_attributes(dataset=dataset_name, table_name=table_name) dev_raw_data_source >> google_bigquery pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=SCHEMA)) # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)] data_from_bigquery.sort() assert ROWS_EXPECTED == data_from_bigquery finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_object_names_datasets(sdc_builder, sdc_executor, gcp, dataset_name): """ Create data using Google BigQuery client with specific table names and then check if Google BigQuery origin receives them using wiretap. The pipeline looks like: google_bigquery >> wiretap """ pipeline_builder = sdc_builder.get_pipeline_builder() table_name = get_random_string(string.ascii_letters, 5) dataset_name = f'{dataset_name}_{get_random_string(string.ascii_letters, 5)}' google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') query_str = f"SELECT * FROM `{dataset_name}`.`{table_name}` ORDER BY id" google_bigquery.set_attributes(query=query_str) wiretap = pipeline_builder.add_wiretap() google_bigquery >> wiretap.destination pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=SCHEMA)) errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT) assert not errors, 'Errors found when creating data using bigquery client' # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() assert len(wiretap.output_records) == len(ROWS_TO_INSERT),\ f'Expected {len(ROWS_TO_INSERT)} records, received {len(wiretap.output_records)}' rows_from_wiretap = get_rows_from_wiretap(wiretap) assert rows_from_wiretap == ROWS_TO_INSERT finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def load_stage(dst_dataset: Dataset, bq_client: Client, bucket_name: str, gcs_client: storage.Client) -> List[LoadJob]: """ Stage files from a bucket to a dataset :param dst_dataset: reference to destination dataset object :param bq_client: a BigQuery client object :param bucket_name: the location in GCS containing the vocabulary files :param gcs_client: a Cloud Storage client object :return: list of completed load jobs """ blobs = list(gcs_client.list_blobs(bucket_name)) table_blobs = [_filename_to_table_name(blob.name) for blob in blobs] missing_blobs = [ table for table in VOCABULARY_TABLES if table not in table_blobs ] if missing_blobs: raise RuntimeError( f'Bucket {bucket_name} is missing files for tables {missing_blobs}' ) load_jobs = [] for blob in blobs: table_name = _filename_to_table_name(blob.name) # ignore any non-vocabulary files if table_name not in VOCABULARY_TABLES: continue destination = dst_dataset.table(table_name) safe_schema = safe_schema_for(table_name) job_config = LoadJobConfig() job_config.schema = safe_schema job_config.skip_leading_rows = 1 job_config.field_delimiter = FIELD_DELIMITER job_config.max_bad_records = MAX_BAD_RECORDS job_config.source_format = 'CSV' job_config.quote_character = '' source_uri = f'gs://{bucket_name}/{blob.name}' load_job = bq_client.load_table_from_uri(source_uri, destination, job_config=job_config) LOGGER.info(f'table:{destination} job_id:{load_job.job_id}') load_jobs.append(load_job) load_job.result() return load_jobs
def load_tables(bq: bigquery.Client, dataset: bigquery.Dataset, tables: Iterable[Table]): """Load tables for a test.""" for table in tables: destination = dataset.table(table.name) job_config = bigquery.LoadJobConfig( source_format=table.source_format, write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, ) if table.schema is None: # autodetect schema if not provided job_config.autodetect = True else: job_config.schema = table.schema # look for time_partitioning_field in provided schema for field in job_config.schema: if field.description == "time_partitioning_field": job_config.time_partitioning = bigquery.TimePartitioning( field=field.name) break # stop because there can only be one time partitioning field if isinstance(table.source_path, str): with open(table.source_path, "rb") as file_obj: job = bq.load_table_from_file(file_obj, destination, job_config=job_config) else: file_obj = BytesIO() for row in load(*table.source_path): file_obj.write( json.dumps(row, default=default_encoding).encode() + b"\n") file_obj.seek(0) job = bq.load_table_from_file(file_obj, destination, job_config=job_config) try: job.result() except BadRequest: # print the first 5 rows for debugging errors for row in job.errors[:5]: print(row) raise
def create_bq_table( client: bigquery.Client, dataset: bigquery.Dataset, table_id: str, table_schema: List[bigquery.SchemaField], table_description: str = None, ) -> bigquery.Table: """ Create empty table. """ # TODO: validate 'table_id'. # note: it is not intuitive the dual instantiation of a 'Table' object. table = bigquery.Table(dataset.table(table_id), schema=table_schema) # type: bigquery.Table table.description = table_description # API request return client.create_table(table) # type: bigquery.Table
def persist_lines_stream( project_id, dataset_id, lines=None, validate_records=True, key_path=None, ): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = {} if key_path: credentials = service_account.Credentials.from_service_account_file( key_path, scopes=['https://www.googleapis.com/auth/cloud-platform'], ) bigquery_client = bigquery.Client(credentials=credentials, project=project_id) else: bigquery_client = bigquery.Client(project=project_id) dataset_ref = bigquery_client.dataset(dataset_id) dataset = Dataset(dataset_ref) try: dataset = bigquery_client.create_dataset( Dataset(dataset_ref)) or Dataset(dataset_ref) except exceptions.Conflict: pass for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error('Unable to parse:\n{}'.format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( 'A record for stream {} was encountered before a corresponding schema' .format(msg.stream)) schema = schemas[msg.stream] if validate_records: validate(msg.record, schema) errors[msg.stream] = bigquery_client.insert_rows_json( tables[msg.stream], [msg.record]) rows[msg.stream] += 1 state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table])) rows[table] = 0 errors[table] = None try: tables[table] = bigquery_client.create_table(tables[table]) except exceptions.Conflict: pass elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception('Unrecognized message {}'.format(msg)) for table in errors.keys(): if not errors[table]: logging.info('Loaded {} row(s) into {}:{}'.format( rows[table], dataset_id, table, tables[table].path)) emit_state(state) else: logging.error(errors[table]) return state
def test_google_bigquery_origin(sdc_builder, sdc_executor, gcp): """ Create data using Google BigQuery client and then check if Google BigQuery origin receives the data. The pipeline looks like: google_bigquery >> wiretap """ pipeline_builder = sdc_builder.get_pipeline_builder() dataset_name = get_random_string(ascii_letters, 5) table_name = get_random_string(ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY id' google_bigquery.set_attributes(query=query_str) wiretap = pipeline_builder.add_wiretap() google_bigquery >> wiretap.destination pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [ SchemaField('id', 'INTEGER', mode='NULLABLE'), SchemaField('name', 'STRING', mode='NULLABLE'), SchemaField('floatVal', 'FLOAT', mode='NULLABLE'), SchemaField('numericVal', 'NUMERIC', mode='NULLABLE'), SchemaField('booleanVal', 'BOOLEAN', mode='NULLABLE'), SchemaField('dateVal', 'DATE', mode='NULLABLE'), SchemaField('datetimeVal', 'DATETIME', mode='NULLABLE'), SchemaField('timestampVal', 'TIMESTAMP', mode='NULLABLE'), SchemaField('timeVal', 'TIME', mode='NULLABLE'), SchemaField('bytesVal', 'BYTES', mode='NULLABLE') ] dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) try: # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=schema)) errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT) assert not errors # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() get_value_by_index = lambda x: list(x[0].field.values())[x[1]].value rows_from_wiretap = [ (int(get_value_by_index((record, 0))) if get_value_by_index( (record, 0)) is not None else None, get_value_by_index( (record, 1)), float(get_value_by_index( (record, 2))) if get_value_by_index( (record, 2)) is not None else None, str(get_value_by_index((record, 3))) if get_value_by_index( (record, 3)) is not None else None, get_value_by_index((record, 4)), get_value_by_index( (record, 5)).strftime("%Y-%m-%d") if get_value_by_index( (record, 5)) is not None else None, get_value_by_index( (record, 6)).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] if get_value_by_index( (record, 6)) is not None else None, '{} UTC'.format( get_value_by_index( (record, 7)).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]) if get_value_by_index((record, 7)) is not None else None, get_value_by_index( (record, 8)).strftime('%H:%M:%S.%f')[:-3] if get_value_by_index( (record, 8)) is not None else None, get_value_by_index((record, 9)) if get_value_by_index( (record, 9)) is not None else None) for record in wiretap.output_records ] assert rows_from_wiretap == ROWS_TO_INSERT finally: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def persist_lines_stream(project_id, dataset_id, ensure_ascii, lines=None, validate_records=True, array_nodes=[], force_to_string_fields=[]): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = collections.defaultdict(list) data_holder = [] lines_read = False stream = None if flags.no_records: no_records = int(flags.no_records) else: logger.info('Number of records not specified. Setting to maximum: {}'.format(MAX_NO_RECORDS)) no_records = MAX_NO_RECORDS if flags.data_location: bigquery_client = bigquery.Client(project=project_id, location=flags.data_location) else: bigquery_client = bigquery.Client(project=project_id) dataset_ref = bigquery_client.dataset(dataset_id) dataset = Dataset(dataset_ref) try: dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref) except exceptions.Conflict: pass payload_size = 0 for line in lines: lines_read = True # skip SCHEMA messages (except for the the intial one) if '{"anyOf": [{' in line: continue try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema".format(msg.stream)) schema = schemas[msg.stream] if is_record_deleted(msg.record, SDC_DELETED_AT): continue if validate_records: validate(msg.record, schema) modified_record = handle_decimal_values(msg.record) modified_record = handle_empty_arrays(array_nodes, modified_record) modified_record = force_fields_to_string(force_to_string_fields, modified_record, ensure_ascii) item_size = getsize(modified_record) if payload_size + item_size >= MAX_PAYLOAD_SIZE: logger.info('Near max request size. Sending: {} records, payload size: {}.'.format(len(data_holder), payload_size)) upload_res = bigquery_client.insert_rows_json(tables[msg.stream], data_holder) if upload_res: logger.error('Upload error: {}'.format(upload_res)) else: rows[msg.stream] += len(data_holder) data_holder = [] payload_size = 0 data_holder.append(modified_record) payload_size += item_size else: if len(data_holder) >= no_records: logger.info( "Max request size not reached, max #records reached. Sending: {} records, payload size: {} bytes.".format( len(data_holder), item_size + payload_size)) upload_res = bigquery_client.insert_rows_json(tables[msg.stream], data_holder) if upload_res: logger.error('Upload error: {}'.format(upload_res)) else: rows[msg.stream] += len(data_holder) data_holder = [] payload_size = 0 data_holder.append(modified_record) payload_size += item_size stream = msg.stream state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table])) rows[table] = 0 try: tables[table] = bigquery_client.create_table(tables[table]) except exceptions.Conflict: pass elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) if len(data_holder) > 0 and lines_read and stream: logger.info( "Remaining records. Sending: {} records, payload size: {} bytes.".format(len(data_holder), payload_size)) upload_res = bigquery_client.insert_rows_json(tables[stream], data_holder) if upload_res: logger.error('Upload error: {}'.format(upload_res)) else: rows[stream] += len(data_holder) for table in errors.keys(): if not errors[table]: logging.info('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table, tables[table].path)) emit_state(state) else: logging.error('Errors:', errors[table]) return state
def test_google_bigquery_destination_multiple_types(sdc_builder, sdc_executor, gcp): """Simple big query destination test with INSERT operation. The pipeline inserts 1000 records of multiple types. A type converter is included to transform decimal to float. The pipeline should look like: dev_data_generator >> field_type_converter >> [google_bigquery, wiretap.destination] """ pipeline_builder = sdc_builder.get_pipeline_builder() dev_data_generator = pipeline_builder.add_stage('Dev Data Generator') dev_data_generator.fields_to_generate = [{ 'field': 'field1', 'type': 'STRING' }, { 'field': 'field2', 'type': 'DATETIME' }, { 'field': 'field3', 'type': 'INTEGER' }, { 'field': 'field4', 'precision': 10, 'scale': 2, 'type': 'DECIMAL' }, { 'field': 'field5', 'type': 'DOUBLE' }] batch_size = 1000 dev_data_generator.set_attributes(delay_between_batches=0, batch_size=batch_size) dataset_name = get_random_string(ascii_letters, 5) table_name = get_random_string(ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='destination') google_bigquery.set_attributes(dataset=dataset_name, table_name=table_name, stage_on_record_error='TO_ERROR') # Create Field Type Converter conversions = [{'fields': ['/field4'], 'targetType': 'FLOAT'}] field_type_converter = pipeline_builder.add_stage('Field Type Converter') field_type_converter.set_attributes( conversion_method='BY_FIELD', field_type_converter_configs=conversions) wiretap = pipeline_builder.add_wiretap() dev_data_generator >> field_type_converter >> [ google_bigquery, wiretap.destination ] pipeline = pipeline_builder.build() sdc_executor.add_pipeline(pipeline.configure_for_environment(gcp)) # FLOAT64 is used because there is a bug with NUMERIC, in bigquery Client bigquery_client = gcp.bigquery_client schema = [ SchemaField('field1', 'STRING', mode='required'), SchemaField('field2', 'DATETIME', mode='required'), SchemaField('field3', 'INTEGER', mode='required'), SchemaField('field4', 'FLOAT64', mode='required'), SchemaField('field5', 'FLOAT', mode='required') ] dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) try: logger.info('Creating dataset %s using Google BigQuery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=schema)) logger.info( 'Starting BigQuery Destination pipeline and waiting for it to produce records ...' ) sdc_executor.start_pipeline(pipeline) sdc_executor.wait_for_pipeline_metric(pipeline, 'input_record_count', 1_000, timeout_sec=60) sdc_executor.stop_pipeline(pipeline) # Verify by reading records using Google BigQuery client data_from_bigquery = [row for row in bigquery_client.list_rows(table)] assert len(data_from_bigquery) > batch_size assert len(wiretap.error_records) == 0 assert len(data_from_bigquery) == len(wiretap.output_records) assert [ element in data_from_bigquery for element in wiretap.output_records ] finally: logger.info('Dropping table %s in Google Big Query database ...', table_name) bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_data_type(sdc_builder, sdc_executor, gcp, origin_data_type, gcp_data_type, origin_data, gcp_expected_data): """ Create data using Google BigQuery destination with different data types and then check if it has been correctly created using BigQuery client. The pipeline looks like: dev_raw_data_source >> field_type_converter >> google_bigquery """ data_to_insert = [(1, origin_data)] data_to_expect = [(1, gcp_expected_data)] rows_to_insert = ['id,data'] + [','.join(str(element) for element in row) for row in data_to_insert] pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='DELIMITED', header_line='WITH_HEADER', stop_after_first_batch=True, raw_data='\n'.join(rows_to_insert)) field_type_converter = pipeline_builder.add_stage('Field Type Converter') field_type_converter.conversion_method = 'BY_FIELD' field_type_converter.field_type_converter_configs = [{ 'fields': ['/data'], 'targetType': origin_data_type, 'dataLocale': 'en,US', 'dateFormat': 'YYYY_MM_DD_HH_MM_SS', 'zonedDateTimeFormat': 'ISO_OFFSET_DATE_TIME', 'scale': 38 }] dataset_name = get_random_string(string.ascii_letters, 5) table_name = get_random_string(string.ascii_letters, 5) google_bigquery = pipeline_builder.add_stage(name=DESTINATION_STAGE_NAME, type='destination') google_bigquery.set_attributes(dataset=dataset_name, table_name=table_name) dev_raw_data_source >> field_type_converter >> google_bigquery pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client # We create data type for gcp, and insert it against that type to check if it works schema = [SchemaField('id', 'INTEGER', mode='NULLABLE'), SchemaField('data', gcp_data_type, mode='NULLABLE')] try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table(Table(dataset_ref.table(table_name), schema=schema)) # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() data_from_bigquery = [tuple(row.values()) for row in bigquery_client.list_rows(table)] data_from_bigquery.sort() assert data_to_expect == data_from_bigquery finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_multiple_batch(sdc_builder, sdc_executor, gcp, number_batches): """ Create data using Google BigQuery client with specific column names and then check if Google BigQuery origin receives them using wiretap. The pipeline looks like: google_bigquery >> wiretap """ number_records = 100 batch_size = number_records // number_batches pipeline_builder = sdc_builder.get_pipeline_builder() dataset_name = get_random_string(string.ascii_letters, 5) table_name = get_random_string(string.ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') google_bigquery.set_attributes(max_batch_size_in_records=batch_size) query_str = f"SELECT * FROM {dataset_name}.{table_name} ORDER BY id" google_bigquery.set_attributes(query=query_str) wiretap = pipeline_builder.add_wiretap() google_bigquery >> wiretap.destination pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [ SchemaField('id', 'INTEGER', mode='NULLABLE'), SchemaField('data', 'STRING', mode='NULLABLE') ] try: dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=schema)) input_values = [] for i in range(0, number_records): input_values += [(i, get_random_string(string.ascii_letters, 5))] errors = bigquery_client.insert_rows( table, [(i, get_random_string(string.ascii_letters, 5))]) assert not errors, 'Errors found when creating data using bigquery client' # Start pipeline and verify correct rows are received. logger.info('Starting pipeline') sdc_executor.start_pipeline(pipeline).wait_for_finished() assert len(wiretap.output_records) == number_records,\ f'Expected {number_records} record, received {len(wiretap.output_records)}' records = [record.field for record in wiretap.output_records] assert len(records) >= number_batches batches = wiretap.batches len_records_in_batches = 0 for bat in batches: len_records_in_batches = len_records_in_batches + len(bat) assert len(records) == len_records_in_batches finally: if dataset_ref: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def test_google_bigquery_origin(sdc_builder, sdc_executor, gcp): """ Create data using Google BigQuery client and then check if Google BigQuery origin receives them using snapshot. The pipeline looks like: google_bigquery >> trash """ pipeline_builder = sdc_builder.get_pipeline_builder() dataset_name = get_random_string(ascii_letters, 5) table_name = get_random_string(ascii_letters, 5) google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY id' google_bigquery.set_attributes(query=query_str) trash = pipeline_builder.add_stage('Trash') google_bigquery >> trash pipeline = pipeline_builder.build( title='Google BigQuery').configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [ SchemaField('id', 'INTEGER', mode='NULLABLE'), SchemaField('name', 'STRING', mode='NULLABLE'), SchemaField('floatVal', 'FLOAT', mode='NULLABLE'), SchemaField('numericVal', 'NUMERIC', mode='NULLABLE'), SchemaField('booleanVal', 'BOOLEAN', mode='NULLABLE'), SchemaField('dateVal', 'DATE', mode='NULLABLE'), SchemaField('datetimeVal', 'DATETIME', mode='NULLABLE'), SchemaField('timestampVal', 'TIMESTAMP', mode='NULLABLE'), SchemaField('timeVal', 'TIME', mode='NULLABLE'), SchemaField('bytesVal', 'BYTES', mode='NULLABLE') ] dataset_ref = Dataset(bigquery_client.dataset(dataset_name)) try: # Using Google bigquery client, create dataset, table and data inside table logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) dataset = bigquery_client.create_dataset(dataset_ref) table = bigquery_client.create_table( Table(dataset_ref.table(table_name), schema=schema)) errors = bigquery_client.insert_rows(table, ROWS_TO_INSERT) assert not errors # Start pipeline and verify correct rows are received using snaphot. logger.info('Starting pipeline and snapshot') snapshot = sdc_executor.capture_snapshot(pipeline, start_pipeline=True).snapshot if sdc_executor.get_pipeline_status(pipeline) == 'RUNNING': sdc_executor.stop_pipeline(pipeline) rows_from_snapshot = [ (int(record.value['value'][0]['value']) if record.value['value'][0]['value'] is not None else None, record.value['value'][1]['value'], float(record.value['value'][2]['value']) if record.value['value'][2]['value'] is not None else None, record.value['value'][3]['value'], record.value['value'][4]['value'], time.strftime( '%Y-%m-%d', time.localtime(record.value['value'][5]['value'] / 1000)) if record.value['value'][5]['value'] is not None else None, datetime.fromtimestamp(record.value['value'][6]['value'] / 1000).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] if record.value['value'][6]['value'] is not None else None, '{} UTC'.format( datetime.utcfromtimestamp( record.value['value'][7]['value'] / 1000).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]) if record.value['value'][7]['value'] is not None else None, datetime.fromtimestamp(record.value['value'][8]['value'] / 1000).strftime('%H:%M:%S.%f')[:-3] if record.value['value'][8]['value'] is not None else None, base64.b64decode(record.value['value'][9]['value']) if record.value['value'][9]['value'] is not None else None) for record in snapshot[google_bigquery].output ] assert rows_from_snapshot == ROWS_TO_INSERT finally: bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
def persist_lines_stream(config, lines=None, validate_records=True): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = {} # bigquery_client = bigquery.Client(project=project_id) service = service_account.Credentials.from_service_account_file( config['key_file_location']) bigquery_client = bigquery.Client(project=config['project_id'], credentials=service) dataset_ref = bigquery_client.dataset(config['dataset_id']) dataset = Dataset(dataset_ref) try: dataset = bigquery_client.create_dataset( Dataset(dataset_ref)) or Dataset(dataset_ref) except exceptions.Conflict: pass for line in lines: try: js = json.loads(line) # msg = singer.parse_message(line) if js['type'] == 'RECORD': msg = singer.messages.RecordMessage(stream=js.get('stream'), record=js.get('record'), version=js.get('version'), time_extracted=None) else: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(msg.stream)) schema = schemas[msg.stream] if validate_records: validate(msg.record, schema) errors[msg.stream] = bigquery_client.insert_rows_json( tables[msg.stream], [msg.record]) rows[msg.stream] += 1 elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table])) rows[table] = 0 errors[table] = None try: tables[table] = bigquery_client.create_table(tables[table]) except exceptions.Conflict: pass elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for table in errors.keys(): if not errors[table]: logger.info('Loaded {} row(s) into {}:{}'.format( rows[table], config['dataset_id'], table, tables[table].path)) emit_state(state) else: logger.error('Errors:', errors[table]) return state
def persist_lines_stream(project_id, dataset_id, credentials=None, lines=None, validate_records=True, collision_suffix=None, current_batch=None): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = {} bigquery_client = bigquery.Client(project=project_id, credentials=credentials) buffered_singer_stream = BufferedSingerStream(bigquery_client) dataset_ref = bigquery_client.dataset(dataset_id) dataset = Dataset(dataset_ref) try: dataset = bigquery_client.create_dataset( Dataset(dataset_ref)) or Dataset(dataset_ref) except exceptions.Conflict: pass for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(msg.stream)) schema = schemas[msg.stream] current_time = calendar.timegm(time.gmtime()) if validate_records: try: validate(msg.record, schema) except: logger.error("unable to validate {}".format(msg.record)) # logger.info("Got the following schema - {}".format(schema)) # logger.info("got the following metadata - {}".format(key_properties[msg.stream])) j = simplejson.dumps(msg.record) jparsed = simplejson.loads(j, use_decimal=False) jparsedFormated = formatRecord(jparsed) if msg.version: jparsedFormated[SINGER_TABLE_VERSION] = str(msg.version) if msg.time_extracted and jparsedFormated.get( SINGER_RECEIVED_AT) is None: jparsedFormated[SINGER_RECEIVED_AT] = str(msg.time_extracted) # if self.use_uuid_pk and record.get(singer.PK) is None: jparsedFormated[SINGER_PK] = buildPrimaryKey( jparsedFormated, key_properties[msg.stream]) # logger.info("got the following primary key - {}".format(jparsedFormated[SINGER_PK])) jparsedFormated[SINGER_BATCHED_AT] = current_batch jparsedFormated[SINGER_SEQUENCE] = current_time # logger.info("Streaming for {}".format(jparsedFormated)) buffered_singer_stream.add_record_message(tables[msg.stream], jparsedFormated) # err = bigquery_client.insert_rows_json(tables[msg.stream], [jparsedFormated], ignore_unknown_values=True, skip_invalid_rows=False) # if len(err): # logger.error("Error syncing object {} with formatted payload {} got the following errors {}".format(msg.stream, jparsedFormated, err)) # errors[msg.stream] = err rows[msg.stream] += 1 state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream # Some schema are coming without pre`operties, just ignore them if not 'properties' in msg.schema: continue schemas[table] = msg.schema key_properties[table] = msg.key_properties logger.info("Dealing with {}".format(table)) logger.info("Table Schema Info - {}".format(dataset.table(table))) logger.info("Raw Schema Info - {}".format(schemas[table])) logger.info("Schema Info - {}".format(build_schema( schemas[table]))) tables[table] = bigquery.Table(dataset.table( formatName(table, "table")), schema=build_schema(schemas[table], initial=True)) rows[table] = 0 errors[table] = None try: logger.info("creating table {}".format(tables[table])) tables[table] = bigquery_client.create_table(tables[table]) except exceptions.Conflict: pass elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) buffered_singer_stream.flush_buffer() for table in errors.keys(): if not errors[table]: logging.info("Loaded {} row(s) into {}:{}".format( rows[table], dataset_id, table, tables[table].path)) tableName = "{}.{}".format(dataset_id, table) sql = """ MERGE {} t USING ( SELECT row[OFFSET(0)].* FROM ( SELECT ARRAY_AGG(t ORDER BY t._wly_batched_at DESC LIMIT 1) row FROM {} t GROUP BY t._wly_primary_key ) ) s ON t._wly_primary_key = s._wly_primary_key and t._wly_batched_at = s._wly_batched_at WHEN NOT MATCHED BY SOURCE THEN DELETE WHEN NOT MATCHED BY TARGET THEN INSERT ROW """.format(tableName, tableName) logging.info( 'Executing the final transformation into {}'.format(tableName)) try: query_job = bigquery_client.query(sql) results = query_job.result() if results: logger.info( "Cleaning table {} with sql query {} got the following results {}" .format(tableName, sql, results)) except: logger.error("Failed Cleaning the Table {}".format(tableName)) emit_state(state) else: logging.error('Errors: {}'.format(errors[table])) return state
def persist_lines_stream(project_id, dataset_id, lines=None): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = {} bigquery_client = bigquery.Client(project=project_id) dataset_ref = bigquery_client.dataset(dataset_id) dataset = Dataset(dataset_ref) try: dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref) except exceptions.Conflict: pass for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception("A record for stream {} was encountered before a corresponding schema".format(msg.stream)) schema = schemas[msg.stream] validate(msg.record, schema) errors[msg.stream] = bigquery_client.create_rows(tables[msg.stream], [msg.record]) rows[msg.stream] += 1 state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table])) rows[table] = 0 errors[table] = None try: tables[table] = bigquery_client.create_table(tables[table]) except exceptions.Conflict: pass else: raise Exception("Unrecognized message {}".format(msg)) for table in errors.keys(): if not errors[table]: print('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table), tables[table].path) else: print('Errors:', errors[table], sep=" ") return state
def test_google_bigquery_origin_batch_handling(sdc_builder, sdc_executor, gcp): """Verify proper batch handling by the BigQuery origin. In this test, we write 8 records to BigQuery with a batch size of 3, verifying that each batch is of size 3 and that 8 total records are captured. The small numbers are used because of the limitations described in SDC-8765. """ MAX_BATCH_SIZE = 3 TOTAL_RECORDS = 8 dataset_name = get_random_string(ascii_letters, 5) table_name = get_random_string(ascii_letters, 5) query_str = f'SELECT * FROM {dataset_name}.{table_name} ORDER BY number' pipeline_builder = sdc_builder.get_pipeline_builder() google_bigquery = pipeline_builder.add_stage('Google BigQuery', type='origin') google_bigquery.set_attributes(query=query_str, max_batch_size_in_records=MAX_BATCH_SIZE) wiretap = pipeline_builder.add_wiretap() google_bigquery >> wiretap.destination pipeline = pipeline_builder.build().configure_for_environment(gcp) sdc_executor.add_pipeline(pipeline) bigquery_client = gcp.bigquery_client schema = [ SchemaField('number', 'STRING', mode='required'), SchemaField('value', 'STRING', mode='required') ] data = [ dict(number=str(i), value=get_random_string()) for i in range(TOTAL_RECORDS) ] dataset = Dataset(bigquery_client.dataset(dataset_name)) try: # Using Google bigquery client, create dataset, table and data inside table. logger.info('Creating dataset %s using Google bigquery client ...', dataset_name) bigquery_client.create_dataset(dataset) table = bigquery_client.create_table( Table(dataset.table(table_name), schema=schema)) errors = bigquery_client.insert_rows(table, data) assert not errors sdc_executor.start_pipeline(pipeline).wait_for_finished() history = sdc_executor.get_pipeline_history(pipeline) # Assert that the batch size is being respected in each batch (including the last). In this case, # we'd expect batch sizes of 3, 3, and 2. assert history.latest.metrics.counter( 'pipeline.batchCount.counter').count == 3 assert history.latest.metrics.histogram( 'stage.GoogleBigQuery_01.outputRecords.histogramM5' )._data['count'] == 3 assert history.latest.metrics.histogram( 'stage.GoogleBigQuery_01.outputRecords.histogramM5' )._data['max'] == 3 assert history.latest.metrics.histogram( 'stage.GoogleBigQuery_01.outputRecords.histogramM5' )._data['min'] == 2 records = [record.field for record in wiretap.output_records] assert len(records) == len(data) assert records == data finally: bigquery_client.delete_dataset(dataset, delete_contents=True)
class LoadVocabTest(unittest.TestCase): @classmethod def setUpClass(cls): print('**************************************************************') print(cls.__name__) print('**************************************************************') def setUp(self): project_id = 'fake_project_id' dataset_id = 'fake_dataset_id' bucket_name = 'fake_bucket' dataset_ref = DatasetReference(project_id, dataset_id) self.dst_dataset = Dataset(dataset_ref) self.bq_client = mock.MagicMock() self.gcs_client = mock.MagicMock() self.bucket_name = bucket_name self.all_blobs = [ Blob(f'{table}.csv', self.bucket_name) for table in common.VOCABULARY_TABLES ] def test_get_release_date(self): release_date = datetime.date(2021, 2, 10) expected = '20210210' actual = load_vocab.get_release_date(release_date) self.assertEqual(expected, actual) def test_load_stage(self): # TODO check that extra files are skipped # the expected calls to load_table_from_uri # are made when all vocabulary files are present all_blobs = [ Blob(f'{table}.csv', self.bucket_name) for table in common.VOCABULARY_TABLES ] self.gcs_client.list_blobs.return_value = all_blobs load_vocab.load_stage(self.dst_dataset, self.bq_client, self.bucket_name, self.gcs_client) mock_ltfu = self.bq_client.load_table_from_uri expected_calls = [(f'gs://{self.bucket_name}/{table}.csv', self.dst_dataset.table(table)) for table in common.VOCABULARY_TABLES] actual_calls = [(source_uri, destination) for (source_uri, destination), _ in mock_ltfu.call_args_list] self.assertListEqual(expected_calls, actual_calls) # error is thrown when vocabulary files are missing expected_missing = [common.DOMAIN, common.CONCEPT_SYNONYM] incomplete_blobs = [ Blob(f'{table}.csv', self.bucket_name) for table in common.VOCABULARY_TABLES if table not in expected_missing ] self.gcs_client.list_blobs.return_value = incomplete_blobs expected_msg = f'Bucket {self.bucket_name} is missing files for tables {expected_missing}' with self.assertRaises(RuntimeError) as c: load_vocab.load_stage(self.dst_dataset, self.bq_client, self.bucket_name, self.gcs_client) self.assertIsInstance(c.exception, RuntimeError) self.assertEqual(str(c.exception), expected_msg)
def persist_lines_stream(project_id, dataset_id, lines=None, validate_records=True): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = {} bigquery_client = bigquery.Client(project=project_id) dataset_ref = bigquery_client.dataset(dataset_id) dataset = Dataset(dataset_ref) try: dataset = bigquery_client.create_dataset(Dataset(dataset_ref)) or Dataset(dataset_ref) except exceptions.Conflict: pass for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema".format(msg.stream)) schema = schemas[msg.stream] if validate_records: validate(msg.record, schema) msg.record = apply_string_conversions(msg.record, tables[msg.stream].schema, msg.stream) errors[msg.stream] = bigquery_client.insert_rows_json(tables[msg.stream], [msg.record], ignore_unknown_values=True) rows[msg.stream] += 1 state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table])) rows[table] = 0 errors[table] = None try: tables[table] = bigquery_client.create_table(tables[table]) except exceptions.Conflict: pass elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for table in errors.keys(): if not errors[table]: logging.info('Loaded {} row(s) into {}:{}'.format(rows[table], dataset_id, table, tables[table].path)) emit_state(state) else: logging.error('Errors:', errors[table]) return state
def persist_lines(project_id, dataset_id, table_id, lines): state = None schemas = {} key_properties = {} rows = [] for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(msg.stream)) schema = schemas[msg.stream] validate(msg.record, schema) bigquery_client = bigquery.Client(project=project_id) dataset_ref = bigquery_client.dataset(dataset_id) dataset = Dataset(dataset_ref) try: dataset = bigquery_client.create_dataset( Dataset(dataset_ref)) or Dataset(dataset_ref) except exceptions.Conflict: pass table_ref = dataset.table(table_id) table_schema = build_schema(schema) table = bigquery.Table(table_ref, schema=table_schema) try: table = bigquery_client.create_table(table) except exceptions.Conflict: pass rows.append(msg.record) state = None elif isinstance(msg, singer.StateMessage): logger.debug('Setting state to {}'.format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): schemas[msg.stream] = msg.schema key_properties[msg.stream] = msg.key_properties elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) errors = bigquery_client.create_rows(table, rows) if not errors: print('Loaded {} row(s) into {}:{}'.format(len(rows), dataset_id, table_id)) else: print('Errors:') pprint(errors) return state