def test_value_provider_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) schema = {'fields': [ {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE'}]} additional_bq_parameters = { 'timePartitioning': {'type': 'DAY'}, 'clustering': {'fields': ['language']}} table_ref = bigquery_tools.parse_table_reference(output_table_1) table_ref2 = bigquery_tools.parse_table_reference(output_table_2) pipeline_verifiers = [ BigQueryTableMatcher( project=self.project, dataset=table_ref.datasetId, table=table_ref.tableId, expected_properties=additional_bq_parameters), BigQueryTableMatcher( project=self.project, dataset=table_ref2.datasetId, table=table_ref2.tableId, expected_properties=additional_bq_parameters), BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_2, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: input = p | beam.Create([row for row in _ELEMENTS if 'language' in row]) _ = (input | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery( table=value_provider.StaticValueProvider( str, '%s:%s' % (self.project, output_table_1)), schema=value_provider.StaticValueProvider(dict, schema), additional_bq_parameters=additional_bq_parameters, method='STREAMING_INSERTS')) _ = (input | "WriteWithMultipleDests2" >> beam.io.gcp.bigquery.WriteToBigQuery( table=value_provider.StaticValueProvider( str, '%s:%s' % (self.project, output_table_2)), schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT, additional_bq_parameters=lambda _: additional_bq_parameters, method='FILE_LOADS'))
def test_value_provider_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) schema = { 'fields': [{ 'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE' }] } pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_2, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: input = p | beam.Create( [row for row in _ELEMENTS if 'language' in row]) _ = (input | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery( table=value_provider.StaticValueProvider( str, output_table_1), schema=value_provider.StaticValueProvider(dict, schema), method='STREAMING_INSERTS')) _ = (input | "WriteWithMultipleDests2" >> beam.io.gcp.bigquery.WriteToBigQuery( table=value_provider.StaticValueProvider( str, output_table_2), method='FILE_LOADS'))
def _value_provider_or_static_val(elm): if isinstance(elm, value_provider.ValueProvider): return elm else: # The type argument is a NoOp, because we assume the argument already has # the proper formatting. return value_provider.StaticValueProvider(lambda x: x, value=elm)
def __init__( self, destination, schema=None, custom_gcs_temp_location=None, create_disposition=None, write_disposition=None, triggering_frequency=None, with_auto_sharding=False, temp_file_format=None, max_file_size=None, max_files_per_bundle=None, max_partition_size=None, max_files_per_partition=None, additional_bq_parameters=None, table_side_inputs=None, schema_side_inputs=None, test_client=None, validate=True, is_streaming_pipeline=False, load_job_project_id=None): self.destination = destination self.create_disposition = create_disposition self.write_disposition = write_disposition self.triggering_frequency = triggering_frequency self.with_auto_sharding = with_auto_sharding self.max_file_size = max_file_size or _DEFAULT_MAX_FILE_SIZE self.max_files_per_bundle = ( max_files_per_bundle or _DEFAULT_MAX_WRITERS_PER_BUNDLE) self.max_partition_size = max_partition_size or _MAXIMUM_LOAD_SIZE self.max_files_per_partition = ( max_files_per_partition or _MAXIMUM_SOURCE_URIS) if (isinstance(custom_gcs_temp_location, str) or custom_gcs_temp_location is None): self._custom_gcs_temp_location = vp.StaticValueProvider( str, custom_gcs_temp_location or '') elif isinstance(custom_gcs_temp_location, vp.ValueProvider): self._custom_gcs_temp_location = custom_gcs_temp_location else: raise ValueError('custom_gcs_temp_location must be str or ValueProvider') self.test_client = test_client self.schema = schema self._temp_file_format = temp_file_format or bigquery_tools.FileFormat.JSON # If we have multiple destinations, then we will have multiple load jobs, # thus we will need temporary tables for atomicity. self.dynamic_destinations = bool(callable(destination)) self.additional_bq_parameters = additional_bq_parameters or {} self.table_side_inputs = table_side_inputs or () self.schema_side_inputs = schema_side_inputs or () self.is_streaming_pipeline = is_streaming_pipeline self.load_job_project_id = load_job_project_id self._validate = validate if self._validate: self.verify()
def __init__(self, destination, schema=None, custom_gcs_temp_location=None, create_disposition=None, write_disposition=None, triggering_frequency=None, coder=None, max_file_size=None, max_files_per_bundle=None, additional_bq_parameters=None, table_side_inputs=None, schema_side_inputs=None, test_client=None, validate=True, is_streaming_pipeline=False): self.destination = destination self.create_disposition = create_disposition self.write_disposition = write_disposition self.triggering_frequency = triggering_frequency self.max_file_size = max_file_size or _DEFAULT_MAX_FILE_SIZE self.max_files_per_bundle = (max_files_per_bundle or _DEFAULT_MAX_WRITERS_PER_BUNDLE) if (isinstance(custom_gcs_temp_location, str) or custom_gcs_temp_location is None): self._custom_gcs_temp_location = vp.StaticValueProvider( str, custom_gcs_temp_location or '') elif isinstance(custom_gcs_temp_location, vp.ValueProvider): self._custom_gcs_temp_location = custom_gcs_temp_location else: raise ValueError( 'custom_gcs_temp_location must be str or ValueProvider') self.test_client = test_client self.schema = schema self.coder = coder or bigquery_tools.RowAsDictJsonCoder() # If we have multiple destinations, then we will have multiple load jobs, # thus we will need temporary tables for atomicity. # If the destination is a single one, we assume that we will have only one # job to run - and thus we avoid using temporary tables self.temp_tables = True if callable(destination) else False self.additional_bq_parameters = additional_bq_parameters or {} self.table_side_inputs = table_side_inputs or () self.schema_side_inputs = schema_side_inputs or () self.is_streaming_pipeline = is_streaming_pipeline self._validate = validate if self._validate: self.verify()
def test_to_from_runner_api(self): """Tests that serialization of WriteToBigQuery is correct. This is not intended to be a change-detector test. As such, this only tests the more complicated serialization logic of parameters: ValueProviders, callables, and side inputs. """ FULL_OUTPUT_TABLE = 'test_project:output_table' p = TestPipeline() # Used for testing side input parameters. table_record_pcv = beam.pvalue.AsDict( p | "MakeTable" >> beam.Create([('table', FULL_OUTPUT_TABLE)])) # Used for testing value provider parameters. schema = value_provider.StaticValueProvider(str, '"a:str"') original = WriteToBigQuery( table=lambda _, side_input: side_input['table'], table_side_inputs=(table_record_pcv, ), schema=schema) # pylint: disable=expression-not-assigned p | 'MyWriteToBigQuery' >> original # Run the pipeline through to generate a pipeline proto from an empty # context. This ensures that the serialization code ran. pipeline_proto, context = TestPipeline.from_runner_api( p.to_runner_api(), p.runner, p.get_pipeline_options()).to_runner_api( return_context=True) # Find the transform from the context. write_to_bq_id = [ k for k, v in pipeline_proto.components.transforms.items() if v.unique_name == 'MyWriteToBigQuery' ][0] deserialized_node = context.transforms.get_by_id(write_to_bq_id) deserialized = deserialized_node.transform self.assertIsInstance(deserialized, WriteToBigQuery) # Test that the serialization of a value provider is correct. self.assertEqual(original.schema, deserialized.schema) # Test that the serialization of a callable is correct. self.assertEqual( deserialized._table(None, {'table': FULL_OUTPUT_TABLE}), FULL_OUTPUT_TABLE) # Test that the serialization of a side input is correct. self.assertEqual( len(original.table_side_inputs), len(deserialized.table_side_inputs)) original_side_input_data = original.table_side_inputs[0]._side_input_data() deserialized_side_input_data = deserialized.table_side_inputs[ 0]._side_input_data() self.assertEqual( original_side_input_data.access_pattern, deserialized_side_input_data.access_pattern) self.assertEqual( original_side_input_data.window_mapping_fn, deserialized_side_input_data.window_mapping_fn) self.assertEqual( original_side_input_data.view_fn, deserialized_side_input_data.view_fn)