def test_multiple_destinations_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) output_table_3 = '%s%s' % (self.output_table, 3) output_table_4 = '%s%s' % (self.output_table, 4) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_2, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_3, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_4, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) # Get all input in same machine input = (input | beam.Map(lambda x: (None, x)) | beam.GroupByKey() | beam.FlatMap(lambda elm: elm[1])) _ = (input | "WriteWithMultipleDestsFreely" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_1 if 'language' in x else output_table_2), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) _ = (input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_3 if 'language' in x else output_table_4), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY, max_file_size=20, max_files_per_bundle=-1))
def test_value_provider_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) schema = {'fields': [ {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE'}]} additional_bq_parameters = { 'timePartitioning': {'type': 'DAY'}, 'clustering': {'fields': ['language']}} table_ref = bigquery_tools.parse_table_reference(output_table_1) table_ref2 = bigquery_tools.parse_table_reference(output_table_2) pipeline_verifiers = [ BigQueryTableMatcher( project=self.project, dataset=table_ref.datasetId, table=table_ref.tableId, expected_properties=additional_bq_parameters), BigQueryTableMatcher( project=self.project, dataset=table_ref2.datasetId, table=table_ref2.tableId, expected_properties=additional_bq_parameters), BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_2, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: input = p | beam.Create([row for row in _ELEMENTS if 'language' in row]) _ = (input | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery( table=value_provider.StaticValueProvider( str, '%s:%s' % (self.project, output_table_1)), schema=value_provider.StaticValueProvider(dict, schema), additional_bq_parameters=additional_bq_parameters, method='STREAMING_INSERTS')) _ = (input | "WriteWithMultipleDests2" >> beam.io.gcp.bigquery.WriteToBigQuery( table=value_provider.StaticValueProvider( str, '%s:%s' % (self.project, output_table_2)), schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT, additional_bq_parameters=lambda _: additional_bq_parameters, method='FILE_LOADS'))
def test_one_job_fails_all_jobs_fail(self): # If one of the import jobs fails, then other jobs must not be performed. # This is to avoid reinsertion of some records when a pipeline fails and # is rerun. output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) self.bigquery_client.get_or_create_table( self.project, self.dataset_id, output_table_1.split('.')[1], bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA), None, None) self.bigquery_client.get_or_create_table( self.project, self.dataset_id, output_table_2.split('.')[1], bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA_2), None, None) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, foundation FROM %s" % output_table_2, data=[]) ] args = self.test_pipeline.get_full_options_as_args( experiments='use_beam_bq_sink') with self.assertRaises(Exception): # The pipeline below fails because neither a schema nor SCHEMA_AUTODETECT # are specified. with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) input2 = p | "Broken record" >> beam.Create(['language_broken_record']) input = (input, input2) | beam.Flatten() _ = ( input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_1 if 'language' in x else output_table_2), create_disposition=( beam.io.BigQueryDisposition.CREATE_IF_NEEDED), write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, temp_file_format=bigquery_tools.FileFormat.JSON)) hamcrest_assert(p, all_of(*pipeline_verifiers))
def test_multiple_destinations_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) full_output_table_1 = '%s:%s' % (self.project, output_table_1) full_output_table_2 = '%s:%s' % (self.project, output_table_2) schema1 = {'fields': [ {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE'}]} schema2 = {'fields': [ {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'foundation', 'type': 'STRING', 'mode': 'NULLABLE'}]} bad_record = {'language': 1, 'manguage': 2} pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_2, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) input2 = p | "Broken record" >> beam.Create([bad_record]) input = (input, input2) | beam.Flatten() r = (input | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery( table=lambda x: (full_output_table_1 if 'language' in x else full_output_table_2), schema=lambda dest: (schema1 if dest == full_output_table_1 else schema2), method='STREAMING_INSERTS')) assert_that(r[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS], equal_to([(full_output_table_1, bad_record)]))
def test_value_provider_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) schema = { 'fields': [{ 'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE' }] } pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_2, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: input = p | beam.Create( [row for row in _ELEMENTS if 'language' in row]) _ = (input | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery( table=value_provider.StaticValueProvider( str, output_table_1), schema=value_provider.StaticValueProvider(dict, schema), method='STREAMING_INSERTS')) _ = (input | "WriteWithMultipleDests2" >> beam.io.gcp.bigquery.WriteToBigQuery( table=value_provider.StaticValueProvider( str, output_table_2), method='FILE_LOADS'))
def test_big_query_write_new_types(self): table_name = 'python_new_types_table' table_id = '{}.{}'.format(self.dataset_id, table_name) row_data = { 'float': 0.33, 'numeric': Decimal('10'), 'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'), 'date': '3000-12-31', 'time': '23:59:59', 'datetime': '2018-12-31T12:44:31', 'timestamp': '2018-12-31 12:44:31.744957 UTC', 'geo': 'POINT(30 10)' } input_data = [row_data] # add rows with only one key value pair and None values for all other keys for key, value in iteritems(row_data): input_data.append({key: value}) table_schema = {"fields": [ {"name": "float", "type": "FLOAT"}, {"name": "numeric", "type": "NUMERIC"}, {"name": "bytes", "type": "BYTES"}, {"name": "date", "type": "DATE"}, {"name": "time", "type": "TIME"}, {"name": "datetime", "type": "DATETIME"}, {"name": "timestamp", "type": "TIMESTAMP"}, {"name": "geo", "type": "GEOGRAPHY"} ]} expected_row = (0.33, Decimal('10'), b'\xab\xac', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), datetime.datetime(2018, 12, 31, 12, 44, 31), datetime.datetime(2018, 12, 31, 12, 44, 31, 744957, tzinfo=pytz.utc), 'POINT(30 10)', ) expected_data = [expected_row] # add rows with only one key value pair and None values for all other keys for i, value in enumerate(expected_row): row = [None]*len(expected_row) row[i] = value expected_data.append(tuple(row)) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query='SELECT float, numeric, bytes, date, time, datetime,' 'timestamp, geo FROM %s' % table_id, data=expected_data)] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))
def test_big_query_write_schema_autodetect(self): if self.runner_name == 'TestDataflowRunner': self.skipTest('DataflowRunner does not support schema autodetection') table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ {'number': 1, 'str': 'abc'}, {'number': 2, 'str': 'def'}, ] pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT number, str FROM %s" % table_id, data=[(1, 'abc',), (2, 'def',)])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, method=beam.io.WriteToBigQuery.Method.FILE_LOADS, schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))
def test_big_query_write(self): table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ {'number': 1, 'str': 'abc'}, {'number': 2, 'str': 'def'}, {'number': 3, 'str': u'你好'}, {'number': 4, 'str': u'привет'}, ] table_schema = {"fields": [ {"name": "number", "type": "INTEGER"}, {"name": "str", "type": "STRING"}]} pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT number, str FROM %s" % table_id, data=[(1, 'abc',), (2, 'def',), (3, u'你好',), (4, u'привет',)])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))
def test_big_query_write_without_schema(self): table_name = 'python_no_schema_table' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [{ 'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00' }, { 'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59' }, { 'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00' }] # bigquery io expects bytes to be base64 encoded values for row in input_data: row['bytes'] = base64.b64encode(row['bytes']) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT bytes, date, time FROM %s" % table_id, data=[( b'xyw', datetime.date(2011, 1, 1), datetime.time(23, 59, 59, 999999), ), ( b'abc', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), ), ( b'\xe4\xbd\xa0\xe5\xa5\xbd', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), ), ( b'\xab\xac\xad', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), )]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned ( p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, temp_file_format=FileFormat.JSON))
def test_big_query_write_new_types(self): table_name = 'python_new_types_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ {'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999'}, {'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00'}, {'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59'}, {'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00'} ] # bigquery io expects bytes to be base64 encoded values for row in input_data: row['bytes'] = base64.b64encode(row['bytes']) table_schema = {"fields": [ {"name": "bytes", "type": "BYTES"}, {"name": "date", "type": "DATE"}, {"name": "time", "type": "TIME"}]} pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT bytes, date, time FROM %s" % table_id, data=[(b'xyw', datetime.date(2011, 1, 1), datetime.time(23, 59, 59, 999999), ), (b'abc', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), ), (b'\xe4\xbd\xa0\xe5\xa5\xbd', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), ), (b'\xab\xac\xad', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), )])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))
def test_big_query_write_temp_table_append_schema_update(self): """ Test that schema update options are respected when appending to an existing table via temporary tables. _MAXIMUM_SOURCE_URIS and max_file_size are both set to 1 to force multiple load jobs and usage of temporary tables. """ table_name = 'python_append_schema_update' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [{"int64": 1, "bool": True}, {"int64": 2, "bool": False}] table_schema = { "fields": [{ "name": "int64", "type": "INT64" }, { "name": "bool", "type": "BOOL" }] } args = self.test_pipeline.get_full_options_as_args( on_success_matcher=BigqueryFullResultMatcher( project=self.project, query="SELECT bytes, date, time, int64, bool FROM %s" % table_id, data=[(None, None, None, 1, True), (None, None, None, 2, False)])) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, max_file_size=1, # bytes method=beam.io.WriteToBigQuery.Method.FILE_LOADS, additional_bq_parameters={ 'schemaUpdateOptions': ['ALLOW_FIELD_ADDITION'] }))
def test_multiple_destinations_transform(self): streaming = self.test_pipeline.options.view_as( StandardOptions).streaming if streaming and isinstance(self.test_pipeline.runner, TestDataflowRunner): self.skipTest("TestStream is not supported on TestDataflowRunner") output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) full_output_table_1 = '%s:%s' % (self.project, output_table_1) full_output_table_2 = '%s:%s' % (self.project, output_table_2) schema1 = { 'fields': [{ 'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE' }] } schema2 = { 'fields': [{ 'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'foundation', 'type': 'STRING', 'mode': 'NULLABLE' }] } bad_record = {'language': 1, 'manguage': 2} if streaming: pipeline_verifiers = [ PipelineStateMatcher(PipelineState.RUNNING), BigqueryFullResultStreamingMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultStreamingMatcher( project=self.project, query="SELECT name, foundation FROM %s" % output_table_2, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]) ] else: pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, foundation FROM %s" % output_table_2, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: if streaming: _SIZE = len(_ELEMENTS) test_stream = ( TestStream().advance_watermark_to(0).add_elements( _ELEMENTS[:_SIZE // 2]).advance_watermark_to( 100).add_elements( _ELEMENTS[_SIZE // 2:]).advance_watermark_to_infinity()) input = p | test_stream else: input = p | beam.Create(_ELEMENTS) schema_table_pcv = beam.pvalue.AsDict( p | "MakeSchemas" >> beam.Create([(full_output_table_1, schema1), (full_output_table_2, schema2)])) table_record_pcv = beam.pvalue.AsDict( p | "MakeTables" >> beam.Create([('table1', full_output_table_1), ('table2', full_output_table_2)])) input2 = p | "Broken record" >> beam.Create([bad_record]) input = (input, input2) | beam.Flatten() r = (input | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery( table=lambda x, tables: (tables['table1'] if 'language' in x else tables['table2']), table_side_inputs=(table_record_pcv, ), schema=lambda dest, table_map: table_map.get(dest, None), schema_side_inputs=(schema_table_pcv, ), insert_retry_strategy=RetryStrategy. RETRY_ON_TRANSIENT_ERROR, method='STREAMING_INSERTS')) assert_that(r[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS], equal_to([(full_output_table_1, bad_record)]))
def test_multiple_destinations_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) output_table_3 = '%s%s' % (self.output_table, 3) output_table_4 = '%s%s' % (self.output_table, 4) schema1 = bigquery.WriteToBigQuery.get_dict_table_schema( bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA)) schema2 = bigquery.WriteToBigQuery.get_dict_table_schema( bigquery_tools.parse_table_schema_from_json( self.BIG_QUERY_SCHEMA_2)) schema_kv_pairs = [ (output_table_1, schema1), (output_table_2, schema2), (output_table_3, schema1), (output_table_4, schema2) ] pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, foundation FROM %s" % output_table_2, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_3, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, foundation FROM %s" % output_table_4, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) schema_map_pcv = beam.pvalue.AsDict( p | "MakeSchemas" >> beam.Create(schema_kv_pairs)) table_record_pcv = beam.pvalue.AsDict( p | "MakeTables" >> beam.Create([('table1', output_table_1), ('table2', output_table_2)])) # Get all input in same machine input = (input | beam.Map(lambda x: (None, x)) | beam.GroupByKey() | beam.FlatMap(lambda elm: elm[1])) _ = ( input | "WriteWithMultipleDestsFreely" >> bigquery.WriteToBigQuery( table=lambda x, tables: (tables['table1'] if 'language' in x else tables['table2']), table_side_inputs=(table_record_pcv, ), schema=lambda dest, schema_map: schema_map.get(dest, None), schema_side_inputs=(schema_map_pcv, ), create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) _ = (input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_3 if 'language' in x else output_table_4), schema=lambda dest, schema_map: schema_map.get(dest, None), schema_side_inputs=(schema_map_pcv, ), create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY, max_file_size=20, max_files_per_bundle=-1))
def test_big_query_write_temp_table_append_schema_update( self, file_format): """ Test that nested schema update options and schema relaxation are respected when appending to an existing table via temporary tables. _MAXIMUM_SOURCE_URIS and max_file_size are both set to 1 to force multiple load jobs and usage of temporary tables. """ table_name = 'python_append_schema_update' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) # bytes, date, time fields are optional and omitted in the test # only required and new columns are specified table_schema = { "fields": [{ "name": "int64", "type": "INT64", "mode": "NULLABLE", }, { "name": "bool", "type": "BOOL", }, { "name": "nested_field", "type": "RECORD", "mode": "REPEATED", "fields": [ { "name": "fruit", "type": "STRING", "mode": "NULLABLE" }, ] }] } input_data = [{ "int64": 1, "bool": True, "nested_field": [{ "fruit": "Apple" }] }, { "bool": False, "nested_field": [{ "fruit": "Mango" }] }, { "int64": None, "bool": True, "nested_field": [{ "fruit": "Banana" }] }] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=BigqueryFullResultMatcher( project=self.project, query=""" SELECT bytes, date, time, int64, bool, fruit FROM {}, UNNEST(nested_field) as nested_field ORDER BY fruit """.format(table_id), data=[(None, None, None, 1, True, "Apple"), ( None, None, None, None, True, "Banana"), (None, None, None, None, False, "Mango")])) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, max_file_size=1, # bytes method=beam.io.WriteToBigQuery.Method.FILE_LOADS, additional_bq_parameters={ 'schemaUpdateOptions': ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION'] }, temp_file_format=file_format))
def test_big_query_write_insert_errors_reporting(self): """ Test that errors returned by beam.io.WriteToBigQuery contain both the failed rows amd the reason for it failing. """ table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [{ 'number': 1, 'str': 'some_string', }, { 'number': 2 }, { 'number': 3, 'str': 'some_string', 'additional_field_str': 'some_string', }] table_schema = { "fields": [{ "name": "number", "type": "INTEGER", 'mode': 'REQUIRED' }, { "name": "str", "type": "STRING", 'mode': 'REQUIRED' }] } bq_result_errors = [( { "number": 2 }, [{ "reason": "invalid", "location": "", "debugInfo": "", "message": "Missing required field: Msg_0_CLOUD_QUERY_TABLE.str." }], ), ({ "number": 3, "str": "some_string", "additional_field_str": "some_string" }, [{ "reason": "invalid", "location": "additional_field_str", "debugInfo": "", "message": "no such field: additional_field_str." }])] pipeline_verifiers = [ BigqueryFullResultMatcher(project=self.project, query="SELECT number, str FROM %s" % table_id, data=[(1, 'some_string')]), ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned errors = ( p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, method='STREAMING_INSERTS', insert_retry_strategy='RETRY_NEVER', create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) ) assert_that( errors[BigQueryWriteFn.FAILED_ROWS_WITH_ERRORS] | 'ParseErrors' >> beam.Map(lambda err: (err[1], err[2])), equal_to(bq_result_errors))
def test_avro_file_load(self): # Construct elements such that they can be written via Avro but not via # JSON. See BEAM-8841. from apache_beam.io.gcp import bigquery_file_loads old_max_files = bigquery_file_loads._MAXIMUM_SOURCE_URIS old_max_file_size = bigquery_file_loads._DEFAULT_MAX_FILE_SIZE bigquery_file_loads._MAXIMUM_SOURCE_URIS = 1 bigquery_file_loads._DEFAULT_MAX_FILE_SIZE = 100 elements = [ { 'name': u'Negative infinity', 'value': -float('inf'), 'timestamp': datetime.datetime(1970, 1, 1, tzinfo=pytz.utc), }, { 'name': u'Not a number', 'value': float('nan'), 'timestamp': datetime.datetime(2930, 12, 9, tzinfo=pytz.utc), }, ] schema = beam.io.gcp.bigquery.WriteToBigQuery.get_dict_table_schema( bigquery.TableSchema( fields=[ bigquery.TableFieldSchema( name='name', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='value', type='FLOAT', mode='REQUIRED'), bigquery.TableFieldSchema( name='timestamp', type='TIMESTAMP', mode='REQUIRED'), ])) pipeline_verifiers = [ # Some gymnastics here to avoid comparing NaN since NaN is not equal to # anything, including itself. BigqueryFullResultMatcher( project=self.project, query="SELECT name, value, timestamp FROM {} WHERE value<0".format( self.output_table), data=[(d['name'], d['value'], d['timestamp']) for d in elements[:1]], ), BigqueryFullResultMatcher( project=self.project, query="SELECT name, timestamp FROM {}".format(self.output_table), data=[(d['name'], d['timestamp']) for d in elements], ), ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), ) with beam.Pipeline(argv=args) as p: input = p | 'CreateInput' >> beam.Create(elements) schema_pc = p | 'CreateSchema' >> beam.Create([schema]) _ = ( input | 'WriteToBigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery( table='%s:%s' % (self.project, self.output_table), schema=lambda _, schema: schema, schema_side_inputs=(beam.pvalue.AsSingleton(schema_pc), ), method='FILE_LOADS', temp_file_format=bigquery_tools.FileFormat.AVRO, )) bigquery_file_loads._MAXIMUM_SOURCE_URIS = old_max_files bigquery_file_loads._DEFAULT_MAX_FILE_SIZE = old_max_file_size