def test_one_job_fails_all_jobs_fail(self): # If one of the import jobs fails, then other jobs must not be performed. # This is to avoid reinsertion of some records when a pipeline fails and # is rerun. output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) self.bigquery_client.get_or_create_table( self.project, self.dataset_id, output_table_1.split('.')[1], bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA), None, None) self.bigquery_client.get_or_create_table( self.project, self.dataset_id, output_table_2.split('.')[1], bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA_2), None, None) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, foundation FROM %s" % output_table_2, data=[]) ] args = self.test_pipeline.get_full_options_as_args( experiments='use_beam_bq_sink') with self.assertRaises(Exception): # The pipeline below fails because neither a schema nor SCHEMA_AUTODETECT # are specified. with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) input2 = p | "Broken record" >> beam.Create(['language_broken_record']) input = (input, input2) | beam.Flatten() _ = ( input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_1 if 'language' in x else output_table_2), create_disposition=( beam.io.BigQueryDisposition.CREATE_IF_NEEDED), write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, temp_file_format=bigquery_tools.FileFormat.JSON)) hamcrest_assert(p, all_of(*pipeline_verifiers))
def has_expected_values(actual): from hamcrest.core import assert_that as hamcrest_assert from hamcrest.library.collection import contains from hamcrest.library.collection import only_contains ordered = sorted(actual) # Early firings. hamcrest_assert(ordered[:4], contains(1, 3, 6, 10)) # Different runners have different number of 15s, but there should # be at least one 15. hamcrest_assert(ordered[4:], only_contains(15))
def check_many_files(output_pcs): dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG] spilled_records_pc = output_pcs[ bqfl.WriteRecordsToFile.UNWRITTEN_RECORD_TAG] spilled_records_count = (spilled_records_pc | beam.combiners.Count.Globally()) assert_that(spilled_records_count, equal_to([3]), label='spilled count') files_per_dest = (dest_file_pc | beam.Map(lambda x: x).with_output_types( beam.typehints.KV[str, str]) | beam.combiners.Count.PerKey()) files_per_dest = ( files_per_dest | "GetDests" >> beam.Map( lambda x: (bigquery_tools.get_hashable_destination(x[0]), x[1]))) # Only table1 and table3 get files. table2 records get spilled. assert_that(files_per_dest, equal_to([('project1:dataset1.table1', 1), ('project1:dataset1.table3', 1)]), label='file count') # Check that the files exist _ = dest_file_pc | beam.Map(lambda x: x[1]) | beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
def check_many_files(output_pcs): dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG] spilled_records_pc = output_pcs[ bqfl.WriteRecordsToFile.UNWRITTEN_RECORD_TAG] spilled_records_count = (spilled_records_pc | beam.combiners.Count.Globally()) assert_that(spilled_records_count, equal_to([3]), label='spilled count') files_per_dest = (dest_file_pc | beam.Map(lambda x: x).with_output_types( beam.typehints.KV[str, str]) | beam.combiners.Count.PerKey()) files_per_dest = ( files_per_dest | "GetDests" >> beam.Map(lambda x: ( bigquery_tools.get_hashable_destination(x[0]), x[1]))) # Only table1 and table3 get files. table2 records get spilled. assert_that(files_per_dest, equal_to([('project1:dataset1.table1', 1), ('project1:dataset1.table3', 1)]), label='file count') # Check that the files exist _ = dest_file_pc | beam.Map(lambda x: x[1]) | beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
def test_records_traverse_transform_with_mocks(self): destination = 'project1:dataset1.table1' job_reference = bigquery_api.JobReference() job_reference.projectId = 'project1' job_reference.jobId = 'job_name1' result_job = bigquery_api.Job() result_job.jobReference = job_reference mock_job = mock.Mock() mock_job.status.state = 'DONE' mock_job.status.errorResult = None mock_job.jobReference = job_reference bq_client = mock.Mock() bq_client.jobs.Get.return_value = mock_job bq_client.jobs.Insert.return_value = result_job transform = bqfl.BigQueryBatchFileLoads( destination, custom_gcs_temp_location=self._new_tempdir(), test_client=bq_client, validate=False, coder=CustomRowCoder()) # Need to test this with the DirectRunner to avoid serializing mocks with TestPipeline('DirectRunner') as p: outputs = p | beam.Create(_ELEMENTS) | transform dest_files = outputs[ bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS] dest_job = outputs[ bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS] jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1]) files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1]) destinations = ( dest_files | "GetDests" >> beam.Map(lambda x: (bigquery_tools.get_hashable_destination(x[0]), x[1])) | "GetUniques" >> beam.combiners.Count.PerKey() | "GetFinalDests" >> beam.Keys()) # All files exist _ = (files | beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True)))) # One file per destination assert_that(files | beam.combiners.Count.Globally(), equal_to([1]), label='CountFiles') assert_that(destinations, equal_to([destination]), label='CheckDestinations') assert_that(jobs, equal_to([job_reference]), label='CheckJobs')
def test_one_job_fails_all_jobs_fail(self): # If one of the import jobs fails, then other jobs must not be performed. # This is to avoid reinsertion of some records when a pipeline fails and # is rerun. output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) self.bigquery_client.get_or_create_table( self.project, self.dataset_id, output_table_1.split('.')[1], bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA), None, None) self.bigquery_client.get_or_create_table( self.project, self.dataset_id, output_table_2.split('.')[1], bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA_2), None, None) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_1, data=[]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_2, data=[])] args = self.test_pipeline.get_full_options_as_args( experiments='use_beam_bq_sink') with self.assertRaises(Exception): with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) input2 = p | "Broken record" >> beam.Create(['language_broken_record']) input = (input, input2) | beam.Flatten() _ = (input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_1 if 'language' in x else output_table_2), create_disposition=( beam.io.BigQueryDisposition.CREATE_IF_NEEDED), write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) hamcrest_assert(p, all_of(*pipeline_verifiers))
def test_records_traverse_transform_with_mocks(self): destination = 'project1:dataset1.table1' job_reference = bigquery_api.JobReference() job_reference.projectId = 'project1' job_reference.jobId = 'job_name1' result_job = bigquery_api.Job() result_job.jobReference = job_reference mock_job = mock.Mock() mock_job.status.state = 'DONE' mock_job.status.errorResult = None mock_job.jobReference = job_reference bq_client = mock.Mock() bq_client.jobs.Get.return_value = mock_job bq_client.jobs.Insert.return_value = result_job transform = bqfl.BigQueryBatchFileLoads( destination, custom_gcs_temp_location=self._new_tempdir(), test_client=bq_client, validate=False) # Need to test this with the DirectRunner to avoid serializing mocks with TestPipeline('DirectRunner') as p: outputs = p | beam.Create(_ELEMENTS) | transform dest_files = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS] dest_job = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS] jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1]) files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1]) destinations = ( dest_files | "GetDests" >> beam.Map( lambda x: ( bigquery_tools.get_hashable_destination(x[0]), x[1])) | "GetUniques" >> beam.combiners.Count.PerKey() | "GetFinalDests" >>beam.Keys()) # All files exist _ = (files | beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True)))) # One file per destination assert_that(files | beam.combiners.Count.Globally(), equal_to([1]), label='CountFiles') assert_that(destinations, equal_to([destination]), label='CheckDestinations') assert_that(jobs, equal_to([job_reference]), label='CheckJobs')
def check_files_created(output_pc): files = output_pc | "GetFiles" >> beam.Map(lambda x: x[1]) file_count = files | "CountFiles" >> beam.combiners.Count.Globally() _ = files | "FilesExist" >> beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True))) assert_that(file_count, equal_to([3]), label='check file count') destinations = output_pc | "GetDests" >> beam.Map(lambda x: x[0]) assert_that(destinations, equal_to(list(_DISTINCT_DESTINATIONS)), label='check destinations ')
def check_files_created(output_pc): files = output_pc | "GetFiles" >> beam.Map(lambda x: x[1]) file_count = files | "CountFiles" >> beam.combiners.Count.Globally() _ = files | "FilesExist" >> beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True))) assert_that(file_count, equal_to([3]), label='check file count') destinations = output_pc | "GetDests" >> beam.Map(lambda x: x[0]) assert_that(destinations, equal_to(list(_DISTINCT_DESTINATIONS)), label='check destinations ')
def check_many_files(output_pcs): dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG] files_per_dest = (dest_file_pc | beam.Map(lambda x: x).with_output_types( beam.typehints.KV[str, str]) | beam.combiners.Count.PerKey()) assert_that(files_per_dest, equal_to([('project1:dataset1.table1', 4), ('project1:dataset1.table2', 2), ('project1:dataset1.table3', 1)])) # Check that the files exist _ = dest_file_pc | beam.Map(lambda x: x[1]) | beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
def check_many_files(output_pcs): dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG] files_per_dest = (dest_file_pc | beam.Map(lambda x: x).with_output_types( beam.typehints.KV[str, str]) | beam.combiners.Count.PerKey()) assert_that(files_per_dest, equal_to([('project1:dataset1.table1', 4), ('project1:dataset1.table2', 2), ('project1:dataset1.table3', 1)])) # Check that the files exist _ = dest_file_pc | beam.Map(lambda x: x[1]) | beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
def check_files_created(output_pcs): dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG] files = dest_file_pc | "GetFiles" >> beam.Map(lambda x: x[1]) file_count = files | "CountFiles" >> beam.combiners.Count.Globally() _ = files | "FilesExist" >> beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True))) assert_that(file_count, equal_to([3]), label='check file count') destinations = ( dest_file_pc | "GetDests" >> beam.Map( lambda x: bigquery_tools.get_hashable_destination(x[0]))) assert_that(destinations, equal_to(list(_DISTINCT_DESTINATIONS)), label='check destinations ')
def check_files_created(output_pcs): dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG] files = dest_file_pc | "GetFiles" >> beam.Map(lambda x: x[1][0]) file_count = files | "CountFiles" >> combiners.Count.Globally() _ = files | "FilesExist" >> beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True))) assert_that(file_count, equal_to([3]), label='check file count') destinations = ( dest_file_pc | "GetDests" >> beam.Map( lambda x: bigquery_tools.get_hashable_destination(x[0]))) assert_that(destinations, equal_to(list(_DISTINCT_DESTINATIONS)), label='check destinations ')
def check_many_files(output_pcs): dest_file_pc = output_pcs[bqfl.WriteRecordsToFile.WRITTEN_FILE_TAG] files_per_dest = (dest_file_pc | beam.Map(lambda x: x).with_output_types( beam.typehints.KV[str, Tuple[str, int]]) | combiners.Count.PerKey()) files_per_dest = ( files_per_dest | "GetDests" >> beam.Map(lambda x: ( bigquery_tools.get_hashable_destination(x[0]), x[1]))) assert_that( files_per_dest, equal_to([('project1:dataset1.table1', 4), ('project1:dataset1.table2', 2), ('project1:dataset1.table3', 1)])) # Check that the files exist _ = dest_file_pc | beam.Map(lambda x: x[1][0]) | beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True)))
def _matches(actual): from hamcrest.core import assert_that as hamcrest_assert from hamcrest.library.collection import contains_inanyorder expected_list = list(expected) hamcrest_assert(actual, contains_inanyorder(*expected_list))
def test_multiple_partition_files(self): destination = 'project1:dataset1.table1' job_reference = bigquery_api.JobReference() job_reference.projectId = 'project1' job_reference.jobId = 'job_name1' result_job = mock.Mock() result_job.jobReference = job_reference mock_job = mock.Mock() mock_job.status.state = 'DONE' mock_job.status.errorResult = None mock_job.jobReference = job_reference bq_client = mock.Mock() bq_client.jobs.Get.return_value = mock_job bq_client.jobs.Insert.return_value = result_job bq_client.tables.Delete.return_value = None with TestPipeline('DirectRunner') as p: outputs = (p | beam.Create(_ELEMENTS) | bqfl.BigQueryBatchFileLoads( destination, custom_gcs_temp_location=self._new_tempdir(), test_client=bq_client, validate=False, coder=CustomRowCoder(), max_file_size=45, max_partition_size=80, max_files_per_partition=2)) dest_files = outputs[ bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS] dest_load_jobs = outputs[ bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS] dest_copy_jobs = outputs[ bqfl.BigQueryBatchFileLoads.DESTINATION_COPY_JOBID_PAIRS] load_jobs = dest_load_jobs | "GetLoadJobs" >> beam.Map( lambda x: x[1]) copy_jobs = dest_copy_jobs | "GetCopyJobs" >> beam.Map( lambda x: x[1]) files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1][0]) destinations = ( dest_files | "GetDests" >> beam.Map(lambda x: (bigquery_tools.get_hashable_destination(x[0]), x[1])) | "GetUniques" >> combiners.Count.PerKey() | "GetFinalDests" >> beam.Keys()) # All files exist _ = (files | beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True)))) # One file per destination assert_that(files | "CountFiles" >> combiners.Count.Globally(), equal_to([6]), label='CheckFileCount') assert_that(destinations, equal_to([destination]), label='CheckDestinations') assert_that(load_jobs | "CountLoadJobs" >> combiners.Count.Globally(), equal_to([6]), label='CheckLoadJobCount') assert_that(copy_jobs | "CountCopyJobs" >> combiners.Count.Globally(), equal_to([6]), label='CheckCopyJobCount')
def _matches(actual): expected_list = list(expected) hamcrest_assert(actual, contains_inanyorder(*expected_list))
def test_triggering_frequency(self, is_streaming, with_auto_sharding): destination = 'project1:dataset1.table1' job_reference = bigquery_api.JobReference() job_reference.projectId = 'project1' job_reference.jobId = 'job_name1' result_job = bigquery_api.Job() result_job.jobReference = job_reference mock_job = mock.Mock() mock_job.status.state = 'DONE' mock_job.status.errorResult = None mock_job.jobReference = job_reference bq_client = mock.Mock() bq_client.jobs.Get.return_value = mock_job bq_client.jobs.Insert.return_value = result_job # Insert a fake clock to work with auto-sharding which needs a processing # time timer. class _FakeClock(object): def __init__(self, now=time.time()): self._now = now def __call__(self): return self._now start_time = timestamp.Timestamp(0) bq_client.test_clock = _FakeClock(now=start_time) triggering_frequency = 20 if is_streaming else None transform = bqfl.BigQueryBatchFileLoads( destination, custom_gcs_temp_location=self._new_tempdir(), test_client=bq_client, validate=False, temp_file_format=bigquery_tools.FileFormat.JSON, is_streaming_pipeline=is_streaming, triggering_frequency=triggering_frequency, with_auto_sharding=with_auto_sharding) # Need to test this with the DirectRunner to avoid serializing mocks with TestPipeline( runner='BundleBasedDirectRunner', options=StandardOptions(streaming=is_streaming)) as p: if is_streaming: _SIZE = len(_ELEMENTS) fisrt_batch = [ TimestampedValue(value, start_time + i + 1) for i, value in enumerate(_ELEMENTS[:_SIZE // 2]) ] second_batch = [ TimestampedValue(value, start_time + _SIZE // 2 + i + 1) for i, value in enumerate(_ELEMENTS[_SIZE // 2:]) ] # Advance processing time between batches of input elements to fire the # user triggers. Intentionally advance the processing time twice for the # auto-sharding case since we need to first fire the timer and then # fire the trigger. test_stream = ( TestStream().advance_watermark_to(start_time).add_elements( fisrt_batch).advance_processing_time(30). advance_processing_time(30).add_elements(second_batch). advance_processing_time(30).advance_processing_time( 30).advance_watermark_to_infinity()) input = p | test_stream else: input = p | beam.Create(_ELEMENTS) outputs = input | transform dest_files = outputs[ bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS] dest_job = outputs[ bqfl.BigQueryBatchFileLoads.DESTINATION_JOBID_PAIRS] files = dest_files | "GetFiles" >> beam.Map(lambda x: x[1][0]) destinations = ( dest_files | "GetDests" >> beam.Map(lambda x: (bigquery_tools.get_hashable_destination(x[0]), x[1])) | "GetUniques" >> combiners.Count.PerKey() | "GetFinalDests" >> beam.Keys()) jobs = dest_job | "GetJobs" >> beam.Map(lambda x: x[1]) # Check that all files exist. _ = (files | beam.Map( lambda x: hamcrest_assert(os.path.exists(x), is_(True)))) # Expect two load jobs are generated in the streaming case due to the # triggering frequency. Grouping is per trigger so we expect two entries # in the output as opposed to one. file_count = files | combiners.Count.Globally().without_defaults() expected_file_count = [1, 1] if is_streaming else [1] expected_destinations = [destination, destination ] if is_streaming else [destination] expected_jobs = [job_reference, job_reference ] if is_streaming else [job_reference] assert_that(file_count, equal_to(expected_file_count), label='CountFiles') assert_that(destinations, equal_to(expected_destinations), label='CheckDestinations') assert_that(jobs, equal_to(expected_jobs), label='CheckJobs')