class CombineTest(unittest.TestCase): def parseTestPipelineOptions(self): return { 'numRecords': self.input_options.get('num_records'), 'keySizeBytes': self.input_options.get('key_size'), 'valueSizeBytes': self.input_options.get('value_size'), 'bundleSizeDistribution': { 'type': self.input_options.get( 'bundle_size_distribution_type', 'const' ), 'param': self.input_options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': self.input_options.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline() self.input_options = json.loads(self.pipeline.get_option('input_options')) self.metrics_monitor = self.pipeline.get_option('publish_to_big_query') metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') check = metrics_project_id and self.metrics_namespace and metrics_dataset \ is not None if not self.metrics_monitor: logging.info('Metrics will not be collected') elif check: self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, ) else: raise ValueError('One or more of parameters for collecting metrics ' 'are empty.') class _GetElement(beam.DoFn): def process(self, element): yield element def testCombineGlobally(self): # pylint: disable=expression-not-assigned (self.pipeline | beam.io.Read(synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace)) | 'Combine with Top' >> beam.CombineGlobally( beam.combiners.TopCombineFn(1000)) | 'Consume' >> beam.ParDo(self._GetElement()) | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)) ) result = self.pipeline.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
def _run_wordcount_it(self, **opts): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose output = '/'.join([ test_pipeline.get_option('output'), str(int(time.time() * 1000)), 'results' ]) arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int( arg_sleep_secs) if arg_sleep_secs is not None else None pipeline_verifiers = [ PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.DEFAULT_CHECKSUM, sleep_secs) ] extra_opts = { 'output': output, 'on_success_matcher': all_of(*pipeline_verifiers) } extra_opts.update(opts) # Register clean up before pipeline execution self.addCleanup(delete_files, [output + '*']) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
def test_datastore_wordcount_it(self): test_pipeline = TestPipeline(is_integration_test=True) kind = self.DATASTORE_WORDCOUNT_KIND output = '/'.join([ test_pipeline.get_option('output'), str(int(time.time() * 1000)), 'datastore_wordcount_results' ]) arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int( arg_sleep_secs) if arg_sleep_secs is not None else None pipeline_verifiers = [ PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.EXPECTED_CHECKSUM, sleep_secs) ] extra_opts = { 'kind': kind, 'output': output, # Comment this out to regenerate input data on Datastore (delete # existing data first using the bulk delete Dataflow template). 'read_only': True, 'on_success_matcher': all_of(*pipeline_verifiers) } datastore_wordcount.run( test_pipeline.get_full_options_as_args(**extra_opts))
def test_datastore_wordcount_it(self): test_pipeline = TestPipeline(is_integration_test=True) dataset = test_pipeline.get_option("project") kind = self.DATASTORE_WORDCOUNT_KIND output = '/'.join([ test_pipeline.get_option('output'), str(int(time.time() * 1000)), 'datastore_wordcount_results' ]) arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int( arg_sleep_secs) if arg_sleep_secs is not None else None pipeline_verifiers = [ PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.EXPECTED_CHECKSUM, sleep_secs) ] extra_opts = { 'dataset': dataset, 'kind': kind, 'output': output, 'read_only': True, 'on_success_matcher': all_of(*pipeline_verifiers) } datastore_wordcount.run( test_pipeline.get_full_options_as_args(**extra_opts))
class UserScoreIT(unittest.TestCase): DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*' DEFAULT_EXPECTED_CHECKSUM = '9f3bd81669607f0d98ec80ddd477f3277cfba0a2' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.uuid = str(uuid.uuid4()) self.output = '/'.join( [self.test_pipeline.get_option('output'), self.uuid, 'results']) @attr('IT') def test_user_score_it(self): state_verifier = PipelineStateMatcher(PipelineState.DONE) arg_sleep_secs = self.test_pipeline.get_option('sleep_secs') sleep_secs = int(arg_sleep_secs) if arg_sleep_secs is not None else None file_verifier = FileChecksumMatcher( self.output + '/*-of-*', self.DEFAULT_EXPECTED_CHECKSUM, sleep_secs) extra_opts = { 'input': self.DEFAULT_INPUT_FILE, 'output': self.output + '/user-score', 'on_success_matcher': all_of(state_verifier, file_verifier) } # Register clean up before pipeline execution self.addCleanup(delete_files, [self.output + '*']) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. user_score.run( self.test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False)
def _run_wordcount_it(self, run_wordcount, **opts): test_pipeline = TestPipeline(is_integration_test=True) extra_opts = {} # Set extra options to the pipeline for test purpose test_output = '/'.join([ test_pipeline.get_option('output'), str(int(time.time() * 1000)), 'results' ]) extra_opts['output'] = test_output test_input = test_pipeline.get_option('input') if test_input: extra_opts['input'] = test_input arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int( arg_sleep_secs) if arg_sleep_secs is not None else None expect_checksum = (test_pipeline.get_option('expect_checksum') or self.DEFAULT_CHECKSUM) pipeline_verifiers = [ PipelineStateMatcher(), FileChecksumMatcher(test_output + '*-of-*', expect_checksum, sleep_secs) ] extra_opts['on_success_matcher'] = all_of(*pipeline_verifiers) extra_opts.update(opts) # Register clean up before pipeline execution self.addCleanup(delete_files, [test_output + '*']) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. run_wordcount(test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False)
class GroupByKeyTest(unittest.TestCase): def parseTestPipelineOptions(self): return { 'numRecords': self.input_options.get('num_records'), 'keySizeBytes': self.input_options.get('key_size'), 'valueSizeBytes': self.input_options.get('value_size'), 'bundleSizeDistribution': { 'type': self.input_options.get( 'bundle_size_distribution_type', 'const' ), 'param': self.input_options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': self.input_options.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline(is_integration_test=True) self.input_options = json.loads(self.pipeline.get_option('input_options')) metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') self.metrics_monitor = None check = metrics_project_id and self.metrics_namespace and metrics_dataset \ is not None if check: schema = [{'name': RUNTIME_LABEL, 'type': 'FLOAT', 'mode': 'REQUIRED'}] self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, schema_map=schema ) else: logging.error('One or more of parameters for collecting metrics ' 'are empty. Metrics will not be collected') def testGroupByKey(self): with self.pipeline as p: # pylint: disable=expression-not-assigned (p | beam.io.Read(synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace)) | 'GroupByKey' >> beam.GroupByKey() | 'Ungroup' >> beam.FlatMap( lambda elm: [(elm[0], v) for v in elm[1]]) | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)) ) result = p.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
class CloudDLPIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') @attr("IT") def test_deidentification(self): with TestPipeline(is_integration_test=True) as p: output = (p | beam.Create(["*****@*****.**"]) | MaskDetectedDetails( project=self.project, deidentification_config=DEIDENTIFY_CONFIG, inspection_config=INSPECT_CONFIG)) assert_that(output, equal_to(['####################'])) @attr("IT") def test_inspection(self): with TestPipeline(is_integration_test=True) as p: output = (p | beam.Create(["*****@*****.**"]) | InspectForDetails(project=self.project, inspection_config=INSPECT_CONFIG) | beam.ParDo(extract_inspection_results).with_outputs( 'quote', 'info_type')) assert_that(output.info_type, equal_to(['EMAIL_ADDRESS']), 'Type matches')
def test_bigquery_tornadoes_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose project = test_pipeline.get_option('project') dataset = 'BigQueryTornadoesIT' table = 'monthly_tornadoes_%s' % int(round(time.time() * 1000)) output_table = '.'.join([dataset, table]) query = 'SELECT month, tornado_count FROM [%s]' % output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=project, query=query, checksum=self.DEFAULT_CHECKSUM)] extra_opts = {'output': output_table, 'on_success_matcher': all_of(*pipeline_verifiers)} # Register cleanup before pipeline execution. self.addCleanup(utils.delete_bq_table, project, dataset, table) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. bigquery_tornadoes.run( test_pipeline.get_full_options_as_args(**extra_opts))
class StreamingWordCountIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') self.uuid = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pubsub_client = pubsub.Client(project=self.project) self.input_topic = self.pubsub_client.topic(INPUT_TOPIC + self.uuid) self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC + self.uuid) self.input_sub = self.input_topic.subscription(INPUT_SUB + self.uuid) self.output_sub = self.output_topic.subscription(OUTPUT_SUB + self.uuid) self.input_topic.create() self.output_topic.create() test_utils.wait_for_topics_created([self.input_topic, self.output_topic]) self.input_sub.create() self.output_sub.create() def _inject_numbers(self, topic, num_messages): """Inject numbers as test data to PubSub.""" logging.debug('Injecting %d numbers to topic %s', num_messages, topic.full_name) for n in range(num_messages): topic.publish(str(n)) def _cleanup_pubsub(self): test_utils.cleanup_subscriptions([self.input_sub, self.output_sub]) test_utils.cleanup_topics([self.input_topic, self.output_topic]) def tearDown(self): self._cleanup_pubsub() @attr('IT') def test_streaming_wordcount_it(self): # Build expected dataset. expected_msg = [('%d: 1' % num) for num in range(DEFAULT_INPUT_NUMBERS)] # Set extra options to the pipeline for test purpose state_verifier = PipelineStateMatcher(PipelineState.RUNNING) pubsub_msg_verifier = PubSubMessageMatcher(self.project, OUTPUT_SUB + self.uuid, expected_msg, timeout=400) extra_opts = {'input_subscription': self.input_sub.full_name, 'output_topic': self.output_topic.full_name, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)} # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created([self.input_sub]) self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. streaming_wordcount.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
class UserScoreIT(unittest.TestCase): DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*' DEFAULT_EXPECTED_CHECKSUM = '9f3bd81669607f0d98ec80ddd477f3277cfba0a2' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.uuid = str(uuid.uuid4()) self.output = '/'.join([self.test_pipeline.get_option('output'), self.uuid, 'results']) @attr('IT') def test_user_score_it(self): state_verifier = PipelineStateMatcher(PipelineState.DONE) file_verifier = FileChecksumMatcher(self.output + '*-of-*', self.DEFAULT_EXPECTED_CHECKSUM) extra_opts = {'input': self.DEFAULT_INPUT_FILE, 'output': self.output + '/user-score', 'on_success_matcher': all_of(state_verifier, file_verifier)} # Register clean up before pipeline execution self.addCleanup(delete_files, [self.output + '*']) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. user_score.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_bigquery_tornadoes_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose project = test_pipeline.get_option('project') dataset = 'BigQueryTornadoesIT' table = 'monthly_tornadoes_%s' % int(round(time.time() * 1000)) output_table = '.'.join([dataset, table]) query = 'SELECT month, tornado_count FROM `%s`' % output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=project, query=query, checksum=self.DEFAULT_CHECKSUM)] extra_opts = {'output': output_table, 'on_success_matcher': all_of(*pipeline_verifiers)} # Register cleanup before pipeline execution. # Note that actual execution happens in reverse order. self.addCleanup(utils.delete_bq_table, project, dataset, table) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. bigquery_tornadoes.run( test_pipeline.get_full_options_as_args(**extra_opts))
def test_filters_output_bigquery_matcher(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose project = test_pipeline.get_option('project') dataset = 'FiltersTestIT' table = 'cold_days_%s' % int(round(time.time() * 1000)) output_table = '.'.join([dataset, table]) query = 'SELECT year, month, day, mean_temp FROM `%s`' % output_table pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher( project=project, query=query, checksum=self.DEFAULT_CHECKSUM) ] extra_opts = { 'output': output_table, 'on_success_matcher': all_of(*pipeline_verifiers) } # Register cleanup before pipeline execution. # Note that actual execution happens in reverse order. self.addCleanup(utils.delete_bq_table, project, dataset, table) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. filters.run(test_pipeline.get_full_options_as_args(**extra_opts))
class LoadTest(unittest.TestCase): def parseTestPipelineOptions(self, options=None): if not options: options = self.input_options return { 'numRecords': options.get('num_records'), 'keySizeBytes': options.get('key_size'), 'valueSizeBytes': options.get('value_size'), 'hotKeyFraction': options.get('hot_key_fraction', 0), 'numHotKeys': options.get('num_hot_keys', 0), 'bundleSizeDistribution': { 'type': options.get('bundle_size_distribution_type', 'const'), 'param': options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': options.get('force_initial_num_bundles', 0) } def setUp(self): self.pipeline = TestPipeline() self.input_options = json.loads( self.pipeline.get_option('input_options')) self.publish_to_big_query = self.pipeline.get_option( 'publish_to_big_query') self.metrics_namespace = self.pipeline.get_option('metrics_table') if not self.publish_to_big_query or self.publish_to_big_query != 'true': logging.info('Metrics will not be collected') self.metrics_monitor = None else: self.metrics_monitor = MetricsReader( project_name=self.pipeline.get_option('project'), bq_table=self.metrics_namespace, bq_dataset=self.pipeline.get_option('metrics_dataset'), ) def tearDown(self): result = self.pipeline.run() result.wait_until_finish() if self.metrics_monitor: self.metrics_monitor.publish_metrics(result)
class LoadTest(unittest.TestCase): def parseTestPipelineOptions(self, options=None): if not options: options = self.input_options return { 'numRecords': options.get('num_records'), 'keySizeBytes': options.get('key_size'), 'valueSizeBytes': options.get('value_size'), 'bundleSizeDistribution': { 'type': options.get( 'bundle_size_distribution_type', 'const' ), 'param': options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': options.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline() self.input_options = json.loads(self.pipeline.get_option('input_options')) self.publish_to_big_query = self.pipeline.get_option('publish_to_big_query') self.metrics_namespace = self.pipeline.get_option('metrics_table') if not self.publish_to_big_query or self.publish_to_big_query != 'true': logging.info('Metrics will not be collected') self.metrics_monitor = None else: self.metrics_monitor = MetricsReader( project_name=self.pipeline.get_option('project'), bq_table=self.metrics_namespace, bq_dataset=self.pipeline.get_option('metrics_dataset'), ) def tearDown(self): result = self.pipeline.run() result.wait_until_finish() if self.metrics_monitor: self.metrics_monitor.publish_metrics(result)
def test_wordcount_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose output = '/'.join([test_pipeline.get_option('output'), str(int(time.time())), 'results']) arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int(arg_sleep_secs) if arg_sleep_secs is not None else None pipeline_verifiers = [PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.DEFAULT_CHECKSUM, sleep_secs)] extra_opts = {'output': output, 'on_success_matcher': all_of(*pipeline_verifiers)} # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
class StreamingWordCountIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) # Set up PubSub environment. from google.cloud import pubsub self.pubsub_client = pubsub.Client( project=self.test_pipeline.get_option('project')) self.input_topic = self.pubsub_client.topic(INPUT_TOPIC) self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC) self.input_sub = self.input_topic.subscription(INPUT_SUB) self.output_sub = self.output_topic.subscription(OUTPUT_SUB) self._cleanup_pubsub() self.input_topic.create() self.output_topic.create() test_utils.wait_for_topics_created([self.input_topic, self.output_topic]) self.input_sub.create() self.output_sub.create() def _inject_numbers(self, topic, num_messages): """Inject numbers as test data to PubSub.""" logging.debug('Injecting %d numbers to topic %s', num_messages, topic.full_name) for n in range(num_messages): topic.publish(str(n)) def _cleanup_pubsub(self): test_utils.cleanup_subscriptions([self.input_sub, self.output_sub]) test_utils.cleanup_topics([self.input_topic, self.output_topic]) def tearDown(self): self._cleanup_pubsub() @attr('developing_test') def test_streaming_wordcount_it(self): # Set extra options to the pipeline for test purpose pipeline_verifiers = [PipelineStateMatcher(PipelineState.RUNNING)] extra_opts = {'input_sub': self.input_sub.full_name, 'output_topic': self.output_topic.full_name, 'on_success_matcher': all_of(*pipeline_verifiers)} # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created([self.input_sub]) self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. streaming_wordcount.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_datastore_wordcount_it(self): test_pipeline = TestPipeline(is_integration_test=True) dataset = test_pipeline.get_option("project") kind = self.DATASTORE_WORDCOUNT_KIND output = '/'.join([test_pipeline.get_option('output'), str(int(time.time() * 1000)), 'datastore_wordcount_results']) arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int(arg_sleep_secs) if arg_sleep_secs is not None else None pipeline_verifiers = [PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.EXPECTED_CHECKSUM, sleep_secs)] extra_opts = {'dataset': dataset, 'kind': kind, 'output': output, 'read_only': True, 'on_success_matcher': all_of(*pipeline_verifiers)} datastore_wordcount.run(test_pipeline.get_full_options_as_args( **extra_opts))
class LoadTest(unittest.TestCase): def parseTestPipelineOptions(self, options=None): if not options: options = self.input_options return { 'numRecords': options.get('num_records'), 'keySizeBytes': options.get('key_size'), 'valueSizeBytes': options.get('value_size'), 'hotKeyFraction': options.get('hot_key_fraction', 0), 'numHotKeys': options.get('num_hot_keys', 0), 'bundleSizeDistribution': { 'type': options.get( 'bundle_size_distribution_type', 'const' ), 'param': options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': options.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline() self.input_options = json.loads(self.pipeline.get_option('input_options')) self.project_id = self.pipeline.get_option('project') self.metrics_dataset = self.pipeline.get_option('metrics_dataset') self.metrics_namespace = self.pipeline.get_option('metrics_table') self.metrics_monitor = MetricsReader( publish_to_bq=self.pipeline.get_option('publish_to_big_query') == 'true', project_name=self.project_id, bq_table=self.metrics_namespace, bq_dataset=self.metrics_dataset, # Apply filter to prevent system metrics from being published filters=MetricsFilter().with_namespace(self.metrics_namespace) ) def tearDown(self): result = self.pipeline.run() result.wait_until_finish() self.metrics_monitor.publish_metrics(result) def get_option_or_default(self, opt_name, default=0): """Returns a pipeline option or a default value if it was not provided. The returned value is converted to an integer. """ option = self.pipeline.get_option(opt_name) try: return int(option) except TypeError: return default except ValueError as exc: self.fail(str(exc))
class HourlyTeamScoreIT(unittest.TestCase): DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*' # SHA-1 hash generated from sorted rows reading from BigQuery table DEFAULT_EXPECTED_CHECKSUM = '4fa761fb5c3341ec573d5d12c6ab75e3b2957a25' OUTPUT_DATASET = 'hourly_team_score_it_dataset' OUTPUT_TABLE = 'leader_board' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') # Set up BigQuery environment from google.cloud import bigquery client = bigquery.Client() unique_dataset_name = self.OUTPUT_DATASET + str(int(time.time())) self.dataset = client.dataset(unique_dataset_name, project=self.project) self.dataset.create() def _cleanup_dataset(self): self.dataset.delete() @attr('IT') def test_hourly_team_score_it(self): state_verifier = PipelineStateMatcher(PipelineState.DONE) query = ('SELECT COUNT(*) FROM [%s:%s.%s]' % (self.project, self.dataset.name, self.OUTPUT_TABLE)) bigquery_verifier = BigqueryMatcher(self.project, query, self.DEFAULT_EXPECTED_CHECKSUM) extra_opts = {'input': self.DEFAULT_INPUT_FILE, 'dataset': self.dataset.name, 'window_duration': 1, 'on_success_matcher': all_of(state_verifier, bigquery_verifier)} # Register clean up before pipeline execution # Note that actual execution happens in reverse order. self.addCleanup(self._cleanup_dataset) self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name, self.OUTPUT_TABLE) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. hourly_team_score.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
class HourlyTeamScoreIT(unittest.TestCase): DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*' # SHA-1 hash generated from sorted rows reading from BigQuery table DEFAULT_EXPECTED_CHECKSUM = '4fa761fb5c3341ec573d5d12c6ab75e3b2957a25' OUTPUT_DATASET = 'hourly_team_score_it_dataset' OUTPUT_TABLE = 'leader_board' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') # Set up BigQuery environment from google.cloud import bigquery client = bigquery.Client() unique_dataset_name = self.OUTPUT_DATASET + str(int(time.time())) self.dataset = client.dataset(unique_dataset_name, project=self.project) self.dataset.create() def _cleanup_dataset(self): self.dataset.delete() @attr('IT') def test_hourly_team_score_it(self): state_verifier = PipelineStateMatcher(PipelineState.DONE) query = ('SELECT COUNT(*) FROM [%s:%s.%s]' % (self.project, self.dataset.name, self.OUTPUT_TABLE)) bigquery_verifier = BigqueryMatcher(self.project, query, self.DEFAULT_EXPECTED_CHECKSUM) extra_opts = { 'input': self.DEFAULT_INPUT_FILE, 'dataset': self.dataset.name, 'window_duration': 1, 'on_success_matcher': all_of(state_verifier, bigquery_verifier) } # Register clean up before pipeline execution # Note that actual execution happens in reverse order. self.addCleanup(self._cleanup_dataset) self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name, self.OUTPUT_TABLE) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. hourly_team_score.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_run_example_with_setup_file(self): pipeline = TestPipeline(is_integration_test=True) coordinate_output = FileSystems.join( pipeline.get_option('output'), 'juliaset-{}'.format(str(uuid.uuid4())), 'coordinates.txt') extra_args = { 'coordinate_output': coordinate_output, 'grid_size': self.GRID_SIZE, 'setup_file': os.path.normpath( os.path.join(os.path.dirname(__file__), '..', 'setup.py')), 'on_success_matcher': all_of(PipelineStateMatcher(PipelineState.DONE)), } args = pipeline.get_full_options_as_args(**extra_args) juliaset.run(args)
def test_bigquery_tornadoes_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose output_table = ('BigQueryTornadoesIT' '.monthly_tornadoes_%s' % int(round(time.time() * 1000))) query = 'SELECT month, tornado_count FROM [%s]' % output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=test_pipeline.get_option('project'), query=query, checksum=self.DEFAULT_CHECKSUM)] extra_opts = {'output': output_table, 'on_success_matcher': all_of(*pipeline_verifiers)} # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. bigquery_tornadoes.run( test_pipeline.get_full_options_as_args(**extra_opts))
class HourlyTeamScoreIT(unittest.TestCase): DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*' # SHA-1 hash generated from sorted rows reading from BigQuery table DEFAULT_EXPECTED_CHECKSUM = '4fa761fb5c3341ec573d5d12c6ab75e3b2957a25' OUTPUT_DATASET = 'hourly_team_score_it_dataset' OUTPUT_TABLE = 'leader_board' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') # Set up BigQuery environment self.dataset_ref = utils.create_bq_dataset(self.project, self.OUTPUT_DATASET) @pytest.mark.it_postcommit def test_hourly_team_score_it(self): state_verifier = PipelineStateMatcher(PipelineState.DONE) query = ( 'SELECT COUNT(*) FROM `%s.%s.%s`' % (self.project, self.dataset_ref.dataset_id, self.OUTPUT_TABLE)) bigquery_verifier = BigqueryMatcher(self.project, query, self.DEFAULT_EXPECTED_CHECKSUM) extra_opts = { 'input': self.DEFAULT_INPUT_FILE, 'dataset': self.dataset_ref.dataset_id, 'window_duration': 1, 'on_success_matcher': all_of(state_verifier, bigquery_verifier) } # Register clean up before pipeline execution # Note that actual execution happens in reverse order. self.addCleanup(utils.delete_bq_dataset, self.project, self.dataset_ref) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. hourly_team_score.run( self.test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False)
class GroupByKeyTest(unittest.TestCase): def parseTestPipelineOptions(self): return { 'numRecords': self.inputOptions.get('num_records'), 'keySizeBytes': self.inputOptions.get('key_size'), 'valueSizeBytes': self.inputOptions.get('value_size'), 'bundleSizeDistribution': { 'type': self.inputOptions.get('bundle_size_distribution_type', 'const'), 'param': self.inputOptions.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': self.inputOptions.get('force_initial_num_bundles', 0) } def setUp(self): self.pipeline = TestPipeline(is_integration_test=True) self.inputOptions = json.loads( self.pipeline.get_option('input_options')) def testGroupByKey(self): with self.pipeline as p: # pylint: disable=expression-not-assigned (p | beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time' >> beam.ParDo(MeasureTime()) | 'GroupByKey' >> beam.GroupByKey() | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v) for v in elm[1]])) result = p.run() result.wait_until_finish() metrics = result.metrics().query() for dist in metrics['distributions']: logging.info("Distribution: %s", dist)
class CombineTest(unittest.TestCase): def parseTestPipelineOptions(self): return { 'numRecords': self.inputOptions.get('num_records'), 'keySizeBytes': self.inputOptions.get('key_size'), 'valueSizeBytes': self.inputOptions.get('value_size'), 'bundleSizeDistribution': { 'type': self.inputOptions.get( 'bundle_size_distribution_type', 'const' ), 'param': self.inputOptions.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': self.inputOptions.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline(is_integration_test=True) self.inputOptions = json.loads(self.pipeline.get_option('input_options')) class _GetElement(beam.DoFn): def process(self, element): yield element def testCombineGlobally(self): with self.pipeline as p: # pylint: disable=expression-not-assigned (p | beam.io.Read(synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time' >> beam.ParDo(MeasureTime()) | 'Combine with Top' >> beam.CombineGlobally( beam.combiners.TopCombineFn(1000)) | 'Consume' >> beam.ParDo(self._GetElement()) ) result = p.run() result.wait_until_finish() metrics = result.metrics().query() for dist in metrics['distributions']: logging.info("Distribution: %s", dist)
def test_bigquery_tornadoes_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose output_table = ('BigQueryTornadoesIT' '.monthly_tornadoes_%s' % int(round(time.time() * 1000))) query = 'SELECT month, tornado_count FROM [%s]' % output_table pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=test_pipeline.get_option('project'), query=query, checksum=self.DEFAULT_CHECKSUM) ] extra_opts = { 'output': output_table, 'on_success_matcher': all_of(*pipeline_verifiers) } # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. bigquery_tornadoes.run( test_pipeline.get_full_options_as_args(**extra_opts))
class BigQueryQueryToTableIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s' % self.dataset_id) def _setup_new_types_env(self): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table( tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=NEW_TYPES_INPUT_TABLE), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [ {'bytes':b'xyw=', 'date':'2011-01-01', 'time':'23:59:59.999999'}, {'bytes':b'abc=', 'date':'2000-01-01', 'time':'00:00:00'}, {'bytes':b'dec=', 'date':'3000-12-31', 'time':'23:59:59.990000'} ] self.bigquery_client.insert_rows( self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data) @attr('IT') def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': LEGACY_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_standard_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) # TODO(BEAM-6660): Enable this test when ready. @unittest.skip('This test requires BQ Dataflow native source support for ' + 'KMS, which is not available yet.') @attr('IT') def test_big_query_standard_sql_kms_key(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers), 'kms_key': KMS_KEY } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) table = self.bigquery_client.get_table( self.project, self.dataset_id, 'output_table') self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName) @unittest.skipIf(sys.version_info[0] == 3 and os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', 'This test still needs to be fixed on Python 3' 'TODO: BEAM-6769') @attr('IT') def test_big_query_new_types(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
class SideInputTest(unittest.TestCase): def _parseTestPipelineOptions(self): return { 'numRecords': self.inputOptions.get('num_records'), 'keySizeBytes': self.inputOptions.get('key_size'), 'valueSizeBytes': self.inputOptions.get('value_size'), 'bundleSizeDistribution': { 'type': self.inputOptions.get( 'bundle_size_distribution_type', 'const' ), 'param': self.inputOptions.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': self.inputOptions.get( 'force_initial_num_bundles', 0 ) } def _getSideInput(self): side_input = self._parseTestPipelineOptions() side_input['numRecords'] = side_input['numRecords'] side_input['keySizeBytes'] = side_input['keySizeBytes'] side_input['valueSizeBytes'] = side_input['valueSizeBytes'] return side_input def _getPerElementDelaySec(self): return self.syntheticStepOptions.get('per_element_delay_sec', 0) def _getPerBundleDelaySec(self): return self.syntheticStepOptions.get('per_bundle_delay_sec', 0) def _getOutputRecordsPerInputRecords(self): return self.syntheticStepOptions.get('output_records_per_input_records', 0) def setUp(self): self.pipeline = TestPipeline() self.inputOptions = json.loads(self.pipeline.get_option('input_options')) self.iterations = self.pipeline.get_option('number_of_counter_operations') if self.iterations is None: self.iterations = 1 self.iterations = int(self.iterations) self.metrics_monitor = self.pipeline.get_option('publish_to_big_query') metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') check = metrics_project_id and self.metrics_namespace and metrics_dataset \ is not None if not self.metrics_monitor: logging.info('Metrics will not be collected') elif check: self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, ) else: raise ValueError('One or more of parameters for collecting metrics ' 'are empty.') def testSideInput(self): def join_fn(element, side_input, iterations): list = [] for i in range(iterations): for key, value in side_input: if i == iterations - 1: list.append({key: element[1]+value}) yield list with self.pipeline as p: main_input = (p | "Read pcoll 1" >> beam.io.Read( synthetic_pipeline.SyntheticSource( self._parseTestPipelineOptions())) | 'Measure time: Start pcoll 1' >> beam.ParDo( MeasureTime(self.metrics_namespace)) ) side_input = (p | "Read pcoll 2" >> beam.io.Read( synthetic_pipeline.SyntheticSource( self._getSideInput())) | 'Measure time: Start pcoll 2' >> beam.ParDo( MeasureTime(self.metrics_namespace)) ) # pylint: disable=expression-not-assigned (main_input | "Merge" >> beam.ParDo( join_fn, AsIter(side_input), self.iterations) | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)) ) result = p.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result) if __name__ == '__main__': logging.getLogger().setLevel(logging.DEBUG) unittest.main()
class BigQueryFileLoadsIT(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_bq_file_loads_' BIG_QUERY_SCHEMA = ( '{"fields": [{"name": "name","type": "STRING"},' '{"name": "language","type": "STRING"}]}' ) BIG_QUERY_SCHEMA_2 = ( '{"fields": [{"name": "name","type": "STRING"},' '{"name": "foundation","type": "STRING"}]}' ) def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client = bigquery_tools.BigQueryWrapper() self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) logging.info("Created dataset %s in project %s", self.dataset_id, self.project) @attr('IT') def test_multiple_destinations_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) output_table_3 = '%s%s' % (self.output_table, 3) output_table_4 = '%s%s' % (self.output_table, 4) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_2, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_3, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_4, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) # Get all input in same machine input = (input | beam.Map(lambda x: (None, x)) | beam.GroupByKey() | beam.FlatMap(lambda elm: elm[1])) _ = (input | "WriteWithMultipleDestsFreely" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_1 if 'language' in x else output_table_2), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) _ = (input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_3 if 'language' in x else output_table_4), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY, max_file_size=20, max_files_per_bundle=-1)) @attr('IT') def test_one_job_fails_all_jobs_fail(self): # If one of the import jobs fails, then other jobs must not be performed. # This is to avoid reinsertion of some records when a pipeline fails and # is rerun. output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) self.bigquery_client.get_or_create_table( self.project, self.dataset_id, output_table_1.split('.')[1], bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA), None, None) self.bigquery_client.get_or_create_table( self.project, self.dataset_id, output_table_2.split('.')[1], bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA_2), None, None) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_1, data=[]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_2, data=[])] args = self.test_pipeline.get_full_options_as_args( experiments='use_beam_bq_sink') with self.assertRaises(Exception): with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) input2 = p | "Broken record" >> beam.Create(['language_broken_record']) input = (input, input2) | beam.Flatten() _ = (input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_1 if 'language' in x else output_table_2), create_disposition=( beam.io.BigQueryDisposition.CREATE_IF_NEEDED), write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) hamcrest_assert(p, all_of(*pipeline_verifiers)) def tearDown(self): request = bigquery_api.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: logging.info("Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s in project %s', self.dataset_id, self.project)
class ParDoTest(unittest.TestCase): def parseTestPipelineOptions(self): return {'numRecords': self.input_options.get('num_records'), 'keySizeBytes': self.input_options.get('key_size'), 'valueSizeBytes': self.input_options.get('value_size'), 'bundleSizeDistribution': { 'type': self.input_options.get( 'bundle_size_distribution_type', 'const' ), 'param': self.input_options.get( 'bundle_size_distribution_param', 0 ) }, 'forceNumInitialBundles': self.input_options.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline(is_integration_test=True) self.output = self.pipeline.get_option('output') self.iterations = self.pipeline.get_option('number_of_counter_operations') self.input_options = json.loads(self.pipeline.get_option('input_options')) metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') self.metrics_monitor = None if metrics_project_id and self.metrics_namespace is not None: measured_values = [ {'name': RUNTIME_LABEL, 'type': 'FLOAT', 'mode': 'REQUIRED'}, {'name': COUNTER_LABEL, 'type': 'INTEGER', 'mode': 'REQUIRED'} ] self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, schema_map=measured_values ) else: logging.error('One or more of parameters for collecting metrics ' 'are empty. Metrics will not be collected') def testParDo(self): class _GetElement(beam.DoFn): from apache_beam.testing.load_tests.load_test_metrics_utils import count_bytes @count_bytes(COUNTER_LABEL) def process(self, element, namespace, is_returning): if is_returning: yield element if self.iterations is None: num_runs = 1 else: num_runs = int(self.iterations) with self.pipeline as p: pc = (p | 'Read synthetic' >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions() )) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace)) ) for i in range(num_runs): is_returning = (i == (num_runs-1)) pc = (pc | 'Step: %d' % i >> beam.ParDo( _GetElement(), self.metrics_namespace, is_returning) ) if self.output is not None: pc = (pc | "Write" >> beam.io.WriteToText(self.output) ) # pylint: disable=expression-not-assigned (pc | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)) ) result = p.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
def test_get_option(self): name, value = ('job', 'mockJob') test_pipeline = TestPipeline() test_pipeline.options_list = ['--%s=%s' % (name, value)] self.assertEqual(test_pipeline.get_option(name), value)
class CombineTest(unittest.TestCase): def parseTestPipelineOptions(self): return { 'numRecords': self.input_options.get('num_records'), 'keySizeBytes': self.input_options.get('key_size'), 'valueSizeBytes': self.input_options.get('value_size'), 'bundleSizeDistribution': { 'type': self.input_options.get( 'bundle_size_distribution_type', 'const' ), 'param': self.input_options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': self.input_options.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline() self.input_options = json.loads(self.pipeline.get_option('input_options')) self.metrics_monitor = self.pipeline.get_option('publish_to_big_query') metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') check = metrics_project_id and self.metrics_namespace and metrics_dataset \ is not None if not self.metrics_monitor: logging.info('Metrics will not be collected') elif check: self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, ) else: raise ValueError('One or more of parameters for collecting metrics ' 'are empty.') class _GetElement(beam.DoFn): def process(self, element): yield element def testCombineGlobally(self): with self.pipeline as p: # pylint: disable=expression-not-assigned (p | beam.io.Read(synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace)) | 'Combine with Top' >> beam.CombineGlobally( beam.combiners.TopCombineFn(1000)) | 'Consume' >> beam.ParDo(self._GetElement()) | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)) ) result = p.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
class BigQueryFileLoadsIT(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_bq_file_loads_' BIG_QUERY_SCHEMA = ('{"fields": [{"name": "name","type": "STRING"},' '{"name": "language","type": "STRING"}]}') BIG_QUERY_SCHEMA_2 = ('{"fields": [{"name": "name","type": "STRING"},' '{"name": "foundation","type": "STRING"}]}') def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client = bigquery_tools.BigQueryWrapper() self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) logging.info("Created dataset %s in project %s", self.dataset_id, self.project) @attr('IT') def test_multiple_destinations_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) output_table_3 = '%s%s' % (self.output_table, 3) output_table_4 = '%s%s' % (self.output_table, 4) schema1 = bigquery.WriteToBigQuery.get_dict_table_schema( bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA)) schema2 = bigquery.WriteToBigQuery.get_dict_table_schema( bigquery_tools.parse_table_schema_from_json( self.BIG_QUERY_SCHEMA_2)) schema_kv_pairs = [ (output_table_1, schema1), (output_table_2, schema2), (output_table_3, schema1), (output_table_4, schema2) ] pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, foundation FROM %s" % output_table_2, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_3, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, foundation FROM %s" % output_table_4, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) schema_map_pcv = beam.pvalue.AsDict( p | "MakeSchemas" >> beam.Create(schema_kv_pairs)) table_record_pcv = beam.pvalue.AsDict( p | "MakeTables" >> beam.Create([('table1', output_table_1), ('table2', output_table_2)])) # Get all input in same machine input = (input | beam.Map(lambda x: (None, x)) | beam.GroupByKey() | beam.FlatMap(lambda elm: elm[1])) _ = ( input | "WriteWithMultipleDestsFreely" >> bigquery.WriteToBigQuery( table=lambda x, tables: (tables['table1'] if 'language' in x else tables['table2']), table_side_inputs=(table_record_pcv, ), schema=lambda dest, schema_map: schema_map.get(dest, None), schema_side_inputs=(schema_map_pcv, ), create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) _ = (input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_3 if 'language' in x else output_table_4), schema=lambda dest, schema_map: schema_map.get(dest, None), schema_side_inputs=(schema_map_pcv, ), create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY, max_file_size=20, max_files_per_bundle=-1)) @attr('IT') def test_one_job_fails_all_jobs_fail(self): # If one of the import jobs fails, then other jobs must not be performed. # This is to avoid reinsertion of some records when a pipeline fails and # is rerun. output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) self.bigquery_client.get_or_create_table( self.project, self.dataset_id, output_table_1.split('.')[1], bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA), None, None) self.bigquery_client.get_or_create_table( self.project, self.dataset_id, output_table_2.split('.')[1], bigquery_tools.parse_table_schema_from_json( self.BIG_QUERY_SCHEMA_2), None, None) pipeline_verifiers = [ BigqueryFullResultMatcher(project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[]), BigqueryFullResultMatcher(project=self.project, query="SELECT name, foundation FROM %s" % output_table_2, data=[]) ] args = self.test_pipeline.get_full_options_as_args( experiments='use_beam_bq_sink') with self.assertRaises(Exception): with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) input2 = p | "Broken record" >> beam.Create( ['language_broken_record']) input = (input, input2) | beam.Flatten() _ = (input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_1 if 'language' in x else output_table_2), create_disposition=( beam.io.BigQueryDisposition.CREATE_IF_NEEDED), write_disposition=beam.io.BigQueryDisposition. WRITE_APPEND)) hamcrest_assert(p, all_of(*pipeline_verifiers)) def tearDown(self): request = bigquery_api.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: logging.info("Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s in project %s', self.dataset_id, self.project)
class ExerciseStreamingMetricsPipelineTest(unittest.TestCase): def setUp(self): """Creates all required topics and subs.""" self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') self.uuid = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pub_client = pubsub.PublisherClient() self.input_topic_name = INPUT_TOPIC + self.uuid self.input_topic = self.pub_client.create_topic( self.pub_client.topic_path(self.project, self.input_topic_name)) self.output_topic_name = OUTPUT_TOPIC + self.uuid self.output_topic = self.pub_client.create_topic( self.pub_client.topic_path(self.project, self.output_topic_name)) self.sub_client = pubsub.SubscriberClient() self.input_sub_name = INPUT_SUB + self.uuid self.input_sub = self.sub_client.create_subscription( self.sub_client.subscription_path(self.project, self.input_sub_name), self.input_topic.name) self.output_sub_name = OUTPUT_SUB + self.uuid self.output_sub = self.sub_client.create_subscription( self.sub_client.subscription_path(self.project, self.output_sub_name), self.output_topic.name, ack_deadline_seconds=60) def _inject_words(self, topic, messages): """Inject messages as test data to PubSub.""" _LOGGER.debug('Injecting messages to topic %s', topic.name) for msg in messages: self.pub_client.publish(self.input_topic.name, msg.encode('utf-8')) _LOGGER.debug('Done. Injecting messages to topic %s', topic.name) def tearDown(self): """Delete all created topics and subs.""" test_utils.cleanup_subscriptions( self.sub_client, [self.input_sub, self.output_sub]) test_utils.cleanup_topics( self.pub_client, [self.input_topic, self.output_topic]) def run_pipeline(self): # Waits for messages to appear in output topic. expected_msg = [msg.encode('utf-8') for msg in MESSAGES_TO_PUBLISH] pubsub_msg_verifier = PubSubMessageMatcher( self.project, self.output_sub.name, expected_msg, timeout=600) # Checks that pipeline initializes to RUNNING state. state_verifier = PipelineStateMatcher(PipelineState.RUNNING) extra_opts = { 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier), 'experiment': 'beam_fn_api', 'input_subscription': self.input_sub.name, 'output_topic': self.output_topic.name, } argv = self.test_pipeline.get_full_options_as_args(**extra_opts) return dataflow_exercise_streaming_metrics_pipeline.run(argv) @attr('IT', 'ValidatesRunner') def test_streaming_pipeline_returns_expected_user_metrics_fnapi_it(self): """ Runs streaming Dataflow job and verifies that user metrics are reported correctly. """ self._inject_words(self.input_topic, MESSAGES_TO_PUBLISH) result = self.run_pipeline() METRIC_NAMESPACE = \ ('apache_beam.runners.dataflow.' 'dataflow_exercise_streaming_metrics_pipeline.StreamingUserMetricsDoFn') matchers = [ # System metrics MetricResultMatcher( name='ElementCount', labels={ "output_user_name": "generate_metrics-out0", "original_name": "generate_metrics-out0-ElementCount" }, attempted=len(MESSAGES_TO_PUBLISH), committed=len(MESSAGES_TO_PUBLISH), ), # User Counter Metrics. MetricResultMatcher( name='double_msg_counter_name', namespace=METRIC_NAMESPACE, step='generate_metrics', attempted=len(MESSAGES_TO_PUBLISH) * 2, committed=len(MESSAGES_TO_PUBLISH) * 2), MetricResultMatcher( name='msg_len_dist_metric_name', namespace=METRIC_NAMESPACE, step='generate_metrics', attempted=DistributionMatcher( sum_value=len(''.join(MESSAGES_TO_PUBLISH)), count_value=len(MESSAGES_TO_PUBLISH), min_value=len(MESSAGES_TO_PUBLISH[0]), max_value=len(MESSAGES_TO_PUBLISH[1])), committed=DistributionMatcher( sum_value=len(''.join(MESSAGES_TO_PUBLISH)), count_value=len(MESSAGES_TO_PUBLISH), min_value=len(MESSAGES_TO_PUBLISH[0]), max_value=len(MESSAGES_TO_PUBLISH[1]))), ] metrics = result.metrics().all_metrics() errors = metric_result_matchers.verify_all(metrics, matchers) self.assertFalse(errors, str(errors))
class BigQueryWriteIntegrationTests(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_write_to_table_' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) _LOGGER.info("Created dataset %s in project %s", self.dataset_id, self.project) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: _LOGGER.info("Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: _LOGGER.debug('Failed to clean up dataset %s in project %s', self.dataset_id, self.project) def create_table(self, table_name): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table( tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=table_name), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) @attr('IT') def test_big_query_write(self): table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ {'number': 1, 'str': 'abc'}, {'number': 2, 'str': 'def'}, {'number': 3, 'str': u'你好'}, {'number': 4, 'str': u'привет'}, ] table_schema = {"fields": [ {"name": "number", "type": "INTEGER"}, {"name": "str", "type": "STRING"}]} pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT number, str FROM %s" % table_id, data=[(1, 'abc',), (2, 'def',), (3, u'你好',), (4, u'привет',)])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) @attr('IT') def test_big_query_write_schema_autodetect(self): if self.runner_name == 'TestDataflowRunner': self.skipTest('DataflowRunner does not support schema autodetection') table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ {'number': 1, 'str': 'abc'}, {'number': 2, 'str': 'def'}, ] pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT number, str FROM %s" % table_id, data=[(1, 'abc',), (2, 'def',)])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, method=beam.io.WriteToBigQuery.Method.FILE_LOADS, schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) @attr('IT') def test_big_query_write_new_types(self): table_name = 'python_new_types_table' table_id = '{}.{}'.format(self.dataset_id, table_name) row_data = { 'float': 0.33, 'numeric': Decimal('10'), 'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'), 'date': '3000-12-31', 'time': '23:59:59', 'datetime': '2018-12-31T12:44:31', 'timestamp': '2018-12-31 12:44:31.744957 UTC', 'geo': 'POINT(30 10)' } input_data = [row_data] # add rows with only one key value pair and None values for all other keys for key, value in iteritems(row_data): input_data.append({key: value}) table_schema = {"fields": [ {"name": "float", "type": "FLOAT"}, {"name": "numeric", "type": "NUMERIC"}, {"name": "bytes", "type": "BYTES"}, {"name": "date", "type": "DATE"}, {"name": "time", "type": "TIME"}, {"name": "datetime", "type": "DATETIME"}, {"name": "timestamp", "type": "TIMESTAMP"}, {"name": "geo", "type": "GEOGRAPHY"} ]} expected_row = (0.33, Decimal('10'), b'\xab\xac', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), datetime.datetime(2018, 12, 31, 12, 44, 31), datetime.datetime(2018, 12, 31, 12, 44, 31, 744957, tzinfo=pytz.utc), 'POINT(30 10)', ) expected_data = [expected_row] # add rows with only one key value pair and None values for all other keys for i, value in enumerate(expected_row): row = [None]*len(expected_row) row[i] = value expected_data.append(tuple(row)) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query='SELECT float, numeric, bytes, date, time, datetime,' 'timestamp, geo FROM %s' % table_id, data=expected_data)] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) @attr('IT') def test_big_query_write_without_schema(self): table_name = 'python_no_schema_table' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ {'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999'}, {'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00'}, {'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59'}, {'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00'} ] # bigquery io expects bytes to be base64 encoded values for row in input_data: row['bytes'] = base64.b64encode(row['bytes']) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT bytes, date, time FROM %s" % table_id, data=[(b'xyw', datetime.date(2011, 1, 1), datetime.time(23, 59, 59, 999999), ), (b'abc', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), ), (b'\xe4\xbd\xa0\xe5\xa5\xbd', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), ), (b'\xab\xac\xad', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), )])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
class BigQueryStreamingInsertTransformIntegrationTests(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_bq_streaming_inserts_' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client = bigquery_tools.BigQueryWrapper() self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) logging.info("Created dataset %s in project %s", self.dataset_id, self.project) @attr('IT') def test_value_provider_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) schema = {'fields': [ {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE'}]} pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_2, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: input = p | beam.Create([row for row in _ELEMENTS if 'language' in row]) _ = (input | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery( table=value_provider.StaticValueProvider( str, '%s:%s' % (self.project, output_table_1)), schema=value_provider.StaticValueProvider(dict, schema), method='STREAMING_INSERTS')) _ = (input | "WriteWithMultipleDests2" >> beam.io.gcp.bigquery.WriteToBigQuery( table=value_provider.StaticValueProvider( str, '%s:%s' % (self.project, output_table_2)), method='FILE_LOADS')) @attr('IT') def test_multiple_destinations_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) full_output_table_1 = '%s:%s' % (self.project, output_table_1) full_output_table_2 = '%s:%s' % (self.project, output_table_2) schema1 = {'fields': [ {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE'}]} schema2 = {'fields': [ {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'foundation', 'type': 'STRING', 'mode': 'NULLABLE'}]} bad_record = {'language': 1, 'manguage': 2} pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_2, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) input2 = p | "Broken record" >> beam.Create([bad_record]) input = (input, input2) | beam.Flatten() r = (input | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery( table=lambda x: (full_output_table_1 if 'language' in x else full_output_table_2), schema=lambda dest: (schema1 if dest == full_output_table_1 else schema2), method='STREAMING_INSERTS')) assert_that(r[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS], equal_to([(full_output_table_1, bad_record)])) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: logging.info("Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s in project %s', self.dataset_id, self.project)
class PubSubIntegrationTest(unittest.TestCase): ID_LABEL = 'id' TIMESTAMP_ATTRIBUTE = 'timestamp' INPUT_MESSAGES = [ # Use ID_LABEL attribute to deduplicate messages with the same ID. PubsubMessage('data001', {ID_LABEL: 'foo'}), PubsubMessage('data001', {ID_LABEL: 'foo'}), PubsubMessage('data001', {ID_LABEL: 'foo'}), # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the IT # pipeline writes back the timestamp of each element (as reported by # Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute. PubsubMessage('data002', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', }), ] EXPECTED_OUTPUT_MESSAGES = [ PubsubMessage('data001-seen', {'processed': 'IT'}), PubsubMessage('data002-seen', { TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 'processed': 'IT', }), ] def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') self.uuid = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pubsub_client = pubsub.Client(project=self.project) self.input_topic = self.pubsub_client.topic(INPUT_TOPIC + self.uuid) self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC + self.uuid) self.input_sub = self.input_topic.subscription(INPUT_SUB + self.uuid) self.output_sub = self.output_topic.subscription(OUTPUT_SUB + self.uuid) self.input_topic.create() self.output_topic.create() test_utils.wait_for_topics_created([self.input_topic, self.output_topic]) self.input_sub.create() self.output_sub.create() def tearDown(self): test_utils.cleanup_subscriptions([self.input_sub, self.output_sub]) test_utils.cleanup_topics([self.input_topic, self.output_topic]) def _test_streaming(self, with_attributes): """Runs IT pipeline with message verifier. Args: with_attributes: False - Reads and writes message data only. True - Reads and writes message data and attributes. Also verifies id_label and timestamp_attribute features. """ # Build expected dataset. # Set extra options to the pipeline for test purpose state_verifier = PipelineStateMatcher(PipelineState.RUNNING) expected_messages = self.EXPECTED_OUTPUT_MESSAGES if not with_attributes: expected_messages = [pubsub_msg.data for pubsub_msg in expected_messages] pubsub_msg_verifier = PubSubMessageMatcher( self.project, OUTPUT_SUB + self.uuid, expected_messages, timeout=MESSAGE_MATCHER_TIMEOUT_S, with_attributes=with_attributes, strip_attributes=[self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE]) extra_opts = {'input_subscription': self.input_sub.full_name, 'output_topic': self.output_topic.full_name, 'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)} # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created([self.input_sub]) for msg in self.INPUT_MESSAGES: self.input_topic.publish(msg.data, **msg.attributes) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. pubsub_it_pipeline.run_pipeline( argv=self.test_pipeline.get_full_options_as_args(**extra_opts), with_attributes=with_attributes, id_label=self.ID_LABEL, timestamp_attribute=self.TIMESTAMP_ATTRIBUTE) @attr('IT') def test_streaming_data_only(self): self._test_streaming(with_attributes=False) @attr('IT') def test_streaming_with_attributes(self): self._test_streaming(with_attributes=True)
class FastavroIT(unittest.TestCase): SCHEMA = Parse(''' {"namespace": "example.avro", "type": "record", "name": "User", "fields": [ {"name": "label", "type": "string"}, {"name": "number", "type": ["int", "null"]}, {"name": "number_str", "type": ["string", "null"]}, {"name": "color", "type": ["string", "null"]} ] } ''') def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.uuid = str(uuid.uuid4()) self.output = '/'.join([ self.test_pipeline.get_option('output'), self.uuid ]) @attr('IT') def test_avro_it(self): num_records = self.test_pipeline.get_option('records') num_records = int(num_records) if num_records else 1000000 # Seed a `PCollection` with indices that will each be FlatMap'd into # `batch_size` records, to avoid having a too-large list in memory at # the outset batch_size = self.test_pipeline.get_option('batch-size') batch_size = int(batch_size) if batch_size else 10000 # pylint: disable=range-builtin-not-iterating batches = range(int(num_records / batch_size)) def batch_indices(start): # pylint: disable=range-builtin-not-iterating return range(start * batch_size, (start + 1) * batch_size) # A `PCollection` with `num_records` avro records records_pcoll = \ self.test_pipeline \ | 'create-batches' >> Create(batches) \ | 'expand-batches' >> FlatMap(batch_indices) \ | 'create-records' >> Map(record) fastavro_output = '/'.join([self.output, 'fastavro']) avro_output = '/'.join([self.output, 'avro']) self.addCleanup(delete_files, [self.output + '*']) # pylint: disable=expression-not-assigned records_pcoll \ | 'write_fastavro' >> WriteToAvro( fastavro_output, self.SCHEMA, use_fastavro=True ) # pylint: disable=expression-not-assigned records_pcoll \ | 'write_avro' >> WriteToAvro( avro_output, self.SCHEMA, use_fastavro=False ) result = self.test_pipeline.run() result.wait_until_finish() assert result.state == PipelineState.DONE fastavro_read_pipeline = TestPipeline(is_integration_test=True) fastavro_records = \ fastavro_read_pipeline \ | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \ | 'read-fastavro' >> ReadAllFromAvro(use_fastavro=True) \ | Map(lambda rec: (rec['number'], rec)) avro_records = \ fastavro_read_pipeline \ | 'create-avro' >> Create(['%s*' % avro_output]) \ | 'read-avro' >> ReadAllFromAvro(use_fastavro=False) \ | Map(lambda rec: (rec['number'], rec)) def check(elem): v = elem[1] def assertEqual(l, r): if l != r: raise BeamAssertException('Assertion failed: %s == %s' % (l, r)) assertEqual(v.keys(), ['avro', 'fastavro']) avro_values = v['avro'] fastavro_values = v['fastavro'] assertEqual(avro_values, fastavro_values) assertEqual(len(avro_values), 1) # pylint: disable=expression-not-assigned { 'avro': avro_records, 'fastavro': fastavro_records } \ | CoGroupByKey() \ | Map(check) fastavro_read_pipeline.run().wait_until_finish() assert result.state == PipelineState.DONE
class GcsIOIntegrationTest(unittest.TestCase): INPUT_FILE = 'gs://dataflow-samples/shakespeare/kinglear.txt' # Larger than 1MB to test maxBytesRewrittenPerCall. INPUT_FILE_LARGE = ( 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000000.json') def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ if self.runner_name != 'TestDataflowRunner': # This test doesn't run a pipeline, so it doesn't make sense to try it on # different runners. Running with TestDataflowRunner makes sense since # it uses GoogleCloudOptions such as 'project'. raise unittest.SkipTest( 'This test only runs with TestDataflowRunner.') self.project = self.test_pipeline.get_option('project') self.gcs_tempdir = (self.test_pipeline.get_option('temp_location') + '/gcs_it-' + str(uuid.uuid4())) self.kms_key_name = self.test_pipeline.get_option('kms_key_name') self.gcsio = gcsio.GcsIO() def tearDown(self): FileSystems.delete([self.gcs_tempdir + '/']) def _verify_copy(self, src, dst, dst_kms_key_name=None): self.assertTrue(FileSystems.exists(src), 'src does not exist: %s' % src) self.assertTrue(FileSystems.exists(dst), 'dst does not exist: %s' % dst) src_checksum = self.gcsio.checksum(src) dst_checksum = self.gcsio.checksum(dst) self.assertEqual(src_checksum, dst_checksum) actual_dst_kms_key = self.gcsio.kms_key(dst) if actual_dst_kms_key is None: self.assertEqual(actual_dst_kms_key, dst_kms_key_name) else: self.assertTrue(actual_dst_kms_key.startswith(dst_kms_key_name), "got: %s, wanted startswith: %s" % (actual_dst_kms_key, dst_kms_key_name)) def _test_copy(self, name, kms_key_name=None, max_bytes_rewritten_per_call=None, src=None): src = src or self.INPUT_FILE dst = self.gcs_tempdir + '/%s' % name extra_kwargs = {} if max_bytes_rewritten_per_call is not None: extra_kwargs['max_bytes_rewritten_per_call'] = ( max_bytes_rewritten_per_call) self.gcsio.copy(src, dst, kms_key_name, **extra_kwargs) self._verify_copy(src, dst, kms_key_name) @attr('IT') def test_copy(self): self._test_copy("test_copy") @attr('IT') def test_copy_kms(self): if self.kms_key_name is None: raise unittest.SkipTest('--kms_key_name not specified') self._test_copy("test_copy_kms", self.kms_key_name) @attr('IT') def test_copy_rewrite_token(self): # Tests a multi-part copy (rewrite) operation. This is triggered by a # combination of 3 conditions: # - a large enough src # - setting max_bytes_rewritten_per_call # - setting kms_key_name if self.kms_key_name is None: raise unittest.SkipTest('--kms_key_name not specified') rewrite_responses = [] self.gcsio._set_rewrite_response_callback( lambda response: rewrite_responses.append(response)) self._test_copy("test_copy_rewrite_token", kms_key_name=self.kms_key_name, max_bytes_rewritten_per_call=50 * 1024 * 1024, src=self.INPUT_FILE_LARGE) # Verify that there was a multi-part rewrite. self.assertTrue(any([not r.done for r in rewrite_responses])) def _test_copy_batch(self, name, kms_key_name=None, max_bytes_rewritten_per_call=None, src=None): num_copies = 10 srcs = [src or self.INPUT_FILE] * num_copies dsts = [self.gcs_tempdir + '/%s_%d' % (name, i) for i in range(num_copies)] src_dst_pairs = list(zip(srcs, dsts)) extra_kwargs = {} if max_bytes_rewritten_per_call is not None: extra_kwargs['max_bytes_rewritten_per_call'] = ( max_bytes_rewritten_per_call) result_statuses = self.gcsio.copy_batch( src_dst_pairs, kms_key_name, **extra_kwargs) for status in result_statuses: self.assertIsNone(status[2], status) for _src, _dst in src_dst_pairs: self._verify_copy(_src, _dst, kms_key_name) @attr('IT') def test_copy_batch(self): self._test_copy_batch("test_copy_batch") @attr('IT') def test_copy_batch_kms(self): if self.kms_key_name is None: raise unittest.SkipTest('--kms_key_name not specified') self._test_copy_batch("test_copy_batch_kms", self.kms_key_name) @attr('IT') def test_copy_batch_rewrite_token(self): # Tests a multi-part copy (rewrite) operation. This is triggered by a # combination of 3 conditions: # - a large enough src # - setting max_bytes_rewritten_per_call # - setting kms_key_name if self.kms_key_name is None: raise unittest.SkipTest('--kms_key_name not specified') rewrite_responses = [] self.gcsio._set_rewrite_response_callback( lambda response: rewrite_responses.append(response)) self._test_copy_batch( "test_copy_batch_rewrite_token", kms_key_name=self.kms_key_name, max_bytes_rewritten_per_call=50 * 1024 * 1024, src=self.INPUT_FILE_LARGE) # Verify that there was a multi-part rewrite. self.assertTrue(any([not r.done for r in rewrite_responses]))
class LeaderBoardIT(unittest.TestCase): # Input event containing user, team, score, processing time, window start. INPUT_EVENT = 'user1,teamA,10,%d,2015-11-02 09:09:28.224' INPUT_TOPIC = 'leader_board_it_input_topic' INPUT_SUB = 'leader_board_it_input_subscription' # SHA-1 hash generated from sorted rows reading from BigQuery table DEFAULT_EXPECTED_CHECKSUM = 'de00231fe6730b972c0ff60a99988438911cda53' OUTPUT_DATASET = 'leader_board_it_dataset' OUTPUT_TABLE_USERS = 'leader_board_users' OUTPUT_TABLE_TEAMS = 'leader_board_teams' DEFAULT_INPUT_COUNT = 500 WAIT_UNTIL_FINISH_DURATION = 10 * 60 * 1000 # in milliseconds def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') _unique_id = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pubsub_client = pubsub.Client(project=self.project) unique_topic_name = self.INPUT_TOPIC + _unique_id unique_subscrition_name = self.INPUT_SUB + _unique_id self.input_topic = self.pubsub_client.topic(unique_topic_name) self.input_sub = self.input_topic.subscription(unique_subscrition_name) self.input_topic.create() test_utils.wait_for_topics_created([self.input_topic]) self.input_sub.create() # Set up BigQuery environment from google.cloud import bigquery client = bigquery.Client() unique_dataset_name = self.OUTPUT_DATASET + str(int(time.time())) self.dataset = client.dataset(unique_dataset_name, project=self.project) self.dataset.create() self._test_timestamp = int(time.time() * 1000) def _inject_pubsub_game_events(self, topic, message_count): """Inject game events as test data to PubSub.""" logging.debug('Injecting %d game events to topic %s', message_count, topic.full_name) for _ in range(message_count): topic.publish(self.INPUT_EVENT % self._test_timestamp) def _cleanup_pubsub(self): test_utils.cleanup_subscriptions([self.input_sub]) test_utils.cleanup_topics([self.input_topic]) def _cleanup_dataset(self): self.dataset.delete() @attr('IT') def test_leader_board_it(self): state_verifier = PipelineStateMatcher(PipelineState.RUNNING) success_condition = 'total_score=5000 LIMIT 1' users_query = ('SELECT total_score FROM [%s:%s.%s] ' 'WHERE %s' % (self.project, self.dataset.name, self.OUTPUT_TABLE_USERS, success_condition)) bq_users_verifier = BigqueryMatcher(self.project, users_query, self.DEFAULT_EXPECTED_CHECKSUM) teams_query = ('SELECT total_score FROM [%s:%s.%s] ' 'WHERE %s' % (self.project, self.dataset.name, self.OUTPUT_TABLE_TEAMS, success_condition)) bq_teams_verifier = BigqueryMatcher(self.project, teams_query, self.DEFAULT_EXPECTED_CHECKSUM) extra_opts = {'subscription': self.input_sub.full_name, 'dataset': self.dataset.name, 'topic': self.input_topic.full_name, 'team_window_duration': 1, 'wait_until_finish_duration': self.WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(state_verifier, bq_users_verifier, bq_teams_verifier)} # Register cleanup before pipeline execution. # Note that actual execution happens in reverse order. self.addCleanup(self._cleanup_pubsub) self.addCleanup(self._cleanup_dataset) self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name, self.OUTPUT_TABLE_USERS) self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name, self.OUTPUT_TABLE_TEAMS) # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created([self.input_topic, self.input_sub]) self._inject_pubsub_game_events(self.input_topic, self.DEFAULT_INPUT_COUNT) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. leader_board.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
class BigQueryStreamingInsertTransformIntegrationTests(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_bq_streaming_inserts_' # Prevent nose from finding and running tests that were not # specified in the Gradle file. # See "More tests may be found" in: # https://nose.readthedocs.io/en/latest/doc_tests/test_multiprocess # /multiprocess.html#other-differences-in-test-running _multiprocess_can_split_ = True def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client = bigquery_tools.BigQueryWrapper() self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) _LOGGER.info("Created dataset %s in project %s", self.dataset_id, self.project) @attr('IT') def test_value_provider_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) schema = { 'fields': [{ 'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE' }] } additional_bq_parameters = { 'timePartitioning': { 'type': 'DAY' }, 'clustering': { 'fields': ['language'] } } table_ref = bigquery_tools.parse_table_reference(output_table_1) table_ref2 = bigquery_tools.parse_table_reference(output_table_2) pipeline_verifiers = [ BigQueryTableMatcher(project=self.project, dataset=table_ref.datasetId, table=table_ref.tableId, expected_properties=additional_bq_parameters), BigQueryTableMatcher(project=self.project, dataset=table_ref2.datasetId, table=table_ref2.tableId, expected_properties=additional_bq_parameters), BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_2, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: input = p | beam.Create( [row for row in _ELEMENTS if 'language' in row]) _ = (input | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery( table=value_provider.StaticValueProvider( str, '%s:%s' % (self.project, output_table_1)), schema=value_provider.StaticValueProvider(dict, schema), additional_bq_parameters=additional_bq_parameters, method='STREAMING_INSERTS')) _ = (input | "WriteWithMultipleDests2" >> beam.io.gcp.bigquery.WriteToBigQuery( table=value_provider.StaticValueProvider( str, '%s:%s' % (self.project, output_table_2)), schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT, additional_bq_parameters=lambda _: additional_bq_parameters, method='FILE_LOADS')) @attr('IT') def test_multiple_destinations_transform(self): streaming = self.test_pipeline.options.view_as( StandardOptions).streaming if streaming and isinstance(self.test_pipeline.runner, TestDataflowRunner): self.skipTest("TestStream is not supported on TestDataflowRunner") output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) full_output_table_1 = '%s:%s' % (self.project, output_table_1) full_output_table_2 = '%s:%s' % (self.project, output_table_2) schema1 = { 'fields': [{ 'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE' }] } schema2 = { 'fields': [{ 'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'foundation', 'type': 'STRING', 'mode': 'NULLABLE' }] } bad_record = {'language': 1, 'manguage': 2} if streaming: pipeline_verifiers = [ PipelineStateMatcher(PipelineState.RUNNING), BigqueryFullResultStreamingMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultStreamingMatcher( project=self.project, query="SELECT name, foundation FROM %s" % output_table_2, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]) ] else: pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT name, language FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT name, foundation FROM %s" % output_table_2, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: if streaming: _SIZE = len(_ELEMENTS) test_stream = ( TestStream().advance_watermark_to(0).add_elements( _ELEMENTS[:_SIZE // 2]).advance_watermark_to( 100).add_elements( _ELEMENTS[_SIZE // 2:]).advance_watermark_to_infinity()) input = p | test_stream else: input = p | beam.Create(_ELEMENTS) schema_table_pcv = beam.pvalue.AsDict( p | "MakeSchemas" >> beam.Create([(full_output_table_1, schema1), (full_output_table_2, schema2)])) table_record_pcv = beam.pvalue.AsDict( p | "MakeTables" >> beam.Create([('table1', full_output_table_1), ('table2', full_output_table_2)])) input2 = p | "Broken record" >> beam.Create([bad_record]) input = (input, input2) | beam.Flatten() r = (input | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery( table=lambda x, tables: (tables['table1'] if 'language' in x else tables['table2']), table_side_inputs=(table_record_pcv, ), schema=lambda dest, table_map: table_map.get(dest, None), schema_side_inputs=(schema_table_pcv, ), insert_retry_strategy=RetryStrategy. RETRY_ON_TRANSIENT_ERROR, method='STREAMING_INSERTS')) assert_that(r[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS], equal_to([(full_output_table_1, bad_record)])) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: _LOGGER.info("Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: _LOGGER.debug('Failed to clean up dataset %s in project %s', self.dataset_id, self.project)
class BigQueryFileLoadsIntegrationTests(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_bq_file_loads_' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.dataset_id = '%s%s%s' % ( self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client = bigquery_tools.BigQueryWrapper() self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = '%s.output_table' % (self.dataset_id) self.table_ref = bigquery_tools.parse_table_reference(self.output_table) _LOGGER.info( 'Created dataset %s in project %s', self.dataset_id, self.project) @attr('IT') def test_avro_file_load(self): # Construct elements such that they can be written via Avro but not via # JSON. See BEAM-8841. from apache_beam.io.gcp import bigquery_file_loads old_max_files = bigquery_file_loads._MAXIMUM_SOURCE_URIS old_max_file_size = bigquery_file_loads._DEFAULT_MAX_FILE_SIZE bigquery_file_loads._MAXIMUM_SOURCE_URIS = 1 bigquery_file_loads._DEFAULT_MAX_FILE_SIZE = 100 elements = [ { 'name': u'Negative infinity', 'value': -float('inf'), 'timestamp': datetime.datetime(1970, 1, 1, tzinfo=pytz.utc), }, { 'name': u'Not a number', 'value': float('nan'), 'timestamp': datetime.datetime(2930, 12, 9, tzinfo=pytz.utc), }, ] schema = beam.io.gcp.bigquery.WriteToBigQuery.get_dict_table_schema( bigquery.TableSchema( fields=[ bigquery.TableFieldSchema( name='name', type='STRING', mode='REQUIRED'), bigquery.TableFieldSchema( name='value', type='FLOAT', mode='REQUIRED'), bigquery.TableFieldSchema( name='timestamp', type='TIMESTAMP', mode='REQUIRED'), ])) pipeline_verifiers = [ # Some gymnastics here to avoid comparing NaN since NaN is not equal to # anything, including itself. BigqueryFullResultMatcher( project=self.project, query="SELECT name, value, timestamp FROM {} WHERE value<0".format( self.output_table), data=[(d['name'], d['value'], d['timestamp']) for d in elements[:1]], ), BigqueryFullResultMatcher( project=self.project, query="SELECT name, timestamp FROM {}".format(self.output_table), data=[(d['name'], d['timestamp']) for d in elements], ), ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), ) with beam.Pipeline(argv=args) as p: input = p | 'CreateInput' >> beam.Create(elements) schema_pc = p | 'CreateSchema' >> beam.Create([schema]) _ = ( input | 'WriteToBigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery( table='%s:%s' % (self.project, self.output_table), schema=lambda _, schema: schema, schema_side_inputs=(beam.pvalue.AsSingleton(schema_pc), ), method='FILE_LOADS', temp_file_format=bigquery_tools.FileFormat.AVRO, )) bigquery_file_loads._MAXIMUM_SOURCE_URIS = old_max_files bigquery_file_loads._DEFAULT_MAX_FILE_SIZE = old_max_file_size def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: _LOGGER.info( "Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: _LOGGER.debug( 'Failed to clean up dataset %s in project %s', self.dataset_id, self.project)
class GameStatsIT(unittest.TestCase): # Input events containing user, team, score, processing time, window start. INPUT_EVENT = 'user1,teamA,10,%d,2015-11-02 09:09:28.224' INPUT_TOPIC = 'game_stats_it_input_topic' INPUT_SUB = 'game_stats_it_input_subscription' # SHA-1 hash generated from sorted rows reading from BigQuery table DEFAULT_EXPECTED_CHECKSUM = '5288ccaab77d347c8460d77c15a0db234ef5eb4f' OUTPUT_DATASET = 'game_stats_it_dataset' OUTPUT_TABLE_SESSIONS = 'game_stats_sessions' OUTPUT_TABLE_TEAMS = 'game_stats_teams' DEFAULT_INPUT_COUNT = 500 WAIT_UNTIL_FINISH_DURATION = 12 * 60 * 1000 # in milliseconds def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') _unique_id = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pub_client = pubsub.PublisherClient() self.input_topic = self.pub_client.create_topic( self.pub_client.topic_path(self.project, self.INPUT_TOPIC + _unique_id)) self.sub_client = pubsub.SubscriberClient() self.input_sub = self.sub_client.create_subscription( self.sub_client.subscription_path(self.project, self.INPUT_SUB + _unique_id), self.input_topic.name) # Set up BigQuery environment self.dataset_ref = utils.create_bq_dataset(self.project, self.OUTPUT_DATASET) self._test_timestamp = int(time.time() * 1000) def _inject_pubsub_game_events(self, topic, message_count): """Inject game events as test data to PubSub.""" logging.debug('Injecting %d game events to topic %s', message_count, topic.name) for _ in range(message_count): self.pub_client.publish(topic.name, (self.INPUT_EVENT % self._test_timestamp ).encode('utf-8')) def _cleanup_pubsub(self): test_utils.cleanup_subscriptions(self.sub_client, [self.input_sub]) test_utils.cleanup_topics(self.pub_client, [self.input_topic]) @attr('IT') def test_game_stats_it(self): state_verifier = PipelineStateMatcher(PipelineState.RUNNING) success_condition = 'mean_duration=300 LIMIT 1' sessions_query = ('SELECT mean_duration FROM `%s.%s.%s` ' 'WHERE %s' % (self.project, self.dataset_ref.dataset_id, self.OUTPUT_TABLE_SESSIONS, success_condition)) bq_sessions_verifier = BigqueryMatcher(self.project, sessions_query, self.DEFAULT_EXPECTED_CHECKSUM) # TODO(mariagh): Add teams table verifier once game_stats.py is fixed. extra_opts = {'subscription': self.input_sub.name, 'dataset': self.dataset_ref.dataset_id, 'topic': self.input_topic.name, 'fixed_window_duration': 1, 'user_activity_window_duration': 1, 'wait_until_finish_duration': self.WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(state_verifier, bq_sessions_verifier)} # Register cleanup before pipeline execution. # Note that actual execution happens in reverse order. self.addCleanup(self._cleanup_pubsub) self.addCleanup(utils.delete_bq_dataset, self.project, self.dataset_ref) # Generate input data and inject to PubSub. self._inject_pubsub_game_events(self.input_topic, self.DEFAULT_INPUT_COUNT) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. game_stats.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
class BigQueryWriteIntegrationTests(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_write_to_table_' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) _LOGGER.info("Created dataset %s in project %s", self.dataset_id, self.project) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: _LOGGER.info("Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: _LOGGER.debug('Failed to clean up dataset %s in project %s', self.dataset_id, self.project) def create_table(self, table_name): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'int64' table_field.type = 'INT64' table_field.mode = 'REQUIRED' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table(tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=table_name), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) @pytest.mark.it_postcommit def test_big_query_write(self): table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ { 'number': 1, 'str': 'abc' }, { 'number': 2, 'str': 'def' }, { 'number': 3, 'str': u'你好' }, { 'number': 4, 'str': u'привет' }, ] table_schema = { "fields": [{ "name": "number", "type": "INTEGER" }, { "name": "str", "type": "STRING" }] } pipeline_verifiers = [ BigqueryFullResultMatcher(project=self.project, query="SELECT number, str FROM %s" % table_id, data=[( 1, 'abc', ), ( 2, 'def', ), ( 3, u'你好', ), ( 4, u'привет', )]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) @pytest.mark.it_postcommit def test_big_query_write_schema_autodetect(self): if self.runner_name == 'TestDataflowRunner': self.skipTest( 'DataflowRunner does not support schema autodetection') table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [ { 'number': 1, 'str': 'abc' }, { 'number': 2, 'str': 'def' }, ] pipeline_verifiers = [ BigqueryFullResultMatcher(project=self.project, query="SELECT number, str FROM %s" % table_id, data=[( 1, 'abc', ), ( 2, 'def', )]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, method=beam.io.WriteToBigQuery.Method.FILE_LOADS, schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY, temp_file_format=FileFormat.JSON)) @pytest.mark.it_postcommit def test_big_query_write_new_types(self): table_name = 'python_new_types_table' table_id = '{}.{}'.format(self.dataset_id, table_name) row_data = { 'float': 0.33, 'numeric': Decimal('10'), 'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'), 'date': '3000-12-31', 'time': '23:59:59', 'datetime': '2018-12-31T12:44:31', 'timestamp': '2018-12-31 12:44:31.744957 UTC', 'geo': 'POINT(30 10)' } input_data = [row_data] # add rows with only one key value pair and None values for all other keys for key, value in row_data.items(): input_data.append({key: value}) table_schema = { "fields": [{ "name": "float", "type": "FLOAT" }, { "name": "numeric", "type": "NUMERIC" }, { "name": "bytes", "type": "BYTES" }, { "name": "date", "type": "DATE" }, { "name": "time", "type": "TIME" }, { "name": "datetime", "type": "DATETIME" }, { "name": "timestamp", "type": "TIMESTAMP" }, { "name": "geo", "type": "GEOGRAPHY" }] } expected_row = ( 0.33, Decimal('10'), b'\xab\xac', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), datetime.datetime(2018, 12, 31, 12, 44, 31), datetime.datetime(2018, 12, 31, 12, 44, 31, 744957, tzinfo=pytz.utc), 'POINT(30 10)', ) expected_data = [expected_row] # add rows with only one key value pair and None values for all other keys for i, value in enumerate(expected_row): row = [None] * len(expected_row) row[i] = value expected_data.append(tuple(row)) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query='SELECT float, numeric, bytes, date, time, datetime,' 'timestamp, geo FROM %s' % table_id, data=expected_data) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) @pytest.mark.it_postcommit def test_big_query_write_without_schema(self): table_name = 'python_no_schema_table' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [{ 'int64': 1, 'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999' }, { 'int64': 2, 'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00' }, { 'int64': 3, 'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31', 'time': '23:59:59' }, { 'int64': 4, 'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00' }] # bigquery io expects bytes to be base64 encoded values for row in input_data: row['bytes'] = base64.b64encode(row['bytes']) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT int64, bytes, date, time FROM %s" % table_id, data=[( 1, b'xyw', datetime.date(2011, 1, 1), datetime.time(23, 59, 59, 999999), ), ( 2, b'abc', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), ), ( 3, b'\xe4\xbd\xa0\xe5\xa5\xbd', datetime.date(3000, 12, 31), datetime.time(23, 59, 59), ), ( 4, b'\xab\xac\xad', datetime.date(2000, 1, 1), datetime.time(0, 0, 0), )]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, temp_file_format=FileFormat.JSON)) @pytest.mark.it_postcommit def test_big_query_write_insert_errors_reporting(self): """ Test that errors returned by beam.io.WriteToBigQuery contain both the failed rows amd the reason for it failing. """ table_name = 'python_write_table' table_id = '{}.{}'.format(self.dataset_id, table_name) input_data = [{ 'number': 1, 'str': 'some_string', }, { 'number': 2 }, { 'number': 3, 'str': 'some_string', 'additional_field_str': 'some_string', }] table_schema = { "fields": [{ "name": "number", "type": "INTEGER", 'mode': 'REQUIRED' }, { "name": "str", "type": "STRING", 'mode': 'REQUIRED' }] } bq_result_errors = [( { "number": 2 }, [{ "reason": "invalid", "location": "", "debugInfo": "", "message": "Missing required field: Msg_0_CLOUD_QUERY_TABLE.str." }], ), ({ "number": 3, "str": "some_string", "additional_field_str": "some_string" }, [{ "reason": "invalid", "location": "additional_field_str", "debugInfo": "", "message": "no such field: additional_field_str." }])] pipeline_verifiers = [ BigqueryFullResultMatcher(project=self.project, query="SELECT number, str FROM %s" % table_id, data=[(1, 'some_string')]), ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers)) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned errors = ( p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, method='STREAMING_INSERTS', insert_retry_strategy='RETRY_NEVER', create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) ) assert_that( errors[BigQueryWriteFn.FAILED_ROWS_WITH_ERRORS] | 'ParseErrors' >> beam.Map(lambda err: (err[1], err[2])), equal_to(bq_result_errors)) @pytest.mark.it_postcommit @parameterized.expand([ param(file_format=FileFormat.AVRO), param(file_format=FileFormat.JSON), param(file_format=None), ]) @mock.patch("apache_beam.io.gcp.bigquery_file_loads._MAXIMUM_SOURCE_URIS", new=1) def test_big_query_write_temp_table_append_schema_update( self, file_format): """ Test that nested schema update options and schema relaxation are respected when appending to an existing table via temporary tables. _MAXIMUM_SOURCE_URIS and max_file_size are both set to 1 to force multiple load jobs and usage of temporary tables. """ table_name = 'python_append_schema_update' self.create_table(table_name) table_id = '{}.{}'.format(self.dataset_id, table_name) # bytes, date, time fields are optional and omitted in the test # only required and new columns are specified table_schema = { "fields": [{ "name": "int64", "type": "INT64", "mode": "NULLABLE", }, { "name": "bool", "type": "BOOL", }, { "name": "nested_field", "type": "RECORD", "mode": "REPEATED", "fields": [ { "name": "fruit", "type": "STRING", "mode": "NULLABLE" }, ] }] } input_data = [{ "int64": 1, "bool": True, "nested_field": [{ "fruit": "Apple" }] }, { "bool": False, "nested_field": [{ "fruit": "Mango" }] }, { "int64": None, "bool": True, "nested_field": [{ "fruit": "Banana" }] }] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=BigqueryFullResultMatcher( project=self.project, query=""" SELECT bytes, date, time, int64, bool, fruit FROM {}, UNNEST(nested_field) as nested_field ORDER BY fruit """.format(table_id), data=[(None, None, None, 1, True, "Apple"), ( None, None, None, None, True, "Banana"), (None, None, None, None, False, "Mango")])) with beam.Pipeline(argv=args) as p: # pylint: disable=expression-not-assigned (p | 'create' >> beam.Create(input_data) | 'write' >> beam.io.WriteToBigQuery( table_id, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, max_file_size=1, # bytes method=beam.io.WriteToBigQuery.Method.FILE_LOADS, additional_bq_parameters={ 'schemaUpdateOptions': ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION'] }, temp_file_format=file_format))
class CoGroupByKeyTest(unittest.TestCase): def parseTestPipelineOptions(self, options): return { 'numRecords': options.get('num_records'), 'keySizeBytes': options.get('key_size'), 'valueSizeBytes': options.get('value_size'), 'bundleSizeDistribution': { 'type': options.get( 'bundle_size_distribution_type', 'const' ), 'param': options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': options.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline(is_integration_test=True) self.inputOptions = json.loads(self.pipeline.get_option('input_options')) self.coInputOptions = json.loads( self.pipeline.get_option('co_input_options')) class _Ungroup(beam.DoFn): def process(self, element): values = element[1] inputs = values.get(INPUT_TAG) co_inputs = values.get(CO_INPUT_TAG) for i in inputs: yield i for i in co_inputs: yield i def testCoGroupByKey(self): with self.pipeline as p: pc1 = (p | 'Read ' + INPUT_TAG >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions(self.inputOptions))) | 'Make ' + INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x)) ) pc2 = (p | 'Read ' + CO_INPUT_TAG >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions(self.coInputOptions))) | 'Make ' + CO_INPUT_TAG + ' iterable' >> beam.Map( lambda x: (x, x)) ) # pylint: disable=expression-not-assigned ({INPUT_TAG: pc1, CO_INPUT_TAG: pc2} | 'CoGroupByKey: ' >> beam.CoGroupByKey() | 'Consume Joined Collections' >> beam.ParDo(self._Ungroup()) | 'Measure time' >> beam.ParDo(MeasureTime()) ) result = p.run() result.wait_until_finish() metrics = result.metrics().query() for dist in metrics['distributions']: logging.info("Distribution: %s", dist)
class FlightDelaysTest(unittest.TestCase): EXPECTED = { '2012-12-23': [ ('AA', 20.082559339525282, 12.825593395252838), ('EV', 10.01901901901902, 4.431431431431432), ('HA', -1.0829015544041452, 0.010362694300518135), ('UA', 19.142555438225976, 11.07180570221753), ('MQ', 8.902255639097744, 3.6676691729323307), ('OO', 31.148883374689827, 31.90818858560794), ('US', 3.092541436464088, -2.350828729281768), ('WN', 12.074298711144806, 6.717968157695224), ('AS', 5.0456273764258555, 1.0722433460076046), ('B6', 20.646569646569645, 16.405405405405407), ('DL', 5.2559923298178335, -3.214765100671141), ('F9', 23.823529411764707, 25.455882352941178), ('FL', 4.492877492877493, -0.8005698005698005), ('VX', 62.755102040816325, 62.61224489795919), ('YV', 16.155844155844157, 13.376623376623376), ], '2012-12-24': [ ('AS', 0.5917602996254682, -2.2659176029962547), ('B6', 8.070993914807302, 2.73630831643002), ('DL', 3.7171824973319105, -2.2358591248665953), ('F9', 14.111940298507463, 15.888059701492537), ('FL', 2.4210526315789473, 2.242690058479532), ('VX', 3.841666666666667, -2.4166666666666665), ('YV', 0.32, 0.78), ('MQ', 15.869642857142857, 9.992857142857142), ('OO', 11.048517520215633, 10.138814016172507), ('US', 1.369281045751634, -1.4101307189542485), ('WN', 7.515952597994531, 0.7028258887876025), ('AA', 7.049086757990867, -1.5970319634703196), ('EV', 7.297101449275362, 2.2693236714975846), ('HA', -2.6785714285714284, -2.4744897959183674), ('UA', 10.935406698564593, -1.3337320574162679), ], '2012-12-25': [ ('AS', 3.4816326530612245, 0.27346938775510204), ('B6', 9.10590631364562, 3.989816700610998), ('DL', 2.3022170361726952, -3.6709451575262544), ('F9', 19.38255033557047, 21.845637583892618), ('FL', 1.3982300884955752, 0.9380530973451328), ('VX', 23.62878787878788, 23.636363636363637), ('YV', 11.256302521008404, 11.659663865546218), ('MQ', 32.6, 44.28666666666667), ('OO', 16.2275960170697, 17.11948790896159), ('US', 2.7953216374269005, 0.2236842105263158), ('WN', 14.405783582089553, 10.111940298507463), ('AA', 23.551581843191197, 35.62585969738652), ('EV', 17.368638239339752, 16.43191196698762), ('HA', -4.725806451612903, -3.9946236559139785), ('UA', 16.663145539906104, 10.772300469483568), ], } def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.outdir = (self.test_pipeline.get_option('temp_location') + '/flight_delays_it-' + str(uuid.uuid4())) self.output_path = os.path.join(self.outdir, 'output.csv') def tearDown(self): FileSystems.delete([self.outdir + '/']) @pytest.mark.it_postcommit def test_flight_delays(self): flight_delays.run_flight_delay_pipeline(self.test_pipeline, start_date='2012-12-23', end_date='2012-12-25', output=self.output_path) def read_csv(path): with FileSystems.open(path) as fp: return pd.read_csv(fp) # Parse result file and compare. for date, expectation in self.EXPECTED.items(): result_df = pd.concat( read_csv(metadata.path) for metadata in FileSystems.match( [f'{self.output_path}-{date}*'])[0].metadata_list) result_df = result_df.sort_values('airline').reset_index(drop=True) expected_df = pd.DataFrame( expectation, columns=['airline', 'departure_delay', 'arrival_delay']) expected_df = expected_df.sort_values('airline').reset_index( drop=True) try: pd.testing.assert_frame_equal(result_df, expected_df) except AssertionError as e: raise AssertionError(f"date={date!r} result DataFrame:\n\n" f"{result_df}\n\n" "Differs from Expectation:\n\n" f"{expected_df}") from e
class BigQueryQueryToTableIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s' % self.dataset_id) def _setup_new_types_env(self): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table( tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=NEW_TYPES_INPUT_TABLE), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [ {'bytes':b'xyw=', 'date':'2011-01-01', 'time':'23:59:59.999999'}, {'bytes':b'abc=', 'date':'2000-01-01', 'time':'00:00:00'}, {'bytes':b'dec=', 'date':'3000-12-31', 'time':'23:59:59.990000'} ] self.bigquery_client.insert_rows( self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data) @attr('IT') def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': LEGACY_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_standard_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_new_types(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
class GroupByKeyTest(unittest.TestCase): def parseTestPipelineOptions(self): return { 'numRecords': self.input_options.get('num_records'), 'keySizeBytes': self.input_options.get('key_size'), 'valueSizeBytes': self.input_options.get('value_size'), 'bundleSizeDistribution': { 'type': self.input_options.get('bundle_size_distribution_type', 'const'), 'param': self.input_options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': self.input_options.get('force_initial_num_bundles', 0) } def setUp(self): self.pipeline = TestPipeline() self.input_options = json.loads( self.pipeline.get_option('input_options')) self.metrics_monitor = self.pipeline.get_option('publish_to_big_query') metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') check = metrics_project_id and self.metrics_namespace and metrics_dataset \ is not None if not self.metrics_monitor: logging.info('Metrics will not be collected') elif check: self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, ) else: raise ValueError( 'One or more of parameters for collecting metrics ' 'are empty.') def testGroupByKey(self): with self.pipeline as p: # pylint: disable=expression-not-assigned (p | beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions())) | 'Measure time: Start' >> beam.ParDo( MeasureTime(self.metrics_namespace)) | 'GroupByKey' >> beam.GroupByKey() | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v) for v in elm[1]]) | 'Measure time: End' >> beam.ParDo( MeasureTime(self.metrics_namespace))) result = p.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
class PubSubIntegrationTest(unittest.TestCase): ID_LABEL = 'id' TIMESTAMP_ATTRIBUTE = 'timestamp' INPUT_MESSAGES = { # TODO(BEAM-4275): DirectRunner doesn't support reading or writing # label_ids, nor writing timestamp attributes. Once these features exist, # TestDirectRunner and TestDataflowRunner should behave identically. 'TestDirectRunner': [ PubsubMessage('data001', {}), # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the # IT pipeline writes back the timestamp of each element (as reported # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute. PubsubMessage('data002', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', }), ], 'TestDataflowRunner': [ # Use ID_LABEL attribute to deduplicate messages with the same ID. PubsubMessage('data001', {ID_LABEL: 'foo'}), PubsubMessage('data001', {ID_LABEL: 'foo'}), PubsubMessage('data001', {ID_LABEL: 'foo'}), # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the # IT pipeline writes back the timestamp of each element (as reported # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute. PubsubMessage('data002', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', }) ], } EXPECTED_OUTPUT_MESSAGES = { 'TestDirectRunner': [ PubsubMessage('data001-seen', {'processed': 'IT'}), PubsubMessage('data002-seen', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 'processed': 'IT', }), ], 'TestDataflowRunner': [ PubsubMessage('data001-seen', {'processed': 'IT'}), PubsubMessage('data002-seen', { TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 'processed': 'IT', }), ], } def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.uuid = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pub_client = pubsub.PublisherClient() self.input_topic = self.pub_client.create_topic( self.pub_client.topic_path(self.project, INPUT_TOPIC + self.uuid)) self.output_topic = self.pub_client.create_topic( self.pub_client.topic_path(self.project, OUTPUT_TOPIC + self.uuid)) self.sub_client = pubsub.SubscriberClient() self.input_sub = self.sub_client.create_subscription( self.sub_client.subscription_path(self.project, INPUT_SUB + self.uuid), self.input_topic.name) self.output_sub = self.sub_client.create_subscription( self.sub_client.subscription_path(self.project, OUTPUT_SUB + self.uuid), self.output_topic.name) def tearDown(self): test_utils.cleanup_subscriptions(self.sub_client, [self.input_sub, self.output_sub]) test_utils.cleanup_topics(self.pub_client, [self.input_topic, self.output_topic]) def _test_streaming(self, with_attributes): """Runs IT pipeline with message verifier. Args: with_attributes: False - Reads and writes message data only. True - Reads and writes message data and attributes. Also verifies id_label and timestamp_attribute features. """ # Set on_success_matcher to verify pipeline state and pubsub output. These # verifications run on a (remote) worker. # Expect the state to be RUNNING since a streaming pipeline is usually # never DONE. The test runner will cancel the pipeline after verification. state_verifier = PipelineStateMatcher(PipelineState.RUNNING) expected_messages = self.EXPECTED_OUTPUT_MESSAGES[self.runner_name] if not with_attributes: expected_messages = [pubsub_msg.data for pubsub_msg in expected_messages] if self.runner_name == 'TestDirectRunner': strip_attributes = None else: strip_attributes = [self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE] pubsub_msg_verifier = PubSubMessageMatcher( self.project, self.output_sub.name, expected_messages, timeout=MESSAGE_MATCHER_TIMEOUT_S, with_attributes=with_attributes, strip_attributes=strip_attributes) extra_opts = {'input_subscription': self.input_sub.name, 'output_topic': self.output_topic.name, 'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)} # Generate input data and inject to PubSub. for msg in self.INPUT_MESSAGES[self.runner_name]: self.pub_client.publish(self.input_topic.name, msg.data, **msg.attributes) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. pubsub_it_pipeline.run_pipeline( argv=self.test_pipeline.get_full_options_as_args(**extra_opts), with_attributes=with_attributes, id_label=self.ID_LABEL, timestamp_attribute=self.TIMESTAMP_ATTRIBUTE) @attr('IT') def test_streaming_data_only(self): self._test_streaming(with_attributes=False) @attr('IT') def test_streaming_with_attributes(self): self._test_streaming(with_attributes=True)
class PubSubBigQueryIT(unittest.TestCase): INPUT_TOPIC = 'psit_topic_output' INPUT_SUB = 'psit_subscription_input' BIG_QUERY_DATASET_ID = 'python_pubsub_bq_' SCHEMA = { 'fields': [{ 'name': 'number', 'type': 'INTEGER', 'mode': 'NULLABLE' }] } _SIZE = 4 WAIT_UNTIL_FINISH_DURATION = 15 * 60 * 1000 def setUp(self): # Set up PubSub self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.uuid = str(uuid.uuid4()) from google.cloud import pubsub self.pub_client = pubsub.PublisherClient() self.input_topic = self.pub_client.create_topic( self.pub_client.topic_path(self.project, self.INPUT_TOPIC + self.uuid)) self.sub_client = pubsub.SubscriberClient() self.input_sub = self.sub_client.create_subscription( self.sub_client.subscription_path(self.project, self.INPUT_SUB + self.uuid), self.input_topic.name) # Set up BQ self.dataset_ref = utils.create_bq_dataset(self.project, self.BIG_QUERY_DATASET_ID) self.output_table = "%s.output_table" % (self.dataset_ref.dataset_id) def tearDown(self): # Tear down PubSub test_utils.cleanup_topics(self.pub_client, [self.input_topic]) test_utils.cleanup_subscriptions(self.sub_client, [self.input_sub]) # Tear down BigQuery utils.delete_bq_dataset(self.project, self.dataset_ref) def _run_pubsub_bq_pipeline(self, method, triggering_frequency=None): l = [i for i in range(self._SIZE)] matchers = [ PipelineStateMatcher(PipelineState.RUNNING), BigqueryFullResultStreamingMatcher(project=self.project, query="SELECT number FROM %s" % self.output_table, data=[(i, ) for i in l]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*matchers), wait_until_finish_duration=self.WAIT_UNTIL_FINISH_DURATION, experiments='use_beam_bq_sink', streaming=True) def add_schema_info(element): yield {'number': element} messages = [str(i).encode('utf-8') for i in l] for message in messages: self.pub_client.publish(self.input_topic.name, message) with beam.Pipeline(argv=args) as p: mesages = (p | ReadFromPubSub(subscription=self.input_sub.name) | beam.ParDo(add_schema_info)) _ = mesages | WriteToBigQuery( self.output_table, schema=self.SCHEMA, method=method, triggering_frequency=triggering_frequency) @attr('IT') def test_streaming_inserts(self): self._run_pubsub_bq_pipeline(WriteToBigQuery.Method.STREAMING_INSERTS) @attr('IT') def test_file_loads(self): if isinstance(self.test_pipeline.runner, TestDataflowRunner): self.skipTest('https://issuetracker.google.com/issues/118375066') self._run_pubsub_bq_pipeline(WriteToBigQuery.Method.FILE_LOADS, triggering_frequency=20)
class CoGroupByKeyTest(unittest.TestCase): def parseTestPipelineOptions(self, options): return { 'numRecords': options.get('num_records'), 'keySizeBytes': options.get('key_size'), 'valueSizeBytes': options.get('value_size'), 'bundleSizeDistribution': { 'type': options.get( 'bundle_size_distribution_type', 'const' ), 'param': options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': options.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline(is_integration_test=True) self.input_options = json.loads(self.pipeline.get_option('input_options')) self.co_input_options = json.loads( self.pipeline.get_option('co_input_options')) metrics_project_id = self.pipeline.get_option('project') self.metrics_namespace = self.pipeline.get_option('metrics_table') metrics_dataset = self.pipeline.get_option('metrics_dataset') self.metrics_monitor = None check = metrics_project_id and self.metrics_namespace and metrics_dataset\ is not None if check: measured_values = [{'name': RUNTIME_LABEL, 'type': 'FLOAT', 'mode': 'REQUIRED'}] self.metrics_monitor = MetricsMonitor( project_name=metrics_project_id, table=self.metrics_namespace, dataset=metrics_dataset, schema_map=measured_values ) else: logging.error('One or more of parameters for collecting metrics ' 'are empty. Metrics will not be collected') class _Ungroup(beam.DoFn): def process(self, element): values = element[1] inputs = values.get(INPUT_TAG) co_inputs = values.get(CO_INPUT_TAG) for i in inputs: yield i for i in co_inputs: yield i def testCoGroupByKey(self): with self.pipeline as p: pc1 = (p | 'Read ' + INPUT_TAG >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions(self.input_options))) | 'Make ' + INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x)) | 'Measure time: Start pc1' >> beam.ParDo( MeasureTime(self.metrics_namespace)) ) pc2 = (p | 'Read ' + CO_INPUT_TAG >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions(self.co_input_options))) | 'Make ' + CO_INPUT_TAG + ' iterable' >> beam.Map( lambda x: (x, x)) | 'Measure time: Start pc2' >> beam.ParDo( MeasureTime(self.metrics_namespace)) ) # pylint: disable=expression-not-assigned ({INPUT_TAG: pc1, CO_INPUT_TAG: pc2} | 'CoGroupByKey: ' >> beam.CoGroupByKey() | 'Consume Joined Collections' >> beam.ParDo(self._Ungroup()) | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)) ) result = p.run() result.wait_until_finish() if self.metrics_monitor is not None: self.metrics_monitor.send_metrics(result)
class GameStatsIT(unittest.TestCase): # Input events containing user, team, score, processing time, window start. INPUT_EVENT = 'user1,teamA,10,%d,2015-11-02 09:09:28.224' INPUT_TOPIC = 'game_stats_it_input_topic' INPUT_SUB = 'game_stats_it_input_subscription' # SHA-1 hash generated from sorted rows reading from BigQuery table DEFAULT_EXPECTED_CHECKSUM = '5288ccaab77d347c8460d77c15a0db234ef5eb4f' OUTPUT_DATASET = 'game_stats_it_dataset' OUTPUT_TABLE_SESSIONS = 'game_stats_sessions' OUTPUT_TABLE_TEAMS = 'game_stats_teams' DEFAULT_INPUT_COUNT = 500 WAIT_UNTIL_FINISH_DURATION = 12 * 60 * 1000 # in milliseconds def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') _unique_id = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pub_client = pubsub.PublisherClient() self.input_topic = self.pub_client.create_topic( self.pub_client.topic_path(self.project, self.INPUT_TOPIC + _unique_id)) self.sub_client = pubsub.SubscriberClient() self.input_sub = self.sub_client.create_subscription( self.sub_client.subscription_path(self.project, self.INPUT_SUB + _unique_id), self.input_topic.name) # Set up BigQuery environment self.dataset_ref = utils.create_bq_dataset(self.project, self.OUTPUT_DATASET) self._test_timestamp = int(time.time() * 1000) def _inject_pubsub_game_events(self, topic, message_count): """Inject game events as test data to PubSub.""" logging.debug('Injecting %d game events to topic %s', message_count, topic.name) for _ in range(message_count): self.pub_client.publish(topic.name, (self.INPUT_EVENT % self._test_timestamp).encode('utf-8')) def _cleanup_pubsub(self): test_utils.cleanup_subscriptions(self.sub_client, [self.input_sub]) test_utils.cleanup_topics(self.pub_client, [self.input_topic]) @pytest.mark.it_postcommit def test_game_stats_it(self): state_verifier = PipelineStateMatcher(PipelineState.RUNNING) success_condition = 'mean_duration=300 LIMIT 1' sessions_query = ('SELECT mean_duration FROM `%s.%s.%s` ' 'WHERE %s' % (self.project, self.dataset_ref.dataset_id, self.OUTPUT_TABLE_SESSIONS, success_condition)) bq_sessions_verifier = BigqueryMatcher(self.project, sessions_query, self.DEFAULT_EXPECTED_CHECKSUM) # TODO(mariagh): Add teams table verifier once game_stats.py is fixed. extra_opts = { 'subscription': self.input_sub.name, 'dataset': self.dataset_ref.dataset_id, 'topic': self.input_topic.name, 'fixed_window_duration': 1, 'user_activity_window_duration': 1, 'wait_until_finish_duration': self.WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(state_verifier, bq_sessions_verifier) } # Register cleanup before pipeline execution. # Note that actual execution happens in reverse order. self.addCleanup(self._cleanup_pubsub) self.addCleanup(utils.delete_bq_dataset, self.project, self.dataset_ref) # Generate input data and inject to PubSub. self._inject_pubsub_game_events(self.input_topic, self.DEFAULT_INPUT_COUNT) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. game_stats.run( self.test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False)
class ParDoTest(unittest.TestCase): def parseTestPipelineOptions(self): return {'numRecords': self.inputOptions.get('num_records'), 'keySizeBytes': self.inputOptions.get('key_size'), 'valueSizeBytes': self.inputOptions.get('value_size'), 'bundleSizeDistribution': { 'type': self.inputOptions.get( 'bundle_size_distribution_type', 'const' ), 'param': self.inputOptions.get( 'bundle_size_distribution_param', 0 ) }, 'forceNumInitialBundles': self.inputOptions.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline(is_integration_test=True) self.output = self.pipeline.get_option('output') self.iterations = self.pipeline.get_option('number_of_counter_operations') self.inputOptions = json.loads(self.pipeline.get_option('input_options')) class _MeasureTime(beam.DoFn): def __init__(self): self.runtime_start = Metrics.distribution('pardo', 'runtime.start') self.runtime_end = Metrics.distribution('pardo', 'runtime.end') def start_bundle(self): self.runtime_start.update(time.time()) def finish_bundle(self): self.runtime_end.update(time.time()) def process(self, element): yield element class _GetElement(beam.DoFn): def __init__(self): self.counter = Metrics.counter('pardo', 'total_bytes.count') def process(self, element): _, value = element for i in range(len(value)): self.counter.inc(i) yield element def testParDo(self): if self.iterations is None: num_runs = 1 else: num_runs = int(self.iterations) with self.pipeline as p: pc = (p | 'Read synthetic' >> beam.io.Read( synthetic_pipeline.SyntheticSource( self.parseTestPipelineOptions() )) | 'Measure time' >> beam.ParDo(MeasureTime()) ) for i in range(num_runs): label = 'Step: %d' % i pc = (pc | label >> beam.ParDo(self._GetElement())) if self.output is not None: # pylint: disable=expression-not-assigned (pc | "Write" >> beam.io.WriteToText(self.output) ) result = p.run() result.wait_until_finish() metrics = result.metrics().query() for counter in metrics['counters']: logging.info("Counter: %s", counter) for dist in metrics['distributions']: logging.info("Distribution: %s", dist)