def test_leader_board_it(self): state_verifier = PipelineStateMatcher(PipelineState.RUNNING) success_condition = 'total_score=5000 LIMIT 1' users_query = ('SELECT total_score FROM [%s:%s.%s] ' 'WHERE %s' % (self.project, self.dataset.name, self.OUTPUT_TABLE_USERS, success_condition)) bq_users_verifier = BigqueryMatcher(self.project, users_query, self.DEFAULT_EXPECTED_CHECKSUM) teams_query = ('SELECT total_score FROM [%s:%s.%s] ' 'WHERE %s' % (self.project, self.dataset.name, self.OUTPUT_TABLE_TEAMS, success_condition)) bq_teams_verifier = BigqueryMatcher(self.project, teams_query, self.DEFAULT_EXPECTED_CHECKSUM) extra_opts = { 'subscription': self.input_sub.full_name, 'dataset': self.dataset.name, 'topic': self.input_topic.full_name, 'team_window_duration': 1, 'wait_until_finish_duration': self.WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(state_verifier, bq_users_verifier, bq_teams_verifier) } # Register cleanup before pipeline execution. # Note that actual execution happens in reverse order. self.addCleanup(self._cleanup_pubsub) self.addCleanup(self._cleanup_dataset) self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name, self.OUTPUT_TABLE_USERS) self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name, self.OUTPUT_TABLE_TEAMS) # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created( [self.input_topic, self.input_sub]) self._inject_pubsub_game_events(self.input_topic, self.DEFAULT_INPUT_COUNT) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. leader_board.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_hourly_team_score_it(self): state_verifier = PipelineStateMatcher(PipelineState.DONE) query = ( 'SELECT COUNT(*) FROM `%s.%s.%s`' % (self.project, self.dataset_ref.dataset_id, self.OUTPUT_TABLE)) bigquery_verifier = BigqueryMatcher(self.project, query, self.DEFAULT_EXPECTED_CHECKSUM) extra_opts = { 'input': self.DEFAULT_INPUT_FILE, 'dataset': self.dataset_ref.dataset_id, 'window_duration': 1, 'on_success_matcher': all_of(state_verifier, bigquery_verifier) } # Register clean up before pipeline execution # Note that actual execution happens in reverse order. self.addCleanup(utils.delete_bq_dataset, self.project, self.dataset_ref) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. hourly_team_score.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_big_query_standard_sql_kms_key_native(self): if isinstance(self.test_pipeline.runner, TestDirectRunner): self.skipTest("This test doesn't work on DirectRunner.") verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] kms_key = self.test_pipeline.get_option('kms_key_name') self.assertTrue(kms_key) extra_opts = { 'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers), 'kms_key': kms_key, 'native': True, 'experiments': 'use_legacy_bq_sink', } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) table = self.bigquery_client.get_table(self.project, self.dataset_id, 'output_table') self.assertIsNotNone(table.encryptionConfiguration, 'No encryption configuration found: %s' % table) self.assertEqual(kms_key, table.encryptionConfiguration.kmsKeyName)
def test_big_query_new_types_native(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum, timeout_secs=30, ) ] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'native': True, 'use_json_exports': True, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers), 'experiments': 'use_legacy_bq_sink', } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def test_bigquery_tornadoes_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose project = test_pipeline.get_option('project') dataset = 'BigQueryTornadoesIT' table = 'monthly_tornadoes_%s' % int(round(time.time() * 1000)) output_table = '.'.join([dataset, table]) query = 'SELECT month, tornado_count FROM [%s]' % output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=project, query=query, checksum=self.DEFAULT_CHECKSUM)] extra_opts = {'output': output_table, 'on_success_matcher': all_of(*pipeline_verifiers)} # Register cleanup before pipeline execution. self.addCleanup(utils.delete_bq_table, project, dataset, table) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. bigquery_tornadoes.run( test_pipeline.get_full_options_as_args(**extra_opts))
def test_filters_output_bigquery_matcher(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose project = test_pipeline.get_option('project') dataset = 'FiltersTestIT' table = 'cold_days_%s' % int(round(time.time() * 1000)) output_table = '.'.join([dataset, table]) query = 'SELECT year, month, day, mean_temp FROM `%s`' % output_table pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher( project=project, query=query, checksum=self.DEFAULT_CHECKSUM) ] extra_opts = { 'output': output_table, 'on_success_matcher': all_of(*pipeline_verifiers) } # Register cleanup before pipeline execution. # Note that actual execution happens in reverse order. self.addCleanup(utils.delete_bq_table, project, dataset, table) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. filters.run(test_pipeline.get_full_options_as_args(**extra_opts))
def test_hourly_team_score_output_checksum_on_small_input(self): # Small dataset to prevent Out of Memory when running in local runners INPUT_FILE = 'gs://apache-beam-samples/game/small/gaming_data.csv' EXPECTED_CHECKSUM = '91143e81622aa391eb62eaa3f3a5123401edb07d' state_verifier = PipelineStateMatcher(PipelineState.DONE) query = ( 'SELECT COUNT(*) FROM `%s.%s.%s`' % (self.project, self.dataset_ref.dataset_id, self.OUTPUT_TABLE)) bigquery_verifier = BigqueryMatcher(self.project, query, EXPECTED_CHECKSUM) extra_opts = { 'input': INPUT_FILE, 'dataset': self.dataset_ref.dataset_id, 'window_duration': 1, 'on_success_matcher': all_of(state_verifier, bigquery_verifier) } # Register clean up before pipeline execution # Note that actual execution happens in reverse order. self.addCleanup(utils.delete_bq_dataset, self.project, self.dataset_ref) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. hourly_team_score.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_big_query_standard_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def test_big_query_new_types(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': LEGACY_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def test_bigquery_tornadoes_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose output_table = ('BigQueryTornadoesIT' '.monthly_tornadoes_%s' % int(round(time.time() * 1000))) query = 'SELECT month, tornado_count FROM [%s]' % output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=test_pipeline.get_option('project'), query=query, checksum=self.DEFAULT_CHECKSUM)] extra_opts = {'output': output_table, 'on_success_matcher': all_of(*pipeline_verifiers)} # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. bigquery_tornadoes.run( test_pipeline.get_full_options_as_args(**extra_opts))
def test_big_query_standard_sql_kms_key(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers), 'kms_key': KMS_KEY } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) table = self.bigquery_client.get_table( self.project, self.dataset_id, 'output_table') self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName)
def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] gs_location = 'gs://temp-storage-for-upload-tests/%s' % self.output_table extra_opts = { 'query': LEGACY_QUERY, 'output': self.output_table, 'bq_temp_location': gs_location, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers) } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def test_game_stats_it(self): state_verifier = PipelineStateMatcher(PipelineState.RUNNING) success_condition = 'mean_duration=300 LIMIT 1' sessions_query = ('SELECT mean_duration FROM [%s:%s.%s] ' 'WHERE %s' % (self.project, self.dataset.name, self.OUTPUT_TABLE_SESSIONS, success_condition)) bq_sessions_verifier = BigqueryMatcher(self.project, sessions_query, self.DEFAULT_EXPECTED_CHECKSUM) # TODO(mariagh): Add teams table verifier once game_stats.py is fixed. extra_opts = { 'subscription': self.input_sub.name, 'dataset': self.dataset.name, 'topic': self.input_topic.name, 'fixed_window_duration': 1, 'user_activity_window_duration': 1, 'wait_until_finish_duration': self.WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(state_verifier, bq_sessions_verifier) } # Register cleanup before pipeline execution. # Note that actual execution happens in reverse order. self.addCleanup(self._cleanup_pubsub) self.addCleanup(self._cleanup_dataset) self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name, self.OUTPUT_TABLE_SESSIONS) self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name, self.OUTPUT_TABLE_TEAMS) # Generate input data and inject to PubSub. self._inject_pubsub_game_events(self.input_topic, self.DEFAULT_INPUT_COUNT) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. game_stats.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_pubsub_pipe_it(self): # Build expected dataset. expected_msg = ['conall_0 - 1608051184'.encode('utf-8')] # Set extra options to the pipeline for test purpose state_verifier = PipelineStateMatcher(PipelineState.RUNNING) pubsub_msg_verifier = PubSubMessageMatcher(self.project, self.output_sub.name, expected_msg, timeout=60 * 7) # in seconds EXPECTED_BQ_CHECKSUM = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' # SELECT SHA1(text) FROM `<project>.<dataset>.<table>` validation_query = f'SELECT text FROM `{self.project}.{self.dataset_ref.dataset_id}.{OUTPUT_TABLE}`' bq_sessions_verifier = BigqueryMatcher(self.project, validation_query, EXPECTED_BQ_CHECKSUM) # bq_sessions_verifier extra_opts = { 'bigquery_dataset': self.dataset_ref.dataset_id, 'bigquery_table': OUTPUT_TABLE, 'input_subscription': self.input_sub.name, 'output_topic': self.output_topic.name, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier) } # Generate input data and inject to PubSub. self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. pipeline.run(self.test_pipeline.get_full_options_as_args(**extra_opts)) # Cleanup PubSub self.addCleanup(self._cleanup_pubsub) self.addCleanup(utils.delete_bq_dataset, self.project, self.dataset_ref)
def test_pubsub_pipe_it(self): # Build expected dataset. expected_msg = ['conall_0 - 1608051184'.encode('utf-8')] # Set extra options to the pipeline for test purpose state_verifier = PipelineStateMatcher(PipelineState.RUNNING) pubsub_msg_verifier = PubSubMessageMatcher(self.project, self.output_sub.name, expected_msg, timeout=60 * 7) # in seconds EXPECTED_BQ_CHECKSUM = 'da39a3ee5e6b4b0d3255bfef95601890afd80709' # SELECT SHA1(text) FROM `<project>.<dataset>.<table>` validation_query = f'SELECT text FROM `{self.project}.{self.dataset_ref.dataset_id}.{OUTPUT_TABLE}`' bq_sessions_verifier = BigqueryMatcher(self.project, validation_query, EXPECTED_BQ_CHECKSUM) # make sure you put the expected result in a tuple with a trailing comma expected_bq_msg = [('conall_0 - 1608051184', )] # Fetch Bigquery data with given query, compare to the expected data. # bigquery_verifier = BigqueryFullResultMatcher( # project=self.project, # query=validation_query, # data=expected_bq_msg) # Fetch Bigquery data with given query, compare to the expected data. # This matcher polls BigQuery until the no. of records in BigQuery is # equal to the no. of records in expected data. # Specifying a timeout is optional bigquery_streaming_verifier = BigqueryFullResultStreamingMatcher( project=self.project, query=validation_query, data=expected_bq_msg, timeout=60 * 7) extra_opts = { 'bigquery_dataset': self.dataset_ref.dataset_id, 'bigquery_table': OUTPUT_TABLE, 'input_subscription': self.input_sub.name, 'output_topic': self.output_topic.name, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(bigquery_streaming_verifier, state_verifier, pubsub_msg_verifier) # bigquery_verifier } # Generate input data and inject to PubSub. self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. pipeline.run(self.test_pipeline.get_full_options_as_args(**extra_opts)) # Cleanup PubSub self.addCleanup(self._cleanup_pubsub) self.addCleanup(utils.delete_bq_dataset, self.project, self.dataset_ref)