def test_run_example_with_setup_file(self): pipeline = TestPipeline(is_integration_test=True) coordinate_output = FileSystems.join( pipeline.get_option('output'), 'juliaset-{}'.format(str(uuid.uuid4())), 'coordinates.txt') extra_args = { 'coordinate_output': coordinate_output, 'grid_size': self.GRID_SIZE, 'setup_file': os.path.normpath( os.path.join(os.path.dirname(__file__), '..', 'setup.py')), 'on_success_matcher': all_of(PipelineStateMatcher(PipelineState.DONE)), } args = pipeline.get_full_options_as_args(**extra_args) juliaset.run(args)
def test_filters_output_bigquery_matcher(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose project = test_pipeline.get_option('project') dataset = 'FiltersTestIT' table = 'cold_days_%s' % int(round(time.time() * 1000)) output_table = '.'.join([dataset, table]) query = 'SELECT year, month, day, mean_temp FROM `%s`' % output_table pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher( project=project, query=query, checksum=self.DEFAULT_CHECKSUM) ] extra_opts = { 'output': output_table, 'on_success_matcher': all_of(*pipeline_verifiers) } # Register cleanup before pipeline execution. # Note that actual execution happens in reverse order. self.addCleanup(utils.delete_bq_table, project, dataset, table) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. filters.run(test_pipeline.get_full_options_as_args(**extra_opts))
def test_hourly_team_score_it(self): state_verifier = PipelineStateMatcher(PipelineState.DONE) query = ( 'SELECT COUNT(*) FROM `%s.%s.%s`' % (self.project, self.dataset_ref.dataset_id, self.OUTPUT_TABLE)) bigquery_verifier = BigqueryMatcher(self.project, query, self.DEFAULT_EXPECTED_CHECKSUM) extra_opts = { 'input': self.DEFAULT_INPUT_FILE, 'dataset': self.dataset_ref.dataset_id, 'window_duration': 1, 'on_success_matcher': all_of(state_verifier, bigquery_verifier) } # Register clean up before pipeline execution # Note that actual execution happens in reverse order. self.addCleanup(utils.delete_bq_dataset, self.project, self.dataset_ref) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. hourly_team_score.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_streaming_wordcount_it(self): # Build expected dataset. expected_msg = [('%d: 1' % num).encode('utf-8') for num in range(DEFAULT_INPUT_NUMBERS)] # Set extra options to the pipeline for test purpose state_verifier = PipelineStateMatcher(PipelineState.RUNNING) pubsub_msg_verifier = PubSubMessageMatcher(self.project, self.output_sub.name, expected_msg, timeout=400) extra_opts = { 'input_subscription': self.input_sub.name, 'output_topic': self.output_topic.name, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier) } # Generate input data and inject to PubSub. self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. streaming_wordcount.run( self.test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False)
def test_bigquery_tornadoes_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose project = test_pipeline.get_option('project') dataset = 'BigQueryTornadoesIT' table = 'monthly_tornadoes_%s' % int(round(time.time() * 1000)) output_table = '.'.join([dataset, table]) query = 'SELECT month, tornado_count FROM [%s]' % output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=project, query=query, checksum=self.DEFAULT_CHECKSUM)] extra_opts = {'output': output_table, 'on_success_matcher': all_of(*pipeline_verifiers)} # Register cleanup before pipeline execution. self.addCleanup(utils.delete_bq_table, project, dataset, table) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. bigquery_tornadoes.run( test_pipeline.get_full_options_as_args(**extra_opts))
def test_big_query_new_types_native(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum, timeout_secs=30, ) ] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'native': True, 'use_json_exports': True, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers), 'experiments': 'use_legacy_bq_sink', } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def test_hourly_team_score_output_checksum_on_small_input(self): # Small dataset to prevent Out of Memory when running in local runners INPUT_FILE = 'gs://apache-beam-samples/game/small/gaming_data.csv' EXPECTED_CHECKSUM = '91143e81622aa391eb62eaa3f3a5123401edb07d' state_verifier = PipelineStateMatcher(PipelineState.DONE) query = ( 'SELECT COUNT(*) FROM `%s.%s.%s`' % (self.project, self.dataset_ref.dataset_id, self.OUTPUT_TABLE)) bigquery_verifier = BigqueryMatcher(self.project, query, EXPECTED_CHECKSUM) extra_opts = { 'input': INPUT_FILE, 'dataset': self.dataset_ref.dataset_id, 'window_duration': 1, 'on_success_matcher': all_of(state_verifier, bigquery_verifier) } # Register clean up before pipeline execution # Note that actual execution happens in reverse order. self.addCleanup(utils.delete_bq_dataset, self.project, self.dataset_ref) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. hourly_team_score.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_datastore_wordcount_it(self): test_pipeline = TestPipeline(is_integration_test=True) dataset = test_pipeline.get_option("project") kind = self.DATASTORE_WORDCOUNT_KIND output = '/'.join([ test_pipeline.get_option('output'), str(int(time.time() * 1000)), 'datastore_wordcount_results' ]) arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int( arg_sleep_secs) if arg_sleep_secs is not None else None pipeline_verifiers = [ PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.EXPECTED_CHECKSUM, sleep_secs) ] extra_opts = { 'dataset': dataset, 'kind': kind, 'output': output, 'read_only': True, 'on_success_matcher': all_of(*pipeline_verifiers) } datastore_wordcount.run( test_pipeline.get_full_options_as_args(**extra_opts))
def test_big_query_standard_sql_kms_key_native(self): if isinstance(self.test_pipeline.runner, TestDirectRunner): self.skipTest("This test doesn't work on DirectRunner.") verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher(project=self.project, query=verify_query, checksum=expected_checksum) ] kms_key = self.test_pipeline.get_option('kms_key_name') self.assertTrue(kms_key) extra_opts = { 'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers), 'kms_key': kms_key, 'native': True, 'experiments': 'use_legacy_bq_sink', } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) table = self.bigquery_client.get_table(self.project, self.dataset_id, 'output_table') self.assertIsNotNone(table.encryptionConfiguration, 'No encryption configuration found: %s' % table) self.assertEqual(kms_key, table.encryptionConfiguration.kmsKeyName)
def _run_wordcount_it(self, run_wordcount, **opts): test_pipeline = TestPipeline(is_integration_test=True) extra_opts = {} # Set extra options to the pipeline for test purpose test_output = '/'.join([ test_pipeline.get_option('output'), str(int(time.time() * 1000)), 'results' ]) extra_opts['output'] = test_output test_input = test_pipeline.get_option('input') if test_input: extra_opts['input'] = test_input arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int( arg_sleep_secs) if arg_sleep_secs is not None else None expect_checksum = (test_pipeline.get_option('expect_checksum') or self.DEFAULT_CHECKSUM) pipeline_verifiers = [ PipelineStateMatcher(), FileChecksumMatcher(test_output + '*-of-*', expect_checksum, sleep_secs) ] extra_opts['on_success_matcher'] = all_of(*pipeline_verifiers) extra_opts.update(opts) # Register clean up before pipeline execution self.addCleanup(delete_files, [test_output + '*']) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. run_wordcount(test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False)
def test_datastore_wordcount_it(self): test_pipeline = TestPipeline(is_integration_test=True) kind = self.DATASTORE_WORDCOUNT_KIND output = '/'.join([ test_pipeline.get_option('output'), str(int(time.time() * 1000)), 'datastore_wordcount_results' ]) arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int( arg_sleep_secs) if arg_sleep_secs is not None else None pipeline_verifiers = [ PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.EXPECTED_CHECKSUM, sleep_secs) ] extra_opts = { 'kind': kind, 'output': output, # Comment this out to regenerate input data on Datastore (delete # existing data first using the bulk delete Dataflow template). 'read_only': True, 'on_success_matcher': all_of(*pipeline_verifiers) } datastore_wordcount.run( test_pipeline.get_full_options_as_args(**extra_opts))
def test_streaming_wordcount_it(self): # Build expected dataset. expected_msg = [('%d: 1' % num) for num in range(DEFAULT_INPUT_NUMBERS)] # Set extra options to the pipeline for test purpose state_verifier = PipelineStateMatcher(PipelineState.RUNNING) pubsub_msg_verifier = PubSubMessageMatcher(self.project, OUTPUT_SUB + self.uuid, expected_msg, timeout=400) extra_opts = { 'input_subscription': self.input_sub.full_name, 'output_topic': self.output_topic.full_name, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier) } # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created([self.input_sub]) self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. streaming_wordcount.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def _run_pubsub_bq_pipeline(self, method, triggering_frequency=None): l = [i for i in range(self._SIZE)] matchers = [ PipelineStateMatcher(PipelineState.RUNNING), BigqueryFullResultStreamingMatcher(project=self.project, query="SELECT number FROM %s" % self.output_table, data=[(i, ) for i in l]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*matchers), wait_until_finish_duration=self.WAIT_UNTIL_FINISH_DURATION, experiments='use_beam_bq_sink', streaming=True) def add_schema_info(element): yield {'number': element} messages = [str(i).encode('utf-8') for i in l] for message in messages: self.pub_client.publish(self.input_topic.name, message) with beam.Pipeline(argv=args) as p: mesages = (p | ReadFromPubSub(subscription=self.input_sub.name) | beam.ParDo(add_schema_info)) _ = mesages | WriteToBigQuery( self.output_table, schema=self.SCHEMA, method=method, triggering_frequency=triggering_frequency)
def _run_wordcount_it(self, **opts): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose output = '/'.join([ test_pipeline.get_option('output'), str(int(time.time() * 1000)), 'results' ]) arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int( arg_sleep_secs) if arg_sleep_secs is not None else None pipeline_verifiers = [ PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.DEFAULT_CHECKSUM, sleep_secs) ] extra_opts = { 'output': output, 'on_success_matcher': all_of(*pipeline_verifiers) } extra_opts.update(opts) # Register clean up before pipeline execution self.addCleanup(delete_files, [output + '*']) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
def test_bqfl_streaming(self): if isinstance(self.test_pipeline.runner, TestDataflowRunner): self.skipTest("TestStream is not supported on TestDataflowRunner") output_table = '%s_%s' % (self.output_table, 'ints') _SIZE = 100 schema = self.BIG_QUERY_STREAMING_SCHEMA l = [{'Integr': i} for i in range(_SIZE)] state_matcher = PipelineStateMatcher(PipelineState.RUNNING) bq_matcher = BigqueryFullResultStreamingMatcher( project=self.project, query="SELECT Integr FROM %s" % output_table, data=[(i, ) for i in range(100)]) args = self.test_pipeline.get_full_options_as_args( on_success_matcher=all_of(state_matcher, bq_matcher), streaming=True) with beam.Pipeline(argv=args) as p: stream_source = ( TestStream().advance_watermark_to(0).advance_processing_time( 100).add_elements(l[:_SIZE // 4]). advance_processing_time(100).advance_watermark_to(100).add_elements( l[_SIZE // 4:2 * _SIZE // 4]).advance_processing_time( 100).advance_watermark_to(200).add_elements( l[2 * _SIZE // 4:3 * _SIZE // 4]).advance_processing_time( 100).advance_watermark_to(300).add_elements( l[3 * _SIZE // 4:]).advance_processing_time( 100).advance_watermark_to_infinity()) _ = (p | stream_source | bigquery.WriteToBigQuery(output_table, schema=schema, method=bigquery.WriteToBigQuery \ .Method.FILE_LOADS, triggering_frequency=100))
def test_wordcount_fnapi_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. wordcount_fnapi.run( test_pipeline.get_full_options_as_args( experiment='beam_fn_api', on_success_matcher=PipelineStateMatcher()))
def run_bigquery_io_read_pipeline(self, input_size): test_pipeline = TestPipeline(is_integration_test=True) pipeline_verifiers = [PipelineStateMatcher(),] extra_opts = {'input_table': self.DEFAULT_DATASET + "." + self.DEFAULT_TABLE_PREFIX + input_size, 'num_records': self.NUM_RECORDS[input_size], 'on_success_matcher': all_of(*pipeline_verifiers)} bigquery_io_read_pipeline.run(test_pipeline.get_full_options_as_args( **extra_opts))
def test_train_mode(self): """Runs pipeline in train mode outputting train, test and eval filesets.""" test_pipeline = TestPipeline() # Set extra options to the pipeline for test purpose test_dir = os.path.join(self.OUTPUT_DIR, str(int(time.time()))) self.addCleanup(shutil.rmtree, test_dir) # Checks that pipeline reaches state "Done" pipeline_verifiers = [PipelineStateMatcher()] extra_opts = { 'project': PROJECT, 'output_path': test_dir, 'on_success_matcher': all_of(*pipeline_verifiers), 'runner': 'DirectRunner', } res = preprocess.main( test_pipeline.get_full_options_as_args(**extra_opts), query=self.TEST_QUERY, await_completion=True) # Check counts coming out of GetFirstClaim step. parse_first_claim_cnt = get_pipeline_metric( res, 'parse_firstclaim_success') self.assertEqual(self.TOTAL_RECORDS, parse_first_claim_cnt) # Check counts coming out of AddFeatures step. add_features_cnt = get_pipeline_metric(res, 'create_features_success') self.assertEqual(self.TOTAL_RECORDS, add_features_cnt) # Check counts coming out of AddLabel step. broad_cnt = get_pipeline_metric(res, 'add_label_broad') narrow_cnt = get_pipeline_metric(res, 'add_label_narrow') self.assertEqual(self.TOTAL_RECORDS, broad_cnt + narrow_cnt) # Check if the number of records coming out of Train/Test = limit step. splits = ['train_cnt', 'eval_cnt', 'test_cnt'] train_test_split_cnt = sum( [get_pipeline_metric(res, m) for m in splits]) self.assertEqual(self.TOTAL_RECORDS, train_test_split_cnt) # Check if number of protos created matched output of train/test split. create_proto_success = sum([ get_pipeline_metric(res, 'create_proto_success', index=i) for i in range(3) ]) self.assertEqual(self.TOTAL_RECORDS, create_proto_success) # Open a tf Example and check fields. example = read_example_proto(test_dir) for feature_name in preprocess.FEATURE_NAMES: self.assertGreaterEqual(get_tf_feature(example, feature_name), 0) # Make sure label feature is present. labels = ['broad', 'narrow'] self.assertIn(get_tf_feature(example, 'label', 'bytes_list'), labels)
def test_leader_board_it(self): state_verifier = PipelineStateMatcher(PipelineState.RUNNING) success_condition = 'total_score=5000 LIMIT 1' users_query = ('SELECT total_score FROM [%s:%s.%s] ' 'WHERE %s' % (self.project, self.dataset.name, self.OUTPUT_TABLE_USERS, success_condition)) bq_users_verifier = BigqueryMatcher(self.project, users_query, self.DEFAULT_EXPECTED_CHECKSUM) teams_query = ('SELECT total_score FROM [%s:%s.%s] ' 'WHERE %s' % (self.project, self.dataset.name, self.OUTPUT_TABLE_TEAMS, success_condition)) bq_teams_verifier = BigqueryMatcher(self.project, teams_query, self.DEFAULT_EXPECTED_CHECKSUM) extra_opts = { 'subscription': self.input_sub.full_name, 'dataset': self.dataset.name, 'topic': self.input_topic.full_name, 'team_window_duration': 1, 'wait_until_finish_duration': self.WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(state_verifier, bq_users_verifier, bq_teams_verifier) } # Register cleanup before pipeline execution. # Note that actual execution happens in reverse order. self.addCleanup(self._cleanup_pubsub) self.addCleanup(self._cleanup_dataset) self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name, self.OUTPUT_TABLE_USERS) self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name, self.OUTPUT_TABLE_TEAMS) # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created( [self.input_topic, self.input_sub]) self._inject_pubsub_game_events(self.input_topic, self.DEFAULT_INPUT_COUNT) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. leader_board.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def _test_streaming(self, with_attributes): """Runs IT pipeline with message verifier. Args: with_attributes: False - Reads and writes message data only. True - Reads and writes message data and attributes. Also verifies id_label and timestamp_attribute features. """ # Set on_success_matcher to verify pipeline state and pubsub output. These # verifications run on a (remote) worker. # Expect the state to be RUNNING since a streaming pipeline is usually # never DONE. The test runner will cancel the pipeline after verification. state_verifier = PipelineStateMatcher(PipelineState.RUNNING) expected_messages = self.EXPECTED_OUTPUT_MESSAGES[self.runner_name] if not with_attributes: expected_messages = [ pubsub_msg.data.decode('utf-8') for pubsub_msg in expected_messages ] if self.runner_name == 'TestDirectRunner': strip_attributes = None else: strip_attributes = [self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE] pubsub_msg_verifier = PubSubMessageMatcher( self.project, self.output_sub.name, expected_messages, timeout=MESSAGE_MATCHER_TIMEOUT_S, with_attributes=with_attributes, strip_attributes=strip_attributes) extra_opts = { 'input_subscription': self.input_sub.name, 'output_topic': self.output_topic.name, 'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier) } # Generate input data and inject to PubSub. for msg in self.INPUT_MESSAGES[self.runner_name]: self.pub_client.publish(self.input_topic.name, msg.data, **msg.attributes) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. pubsub_it_pipeline.run_pipeline( argv=self.test_pipeline.get_full_options_as_args(**extra_opts), with_attributes=with_attributes, id_label=self.ID_LABEL, timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)
def run_datastore_write(self, limit=None): test_pipeline = TestPipeline(is_integration_test=True) current_time = datetime.now().strftime("%m%d%H%M%S") seed = random.randint(0, 100000) kind = 'testkind%s%d' % (current_time, seed) pipeline_verifiers = [PipelineStateMatcher()] extra_opts = {'kind': kind, 'num_entities': self.NUM_ENTITIES, 'on_success_matcher': all_of(*pipeline_verifiers)} if limit is not None: extra_opts['limit'] = limit datastore_write_it_pipeline.run(test_pipeline.get_full_options_as_args( **extra_opts))
def test_big_query_standard_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def test_streaming_wordcount_it(self): # Set extra options to the pipeline for test purpose pipeline_verifiers = [PipelineStateMatcher(PipelineState.RUNNING)] extra_opts = {'input_sub': self.input_sub.full_name, 'output_topic': self.output_topic.full_name, 'on_success_matcher': all_of(*pipeline_verifiers)} # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created([self.input_sub]) self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. streaming_wordcount.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_big_query_new_types(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': LEGACY_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
def test_bigquery_side_input_it(self): state_verifier = PipelineStateMatcher(PipelineState.DONE) NUM_GROUPS = 3 extra_opts = { 'output': self.output, 'num_groups': str(NUM_GROUPS), 'on_success_matcher': all_of(state_verifier) } # Register clean up before pipeline execution self.addCleanup(delete_files, [self.output + '*']) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. bigquery_side_input.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_user_score_it(self): state_verifier = PipelineStateMatcher(PipelineState.DONE) file_verifier = FileChecksumMatcher(self.output + '*-of-*', self.DEFAULT_EXPECTED_CHECKSUM) extra_opts = { 'input': self.DEFAULT_INPUT_FILE, 'output': self.output + '/user-score', 'on_success_matcher': all_of(state_verifier, file_verifier) } # Register clean up before pipeline execution self.addCleanup(delete_files, [self.output + '*']) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. user_score.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def run_pipeline(self): # Waits for messages to appear in output topic. expected_msg = [msg.encode('utf-8') for msg in MESSAGES_TO_PUBLISH] pubsub_msg_verifier = PubSubMessageMatcher( self.project, self.output_sub.name, expected_msg, timeout=600) # Checks that pipeline initializes to RUNNING state. state_verifier = PipelineStateMatcher(PipelineState.RUNNING) extra_opts = { 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier), 'experiment': 'beam_fn_api', 'input_subscription': self.input_sub.name, 'output_topic': self.output_topic.name, } argv = self.test_pipeline.get_full_options_as_args(**extra_opts) return dataflow_exercise_streaming_metrics_pipeline.run(argv)
def _test_streaming(self, with_attributes): """Runs IT pipeline with message verifier. Args: with_attributes: False - Reads and writes message data only. True - Reads and writes message data and attributes. Also verifies id_label and timestamp_attribute features. """ # Build expected dataset. # Set extra options to the pipeline for test purpose state_verifier = PipelineStateMatcher(PipelineState.RUNNING) expected_messages = self.EXPECTED_OUTPUT_MESSAGES if not with_attributes: expected_messages = [ pubsub_msg.data for pubsub_msg in expected_messages ] pubsub_msg_verifier = PubSubMessageMatcher( self.project, OUTPUT_SUB + self.uuid, expected_messages, timeout=MESSAGE_MATCHER_TIMEOUT_S, with_attributes=with_attributes, strip_attributes=[self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE]) extra_opts = { 'input_subscription': self.input_sub.full_name, 'output_topic': self.output_topic.full_name, 'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier) } # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created([self.input_sub]) for msg in self.INPUT_MESSAGES: self.input_topic.publish(msg.data, **msg.attributes) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. pubsub_it_pipeline.run_pipeline( argv=self.test_pipeline.get_full_options_as_args(**extra_opts), with_attributes=with_attributes, id_label=self.ID_LABEL, timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)
def test_big_query_standard_sql_kms_key(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers), 'kms_key': KMS_KEY } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) table = self.bigquery_client.get_table( self.project, self.dataset_id, 'output_table') self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName)