def test_bigquery_tornadoes_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose project = test_pipeline.get_option('project') dataset = 'BigQueryTornadoesIT' table = 'monthly_tornadoes_%s' % int(round(time.time() * 1000)) output_table = '.'.join([dataset, table]) query = 'SELECT month, tornado_count FROM `%s`' % output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=project, query=query, checksum=self.DEFAULT_CHECKSUM)] extra_opts = {'output': output_table, 'on_success_matcher': all_of(*pipeline_verifiers)} # Register cleanup before pipeline execution. # Note that actual execution happens in reverse order. self.addCleanup(utils.delete_bq_table, project, dataset, table) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. bigquery_tornadoes.run( test_pipeline.get_full_options_as_args(**extra_opts))
def test_torch_run_inference_coco_maskrcnn_resnet50_fpn(self): test_pipeline = TestPipeline(is_integration_test=True) # text files containing absolute path to the coco validation data on GCS file_of_image_names = 'gs://apache-beam-ml/testing/inputs/it_coco_validation_inputs.txt' # disable: line-too-long output_file_dir = 'gs://apache-beam-ml/testing/predictions' output_file = '/'.join( [output_file_dir, str(uuid.uuid4()), 'result.txt']) model_state_dict_path = 'gs://apache-beam-ml/models/torchvision.models.detection.maskrcnn_resnet50_fpn.pth' images_dir = 'gs://apache-beam-ml/datasets/coco/raw-data/val2017' extra_opts = { 'input': file_of_image_names, 'output': output_file, 'model_state_dict_path': model_state_dict_path, 'images_dir': images_dir, } pytorch_image_segmentation.run( test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False) self.assertEqual(FileSystems().exists(output_file), True) predictions = process_outputs(filepath=output_file) actuals_file = 'gs://apache-beam-ml/testing/expected_outputs/test_torch_run_inference_coco_maskrcnn_resnet50_fpn_actuals.txt' actuals = process_outputs(filepath=actuals_file) predictions_dict = {} for prediction in predictions: filename, prediction_labels = prediction.split(';') predictions_dict[filename] = prediction_labels for actual in actuals: filename, actual_labels = actual.split(';') prediction_labels = predictions_dict[filename] self.assertEqual(actual_labels, prediction_labels)
def setUpClass(cls): parser = argparse.ArgumentParser() parser.add_argument( '--spanner_instance_id', default='beam-test', help='Spanner instance id', ) parser.add_argument( '--spanner_project_id', default='beam-testing', help='GCP project with spanner instance', ) parser.add_argument( '--use_real_spanner', action='store_true', default=False, help='Whether to use emulator or real spanner instance', ) pipeline = TestPipeline(is_integration_test=True) argv = pipeline.get_full_options_as_args() known_args, _ = parser.parse_known_args(argv) cls.project_id = known_args.spanner_project_id cls.instance_id = known_args.spanner_instance_id use_spanner_emulator = not known_args.use_real_spanner cls.table = 'xlang_beam_spanner' cls.spanner_helper = SpannerHelper( cls.project_id, cls.instance_id, cls.table, use_spanner_emulator) coders.registry.register_coder(SpannerTestRow, coders.RowCoder) coders.registry.register_coder(SpannerPartTestRow, coders.RowCoder) coders.registry.register_coder(SpannerTestKey, coders.RowCoder)
def test_datastore_wordcount_it(self): test_pipeline = TestPipeline(is_integration_test=True) dataset = test_pipeline.get_option("project") kind = self.DATASTORE_WORDCOUNT_KIND output = '/'.join([ test_pipeline.get_option('output'), str(int(time.time() * 1000)), 'datastore_wordcount_results' ]) arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int( arg_sleep_secs) if arg_sleep_secs is not None else None pipeline_verifiers = [ PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.EXPECTED_CHECKSUM, sleep_secs) ] extra_opts = { 'dataset': dataset, 'kind': kind, 'output': output, 'read_only': True, 'on_success_matcher': all_of(*pipeline_verifiers) } datastore_wordcount.run( test_pipeline.get_full_options_as_args(**extra_opts))
def test_datastore_wordcount_it(self): test_pipeline = TestPipeline(is_integration_test=True) kind = self.DATASTORE_WORDCOUNT_KIND output = '/'.join([ test_pipeline.get_option('output'), str(int(time.time() * 1000)), 'datastore_wordcount_results' ]) arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int( arg_sleep_secs) if arg_sleep_secs is not None else None pipeline_verifiers = [ PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.EXPECTED_CHECKSUM, sleep_secs) ] extra_opts = { 'kind': kind, 'output': output, # Comment this out to regenerate input data on Datastore (delete # existing data first using the bulk delete Dataflow template). 'read_only': True, 'on_success_matcher': all_of(*pipeline_verifiers) } datastore_wordcount.run( test_pipeline.get_full_options_as_args(**extra_opts))
def test_bigquery_tornadoes_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose project = test_pipeline.get_option('project') dataset = 'BigQueryTornadoesIT' table = 'monthly_tornadoes_%s' % int(round(time.time() * 1000)) output_table = '.'.join([dataset, table]) query = 'SELECT month, tornado_count FROM [%s]' % output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=project, query=query, checksum=self.DEFAULT_CHECKSUM)] extra_opts = {'output': output_table, 'on_success_matcher': all_of(*pipeline_verifiers)} # Register cleanup before pipeline execution. self.addCleanup(utils.delete_bq_table, project, dataset, table) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. bigquery_tornadoes.run( test_pipeline.get_full_options_as_args(**extra_opts))
class BigQuerySideInputIT(unittest.TestCase): DEFAULT_OUTPUT_FILE = \ 'gs://temp-storage-for-end-to-end-tests/py-it-cloud/output' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.uuid = str(uuid.uuid4()) self.output = '/'.join( [self.DEFAULT_OUTPUT_FILE, self.uuid, 'results']) @pytest.mark.no_xdist @pytest.mark.examples_postcommit def test_bigquery_side_input_it(self): state_verifier = PipelineStateMatcher(PipelineState.DONE) NUM_GROUPS = 3 extra_opts = { 'output': self.output, 'num_groups': str(NUM_GROUPS), 'on_success_matcher': all_of(state_verifier) } # Register clean up before pipeline execution self.addCleanup(delete_files, [self.output + '*']) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. bigquery_side_input.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_torch_run_inference_bert_for_masked_lm(self): test_pipeline = TestPipeline(is_integration_test=True) # Path to text file containing some sentences file_of_sentences = 'gs://apache-beam-ml/datasets/custom/sentences.txt' # disable: line-too-long output_file_dir = 'gs://apache-beam-ml/testing/predictions' output_file = '/'.join( [output_file_dir, str(uuid.uuid4()), 'result.txt']) model_state_dict_path = 'gs://apache-beam-ml/models/huggingface.BertForMaskedLM.bert-base-uncased.pth' extra_opts = { 'input': file_of_sentences, 'output': output_file, 'model_state_dict_path': model_state_dict_path, } pytorch_language_modeling.run( test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False) self.assertEqual(FileSystems().exists(output_file), True) predictions = process_outputs(filepath=output_file) actuals_file = 'gs://apache-beam-ml/testing/expected_outputs/test_torch_run_inference_bert_for_masked_lm_actuals.txt' actuals = process_outputs(filepath=actuals_file) predictions_dict = {} for prediction in predictions: text, predicted_text = prediction.split(';') predictions_dict[text] = predicted_text for actual in actuals: text, actual_predicted_text = actual.split(';') predicted_predicted_text = predictions_dict[text] self.assertEqual(actual_predicted_text, predicted_predicted_text)
class UserScoreIT(unittest.TestCase): DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*' DEFAULT_EXPECTED_CHECKSUM = '9f3bd81669607f0d98ec80ddd477f3277cfba0a2' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.uuid = str(uuid.uuid4()) self.output = '/'.join( [self.test_pipeline.get_option('output'), self.uuid, 'results']) @attr('IT') def test_user_score_it(self): state_verifier = PipelineStateMatcher(PipelineState.DONE) arg_sleep_secs = self.test_pipeline.get_option('sleep_secs') sleep_secs = int(arg_sleep_secs) if arg_sleep_secs is not None else None file_verifier = FileChecksumMatcher( self.output + '/*-of-*', self.DEFAULT_EXPECTED_CHECKSUM, sleep_secs) extra_opts = { 'input': self.DEFAULT_INPUT_FILE, 'output': self.output + '/user-score', 'on_success_matcher': all_of(state_verifier, file_verifier) } # Register clean up before pipeline execution self.addCleanup(delete_files, [self.output + '*']) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. user_score.run( self.test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False)
class StreamingWordCountIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') self.uuid = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pubsub_client = pubsub.Client(project=self.project) self.input_topic = self.pubsub_client.topic(INPUT_TOPIC + self.uuid) self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC + self.uuid) self.input_sub = self.input_topic.subscription(INPUT_SUB + self.uuid) self.output_sub = self.output_topic.subscription(OUTPUT_SUB + self.uuid) self.input_topic.create() self.output_topic.create() test_utils.wait_for_topics_created([self.input_topic, self.output_topic]) self.input_sub.create() self.output_sub.create() def _inject_numbers(self, topic, num_messages): """Inject numbers as test data to PubSub.""" logging.debug('Injecting %d numbers to topic %s', num_messages, topic.full_name) for n in range(num_messages): topic.publish(str(n)) def _cleanup_pubsub(self): test_utils.cleanup_subscriptions([self.input_sub, self.output_sub]) test_utils.cleanup_topics([self.input_topic, self.output_topic]) def tearDown(self): self._cleanup_pubsub() @attr('IT') def test_streaming_wordcount_it(self): # Build expected dataset. expected_msg = [('%d: 1' % num) for num in range(DEFAULT_INPUT_NUMBERS)] # Set extra options to the pipeline for test purpose state_verifier = PipelineStateMatcher(PipelineState.RUNNING) pubsub_msg_verifier = PubSubMessageMatcher(self.project, OUTPUT_SUB + self.uuid, expected_msg, timeout=400) extra_opts = {'input_subscription': self.input_sub.full_name, 'output_topic': self.output_topic.full_name, 'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)} # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created([self.input_sub]) self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. streaming_wordcount.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def _run_wordcount_it(self, run_wordcount, **opts): test_pipeline = TestPipeline(is_integration_test=True) extra_opts = {} # Set extra options to the pipeline for test purpose test_output = '/'.join([ test_pipeline.get_option('output'), str(int(time.time() * 1000)), 'results' ]) extra_opts['output'] = test_output test_input = test_pipeline.get_option('input') if test_input: extra_opts['input'] = test_input arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int( arg_sleep_secs) if arg_sleep_secs is not None else None expect_checksum = (test_pipeline.get_option('expect_checksum') or self.DEFAULT_CHECKSUM) pipeline_verifiers = [ PipelineStateMatcher(), FileChecksumMatcher(test_output + '*-of-*', expect_checksum, sleep_secs) ] extra_opts['on_success_matcher'] = all_of(*pipeline_verifiers) extra_opts.update(opts) # Register clean up before pipeline execution self.addCleanup(delete_files, [test_output + '*']) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. run_wordcount(test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False)
class UserScoreIT(unittest.TestCase): DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*' DEFAULT_EXPECTED_CHECKSUM = '9f3bd81669607f0d98ec80ddd477f3277cfba0a2' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.uuid = str(uuid.uuid4()) self.output = '/'.join([self.test_pipeline.get_option('output'), self.uuid, 'results']) @attr('IT') def test_user_score_it(self): state_verifier = PipelineStateMatcher(PipelineState.DONE) file_verifier = FileChecksumMatcher(self.output + '*-of-*', self.DEFAULT_EXPECTED_CHECKSUM) extra_opts = {'input': self.DEFAULT_INPUT_FILE, 'output': self.output + '/user-score', 'on_success_matcher': all_of(state_verifier, file_verifier)} # Register clean up before pipeline execution self.addCleanup(delete_files, [self.output + '*']) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. user_score.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_sklearn_mnist_classification(self): test_pipeline = TestPipeline(is_integration_test=False) input_file = 'gs://apache-beam-ml/testing/inputs/it_mnist_data.csv' output_file_dir = 'gs://temp-storage-for-end-to-end-tests' output_file = '/'.join( [output_file_dir, str(uuid.uuid4()), 'result.txt']) model_path = 'gs://apache-beam-ml/models/mnist_model_svm.pickle' extra_opts = { 'input': input_file, 'output': output_file, 'model_path': model_path, } sklearn_mnist_classification.run( test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False) self.assertEqual(FileSystems().exists(output_file), True) expected_output_filepath = 'gs://apache-beam-ml/testing/expected_outputs/test_sklearn_mnist_classification_actuals.txt' # pylint: disable=line-too-long expected_outputs = process_outputs(expected_output_filepath) predicted_outputs = process_outputs(output_file) self.assertEqual(len(expected_outputs), len(predicted_outputs)) predictions_dict = {} for i in range(len(predicted_outputs)): true_label, prediction = predicted_outputs[i].split(',') predictions_dict[true_label] = prediction for i in range(len(expected_outputs)): true_label, expected_prediction = expected_outputs[i].split(',') self.assertEqual(predictions_dict[true_label], expected_prediction)
def test_filters_output_bigquery_matcher(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose project = test_pipeline.get_option('project') dataset = 'FiltersTestIT' table = 'cold_days_%s' % int(round(time.time() * 1000)) output_table = '.'.join([dataset, table]) query = 'SELECT year, month, day, mean_temp FROM `%s`' % output_table pipeline_verifiers = [ PipelineStateMatcher(), BigqueryMatcher( project=project, query=query, checksum=self.DEFAULT_CHECKSUM) ] extra_opts = { 'output': output_table, 'on_success_matcher': all_of(*pipeline_verifiers) } # Register cleanup before pipeline execution. # Note that actual execution happens in reverse order. self.addCleanup(utils.delete_bq_table, project, dataset, table) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. filters.run(test_pipeline.get_full_options_as_args(**extra_opts))
def _run_wordcount_it(self, **opts): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose output = '/'.join([ test_pipeline.get_option('output'), str(int(time.time() * 1000)), 'results' ]) arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int( arg_sleep_secs) if arg_sleep_secs is not None else None pipeline_verifiers = [ PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.DEFAULT_CHECKSUM, sleep_secs) ] extra_opts = { 'output': output, 'on_success_matcher': all_of(*pipeline_verifiers) } extra_opts.update(opts) # Register clean up before pipeline execution self.addCleanup(delete_files, [output + '*']) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
def run_pipeline(self, **opts): test_pipeline = TestPipeline(is_integration_test=True) argv = test_pipeline.get_full_options_as_args(**opts) parser = argparse.ArgumentParser() unused_known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) p = beam.Pipeline(options=pipeline_options) return dataflow_exercise_metrics_pipeline.apply_and_run(p)
def run_bigquery_io_read_pipeline(self, input_size): test_pipeline = TestPipeline(is_integration_test=True) pipeline_verifiers = [PipelineStateMatcher(),] extra_opts = {'input_table': self.DEFAULT_DATASET + "." + self.DEFAULT_TABLE_PREFIX + input_size, 'num_records': self.NUM_RECORDS[input_size], 'on_success_matcher': all_of(*pipeline_verifiers)} bigquery_io_read_pipeline.run(test_pipeline.get_full_options_as_args( **extra_opts))
def test_wordcount_fnapi_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. wordcount_fnapi.run( test_pipeline.get_full_options_as_args( experiment='beam_fn_api', on_success_matcher=PipelineStateMatcher()))
def test_train_mode(self): """Runs pipeline in train mode outputting train, test and eval filesets.""" test_pipeline = TestPipeline() # Set extra options to the pipeline for test purpose test_dir = os.path.join(self.OUTPUT_DIR, str(int(time.time()))) self.addCleanup(shutil.rmtree, test_dir) # Checks that pipeline reaches state "Done" pipeline_verifiers = [PipelineStateMatcher()] extra_opts = { 'project': PROJECT, 'output_path': test_dir, 'on_success_matcher': all_of(*pipeline_verifiers), 'runner': 'DirectRunner', } res = preprocess.main( test_pipeline.get_full_options_as_args(**extra_opts), query=self.TEST_QUERY, await_completion=True) # Check counts coming out of GetFirstClaim step. parse_first_claim_cnt = get_pipeline_metric( res, 'parse_firstclaim_success') self.assertEqual(self.TOTAL_RECORDS, parse_first_claim_cnt) # Check counts coming out of AddFeatures step. add_features_cnt = get_pipeline_metric(res, 'create_features_success') self.assertEqual(self.TOTAL_RECORDS, add_features_cnt) # Check counts coming out of AddLabel step. broad_cnt = get_pipeline_metric(res, 'add_label_broad') narrow_cnt = get_pipeline_metric(res, 'add_label_narrow') self.assertEqual(self.TOTAL_RECORDS, broad_cnt + narrow_cnt) # Check if the number of records coming out of Train/Test = limit step. splits = ['train_cnt', 'eval_cnt', 'test_cnt'] train_test_split_cnt = sum( [get_pipeline_metric(res, m) for m in splits]) self.assertEqual(self.TOTAL_RECORDS, train_test_split_cnt) # Check if number of protos created matched output of train/test split. create_proto_success = sum([ get_pipeline_metric(res, 'create_proto_success', index=i) for i in range(3) ]) self.assertEqual(self.TOTAL_RECORDS, create_proto_success) # Open a tf Example and check fields. example = read_example_proto(test_dir) for feature_name in preprocess.FEATURE_NAMES: self.assertGreaterEqual(get_tf_feature(example, feature_name), 0) # Make sure label feature is present. labels = ['broad', 'narrow'] self.assertIn(get_tf_feature(example, 'label', 'bytes_list'), labels)
def run_pipeline(self, **opts): test_pipeline = TestPipeline(is_integration_test=True) argv = test_pipeline.get_full_options_as_args(**opts) parser = argparse.ArgumentParser() unused_known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) return dataflow_exercise_metrics_pipeline.apply_and_run(p)
def run_pipeline(self, **opts): test_pipeline = TestPipeline(is_integration_test=True) argv = test_pipeline.get_full_options_as_args(**opts) parser = argparse.ArgumentParser() unused_known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) return dataflow_exercise_metrics_pipeline.apply_and_run(p)
def test_estimate_pi_output_file(self): test_pipeline = TestPipeline(is_integration_test=True) temp_folder = tempfile.mkdtemp() extra_opts = {'output': os.path.join(temp_folder, 'result')} estimate_pi.run(test_pipeline.get_full_options_as_args(**extra_opts)) # Load result file and compare. with open_shards(os.path.join(temp_folder, 'result-*-of-*')) as result_file: [_, _, estimated_pi] = json.loads(result_file.read().strip()) # Note: Probabilistically speaking this test can fail with a probability # that is very small (VERY) given that we run at least 100 thousand # trials. self.assertTrue(3.125 <= estimated_pi <= 3.155)
class StreamingWordCountIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) # Set up PubSub environment. from google.cloud import pubsub self.pubsub_client = pubsub.Client( project=self.test_pipeline.get_option('project')) self.input_topic = self.pubsub_client.topic(INPUT_TOPIC) self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC) self.input_sub = self.input_topic.subscription(INPUT_SUB) self.output_sub = self.output_topic.subscription(OUTPUT_SUB) self._cleanup_pubsub() self.input_topic.create() self.output_topic.create() test_utils.wait_for_topics_created([self.input_topic, self.output_topic]) self.input_sub.create() self.output_sub.create() def _inject_numbers(self, topic, num_messages): """Inject numbers as test data to PubSub.""" logging.debug('Injecting %d numbers to topic %s', num_messages, topic.full_name) for n in range(num_messages): topic.publish(str(n)) def _cleanup_pubsub(self): test_utils.cleanup_subscriptions([self.input_sub, self.output_sub]) test_utils.cleanup_topics([self.input_topic, self.output_topic]) def tearDown(self): self._cleanup_pubsub() @attr('developing_test') def test_streaming_wordcount_it(self): # Set extra options to the pipeline for test purpose pipeline_verifiers = [PipelineStateMatcher(PipelineState.RUNNING)] extra_opts = {'input_sub': self.input_sub.full_name, 'output_topic': self.output_topic.full_name, 'on_success_matcher': all_of(*pipeline_verifiers)} # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created([self.input_sub]) self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. streaming_wordcount.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
class MatchIntegrationTest(unittest.TestCase): INPUT_FILE = 'gs://dataflow-samples/shakespeare/kinglear.txt' KINGLEAR_CHECKSUM = 'f418b25f1507f5a901257026b035ac2857a7ab87' INPUT_FILE_LARGE = ( 'gs://dataflow-samples/wikipedia_edits/wiki_data-00000000000*.json') WIKI_FILES = [ 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000000.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000001.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000002.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000003.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000004.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000005.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000006.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000007.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000008.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000009.json', ] def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) @attr('IT') def test_transform_on_gcs(self): args = self.test_pipeline.get_full_options_as_args() with beam.Pipeline(argv=args) as p: matches_pc = ( p | beam.Create([self.INPUT_FILE, self.INPUT_FILE_LARGE]) | fileio.MatchAll() | 'GetPath' >> beam.Map(lambda metadata: metadata.path)) assert_that( matches_pc, equal_to([self.INPUT_FILE] + self.WIKI_FILES), label='Matched Files') checksum_pc = ( p | 'SingleFile' >> beam.Create([self.INPUT_FILE]) | 'MatchOneAll' >> fileio.MatchAll() | fileio.ReadMatches() | 'ReadIn' >> beam.Map(lambda x: x.read_utf8().split('\n')) | 'Checksums' >> beam.Map(compute_hash)) assert_that( checksum_pc, equal_to([self.KINGLEAR_CHECKSUM]), label='Assert Checksums')
def run_datastore_write(self, limit=None): test_pipeline = TestPipeline(is_integration_test=True) current_time = datetime.now().strftime("%m%d%H%M%S") seed = random.randint(0, 100000) kind = 'testkind%s%d' % (current_time, seed) pipeline_verifiers = [PipelineStateMatcher()] extra_opts = {'kind': kind, 'num_entities': self.NUM_ENTITIES, 'on_success_matcher': all_of(*pipeline_verifiers)} if limit is not None: extra_opts['limit'] = limit datastore_write_it_pipeline.run(test_pipeline.get_full_options_as_args( **extra_opts))
def run_datastore_write(self, limit=None): test_pipeline = TestPipeline(is_integration_test=True) current_time = datetime.now().strftime("%m%d%H%M%S") seed = random.randint(0, 100000) kind = 'testkind%s%d' % (current_time, seed) pipeline_verifiers = [PipelineStateMatcher()] extra_opts = {'kind': kind, 'num_entities': self.NUM_ENTITIES, 'on_success_matcher': all_of(*pipeline_verifiers)} if limit is not None: extra_opts['limit'] = limit datastore_write_it_pipeline.run(test_pipeline.get_full_options_as_args( **extra_opts))
class HourlyTeamScoreIT(unittest.TestCase): DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*' # SHA-1 hash generated from sorted rows reading from BigQuery table DEFAULT_EXPECTED_CHECKSUM = '4fa761fb5c3341ec573d5d12c6ab75e3b2957a25' OUTPUT_DATASET = 'hourly_team_score_it_dataset' OUTPUT_TABLE = 'leader_board' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') # Set up BigQuery environment from google.cloud import bigquery client = bigquery.Client() unique_dataset_name = self.OUTPUT_DATASET + str(int(time.time())) self.dataset = client.dataset(unique_dataset_name, project=self.project) self.dataset.create() def _cleanup_dataset(self): self.dataset.delete() @attr('IT') def test_hourly_team_score_it(self): state_verifier = PipelineStateMatcher(PipelineState.DONE) query = ('SELECT COUNT(*) FROM [%s:%s.%s]' % (self.project, self.dataset.name, self.OUTPUT_TABLE)) bigquery_verifier = BigqueryMatcher(self.project, query, self.DEFAULT_EXPECTED_CHECKSUM) extra_opts = {'input': self.DEFAULT_INPUT_FILE, 'dataset': self.dataset.name, 'window_duration': 1, 'on_success_matcher': all_of(state_verifier, bigquery_verifier)} # Register clean up before pipeline execution # Note that actual execution happens in reverse order. self.addCleanup(self._cleanup_dataset) self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name, self.OUTPUT_TABLE) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. hourly_team_score.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
class HourlyTeamScoreIT(unittest.TestCase): DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*' # SHA-1 hash generated from sorted rows reading from BigQuery table DEFAULT_EXPECTED_CHECKSUM = '4fa761fb5c3341ec573d5d12c6ab75e3b2957a25' OUTPUT_DATASET = 'hourly_team_score_it_dataset' OUTPUT_TABLE = 'leader_board' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') # Set up BigQuery environment from google.cloud import bigquery client = bigquery.Client() unique_dataset_name = self.OUTPUT_DATASET + str(int(time.time())) self.dataset = client.dataset(unique_dataset_name, project=self.project) self.dataset.create() def _cleanup_dataset(self): self.dataset.delete() @attr('IT') def test_hourly_team_score_it(self): state_verifier = PipelineStateMatcher(PipelineState.DONE) query = ('SELECT COUNT(*) FROM [%s:%s.%s]' % (self.project, self.dataset.name, self.OUTPUT_TABLE)) bigquery_verifier = BigqueryMatcher(self.project, query, self.DEFAULT_EXPECTED_CHECKSUM) extra_opts = { 'input': self.DEFAULT_INPUT_FILE, 'dataset': self.dataset.name, 'window_duration': 1, 'on_success_matcher': all_of(state_verifier, bigquery_verifier) } # Register clean up before pipeline execution # Note that actual execution happens in reverse order. self.addCleanup(self._cleanup_dataset) self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name, self.OUTPUT_TABLE) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. hourly_team_score.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_run_example_with_setup_file(self): pipeline = TestPipeline(is_integration_test=True) coordinate_output = FileSystems.join( pipeline.get_option('output'), 'juliaset-{}'.format(str(uuid.uuid4())), 'coordinates.txt') extra_args = { 'coordinate_output': coordinate_output, 'grid_size': self.GRID_SIZE, 'setup_file': os.path.normpath( os.path.join(os.path.dirname(__file__), '..', 'setup.py')), 'on_success_matcher': all_of(PipelineStateMatcher(PipelineState.DONE)), } args = pipeline.get_full_options_as_args(**extra_args) juliaset.run(args)
class MatchIntegrationTest(unittest.TestCase): INPUT_FILE = 'gs://dataflow-samples/shakespeare/kinglear.txt' KINGLEAR_CHECKSUM = 'f418b25f1507f5a901257026b035ac2857a7ab87' INPUT_FILE_LARGE = ( 'gs://dataflow-samples/wikipedia_edits/wiki_data-00000000000*.json') WIKI_FILES = [ 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000000.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000001.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000002.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000003.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000004.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000005.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000006.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000007.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000008.json', 'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000009.json', ] def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) @attr('IT') def test_transform_on_gcs(self): args = self.test_pipeline.get_full_options_as_args() with beam.Pipeline(argv=args) as p: matches_pc = (p | beam.Create([self.INPUT_FILE, self.INPUT_FILE_LARGE]) | fileio.MatchAll() | 'GetPath' >> beam.Map(lambda metadata: metadata.path)) assert_that(matches_pc, equal_to([self.INPUT_FILE] + self.WIKI_FILES), label='Matched Files') checksum_pc = (p | 'SingleFile' >> beam.Create([self.INPUT_FILE]) | 'MatchOneAll' >> fileio.MatchAll() | fileio.ReadMatches() | 'ReadIn' >> beam.Map(lambda x: x.read_utf8().split('\n')) | 'Checksums' >> beam.Map(compute_hash)) assert_that(checksum_pc, equal_to([self.KINGLEAR_CHECKSUM]), label='Assert Checksums')
def test_top_wikipedia_sessions_output_files_on_small_input(self): test_pipeline = TestPipeline(is_integration_test=True) # Setup the files with expected content. temp_folder = tempfile.mkdtemp() self.create_content_input_file( os.path.join(temp_folder, 'input.txt'), '\n'.join(self.EDITS)) extra_opts = { 'input': '%s/input.txt' % temp_folder, 'output': os.path.join(temp_folder, 'result'), 'sampling_threshold': '1.0' } top_wikipedia_sessions.run( test_pipeline.get_full_options_as_args(**extra_opts)) # Load result file and compare. with open_shards(os.path.join(temp_folder, 'result-*-of-*')) as result_file: result = result_file.read().strip().splitlines() self.assertEqual(self.EXPECTED, sorted(result, key=lambda x: x.split()[0]))
def test_autocomplete_output_files_on_small_input(self): test_pipeline = TestPipeline(is_integration_test=True) # Setup the files with expected content. OUTPUT_FILE_DIR = \ 'gs://temp-storage-for-end-to-end-tests/py-it-cloud/output' output = '/'.join([OUTPUT_FILE_DIR, str(uuid.uuid4()), 'result']) INPUT_FILE_DIR = \ 'gs://temp-storage-for-end-to-end-tests/py-it-cloud/input' input = '/'.join([INPUT_FILE_DIR, str(uuid.uuid4()), 'input.txt']) create_content_input_file(input, ' '.join(self.WORDS)) extra_opts = {'input': input, 'output': output} autocomplete.run(test_pipeline.get_full_options_as_args(**extra_opts)) # Load result file and compare. result = read_gcs_output_file(output).strip() self.assertEqual( sorted(self.EXPECTED_PREFIXES), sorted(format_output_file(result)))
def test_bigquery_tornadoes_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose output_table = ('BigQueryTornadoesIT' '.monthly_tornadoes_%s' % int(round(time.time() * 1000))) query = 'SELECT month, tornado_count FROM [%s]' % output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=test_pipeline.get_option('project'), query=query, checksum=self.DEFAULT_CHECKSUM)] extra_opts = {'output': output_table, 'on_success_matcher': all_of(*pipeline_verifiers)} # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. bigquery_tornadoes.run( test_pipeline.get_full_options_as_args(**extra_opts))
class HourlyTeamScoreIT(unittest.TestCase): DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*' # SHA-1 hash generated from sorted rows reading from BigQuery table DEFAULT_EXPECTED_CHECKSUM = '4fa761fb5c3341ec573d5d12c6ab75e3b2957a25' OUTPUT_DATASET = 'hourly_team_score_it_dataset' OUTPUT_TABLE = 'leader_board' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') # Set up BigQuery environment self.dataset_ref = utils.create_bq_dataset(self.project, self.OUTPUT_DATASET) @pytest.mark.it_postcommit def test_hourly_team_score_it(self): state_verifier = PipelineStateMatcher(PipelineState.DONE) query = ( 'SELECT COUNT(*) FROM `%s.%s.%s`' % (self.project, self.dataset_ref.dataset_id, self.OUTPUT_TABLE)) bigquery_verifier = BigqueryMatcher(self.project, query, self.DEFAULT_EXPECTED_CHECKSUM) extra_opts = { 'input': self.DEFAULT_INPUT_FILE, 'dataset': self.dataset_ref.dataset_id, 'window_duration': 1, 'on_success_matcher': all_of(state_verifier, bigquery_verifier) } # Register clean up before pipeline execution # Note that actual execution happens in reverse order. self.addCleanup(utils.delete_bq_dataset, self.project, self.dataset_ref) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. hourly_team_score.run( self.test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False)
def test_wordcount_it(self): test_pipeline = TestPipeline(is_integration_test=True) # Set extra options to the pipeline for test purpose output = '/'.join([test_pipeline.get_option('output'), str(int(time.time())), 'results']) arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int(arg_sleep_secs) if arg_sleep_secs is not None else None pipeline_verifiers = [PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.DEFAULT_CHECKSUM, sleep_secs)] extra_opts = {'output': output, 'on_success_matcher': all_of(*pipeline_verifiers)} # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
def test_coders_output_files_on_small_input(self): test_pipeline = TestPipeline(is_integration_test=True) # Setup the files with expected content. temp_folder = tempfile.mkdtemp() self.create_content_input_file( os.path.join(temp_folder, 'input.txt'), '\n'.join(map(json.dumps, self.SAMPLE_RECORDS))) extra_opts = { 'input': '%s/input.txt' % temp_folder, 'output': os.path.join(temp_folder, 'result') } coders.run(test_pipeline.get_full_options_as_args(**extra_opts)) # Load result file and compare. with open_shards(os.path.join(temp_folder, 'result-*-of-*')) as result_file: result = result_file.read().strip() self.assertEqual(sorted(self.EXPECTED_RESULT), sorted(self.format_result(result)))
def test_autocomplete_output_files_on_small_input(self): logging.error('SAVE_MAIN_SESSION') test_pipeline = TestPipeline(is_integration_test=True) # Setup the files with expected content. temp_folder = tempfile.mkdtemp() create_content_input_file(os.path.join(temp_folder, 'input.txt'), ' '.join(self.WORDS)) extra_opts = { 'input': '%s/input.txt' % temp_folder, 'output': os.path.join(temp_folder, 'result') } autocomplete.run(test_pipeline.get_full_options_as_args(**extra_opts)) # Load result file and compare. with open_shards(os.path.join(temp_folder, 'result-*-of-*')) as result_file: result = result_file.read().strip() self.assertEqual(sorted(self.EXPECTED_PREFIXES), sorted(format_output_file(result)))
def test_inference_mode(self): """Runs a pipeline in inference mode which should output one fileset.""" test_pipeline = TestPipeline() # Set extra options to the pipeline for test purpose test_dir = os.path.join(self.OUTPUT_DIR, str(int(time.time()))) self.addCleanup(shutil.rmtree, test_dir) # Checks that pipeline reaches state "Done" pipeline_verifiers = [PipelineStateMatcher()] extra_opts = { 'project': PROJECT, 'output_path': test_dir, 'on_success_matcher': all_of(*pipeline_verifiers), 'runner': 'DirectRunner', 'pipeline_mode': 'inference', } res = preprocess.main( test_pipeline.get_full_options_as_args(**extra_opts), query=self.TEST_QUERY, await_completion=True) # Check counts coming out of GetFirstClaim step. parse_first_claim_cnt = get_pipeline_metric( res, 'parse_firstclaim_success') self.assertEqual(self.TOTAL_RECORDS, parse_first_claim_cnt) # Ensure a proto is created for all input records create_proto_success = get_pipeline_metric(res, 'create_proto_success') self.assertEqual(self.TOTAL_RECORDS, create_proto_success) # Open a tf Example and check fields. example = read_example_proto(test_dir) for feature_name in preprocess.FEATURE_NAMES: self.assertGreaterEqual(get_tf_feature(example, feature_name), 0) # Make sure label feature is not present since we are in inference. with self.assertRaises(IndexError): get_tf_feature(example, 'label', 'bytes_list')
def test_datastore_wordcount_it(self): test_pipeline = TestPipeline(is_integration_test=True) dataset = test_pipeline.get_option("project") kind = self.DATASTORE_WORDCOUNT_KIND output = '/'.join([test_pipeline.get_option('output'), str(int(time.time() * 1000)), 'datastore_wordcount_results']) arg_sleep_secs = test_pipeline.get_option('sleep_secs') sleep_secs = int(arg_sleep_secs) if arg_sleep_secs is not None else None pipeline_verifiers = [PipelineStateMatcher(), FileChecksumMatcher(output + '*-of-*', self.EXPECTED_CHECKSUM, sleep_secs)] extra_opts = {'dataset': dataset, 'kind': kind, 'output': output, 'read_only': True, 'on_success_matcher': all_of(*pipeline_verifiers)} datastore_wordcount.run(test_pipeline.get_full_options_as_args( **extra_opts))
class BigQueryStreamingInsertTransformIntegrationTests(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_bq_streaming_inserts_' def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client = bigquery_tools.BigQueryWrapper() self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) logging.info("Created dataset %s in project %s", self.dataset_id, self.project) @attr('IT') def test_value_provider_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) schema = {'fields': [ {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE'}]} pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_2, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: input = p | beam.Create([row for row in _ELEMENTS if 'language' in row]) _ = (input | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery( table=value_provider.StaticValueProvider( str, '%s:%s' % (self.project, output_table_1)), schema=value_provider.StaticValueProvider(dict, schema), method='STREAMING_INSERTS')) _ = (input | "WriteWithMultipleDests2" >> beam.io.gcp.bigquery.WriteToBigQuery( table=value_provider.StaticValueProvider( str, '%s:%s' % (self.project, output_table_2)), method='FILE_LOADS')) @attr('IT') def test_multiple_destinations_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) full_output_table_1 = '%s:%s' % (self.project, output_table_1) full_output_table_2 = '%s:%s' % (self.project, output_table_2) schema1 = {'fields': [ {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE'}]} schema2 = {'fields': [ {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'foundation', 'type': 'STRING', 'mode': 'NULLABLE'}]} bad_record = {'language': 1, 'manguage': 2} pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_2, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=hc.all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) input2 = p | "Broken record" >> beam.Create([bad_record]) input = (input, input2) | beam.Flatten() r = (input | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery( table=lambda x: (full_output_table_1 if 'language' in x else full_output_table_2), schema=lambda dest: (schema1 if dest == full_output_table_1 else schema2), method='STREAMING_INSERTS')) assert_that(r[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS], equal_to([(full_output_table_1, bad_record)])) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: logging.info("Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s in project %s', self.dataset_id, self.project)
class BigQueryFileLoadsIT(unittest.TestCase): BIG_QUERY_DATASET_ID = 'python_bq_file_loads_' BIG_QUERY_SCHEMA = ( '{"fields": [{"name": "name","type": "STRING"},' '{"name": "language","type": "STRING"}]}' ) BIG_QUERY_SCHEMA_2 = ( '{"fields": [{"name": "name","type": "STRING"},' '{"name": "foundation","type": "STRING"}]}' ) def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client = bigquery_tools.BigQueryWrapper() self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) logging.info("Created dataset %s in project %s", self.dataset_id, self.project) @attr('IT') def test_multiple_destinations_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) output_table_3 = '%s%s' % (self.output_table, 3) output_table_4 = '%s%s' % (self.output_table, 4) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_2, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_3, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_4, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d])] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) # Get all input in same machine input = (input | beam.Map(lambda x: (None, x)) | beam.GroupByKey() | beam.FlatMap(lambda elm: elm[1])) _ = (input | "WriteWithMultipleDestsFreely" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_1 if 'language' in x else output_table_2), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) _ = (input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_3 if 'language' in x else output_table_4), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY, max_file_size=20, max_files_per_bundle=-1)) @attr('IT') def test_one_job_fails_all_jobs_fail(self): # If one of the import jobs fails, then other jobs must not be performed. # This is to avoid reinsertion of some records when a pipeline fails and # is rerun. output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) self.bigquery_client.get_or_create_table( self.project, self.dataset_id, output_table_1.split('.')[1], bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA), None, None) self.bigquery_client.get_or_create_table( self.project, self.dataset_id, output_table_2.split('.')[1], bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA_2), None, None) pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_1, data=[]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_2, data=[])] args = self.test_pipeline.get_full_options_as_args( experiments='use_beam_bq_sink') with self.assertRaises(Exception): with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) input2 = p | "Broken record" >> beam.Create(['language_broken_record']) input = (input, input2) | beam.Flatten() _ = (input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_1 if 'language' in x else output_table_2), create_disposition=( beam.io.BigQueryDisposition.CREATE_IF_NEEDED), write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) hamcrest_assert(p, all_of(*pipeline_verifiers)) def tearDown(self): request = bigquery_api.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: logging.info("Deleting dataset %s in project %s", self.dataset_id, self.project) self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s in project %s', self.dataset_id, self.project)
class BigQueryQueryToTableIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s' % self.dataset_id) def _setup_new_types_env(self): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table( tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=NEW_TYPES_INPUT_TABLE), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [ {'bytes':b'xyw=', 'date':'2011-01-01', 'time':'23:59:59.999999'}, {'bytes':b'abc=', 'date':'2000-01-01', 'time':'00:00:00'}, {'bytes':b'dec=', 'date':'3000-12-31', 'time':'23:59:59.990000'} ] self.bigquery_client.insert_rows( self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data) @attr('IT') def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': LEGACY_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_standard_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_new_types(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
class LeaderBoardIT(unittest.TestCase): # Input event containing user, team, score, processing time, window start. INPUT_EVENT = 'user1,teamA,10,%d,2015-11-02 09:09:28.224' INPUT_TOPIC = 'leader_board_it_input_topic' INPUT_SUB = 'leader_board_it_input_subscription' # SHA-1 hash generated from sorted rows reading from BigQuery table DEFAULT_EXPECTED_CHECKSUM = 'de00231fe6730b972c0ff60a99988438911cda53' OUTPUT_DATASET = 'leader_board_it_dataset' OUTPUT_TABLE_USERS = 'leader_board_users' OUTPUT_TABLE_TEAMS = 'leader_board_teams' DEFAULT_INPUT_COUNT = 500 WAIT_UNTIL_FINISH_DURATION = 10 * 60 * 1000 # in milliseconds def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') _unique_id = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pubsub_client = pubsub.Client(project=self.project) unique_topic_name = self.INPUT_TOPIC + _unique_id unique_subscrition_name = self.INPUT_SUB + _unique_id self.input_topic = self.pubsub_client.topic(unique_topic_name) self.input_sub = self.input_topic.subscription(unique_subscrition_name) self.input_topic.create() test_utils.wait_for_topics_created([self.input_topic]) self.input_sub.create() # Set up BigQuery environment from google.cloud import bigquery client = bigquery.Client() unique_dataset_name = self.OUTPUT_DATASET + str(int(time.time())) self.dataset = client.dataset(unique_dataset_name, project=self.project) self.dataset.create() self._test_timestamp = int(time.time() * 1000) def _inject_pubsub_game_events(self, topic, message_count): """Inject game events as test data to PubSub.""" logging.debug('Injecting %d game events to topic %s', message_count, topic.full_name) for _ in range(message_count): topic.publish(self.INPUT_EVENT % self._test_timestamp) def _cleanup_pubsub(self): test_utils.cleanup_subscriptions([self.input_sub]) test_utils.cleanup_topics([self.input_topic]) def _cleanup_dataset(self): self.dataset.delete() @attr('IT') def test_leader_board_it(self): state_verifier = PipelineStateMatcher(PipelineState.RUNNING) success_condition = 'total_score=5000 LIMIT 1' users_query = ('SELECT total_score FROM [%s:%s.%s] ' 'WHERE %s' % (self.project, self.dataset.name, self.OUTPUT_TABLE_USERS, success_condition)) bq_users_verifier = BigqueryMatcher(self.project, users_query, self.DEFAULT_EXPECTED_CHECKSUM) teams_query = ('SELECT total_score FROM [%s:%s.%s] ' 'WHERE %s' % (self.project, self.dataset.name, self.OUTPUT_TABLE_TEAMS, success_condition)) bq_teams_verifier = BigqueryMatcher(self.project, teams_query, self.DEFAULT_EXPECTED_CHECKSUM) extra_opts = {'subscription': self.input_sub.full_name, 'dataset': self.dataset.name, 'topic': self.input_topic.full_name, 'team_window_duration': 1, 'wait_until_finish_duration': self.WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(state_verifier, bq_users_verifier, bq_teams_verifier)} # Register cleanup before pipeline execution. # Note that actual execution happens in reverse order. self.addCleanup(self._cleanup_pubsub) self.addCleanup(self._cleanup_dataset) self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name, self.OUTPUT_TABLE_USERS) self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name, self.OUTPUT_TABLE_TEAMS) # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created([self.input_topic, self.input_sub]) self._inject_pubsub_game_events(self.input_topic, self.DEFAULT_INPUT_COUNT) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. leader_board.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
def test_option_args_parsing(self): test_pipeline = TestPipeline(argv=self.TEST_CASE['options']) self.assertListEqual( sorted(test_pipeline.get_full_options_as_args()), sorted(self.TEST_CASE['expected_list']))
def test_empty_option_args_parsing(self): test_pipeline = TestPipeline() self.assertListEqual([], test_pipeline.get_full_options_as_args())
def test_create_test_pipeline_options(self): test_pipeline = TestPipeline(argv=self.TEST_CASE['options']) test_options = PipelineOptions(test_pipeline.get_full_options_as_args()) self.assertDictContainsSubset(self.TEST_CASE['expected_dict'], test_options.get_all_options())
def test_append_extra_options(self): test_pipeline = TestPipeline() for case in self.EXTRA_OPT_CASES: opt_list = test_pipeline.get_full_options_as_args(**case['options']) self.assertListEqual(sorted(opt_list), sorted(case['expected']))
class PubSubIntegrationTest(unittest.TestCase): ID_LABEL = 'id' TIMESTAMP_ATTRIBUTE = 'timestamp' INPUT_MESSAGES = { # TODO(BEAM-4275): DirectRunner doesn't support reading or writing # label_ids, nor writing timestamp attributes. Once these features exist, # TestDirectRunner and TestDataflowRunner should behave identically. 'TestDirectRunner': [ PubsubMessage('data001', {}), # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the # IT pipeline writes back the timestamp of each element (as reported # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute. PubsubMessage('data002', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', }), ], 'TestDataflowRunner': [ # Use ID_LABEL attribute to deduplicate messages with the same ID. PubsubMessage('data001', {ID_LABEL: 'foo'}), PubsubMessage('data001', {ID_LABEL: 'foo'}), PubsubMessage('data001', {ID_LABEL: 'foo'}), # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the # IT pipeline writes back the timestamp of each element (as reported # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute. PubsubMessage('data002', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', }) ], } EXPECTED_OUTPUT_MESSAGES = { 'TestDirectRunner': [ PubsubMessage('data001-seen', {'processed': 'IT'}), PubsubMessage('data002-seen', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 'processed': 'IT', }), ], 'TestDataflowRunner': [ PubsubMessage('data001-seen', {'processed': 'IT'}), PubsubMessage('data002-seen', { TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 'processed': 'IT', }), ], } def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.uuid = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pub_client = pubsub.PublisherClient() self.input_topic = self.pub_client.create_topic( self.pub_client.topic_path(self.project, INPUT_TOPIC + self.uuid)) self.output_topic = self.pub_client.create_topic( self.pub_client.topic_path(self.project, OUTPUT_TOPIC + self.uuid)) self.sub_client = pubsub.SubscriberClient() self.input_sub = self.sub_client.create_subscription( self.sub_client.subscription_path(self.project, INPUT_SUB + self.uuid), self.input_topic.name) self.output_sub = self.sub_client.create_subscription( self.sub_client.subscription_path(self.project, OUTPUT_SUB + self.uuid), self.output_topic.name) def tearDown(self): test_utils.cleanup_subscriptions(self.sub_client, [self.input_sub, self.output_sub]) test_utils.cleanup_topics(self.pub_client, [self.input_topic, self.output_topic]) def _test_streaming(self, with_attributes): """Runs IT pipeline with message verifier. Args: with_attributes: False - Reads and writes message data only. True - Reads and writes message data and attributes. Also verifies id_label and timestamp_attribute features. """ # Set on_success_matcher to verify pipeline state and pubsub output. These # verifications run on a (remote) worker. # Expect the state to be RUNNING since a streaming pipeline is usually # never DONE. The test runner will cancel the pipeline after verification. state_verifier = PipelineStateMatcher(PipelineState.RUNNING) expected_messages = self.EXPECTED_OUTPUT_MESSAGES[self.runner_name] if not with_attributes: expected_messages = [pubsub_msg.data for pubsub_msg in expected_messages] if self.runner_name == 'TestDirectRunner': strip_attributes = None else: strip_attributes = [self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE] pubsub_msg_verifier = PubSubMessageMatcher( self.project, self.output_sub.name, expected_messages, timeout=MESSAGE_MATCHER_TIMEOUT_S, with_attributes=with_attributes, strip_attributes=strip_attributes) extra_opts = {'input_subscription': self.input_sub.name, 'output_topic': self.output_topic.name, 'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)} # Generate input data and inject to PubSub. for msg in self.INPUT_MESSAGES[self.runner_name]: self.pub_client.publish(self.input_topic.name, msg.data, **msg.attributes) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. pubsub_it_pipeline.run_pipeline( argv=self.test_pipeline.get_full_options_as_args(**extra_opts), with_attributes=with_attributes, id_label=self.ID_LABEL, timestamp_attribute=self.TIMESTAMP_ATTRIBUTE) @attr('IT') def test_streaming_data_only(self): self._test_streaming(with_attributes=False) @attr('IT') def test_streaming_with_attributes(self): self._test_streaming(with_attributes=True)
class BigQueryQueryToTableIT(unittest.TestCase): def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.bigquery_client = BigQueryWrapper() self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())), random.randint(0, 10000)) self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id) self.output_table = "%s.output_table" % (self.dataset_id) def tearDown(self): request = bigquery.BigqueryDatasetsDeleteRequest( projectId=self.project, datasetId=self.dataset_id, deleteContents=True) try: self.bigquery_client.client.datasets.Delete(request) except HttpError: logging.debug('Failed to clean up dataset %s' % self.dataset_id) def _setup_new_types_env(self): table_schema = bigquery.TableSchema() table_field = bigquery.TableFieldSchema() table_field.name = 'bytes' table_field.type = 'BYTES' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'date' table_field.type = 'DATE' table_schema.fields.append(table_field) table_field = bigquery.TableFieldSchema() table_field.name = 'time' table_field.type = 'TIME' table_schema.fields.append(table_field) table = bigquery.Table( tableReference=bigquery.TableReference( projectId=self.project, datasetId=self.dataset_id, tableId=NEW_TYPES_INPUT_TABLE), schema=table_schema) request = bigquery.BigqueryTablesInsertRequest( projectId=self.project, datasetId=self.dataset_id, table=table) self.bigquery_client.client.tables.Insert(request) table_data = [ {'bytes':b'xyw=', 'date':'2011-01-01', 'time':'23:59:59.999999'}, {'bytes':b'abc=', 'date':'2000-01-01', 'time':'00:00:00'}, {'bytes':b'dec=', 'date':'3000-12-31', 'time':'23:59:59.990000'} ] self.bigquery_client.insert_rows( self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data) @attr('IT') def test_big_query_legacy_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': LEGACY_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) @attr('IT') def test_big_query_standard_sql(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) # TODO(BEAM-6660): Enable this test when ready. @unittest.skip('This test requires BQ Dataflow native source support for ' + 'KMS, which is not available yet.') @attr('IT') def test_big_query_standard_sql_kms_key(self): verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED) pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] extra_opts = {'query': STANDARD_QUERY, 'output': self.output_table, 'output_schema': DIALECT_OUTPUT_SCHEMA, 'use_standard_sql': True, 'on_success_matcher': all_of(*pipeline_verifiers), 'kms_key': KMS_KEY } options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options) table = self.bigquery_client.get_table( self.project, self.dataset_id, 'output_table') self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName) @unittest.skipIf(sys.version_info[0] == 3 and os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1', 'This test still needs to be fixed on Python 3' 'TODO: BEAM-6769') @attr('IT') def test_big_query_new_types(self): expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED) verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher( project=self.project, query=verify_query, checksum=expected_checksum)] self._setup_new_types_env() extra_opts = { 'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE), 'output': self.output_table, 'output_schema': NEW_TYPES_OUTPUT_SCHEMA, 'use_standard_sql': False, 'on_success_matcher': all_of(*pipeline_verifiers)} options = self.test_pipeline.get_full_options_as_args(**extra_opts) big_query_query_to_table_pipeline.run_bq_pipeline(options)
class GameStatsIT(unittest.TestCase): # Input events containing user, team, score, processing time, window start. INPUT_EVENT = 'user1,teamA,10,%d,2015-11-02 09:09:28.224' INPUT_TOPIC = 'game_stats_it_input_topic' INPUT_SUB = 'game_stats_it_input_subscription' # SHA-1 hash generated from sorted rows reading from BigQuery table DEFAULT_EXPECTED_CHECKSUM = '5288ccaab77d347c8460d77c15a0db234ef5eb4f' OUTPUT_DATASET = 'game_stats_it_dataset' OUTPUT_TABLE_SESSIONS = 'game_stats_sessions' OUTPUT_TABLE_TEAMS = 'game_stats_teams' DEFAULT_INPUT_COUNT = 500 WAIT_UNTIL_FINISH_DURATION = 12 * 60 * 1000 # in milliseconds def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') _unique_id = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pub_client = pubsub.PublisherClient() self.input_topic = self.pub_client.create_topic( self.pub_client.topic_path(self.project, self.INPUT_TOPIC + _unique_id)) self.sub_client = pubsub.SubscriberClient() self.input_sub = self.sub_client.create_subscription( self.sub_client.subscription_path(self.project, self.INPUT_SUB + _unique_id), self.input_topic.name) # Set up BigQuery environment self.dataset_ref = utils.create_bq_dataset(self.project, self.OUTPUT_DATASET) self._test_timestamp = int(time.time() * 1000) def _inject_pubsub_game_events(self, topic, message_count): """Inject game events as test data to PubSub.""" logging.debug('Injecting %d game events to topic %s', message_count, topic.name) for _ in range(message_count): self.pub_client.publish(topic.name, (self.INPUT_EVENT % self._test_timestamp ).encode('utf-8')) def _cleanup_pubsub(self): test_utils.cleanup_subscriptions(self.sub_client, [self.input_sub]) test_utils.cleanup_topics(self.pub_client, [self.input_topic]) @attr('IT') def test_game_stats_it(self): state_verifier = PipelineStateMatcher(PipelineState.RUNNING) success_condition = 'mean_duration=300 LIMIT 1' sessions_query = ('SELECT mean_duration FROM `%s.%s.%s` ' 'WHERE %s' % (self.project, self.dataset_ref.dataset_id, self.OUTPUT_TABLE_SESSIONS, success_condition)) bq_sessions_verifier = BigqueryMatcher(self.project, sessions_query, self.DEFAULT_EXPECTED_CHECKSUM) # TODO(mariagh): Add teams table verifier once game_stats.py is fixed. extra_opts = {'subscription': self.input_sub.name, 'dataset': self.dataset_ref.dataset_id, 'topic': self.input_topic.name, 'fixed_window_duration': 1, 'user_activity_window_duration': 1, 'wait_until_finish_duration': self.WAIT_UNTIL_FINISH_DURATION, 'on_success_matcher': all_of(state_verifier, bq_sessions_verifier)} # Register cleanup before pipeline execution. # Note that actual execution happens in reverse order. self.addCleanup(self._cleanup_pubsub) self.addCleanup(utils.delete_bq_dataset, self.project, self.dataset_ref) # Generate input data and inject to PubSub. self._inject_pubsub_game_events(self.input_topic, self.DEFAULT_INPUT_COUNT) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. game_stats.run( self.test_pipeline.get_full_options_as_args(**extra_opts))
class PubSubIntegrationTest(unittest.TestCase): ID_LABEL = 'id' TIMESTAMP_ATTRIBUTE = 'timestamp' INPUT_MESSAGES = [ # Use ID_LABEL attribute to deduplicate messages with the same ID. PubsubMessage('data001', {ID_LABEL: 'foo'}), PubsubMessage('data001', {ID_LABEL: 'foo'}), PubsubMessage('data001', {ID_LABEL: 'foo'}), # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the IT # pipeline writes back the timestamp of each element (as reported by # Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute. PubsubMessage('data002', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', }), ] EXPECTED_OUTPUT_MESSAGES = [ PubsubMessage('data001-seen', {'processed': 'IT'}), PubsubMessage('data002-seen', { TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 'processed': 'IT', }), ] def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') self.uuid = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pubsub_client = pubsub.Client(project=self.project) self.input_topic = self.pubsub_client.topic(INPUT_TOPIC + self.uuid) self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC + self.uuid) self.input_sub = self.input_topic.subscription(INPUT_SUB + self.uuid) self.output_sub = self.output_topic.subscription(OUTPUT_SUB + self.uuid) self.input_topic.create() self.output_topic.create() test_utils.wait_for_topics_created([self.input_topic, self.output_topic]) self.input_sub.create() self.output_sub.create() def tearDown(self): test_utils.cleanup_subscriptions([self.input_sub, self.output_sub]) test_utils.cleanup_topics([self.input_topic, self.output_topic]) def _test_streaming(self, with_attributes): """Runs IT pipeline with message verifier. Args: with_attributes: False - Reads and writes message data only. True - Reads and writes message data and attributes. Also verifies id_label and timestamp_attribute features. """ # Build expected dataset. # Set extra options to the pipeline for test purpose state_verifier = PipelineStateMatcher(PipelineState.RUNNING) expected_messages = self.EXPECTED_OUTPUT_MESSAGES if not with_attributes: expected_messages = [pubsub_msg.data for pubsub_msg in expected_messages] pubsub_msg_verifier = PubSubMessageMatcher( self.project, OUTPUT_SUB + self.uuid, expected_messages, timeout=MESSAGE_MATCHER_TIMEOUT_S, with_attributes=with_attributes, strip_attributes=[self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE]) extra_opts = {'input_subscription': self.input_sub.full_name, 'output_topic': self.output_topic.full_name, 'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)} # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created([self.input_sub]) for msg in self.INPUT_MESSAGES: self.input_topic.publish(msg.data, **msg.attributes) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. pubsub_it_pipeline.run_pipeline( argv=self.test_pipeline.get_full_options_as_args(**extra_opts), with_attributes=with_attributes, id_label=self.ID_LABEL, timestamp_attribute=self.TIMESTAMP_ATTRIBUTE) @attr('IT') def test_streaming_data_only(self): self._test_streaming(with_attributes=False) @attr('IT') def test_streaming_with_attributes(self): self._test_streaming(with_attributes=True)