コード例 #1
0
  def test_filters_output_bigquery_matcher(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    project = test_pipeline.get_option('project')

    dataset = 'FiltersTestIT'
    table = 'cold_days_%s' % int(round(time.time() * 1000))
    output_table = '.'.join([dataset, table])
    query = 'SELECT year, month, day, mean_temp FROM `%s`' % output_table

    pipeline_verifiers = [
        PipelineStateMatcher(),
        BigqueryMatcher(
            project=project, query=query, checksum=self.DEFAULT_CHECKSUM)
    ]
    extra_opts = {
        'output': output_table,
        'on_success_matcher': all_of(*pipeline_verifiers)
    }

    # Register cleanup before pipeline execution.
    # Note that actual execution happens in reverse order.
    self.addCleanup(utils.delete_bq_table, project, dataset, table)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    filters.run(test_pipeline.get_full_options_as_args(**extra_opts))
コード例 #2
0
    def test_hourly_team_score_it(self):
        state_verifier = PipelineStateMatcher(PipelineState.DONE)
        query = (
            'SELECT COUNT(*) FROM `%s.%s.%s`' %
            (self.project, self.dataset_ref.dataset_id, self.OUTPUT_TABLE))

        bigquery_verifier = BigqueryMatcher(self.project, query,
                                            self.DEFAULT_EXPECTED_CHECKSUM)

        extra_opts = {
            'input': self.DEFAULT_INPUT_FILE,
            'dataset': self.dataset_ref.dataset_id,
            'window_duration': 1,
            'on_success_matcher': all_of(state_verifier, bigquery_verifier)
        }

        # Register clean up before pipeline execution
        # Note that actual execution happens in reverse order.
        self.addCleanup(utils.delete_bq_dataset, self.project,
                        self.dataset_ref)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        hourly_team_score.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts))
コード例 #3
0
    def test_streaming_wordcount_it(self):
        # Build expected dataset.
        expected_msg = [('%d: 1' % num).encode('utf-8')
                        for num in range(DEFAULT_INPUT_NUMBERS)]

        # Set extra options to the pipeline for test purpose
        state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
        pubsub_msg_verifier = PubSubMessageMatcher(self.project,
                                                   self.output_sub.name,
                                                   expected_msg,
                                                   timeout=400)
        extra_opts = {
            'input_subscription': self.input_sub.name,
            'output_topic': self.output_topic.name,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION,
            'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)
        }

        # Generate input data and inject to PubSub.
        self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        streaming_wordcount.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts),
            save_main_session=False)
コード例 #4
0
  def test_bqfl_streaming(self):
    if isinstance(self.test_pipeline.runner, TestDataflowRunner):
      self.skipTest("TestStream is not supported on TestDataflowRunner")
    output_table = '%s_%s' % (self.output_table, 'ints')
    _SIZE = 100
    schema = self.BIG_QUERY_STREAMING_SCHEMA
    l = [{'Integr': i} for i in range(_SIZE)]

    state_matcher = PipelineStateMatcher(PipelineState.RUNNING)
    bq_matcher = BigqueryFullResultStreamingMatcher(
        project=self.project,
        query="SELECT Integr FROM %s" % output_table,
        data=[(i, ) for i in range(100)])

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=all_of(state_matcher, bq_matcher), streaming=True)
    with beam.Pipeline(argv=args) as p:
      stream_source = (
          TestStream().advance_watermark_to(0).advance_processing_time(
              100).add_elements(l[:_SIZE // 4]).
          advance_processing_time(100).advance_watermark_to(100).add_elements(
              l[_SIZE // 4:2 * _SIZE // 4]).advance_processing_time(
                  100).advance_watermark_to(200).add_elements(
                      l[2 * _SIZE // 4:3 * _SIZE // 4]).advance_processing_time(
                          100).advance_watermark_to(300).add_elements(
                              l[3 * _SIZE // 4:]).advance_processing_time(
                                  100).advance_watermark_to_infinity())
      _ = (p
           | stream_source
           | bigquery.WriteToBigQuery(output_table,
                                      schema=schema,
                                      method=bigquery.WriteToBigQuery \
                                        .Method.FILE_LOADS,
                                      triggering_frequency=100))
コード例 #5
0
 def test_big_query_new_types_native(self):
     expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
     verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
     pipeline_verifiers = [
         PipelineStateMatcher(),
         BigqueryMatcher(
             project=self.project,
             query=verify_query,
             checksum=expected_checksum,
             timeout_secs=30,
         )
     ]
     self._setup_new_types_env()
     extra_opts = {
         'query':
         NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
         'output': self.output_table,
         'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
         'use_standard_sql': False,
         'native': True,
         'use_json_exports': True,
         'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
         'on_success_matcher': all_of(*pipeline_verifiers),
         'experiments': 'use_legacy_bq_sink',
     }
     options = self.test_pipeline.get_full_options_as_args(**extra_opts)
     big_query_query_to_table_pipeline.run_bq_pipeline(options)
コード例 #6
0
  def test_bigquery_tornadoes_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    project = test_pipeline.get_option('project')

    dataset = 'BigQueryTornadoesIT'
    table = 'monthly_tornadoes_%s' % int(round(time.time() * 1000))
    output_table = '.'.join([dataset, table])
    query = 'SELECT month, tornado_count FROM [%s]' % output_table

    pipeline_verifiers = [PipelineStateMatcher(),
                          BigqueryMatcher(
                              project=project,
                              query=query,
                              checksum=self.DEFAULT_CHECKSUM)]
    extra_opts = {'output': output_table,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Register cleanup before pipeline execution.
    self.addCleanup(utils.delete_bq_table, project, dataset, table)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    bigquery_tornadoes.run(
        test_pipeline.get_full_options_as_args(**extra_opts))
コード例 #7
0
    def _run_wordcount_it(self, run_wordcount, **opts):
        test_pipeline = TestPipeline(is_integration_test=True)
        extra_opts = {}

        # Set extra options to the pipeline for test purpose
        test_output = '/'.join([
            test_pipeline.get_option('output'),
            str(int(time.time() * 1000)), 'results'
        ])
        extra_opts['output'] = test_output

        test_input = test_pipeline.get_option('input')
        if test_input:
            extra_opts['input'] = test_input

        arg_sleep_secs = test_pipeline.get_option('sleep_secs')
        sleep_secs = int(
            arg_sleep_secs) if arg_sleep_secs is not None else None
        expect_checksum = (test_pipeline.get_option('expect_checksum')
                           or self.DEFAULT_CHECKSUM)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            FileChecksumMatcher(test_output + '*-of-*', expect_checksum,
                                sleep_secs)
        ]
        extra_opts['on_success_matcher'] = all_of(*pipeline_verifiers)
        extra_opts.update(opts)

        # Register clean up before pipeline execution
        self.addCleanup(delete_files, [test_output + '*'])

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        run_wordcount(test_pipeline.get_full_options_as_args(**extra_opts),
                      save_main_session=False)
コード例 #8
0
    def test_datastore_wordcount_it(self):
        test_pipeline = TestPipeline(is_integration_test=True)
        kind = self.DATASTORE_WORDCOUNT_KIND
        output = '/'.join([
            test_pipeline.get_option('output'),
            str(int(time.time() * 1000)), 'datastore_wordcount_results'
        ])

        arg_sleep_secs = test_pipeline.get_option('sleep_secs')
        sleep_secs = int(
            arg_sleep_secs) if arg_sleep_secs is not None else None
        pipeline_verifiers = [
            PipelineStateMatcher(),
            FileChecksumMatcher(output + '*-of-*', self.EXPECTED_CHECKSUM,
                                sleep_secs)
        ]
        extra_opts = {
            'kind': kind,
            'output': output,
            # Comment this out to regenerate input data on Datastore (delete
            # existing data first using the bulk delete Dataflow template).
            'read_only': True,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }

        datastore_wordcount.run(
            test_pipeline.get_full_options_as_args(**extra_opts))
コード例 #9
0
    def test_datastore_wordcount_it(self):
        test_pipeline = TestPipeline(is_integration_test=True)
        dataset = test_pipeline.get_option("project")
        kind = self.DATASTORE_WORDCOUNT_KIND
        output = '/'.join([
            test_pipeline.get_option('output'),
            str(int(time.time() * 1000)), 'datastore_wordcount_results'
        ])

        arg_sleep_secs = test_pipeline.get_option('sleep_secs')
        sleep_secs = int(
            arg_sleep_secs) if arg_sleep_secs is not None else None
        pipeline_verifiers = [
            PipelineStateMatcher(),
            FileChecksumMatcher(output + '*-of-*', self.EXPECTED_CHECKSUM,
                                sleep_secs)
        ]
        extra_opts = {
            'dataset': dataset,
            'kind': kind,
            'output': output,
            'read_only': True,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }

        datastore_wordcount.run(
            test_pipeline.get_full_options_as_args(**extra_opts))
コード例 #10
0
    def test_streaming_wordcount_it(self):
        # Build expected dataset.
        expected_msg = [('%d: 1' % num)
                        for num in range(DEFAULT_INPUT_NUMBERS)]

        # Set extra options to the pipeline for test purpose
        state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
        pubsub_msg_verifier = PubSubMessageMatcher(self.project,
                                                   OUTPUT_SUB + self.uuid,
                                                   expected_msg,
                                                   timeout=400)
        extra_opts = {
            'input_subscription': self.input_sub.full_name,
            'output_topic': self.output_topic.full_name,
            'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)
        }

        # Generate input data and inject to PubSub.
        test_utils.wait_for_subscriptions_created([self.input_sub])
        self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        streaming_wordcount.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts))
コード例 #11
0
    def test_big_query_standard_sql_kms_key_native(self):
        if isinstance(self.test_pipeline.runner, TestDirectRunner):
            self.skipTest("This test doesn't work on DirectRunner.")
        verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
        expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=self.project,
                            query=verify_query,
                            checksum=expected_checksum)
        ]
        kms_key = self.test_pipeline.get_option('kms_key_name')
        self.assertTrue(kms_key)
        extra_opts = {
            'query': STANDARD_QUERY,
            'output': self.output_table,
            'output_schema': DIALECT_OUTPUT_SCHEMA,
            'use_standard_sql': True,
            'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
            'on_success_matcher': all_of(*pipeline_verifiers),
            'kms_key': kms_key,
            'native': True,
            'experiments': 'use_legacy_bq_sink',
        }
        options = self.test_pipeline.get_full_options_as_args(**extra_opts)
        big_query_query_to_table_pipeline.run_bq_pipeline(options)

        table = self.bigquery_client.get_table(self.project, self.dataset_id,
                                               'output_table')
        self.assertIsNotNone(table.encryptionConfiguration,
                             'No encryption configuration found: %s' % table)
        self.assertEqual(kms_key, table.encryptionConfiguration.kmsKeyName)
コード例 #12
0
    def _run_pubsub_bq_pipeline(self, method, triggering_frequency=None):
        l = [i for i in range(self._SIZE)]

        matchers = [
            PipelineStateMatcher(PipelineState.RUNNING),
            BigqueryFullResultStreamingMatcher(project=self.project,
                                               query="SELECT number FROM %s" %
                                               self.output_table,
                                               data=[(i, ) for i in l])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*matchers),
            wait_until_finish_duration=self.WAIT_UNTIL_FINISH_DURATION,
            experiments='use_beam_bq_sink',
            streaming=True)

        def add_schema_info(element):
            yield {'number': element}

        messages = [str(i).encode('utf-8') for i in l]
        for message in messages:
            self.pub_client.publish(self.input_topic.name, message)

        with beam.Pipeline(argv=args) as p:
            mesages = (p
                       | ReadFromPubSub(subscription=self.input_sub.name)
                       | beam.ParDo(add_schema_info))
            _ = mesages | WriteToBigQuery(
                self.output_table,
                schema=self.SCHEMA,
                method=method,
                triggering_frequency=triggering_frequency)
コード例 #13
0
    def _run_wordcount_it(self, **opts):
        test_pipeline = TestPipeline(is_integration_test=True)

        # Set extra options to the pipeline for test purpose
        output = '/'.join([
            test_pipeline.get_option('output'),
            str(int(time.time() * 1000)), 'results'
        ])
        arg_sleep_secs = test_pipeline.get_option('sleep_secs')
        sleep_secs = int(
            arg_sleep_secs) if arg_sleep_secs is not None else None
        pipeline_verifiers = [
            PipelineStateMatcher(),
            FileChecksumMatcher(output + '*-of-*', self.DEFAULT_CHECKSUM,
                                sleep_secs)
        ]
        extra_opts = {
            'output': output,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }
        extra_opts.update(opts)

        # Register clean up before pipeline execution
        self.addCleanup(delete_files, [output + '*'])

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
コード例 #14
0
    def test_hourly_team_score_output_checksum_on_small_input(self):
        # Small dataset to prevent Out of Memory when running in local runners
        INPUT_FILE = 'gs://apache-beam-samples/game/small/gaming_data.csv'
        EXPECTED_CHECKSUM = '91143e81622aa391eb62eaa3f3a5123401edb07d'
        state_verifier = PipelineStateMatcher(PipelineState.DONE)
        query = (
            'SELECT COUNT(*) FROM `%s.%s.%s`' %
            (self.project, self.dataset_ref.dataset_id, self.OUTPUT_TABLE))

        bigquery_verifier = BigqueryMatcher(self.project, query,
                                            EXPECTED_CHECKSUM)

        extra_opts = {
            'input': INPUT_FILE,
            'dataset': self.dataset_ref.dataset_id,
            'window_duration': 1,
            'on_success_matcher': all_of(state_verifier, bigquery_verifier)
        }

        # Register clean up before pipeline execution
        # Note that actual execution happens in reverse order.
        self.addCleanup(utils.delete_bq_dataset, self.project,
                        self.dataset_ref)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        hourly_team_score.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts))
コード例 #15
0
  def test_wordcount_fnapi_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    wordcount_fnapi.run(
        test_pipeline.get_full_options_as_args(
            experiment='beam_fn_api',
            on_success_matcher=PipelineStateMatcher()))
コード例 #16
0
 def run_bigquery_io_read_pipeline(self, input_size):
   test_pipeline = TestPipeline(is_integration_test=True)
   pipeline_verifiers = [PipelineStateMatcher(),]
   extra_opts = {'input_table': self.DEFAULT_DATASET + "." +
                                self.DEFAULT_TABLE_PREFIX + input_size,
                 'num_records': self.NUM_RECORDS[input_size],
                 'on_success_matcher': all_of(*pipeline_verifiers)}
   bigquery_io_read_pipeline.run(test_pipeline.get_full_options_as_args(
       **extra_opts))
コード例 #17
0
    def test_train_mode(self):
        """Runs pipeline in train mode outputting train, test and eval filesets."""
        test_pipeline = TestPipeline()
        # Set extra options to the pipeline for test purpose
        test_dir = os.path.join(self.OUTPUT_DIR, str(int(time.time())))
        self.addCleanup(shutil.rmtree, test_dir)

        # Checks that pipeline reaches state "Done"
        pipeline_verifiers = [PipelineStateMatcher()]
        extra_opts = {
            'project': PROJECT,
            'output_path': test_dir,
            'on_success_matcher': all_of(*pipeline_verifiers),
            'runner': 'DirectRunner',
        }

        res = preprocess.main(
            test_pipeline.get_full_options_as_args(**extra_opts),
            query=self.TEST_QUERY,
            await_completion=True)

        # Check counts coming out of GetFirstClaim step.
        parse_first_claim_cnt = get_pipeline_metric(
            res, 'parse_firstclaim_success')
        self.assertEqual(self.TOTAL_RECORDS, parse_first_claim_cnt)

        # Check counts coming out of AddFeatures step.
        add_features_cnt = get_pipeline_metric(res, 'create_features_success')
        self.assertEqual(self.TOTAL_RECORDS, add_features_cnt)

        # Check counts coming out of AddLabel step.
        broad_cnt = get_pipeline_metric(res, 'add_label_broad')
        narrow_cnt = get_pipeline_metric(res, 'add_label_narrow')
        self.assertEqual(self.TOTAL_RECORDS, broad_cnt + narrow_cnt)

        # Check if the number of records coming out of Train/Test = limit step.
        splits = ['train_cnt', 'eval_cnt', 'test_cnt']
        train_test_split_cnt = sum(
            [get_pipeline_metric(res, m) for m in splits])
        self.assertEqual(self.TOTAL_RECORDS, train_test_split_cnt)

        # Check if number of protos created matched output of train/test split.
        create_proto_success = sum([
            get_pipeline_metric(res, 'create_proto_success', index=i)
            for i in range(3)
        ])
        self.assertEqual(self.TOTAL_RECORDS, create_proto_success)

        # Open a tf Example and check fields.
        example = read_example_proto(test_dir)
        for feature_name in preprocess.FEATURE_NAMES:
            self.assertGreaterEqual(get_tf_feature(example, feature_name), 0)
        # Make sure label feature is present.
        labels = ['broad', 'narrow']
        self.assertIn(get_tf_feature(example, 'label', 'bytes_list'), labels)
コード例 #18
0
    def test_leader_board_it(self):
        state_verifier = PipelineStateMatcher(PipelineState.RUNNING)

        success_condition = 'total_score=5000 LIMIT 1'
        users_query = ('SELECT total_score FROM [%s:%s.%s] '
                       'WHERE %s' %
                       (self.project, self.dataset.name,
                        self.OUTPUT_TABLE_USERS, success_condition))
        bq_users_verifier = BigqueryMatcher(self.project, users_query,
                                            self.DEFAULT_EXPECTED_CHECKSUM)

        teams_query = ('SELECT total_score FROM [%s:%s.%s] '
                       'WHERE %s' %
                       (self.project, self.dataset.name,
                        self.OUTPUT_TABLE_TEAMS, success_condition))
        bq_teams_verifier = BigqueryMatcher(self.project, teams_query,
                                            self.DEFAULT_EXPECTED_CHECKSUM)

        extra_opts = {
            'subscription':
            self.input_sub.full_name,
            'dataset':
            self.dataset.name,
            'topic':
            self.input_topic.full_name,
            'team_window_duration':
            1,
            'wait_until_finish_duration':
            self.WAIT_UNTIL_FINISH_DURATION,
            'on_success_matcher':
            all_of(state_verifier, bq_users_verifier, bq_teams_verifier)
        }

        # Register cleanup before pipeline execution.
        # Note that actual execution happens in reverse order.
        self.addCleanup(self._cleanup_pubsub)
        self.addCleanup(self._cleanup_dataset)
        self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name,
                        self.OUTPUT_TABLE_USERS)
        self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name,
                        self.OUTPUT_TABLE_TEAMS)

        # Generate input data and inject to PubSub.
        test_utils.wait_for_subscriptions_created(
            [self.input_topic, self.input_sub])
        self._inject_pubsub_game_events(self.input_topic,
                                        self.DEFAULT_INPUT_COUNT)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        leader_board.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts))
コード例 #19
0
 def test_big_query_standard_sql(self):
   verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
   expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
   pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
       project=self.project,
       query=verify_query,
       checksum=expected_checksum)]
   extra_opts = {'query': STANDARD_QUERY,
                 'output': self.output_table,
                 'output_schema': DIALECT_OUTPUT_SCHEMA,
                 'use_standard_sql': True,
                 'on_success_matcher': all_of(*pipeline_verifiers)}
   options = self.test_pipeline.get_full_options_as_args(**extra_opts)
   big_query_query_to_table_pipeline.run_bq_pipeline(options)
コード例 #20
0
  def run_datastore_write(self, limit=None):
    test_pipeline = TestPipeline(is_integration_test=True)
    current_time = datetime.now().strftime("%m%d%H%M%S")
    seed = random.randint(0, 100000)
    kind = 'testkind%s%d' % (current_time, seed)
    pipeline_verifiers = [PipelineStateMatcher()]
    extra_opts = {'kind': kind,
                  'num_entities': self.NUM_ENTITIES,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    if limit is not None:
      extra_opts['limit'] = limit

    datastore_write_it_pipeline.run(test_pipeline.get_full_options_as_args(
        **extra_opts))
コード例 #21
0
    def _test_streaming(self, with_attributes):
        """Runs IT pipeline with message verifier.

    Args:
      with_attributes: False - Reads and writes message data only.
        True - Reads and writes message data and attributes. Also verifies
        id_label and timestamp_attribute features.
    """
        # Set on_success_matcher to verify pipeline state and pubsub output. These
        # verifications run on a (remote) worker.

        # Expect the state to be RUNNING since a streaming pipeline is usually
        # never DONE. The test runner will cancel the pipeline after verification.
        state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
        expected_messages = self.EXPECTED_OUTPUT_MESSAGES[self.runner_name]
        if not with_attributes:
            expected_messages = [
                pubsub_msg.data.decode('utf-8')
                for pubsub_msg in expected_messages
            ]
        if self.runner_name == 'TestDirectRunner':
            strip_attributes = None
        else:
            strip_attributes = [self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE]
        pubsub_msg_verifier = PubSubMessageMatcher(
            self.project,
            self.output_sub.name,
            expected_messages,
            timeout=MESSAGE_MATCHER_TIMEOUT_S,
            with_attributes=with_attributes,
            strip_attributes=strip_attributes)
        extra_opts = {
            'input_subscription': self.input_sub.name,
            'output_topic': self.output_topic.name,
            'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS,
            'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)
        }

        # Generate input data and inject to PubSub.
        for msg in self.INPUT_MESSAGES[self.runner_name]:
            self.pub_client.publish(self.input_topic.name, msg.data,
                                    **msg.attributes)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        pubsub_it_pipeline.run_pipeline(
            argv=self.test_pipeline.get_full_options_as_args(**extra_opts),
            with_attributes=with_attributes,
            id_label=self.ID_LABEL,
            timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)
コード例 #22
0
  def test_streaming_wordcount_it(self):
    # Set extra options to the pipeline for test purpose
    pipeline_verifiers = [PipelineStateMatcher(PipelineState.RUNNING)]
    extra_opts = {'input_sub': self.input_sub.full_name,
                  'output_topic': self.output_topic.full_name,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Generate input data and inject to PubSub.
    test_utils.wait_for_subscriptions_created([self.input_sub])
    self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    streaming_wordcount.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts))
コード例 #23
0
 def test_big_query_new_types(self):
   expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
   verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
   pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
       project=self.project,
       query=verify_query,
       checksum=expected_checksum)]
   self._setup_new_types_env()
   extra_opts = {
       'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
       'output': self.output_table,
       'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
       'use_standard_sql': False,
       'on_success_matcher': all_of(*pipeline_verifiers)}
   options = self.test_pipeline.get_full_options_as_args(**extra_opts)
   big_query_query_to_table_pipeline.run_bq_pipeline(options)
コード例 #24
0
  def test_big_query_legacy_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]

    extra_opts = {'query': LEGACY_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': False,
                  'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION_MS,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)
コード例 #25
0
  def test_run_example_with_setup_file(self):
    pipeline = TestPipeline(is_integration_test=True)
    coordinate_output = FileSystems.join(
        pipeline.get_option('output'),
        'juliaset-{}'.format(str(uuid.uuid4())),
        'coordinates.txt')
    extra_args = {
        'coordinate_output': coordinate_output,
        'grid_size': self.GRID_SIZE,
        'setup_file': os.path.normpath(
            os.path.join(os.path.dirname(__file__), '..', 'setup.py')),
        'on_success_matcher': all_of(PipelineStateMatcher(PipelineState.DONE)),
    }
    args = pipeline.get_full_options_as_args(**extra_args)

    juliaset.run(args)
コード例 #26
0
    def test_bigquery_side_input_it(self):
        state_verifier = PipelineStateMatcher(PipelineState.DONE)
        NUM_GROUPS = 3

        extra_opts = {
            'output': self.output,
            'num_groups': str(NUM_GROUPS),
            'on_success_matcher': all_of(state_verifier)
        }

        # Register clean up before pipeline execution
        self.addCleanup(delete_files, [self.output + '*'])

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        bigquery_side_input.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts))
コード例 #27
0
    def test_user_score_it(self):

        state_verifier = PipelineStateMatcher(PipelineState.DONE)
        file_verifier = FileChecksumMatcher(self.output + '*-of-*',
                                            self.DEFAULT_EXPECTED_CHECKSUM)

        extra_opts = {
            'input': self.DEFAULT_INPUT_FILE,
            'output': self.output + '/user-score',
            'on_success_matcher': all_of(state_verifier, file_verifier)
        }

        # Register clean up before pipeline execution
        self.addCleanup(delete_files, [self.output + '*'])

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        user_score.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts))
コード例 #28
0
  def run_pipeline(self):
    # Waits for messages to appear in output topic.
    expected_msg = [msg.encode('utf-8') for msg in MESSAGES_TO_PUBLISH]
    pubsub_msg_verifier = PubSubMessageMatcher(
        self.project, self.output_sub.name, expected_msg, timeout=600)

    # Checks that pipeline initializes to RUNNING state.
    state_verifier = PipelineStateMatcher(PipelineState.RUNNING)

    extra_opts = {
        'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION,
        'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier),
        'experiment': 'beam_fn_api',
        'input_subscription': self.input_sub.name,
        'output_topic': self.output_topic.name,
    }

    argv = self.test_pipeline.get_full_options_as_args(**extra_opts)
    return dataflow_exercise_streaming_metrics_pipeline.run(argv)
コード例 #29
0
    def _test_streaming(self, with_attributes):
        """Runs IT pipeline with message verifier.

    Args:
      with_attributes: False - Reads and writes message data only.
        True - Reads and writes message data and attributes. Also verifies
        id_label and timestamp_attribute features.
    """
        # Build expected dataset.
        # Set extra options to the pipeline for test purpose
        state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
        expected_messages = self.EXPECTED_OUTPUT_MESSAGES
        if not with_attributes:
            expected_messages = [
                pubsub_msg.data for pubsub_msg in expected_messages
            ]
        pubsub_msg_verifier = PubSubMessageMatcher(
            self.project,
            OUTPUT_SUB + self.uuid,
            expected_messages,
            timeout=MESSAGE_MATCHER_TIMEOUT_S,
            with_attributes=with_attributes,
            strip_attributes=[self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE])
        extra_opts = {
            'input_subscription': self.input_sub.full_name,
            'output_topic': self.output_topic.full_name,
            'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS,
            'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)
        }

        # Generate input data and inject to PubSub.
        test_utils.wait_for_subscriptions_created([self.input_sub])
        for msg in self.INPUT_MESSAGES:
            self.input_topic.publish(msg.data, **msg.attributes)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        pubsub_it_pipeline.run_pipeline(
            argv=self.test_pipeline.get_full_options_as_args(**extra_opts),
            with_attributes=with_attributes,
            id_label=self.ID_LABEL,
            timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)
コード例 #30
0
  def test_big_query_standard_sql_kms_key(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    extra_opts = {'query': STANDARD_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': True,
                  'on_success_matcher': all_of(*pipeline_verifiers),
                  'kms_key': KMS_KEY
                 }
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

    table = self.bigquery_client.get_table(
        self.project, self.dataset_id, 'output_table')
    self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName)