예제 #1
0
class CombineTest(unittest.TestCase):
  def parseTestPipelineOptions(self):
    return {
        'numRecords': self.input_options.get('num_records'),
        'keySizeBytes': self.input_options.get('key_size'),
        'valueSizeBytes': self.input_options.get('value_size'),
        'bundleSizeDistribution': {
            'type': self.input_options.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': self.input_options.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': self.input_options.get(
            'force_initial_num_bundles', 0
        )
    }

  def setUp(self):
    self.pipeline = TestPipeline()
    self.input_options = json.loads(self.pipeline.get_option('input_options'))

    self.metrics_monitor = self.pipeline.get_option('publish_to_big_query')
    metrics_project_id = self.pipeline.get_option('project')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')
    metrics_dataset = self.pipeline.get_option('metrics_dataset')
    check = metrics_project_id and self.metrics_namespace and metrics_dataset \
            is not None
    if not self.metrics_monitor:
      logging.info('Metrics will not be collected')
    elif check:
      self.metrics_monitor = MetricsMonitor(
          project_name=metrics_project_id,
          table=self.metrics_namespace,
          dataset=metrics_dataset,
      )
    else:
      raise ValueError('One or more of parameters for collecting metrics '
                       'are empty.')

  class _GetElement(beam.DoFn):
    def process(self, element):
      yield element

  def testCombineGlobally(self):
    # pylint: disable=expression-not-assigned
    (self.pipeline
     | beam.io.Read(synthetic_pipeline.SyntheticSource(
         self.parseTestPipelineOptions()))
     | 'Measure time: Start' >> beam.ParDo(
         MeasureTime(self.metrics_namespace))
     | 'Combine with Top' >> beam.CombineGlobally(
         beam.combiners.TopCombineFn(1000))
     | 'Consume' >> beam.ParDo(self._GetElement())
     | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace))
    )

    result = self.pipeline.run()
    result.wait_until_finish()
    if self.metrics_monitor is not None:
      self.metrics_monitor.send_metrics(result)
예제 #2
0
    def _run_wordcount_it(self, **opts):
        test_pipeline = TestPipeline(is_integration_test=True)

        # Set extra options to the pipeline for test purpose
        output = '/'.join([
            test_pipeline.get_option('output'),
            str(int(time.time() * 1000)), 'results'
        ])
        arg_sleep_secs = test_pipeline.get_option('sleep_secs')
        sleep_secs = int(
            arg_sleep_secs) if arg_sleep_secs is not None else None
        pipeline_verifiers = [
            PipelineStateMatcher(),
            FileChecksumMatcher(output + '*-of-*', self.DEFAULT_CHECKSUM,
                                sleep_secs)
        ]
        extra_opts = {
            'output': output,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }
        extra_opts.update(opts)

        # Register clean up before pipeline execution
        self.addCleanup(delete_files, [output + '*'])

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
예제 #3
0
    def test_datastore_wordcount_it(self):
        test_pipeline = TestPipeline(is_integration_test=True)
        kind = self.DATASTORE_WORDCOUNT_KIND
        output = '/'.join([
            test_pipeline.get_option('output'),
            str(int(time.time() * 1000)), 'datastore_wordcount_results'
        ])

        arg_sleep_secs = test_pipeline.get_option('sleep_secs')
        sleep_secs = int(
            arg_sleep_secs) if arg_sleep_secs is not None else None
        pipeline_verifiers = [
            PipelineStateMatcher(),
            FileChecksumMatcher(output + '*-of-*', self.EXPECTED_CHECKSUM,
                                sleep_secs)
        ]
        extra_opts = {
            'kind': kind,
            'output': output,
            # Comment this out to regenerate input data on Datastore (delete
            # existing data first using the bulk delete Dataflow template).
            'read_only': True,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }

        datastore_wordcount.run(
            test_pipeline.get_full_options_as_args(**extra_opts))
    def test_datastore_wordcount_it(self):
        test_pipeline = TestPipeline(is_integration_test=True)
        dataset = test_pipeline.get_option("project")
        kind = self.DATASTORE_WORDCOUNT_KIND
        output = '/'.join([
            test_pipeline.get_option('output'),
            str(int(time.time() * 1000)), 'datastore_wordcount_results'
        ])

        arg_sleep_secs = test_pipeline.get_option('sleep_secs')
        sleep_secs = int(
            arg_sleep_secs) if arg_sleep_secs is not None else None
        pipeline_verifiers = [
            PipelineStateMatcher(),
            FileChecksumMatcher(output + '*-of-*', self.EXPECTED_CHECKSUM,
                                sleep_secs)
        ]
        extra_opts = {
            'dataset': dataset,
            'kind': kind,
            'output': output,
            'read_only': True,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }

        datastore_wordcount.run(
            test_pipeline.get_full_options_as_args(**extra_opts))
class UserScoreIT(unittest.TestCase):

  DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*'
  DEFAULT_EXPECTED_CHECKSUM = '9f3bd81669607f0d98ec80ddd477f3277cfba0a2'

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.uuid = str(uuid.uuid4())

    self.output = '/'.join(
        [self.test_pipeline.get_option('output'), self.uuid, 'results'])

  @attr('IT')
  def test_user_score_it(self):

    state_verifier = PipelineStateMatcher(PipelineState.DONE)
    arg_sleep_secs = self.test_pipeline.get_option('sleep_secs')
    sleep_secs = int(arg_sleep_secs) if arg_sleep_secs is not None else None
    file_verifier = FileChecksumMatcher(
        self.output + '/*-of-*', self.DEFAULT_EXPECTED_CHECKSUM, sleep_secs)

    extra_opts = {
        'input': self.DEFAULT_INPUT_FILE,
        'output': self.output + '/user-score',
        'on_success_matcher': all_of(state_verifier, file_verifier)
    }

    # Register clean up before pipeline execution
    self.addCleanup(delete_files, [self.output + '*'])

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    user_score.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts),
        save_main_session=False)
    def _run_wordcount_it(self, run_wordcount, **opts):
        test_pipeline = TestPipeline(is_integration_test=True)
        extra_opts = {}

        # Set extra options to the pipeline for test purpose
        test_output = '/'.join([
            test_pipeline.get_option('output'),
            str(int(time.time() * 1000)), 'results'
        ])
        extra_opts['output'] = test_output

        test_input = test_pipeline.get_option('input')
        if test_input:
            extra_opts['input'] = test_input

        arg_sleep_secs = test_pipeline.get_option('sleep_secs')
        sleep_secs = int(
            arg_sleep_secs) if arg_sleep_secs is not None else None
        expect_checksum = (test_pipeline.get_option('expect_checksum')
                           or self.DEFAULT_CHECKSUM)
        pipeline_verifiers = [
            PipelineStateMatcher(),
            FileChecksumMatcher(test_output + '*-of-*', expect_checksum,
                                sleep_secs)
        ]
        extra_opts['on_success_matcher'] = all_of(*pipeline_verifiers)
        extra_opts.update(opts)

        # Register clean up before pipeline execution
        self.addCleanup(delete_files, [test_output + '*'])

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        run_wordcount(test_pipeline.get_full_options_as_args(**extra_opts),
                      save_main_session=False)
예제 #7
0
class GroupByKeyTest(unittest.TestCase):
  def parseTestPipelineOptions(self):
    return {
        'numRecords': self.input_options.get('num_records'),
        'keySizeBytes': self.input_options.get('key_size'),
        'valueSizeBytes': self.input_options.get('value_size'),
        'bundleSizeDistribution': {
            'type': self.input_options.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': self.input_options.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': self.input_options.get(
            'force_initial_num_bundles', 0
        )
    }

  def setUp(self):
    self.pipeline = TestPipeline(is_integration_test=True)
    self.input_options = json.loads(self.pipeline.get_option('input_options'))

    metrics_project_id = self.pipeline.get_option('project')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')
    metrics_dataset = self.pipeline.get_option('metrics_dataset')
    self.metrics_monitor = None
    check = metrics_project_id and self.metrics_namespace and metrics_dataset \
            is not None
    if check:
      schema = [{'name': RUNTIME_LABEL, 'type': 'FLOAT', 'mode': 'REQUIRED'}]
      self.metrics_monitor = MetricsMonitor(
          project_name=metrics_project_id,
          table=self.metrics_namespace,
          dataset=metrics_dataset,
          schema_map=schema
      )
    else:
      logging.error('One or more of parameters for collecting metrics '
                    'are empty. Metrics will not be collected')

  def testGroupByKey(self):
    with self.pipeline as p:
      # pylint: disable=expression-not-assigned
      (p
       | beam.io.Read(synthetic_pipeline.SyntheticSource(
           self.parseTestPipelineOptions()))
       | 'Measure time: Start' >> beam.ParDo(
           MeasureTime(self.metrics_namespace))
       | 'GroupByKey' >> beam.GroupByKey()
       | 'Ungroup' >> beam.FlatMap(
           lambda elm: [(elm[0], v) for v in elm[1]])
       | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace))
      )

      result = p.run()
      result.wait_until_finish()
      if self.metrics_monitor is not None:
        self.metrics_monitor.send_metrics(result)
예제 #8
0
class CloudDLPIT(unittest.TestCase):
    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.runner_name = type(self.test_pipeline.runner).__name__
        self.project = self.test_pipeline.get_option('project')

    @attr("IT")
    def test_deidentification(self):
        with TestPipeline(is_integration_test=True) as p:
            output = (p | beam.Create(["*****@*****.**"])
                      | MaskDetectedDetails(
                          project=self.project,
                          deidentification_config=DEIDENTIFY_CONFIG,
                          inspection_config=INSPECT_CONFIG))
            assert_that(output, equal_to(['####################']))

    @attr("IT")
    def test_inspection(self):
        with TestPipeline(is_integration_test=True) as p:
            output = (p | beam.Create(["*****@*****.**"])
                      | InspectForDetails(project=self.project,
                                          inspection_config=INSPECT_CONFIG)
                      | beam.ParDo(extract_inspection_results).with_outputs(
                          'quote', 'info_type'))
            assert_that(output.info_type, equal_to(['EMAIL_ADDRESS']),
                        'Type matches')
  def test_bigquery_tornadoes_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    project = test_pipeline.get_option('project')

    dataset = 'BigQueryTornadoesIT'
    table = 'monthly_tornadoes_%s' % int(round(time.time() * 1000))
    output_table = '.'.join([dataset, table])
    query = 'SELECT month, tornado_count FROM [%s]' % output_table

    pipeline_verifiers = [PipelineStateMatcher(),
                          BigqueryMatcher(
                              project=project,
                              query=query,
                              checksum=self.DEFAULT_CHECKSUM)]
    extra_opts = {'output': output_table,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Register cleanup before pipeline execution.
    self.addCleanup(utils.delete_bq_table, project, dataset, table)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    bigquery_tornadoes.run(
        test_pipeline.get_full_options_as_args(**extra_opts))
예제 #10
0
class StreamingWordCountIT(unittest.TestCase):

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.project = self.test_pipeline.get_option('project')
    self.uuid = str(uuid.uuid4())

    # Set up PubSub environment.
    from google.cloud import pubsub
    self.pubsub_client = pubsub.Client(project=self.project)
    self.input_topic = self.pubsub_client.topic(INPUT_TOPIC + self.uuid)
    self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC + self.uuid)
    self.input_sub = self.input_topic.subscription(INPUT_SUB + self.uuid)
    self.output_sub = self.output_topic.subscription(OUTPUT_SUB + self.uuid)

    self.input_topic.create()
    self.output_topic.create()
    test_utils.wait_for_topics_created([self.input_topic, self.output_topic])
    self.input_sub.create()
    self.output_sub.create()

  def _inject_numbers(self, topic, num_messages):
    """Inject numbers as test data to PubSub."""
    logging.debug('Injecting %d numbers to topic %s',
                  num_messages, topic.full_name)
    for n in range(num_messages):
      topic.publish(str(n))

  def _cleanup_pubsub(self):
    test_utils.cleanup_subscriptions([self.input_sub, self.output_sub])
    test_utils.cleanup_topics([self.input_topic, self.output_topic])

  def tearDown(self):
    self._cleanup_pubsub()

  @attr('IT')
  def test_streaming_wordcount_it(self):
    # Build expected dataset.
    expected_msg = [('%d: 1' % num) for num in range(DEFAULT_INPUT_NUMBERS)]

    # Set extra options to the pipeline for test purpose
    state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
    pubsub_msg_verifier = PubSubMessageMatcher(self.project,
                                               OUTPUT_SUB + self.uuid,
                                               expected_msg,
                                               timeout=400)
    extra_opts = {'input_subscription': self.input_sub.full_name,
                  'output_topic': self.output_topic.full_name,
                  'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION,
                  'on_success_matcher': all_of(state_verifier,
                                               pubsub_msg_verifier)}

    # Generate input data and inject to PubSub.
    test_utils.wait_for_subscriptions_created([self.input_sub])
    self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    streaming_wordcount.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts))
예제 #11
0
class UserScoreIT(unittest.TestCase):

  DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*'
  DEFAULT_EXPECTED_CHECKSUM = '9f3bd81669607f0d98ec80ddd477f3277cfba0a2'

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.uuid = str(uuid.uuid4())

    self.output = '/'.join([self.test_pipeline.get_option('output'),
                            self.uuid,
                            'results'])

  @attr('IT')
  def test_user_score_it(self):

    state_verifier = PipelineStateMatcher(PipelineState.DONE)
    file_verifier = FileChecksumMatcher(self.output + '*-of-*',
                                        self.DEFAULT_EXPECTED_CHECKSUM)

    extra_opts = {'input': self.DEFAULT_INPUT_FILE,
                  'output': self.output + '/user-score',
                  'on_success_matcher': all_of(state_verifier,
                                               file_verifier)}

    # Register clean up before pipeline execution
    self.addCleanup(delete_files, [self.output + '*'])

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    user_score.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts))
  def test_bigquery_tornadoes_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    project = test_pipeline.get_option('project')

    dataset = 'BigQueryTornadoesIT'
    table = 'monthly_tornadoes_%s' % int(round(time.time() * 1000))
    output_table = '.'.join([dataset, table])
    query = 'SELECT month, tornado_count FROM `%s`' % output_table

    pipeline_verifiers = [PipelineStateMatcher(),
                          BigqueryMatcher(
                              project=project,
                              query=query,
                              checksum=self.DEFAULT_CHECKSUM)]
    extra_opts = {'output': output_table,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Register cleanup before pipeline execution.
    # Note that actual execution happens in reverse order.
    self.addCleanup(utils.delete_bq_table, project, dataset, table)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    bigquery_tornadoes.run(
        test_pipeline.get_full_options_as_args(**extra_opts))
예제 #13
0
  def test_filters_output_bigquery_matcher(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    project = test_pipeline.get_option('project')

    dataset = 'FiltersTestIT'
    table = 'cold_days_%s' % int(round(time.time() * 1000))
    output_table = '.'.join([dataset, table])
    query = 'SELECT year, month, day, mean_temp FROM `%s`' % output_table

    pipeline_verifiers = [
        PipelineStateMatcher(),
        BigqueryMatcher(
            project=project, query=query, checksum=self.DEFAULT_CHECKSUM)
    ]
    extra_opts = {
        'output': output_table,
        'on_success_matcher': all_of(*pipeline_verifiers)
    }

    # Register cleanup before pipeline execution.
    # Note that actual execution happens in reverse order.
    self.addCleanup(utils.delete_bq_table, project, dataset, table)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    filters.run(test_pipeline.get_full_options_as_args(**extra_opts))
예제 #14
0
class LoadTest(unittest.TestCase):
    def parseTestPipelineOptions(self, options=None):
        if not options:
            options = self.input_options

        return {
            'numRecords': options.get('num_records'),
            'keySizeBytes': options.get('key_size'),
            'valueSizeBytes': options.get('value_size'),
            'hotKeyFraction': options.get('hot_key_fraction', 0),
            'numHotKeys': options.get('num_hot_keys', 0),
            'bundleSizeDistribution': {
                'type': options.get('bundle_size_distribution_type', 'const'),
                'param': options.get('bundle_size_distribution_param', 0)
            },
            'forceNumInitialBundles': options.get('force_initial_num_bundles',
                                                  0)
        }

    def setUp(self):
        self.pipeline = TestPipeline()
        self.input_options = json.loads(
            self.pipeline.get_option('input_options'))

        self.publish_to_big_query = self.pipeline.get_option(
            'publish_to_big_query')
        self.metrics_namespace = self.pipeline.get_option('metrics_table')

        if not self.publish_to_big_query or self.publish_to_big_query != 'true':
            logging.info('Metrics will not be collected')
            self.metrics_monitor = None
        else:
            self.metrics_monitor = MetricsReader(
                project_name=self.pipeline.get_option('project'),
                bq_table=self.metrics_namespace,
                bq_dataset=self.pipeline.get_option('metrics_dataset'),
            )

    def tearDown(self):
        result = self.pipeline.run()
        result.wait_until_finish()

        if self.metrics_monitor:
            self.metrics_monitor.publish_metrics(result)
예제 #15
0
class LoadTest(unittest.TestCase):
  def parseTestPipelineOptions(self, options=None):
    if not options:
      options = self.input_options

    return {
        'numRecords': options.get('num_records'),
        'keySizeBytes': options.get('key_size'),
        'valueSizeBytes': options.get('value_size'),
        'bundleSizeDistribution': {
            'type': options.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': options.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': options.get(
            'force_initial_num_bundles', 0
        )
    }

  def setUp(self):
    self.pipeline = TestPipeline()
    self.input_options = json.loads(self.pipeline.get_option('input_options'))

    self.publish_to_big_query = self.pipeline.get_option('publish_to_big_query')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')

    if not self.publish_to_big_query or self.publish_to_big_query != 'true':
      logging.info('Metrics will not be collected')
      self.metrics_monitor = None
    else:
      self.metrics_monitor = MetricsReader(
          project_name=self.pipeline.get_option('project'),
          bq_table=self.metrics_namespace,
          bq_dataset=self.pipeline.get_option('metrics_dataset'),
      )

  def tearDown(self):
    result = self.pipeline.run()
    result.wait_until_finish()

    if self.metrics_monitor:
      self.metrics_monitor.publish_metrics(result)
예제 #16
0
  def test_wordcount_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    output = '/'.join([test_pipeline.get_option('output'),
                       str(int(time.time())),
                       'results'])
    arg_sleep_secs = test_pipeline.get_option('sleep_secs')
    sleep_secs = int(arg_sleep_secs) if arg_sleep_secs is not None else None
    pipeline_verifiers = [PipelineStateMatcher(),
                          FileChecksumMatcher(output + '*-of-*',
                                              self.DEFAULT_CHECKSUM,
                                              sleep_secs)]
    extra_opts = {'output': output,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts))
class StreamingWordCountIT(unittest.TestCase):

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)

    # Set up PubSub environment.
    from google.cloud import pubsub
    self.pubsub_client = pubsub.Client(
        project=self.test_pipeline.get_option('project'))
    self.input_topic = self.pubsub_client.topic(INPUT_TOPIC)
    self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC)
    self.input_sub = self.input_topic.subscription(INPUT_SUB)
    self.output_sub = self.output_topic.subscription(OUTPUT_SUB)

    self._cleanup_pubsub()

    self.input_topic.create()
    self.output_topic.create()
    test_utils.wait_for_topics_created([self.input_topic, self.output_topic])
    self.input_sub.create()
    self.output_sub.create()

  def _inject_numbers(self, topic, num_messages):
    """Inject numbers as test data to PubSub."""
    logging.debug('Injecting %d numbers to topic %s',
                  num_messages, topic.full_name)
    for n in range(num_messages):
      topic.publish(str(n))

  def _cleanup_pubsub(self):
    test_utils.cleanup_subscriptions([self.input_sub, self.output_sub])
    test_utils.cleanup_topics([self.input_topic, self.output_topic])

  def tearDown(self):
    self._cleanup_pubsub()

  @attr('developing_test')
  def test_streaming_wordcount_it(self):
    # Set extra options to the pipeline for test purpose
    pipeline_verifiers = [PipelineStateMatcher(PipelineState.RUNNING)]
    extra_opts = {'input_sub': self.input_sub.full_name,
                  'output_topic': self.output_topic.full_name,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Generate input data and inject to PubSub.
    test_utils.wait_for_subscriptions_created([self.input_sub])
    self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    streaming_wordcount.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts))
예제 #18
0
  def test_datastore_wordcount_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)
    dataset = test_pipeline.get_option("project")
    kind = self.DATASTORE_WORDCOUNT_KIND
    output = '/'.join([test_pipeline.get_option('output'),
                       str(int(time.time() * 1000)),
                       'datastore_wordcount_results'])

    arg_sleep_secs = test_pipeline.get_option('sleep_secs')
    sleep_secs = int(arg_sleep_secs) if arg_sleep_secs is not None else None
    pipeline_verifiers = [PipelineStateMatcher(),
                          FileChecksumMatcher(output + '*-of-*',
                                              self.EXPECTED_CHECKSUM,
                                              sleep_secs)]
    extra_opts = {'dataset': dataset,
                  'kind': kind,
                  'output': output,
                  'read_only': True,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    datastore_wordcount.run(test_pipeline.get_full_options_as_args(
        **extra_opts))
예제 #19
0
class LoadTest(unittest.TestCase):
  def parseTestPipelineOptions(self, options=None):
    if not options:
      options = self.input_options

    return {
        'numRecords': options.get('num_records'),
        'keySizeBytes': options.get('key_size'),
        'valueSizeBytes': options.get('value_size'),
        'hotKeyFraction': options.get('hot_key_fraction', 0),
        'numHotKeys': options.get('num_hot_keys', 0),
        'bundleSizeDistribution': {
            'type': options.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': options.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': options.get(
            'force_initial_num_bundles', 0
        )
    }

  def setUp(self):
    self.pipeline = TestPipeline()
    self.input_options = json.loads(self.pipeline.get_option('input_options'))
    self.project_id = self.pipeline.get_option('project')

    self.metrics_dataset = self.pipeline.get_option('metrics_dataset')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')

    self.metrics_monitor = MetricsReader(
        publish_to_bq=self.pipeline.get_option('publish_to_big_query') ==
        'true',
        project_name=self.project_id,
        bq_table=self.metrics_namespace,
        bq_dataset=self.metrics_dataset,
        # Apply filter to prevent system metrics from being published
        filters=MetricsFilter().with_namespace(self.metrics_namespace)
    )

  def tearDown(self):
    result = self.pipeline.run()
    result.wait_until_finish()

    self.metrics_monitor.publish_metrics(result)

  def get_option_or_default(self, opt_name, default=0):
    """Returns a pipeline option or a default value if it was not provided.

    The returned value is converted to an integer.
    """
    option = self.pipeline.get_option(opt_name)
    try:
      return int(option)
    except TypeError:
      return default
    except ValueError as exc:
      self.fail(str(exc))
class HourlyTeamScoreIT(unittest.TestCase):

  DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*'
  # SHA-1 hash generated from sorted rows reading from BigQuery table
  DEFAULT_EXPECTED_CHECKSUM = '4fa761fb5c3341ec573d5d12c6ab75e3b2957a25'
  OUTPUT_DATASET = 'hourly_team_score_it_dataset'
  OUTPUT_TABLE = 'leader_board'

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.project = self.test_pipeline.get_option('project')

    # Set up BigQuery environment
    from google.cloud import bigquery
    client = bigquery.Client()
    unique_dataset_name = self.OUTPUT_DATASET + str(int(time.time()))
    self.dataset = client.dataset(unique_dataset_name, project=self.project)
    self.dataset.create()

  def _cleanup_dataset(self):
    self.dataset.delete()

  @attr('IT')
  def test_hourly_team_score_it(self):
    state_verifier = PipelineStateMatcher(PipelineState.DONE)
    query = ('SELECT COUNT(*) FROM [%s:%s.%s]' % (self.project,
                                                  self.dataset.name,
                                                  self.OUTPUT_TABLE))

    bigquery_verifier = BigqueryMatcher(self.project,
                                        query,
                                        self.DEFAULT_EXPECTED_CHECKSUM)

    extra_opts = {'input': self.DEFAULT_INPUT_FILE,
                  'dataset': self.dataset.name,
                  'window_duration': 1,
                  'on_success_matcher': all_of(state_verifier,
                                               bigquery_verifier)}

    # Register clean up before pipeline execution
    # Note that actual execution happens in reverse order.
    self.addCleanup(self._cleanup_dataset)
    self.addCleanup(utils.delete_bq_table, self.project,
                    self.dataset.name, self.OUTPUT_TABLE)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    hourly_team_score.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts))
예제 #21
0
class HourlyTeamScoreIT(unittest.TestCase):

    DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*'
    # SHA-1 hash generated from sorted rows reading from BigQuery table
    DEFAULT_EXPECTED_CHECKSUM = '4fa761fb5c3341ec573d5d12c6ab75e3b2957a25'
    OUTPUT_DATASET = 'hourly_team_score_it_dataset'
    OUTPUT_TABLE = 'leader_board'

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.project = self.test_pipeline.get_option('project')

        # Set up BigQuery environment
        from google.cloud import bigquery
        client = bigquery.Client()
        unique_dataset_name = self.OUTPUT_DATASET + str(int(time.time()))
        self.dataset = client.dataset(unique_dataset_name,
                                      project=self.project)
        self.dataset.create()

    def _cleanup_dataset(self):
        self.dataset.delete()

    @attr('IT')
    def test_hourly_team_score_it(self):
        state_verifier = PipelineStateMatcher(PipelineState.DONE)
        query = ('SELECT COUNT(*) FROM [%s:%s.%s]' %
                 (self.project, self.dataset.name, self.OUTPUT_TABLE))

        bigquery_verifier = BigqueryMatcher(self.project, query,
                                            self.DEFAULT_EXPECTED_CHECKSUM)

        extra_opts = {
            'input': self.DEFAULT_INPUT_FILE,
            'dataset': self.dataset.name,
            'window_duration': 1,
            'on_success_matcher': all_of(state_verifier, bigquery_verifier)
        }

        # Register clean up before pipeline execution
        # Note that actual execution happens in reverse order.
        self.addCleanup(self._cleanup_dataset)
        self.addCleanup(utils.delete_bq_table, self.project, self.dataset.name,
                        self.OUTPUT_TABLE)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        hourly_team_score.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts))
예제 #22
0
  def test_run_example_with_setup_file(self):
    pipeline = TestPipeline(is_integration_test=True)
    coordinate_output = FileSystems.join(
        pipeline.get_option('output'),
        'juliaset-{}'.format(str(uuid.uuid4())),
        'coordinates.txt')
    extra_args = {
        'coordinate_output': coordinate_output,
        'grid_size': self.GRID_SIZE,
        'setup_file': os.path.normpath(
            os.path.join(os.path.dirname(__file__), '..', 'setup.py')),
        'on_success_matcher': all_of(PipelineStateMatcher(PipelineState.DONE)),
    }
    args = pipeline.get_full_options_as_args(**extra_args)

    juliaset.run(args)
  def test_bigquery_tornadoes_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    output_table = ('BigQueryTornadoesIT'
                    '.monthly_tornadoes_%s' % int(round(time.time() * 1000)))
    query = 'SELECT month, tornado_count FROM [%s]' % output_table
    pipeline_verifiers = [PipelineStateMatcher(),
                          BigqueryMatcher(
                              project=test_pipeline.get_option('project'),
                              query=query,
                              checksum=self.DEFAULT_CHECKSUM)]
    extra_opts = {'output': output_table,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    bigquery_tornadoes.run(
        test_pipeline.get_full_options_as_args(**extra_opts))
class HourlyTeamScoreIT(unittest.TestCase):

    DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*'
    # SHA-1 hash generated from sorted rows reading from BigQuery table
    DEFAULT_EXPECTED_CHECKSUM = '4fa761fb5c3341ec573d5d12c6ab75e3b2957a25'
    OUTPUT_DATASET = 'hourly_team_score_it_dataset'
    OUTPUT_TABLE = 'leader_board'

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.project = self.test_pipeline.get_option('project')

        # Set up BigQuery environment
        self.dataset_ref = utils.create_bq_dataset(self.project,
                                                   self.OUTPUT_DATASET)

    @pytest.mark.it_postcommit
    def test_hourly_team_score_it(self):
        state_verifier = PipelineStateMatcher(PipelineState.DONE)
        query = (
            'SELECT COUNT(*) FROM `%s.%s.%s`' %
            (self.project, self.dataset_ref.dataset_id, self.OUTPUT_TABLE))

        bigquery_verifier = BigqueryMatcher(self.project, query,
                                            self.DEFAULT_EXPECTED_CHECKSUM)

        extra_opts = {
            'input': self.DEFAULT_INPUT_FILE,
            'dataset': self.dataset_ref.dataset_id,
            'window_duration': 1,
            'on_success_matcher': all_of(state_verifier, bigquery_verifier)
        }

        # Register clean up before pipeline execution
        # Note that actual execution happens in reverse order.
        self.addCleanup(utils.delete_bq_dataset, self.project,
                        self.dataset_ref)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        hourly_team_score.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts),
            save_main_session=False)
예제 #25
0
class GroupByKeyTest(unittest.TestCase):
    def parseTestPipelineOptions(self):
        return {
            'numRecords':
            self.inputOptions.get('num_records'),
            'keySizeBytes':
            self.inputOptions.get('key_size'),
            'valueSizeBytes':
            self.inputOptions.get('value_size'),
            'bundleSizeDistribution': {
                'type':
                self.inputOptions.get('bundle_size_distribution_type',
                                      'const'),
                'param':
                self.inputOptions.get('bundle_size_distribution_param', 0)
            },
            'forceNumInitialBundles':
            self.inputOptions.get('force_initial_num_bundles', 0)
        }

    def setUp(self):
        self.pipeline = TestPipeline(is_integration_test=True)
        self.inputOptions = json.loads(
            self.pipeline.get_option('input_options'))

    def testGroupByKey(self):
        with self.pipeline as p:
            # pylint: disable=expression-not-assigned
            (p
             | beam.io.Read(
                 synthetic_pipeline.SyntheticSource(
                     self.parseTestPipelineOptions()))
             | 'Measure time' >> beam.ParDo(MeasureTime())
             | 'GroupByKey' >> beam.GroupByKey()
             | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v)
                                                      for v in elm[1]]))

            result = p.run()
            result.wait_until_finish()
            metrics = result.metrics().query()
            for dist in metrics['distributions']:
                logging.info("Distribution: %s", dist)
예제 #26
0
class CombineTest(unittest.TestCase):
  def parseTestPipelineOptions(self):
    return {
        'numRecords': self.inputOptions.get('num_records'),
        'keySizeBytes': self.inputOptions.get('key_size'),
        'valueSizeBytes': self.inputOptions.get('value_size'),
        'bundleSizeDistribution': {
            'type': self.inputOptions.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': self.inputOptions.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': self.inputOptions.get(
            'force_initial_num_bundles', 0
        )
    }

  def setUp(self):
    self.pipeline = TestPipeline(is_integration_test=True)
    self.inputOptions = json.loads(self.pipeline.get_option('input_options'))

  class _GetElement(beam.DoFn):
    def process(self, element):
      yield element

  def testCombineGlobally(self):
    with self.pipeline as p:
      # pylint: disable=expression-not-assigned
      (p
       | beam.io.Read(synthetic_pipeline.SyntheticSource(
           self.parseTestPipelineOptions()))
       | 'Measure time' >> beam.ParDo(MeasureTime())
       | 'Combine with Top' >> beam.CombineGlobally(
           beam.combiners.TopCombineFn(1000))
       | 'Consume' >> beam.ParDo(self._GetElement())
      )

      result = p.run()
      result.wait_until_finish()
      metrics = result.metrics().query()
      for dist in metrics['distributions']:
        logging.info("Distribution: %s", dist)
예제 #27
0
    def test_bigquery_tornadoes_it(self):
        test_pipeline = TestPipeline(is_integration_test=True)

        # Set extra options to the pipeline for test purpose
        output_table = ('BigQueryTornadoesIT'
                        '.monthly_tornadoes_%s' %
                        int(round(time.time() * 1000)))
        query = 'SELECT month, tornado_count FROM [%s]' % output_table
        pipeline_verifiers = [
            PipelineStateMatcher(),
            BigqueryMatcher(project=test_pipeline.get_option('project'),
                            query=query,
                            checksum=self.DEFAULT_CHECKSUM)
        ]
        extra_opts = {
            'output': output_table,
            'on_success_matcher': all_of(*pipeline_verifiers)
        }

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        bigquery_tornadoes.run(
            test_pipeline.get_full_options_as_args(**extra_opts))
class BigQueryQueryToTableIT(unittest.TestCase):
  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.bigquery_client = BigQueryWrapper()
    self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())),
                                  random.randint(0, 10000))
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    self.output_table = "%s.output_table" % (self.dataset_id)

  def tearDown(self):
    request = bigquery.BigqueryDatasetsDeleteRequest(
        projectId=self.project, datasetId=self.dataset_id,
        deleteContents=True)
    try:
      self.bigquery_client.client.datasets.Delete(request)
    except HttpError:
      logging.debug('Failed to clean up dataset %s' % self.dataset_id)

  def _setup_new_types_env(self):
    table_schema = bigquery.TableSchema()
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'bytes'
    table_field.type = 'BYTES'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'date'
    table_field.type = 'DATE'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'time'
    table_field.type = 'TIME'
    table_schema.fields.append(table_field)
    table = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=NEW_TYPES_INPUT_TABLE),
        schema=table_schema)
    request = bigquery.BigqueryTablesInsertRequest(
        projectId=self.project, datasetId=self.dataset_id, table=table)
    self.bigquery_client.client.tables.Insert(request)
    table_data = [
        {'bytes':b'xyw=', 'date':'2011-01-01', 'time':'23:59:59.999999'},
        {'bytes':b'abc=', 'date':'2000-01-01', 'time':'00:00:00'},
        {'bytes':b'dec=', 'date':'3000-12-31', 'time':'23:59:59.990000'}
    ]
    self.bigquery_client.insert_rows(
        self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data)

  @attr('IT')
  def test_big_query_legacy_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]

    extra_opts = {'query': LEGACY_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': False,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

  @attr('IT')
  def test_big_query_standard_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]

    extra_opts = {'query': STANDARD_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': True,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

  # TODO(BEAM-6660): Enable this test when ready.
  @unittest.skip('This test requires BQ Dataflow native source support for ' +
                 'KMS, which is not available yet.')
  @attr('IT')
  def test_big_query_standard_sql_kms_key(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    extra_opts = {'query': STANDARD_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': True,
                  'on_success_matcher': all_of(*pipeline_verifiers),
                  'kms_key': KMS_KEY
                 }
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

    table = self.bigquery_client.get_table(
        self.project, self.dataset_id, 'output_table')
    self.assertEqual(KMS_KEY, table.encryptionConfiguration.kmsKeyName)

  @unittest.skipIf(sys.version_info[0] == 3 and
                   os.environ.get('RUN_SKIPPED_PY3_TESTS') != '1',
                   'This test still needs to be fixed on Python 3'
                   'TODO: BEAM-6769')
  @attr('IT')
  def test_big_query_new_types(self):
    expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
    verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    self._setup_new_types_env()
    extra_opts = {
        'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
        'output': self.output_table,
        'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
        'use_standard_sql': False,
        'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)
예제 #29
0
class SideInputTest(unittest.TestCase):
  def _parseTestPipelineOptions(self):
    return {
        'numRecords': self.inputOptions.get('num_records'),
        'keySizeBytes': self.inputOptions.get('key_size'),
        'valueSizeBytes': self.inputOptions.get('value_size'),
        'bundleSizeDistribution': {
            'type': self.inputOptions.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': self.inputOptions.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': self.inputOptions.get(
            'force_initial_num_bundles', 0
        )
    }

  def _getSideInput(self):
    side_input = self._parseTestPipelineOptions()
    side_input['numRecords'] = side_input['numRecords']
    side_input['keySizeBytes'] = side_input['keySizeBytes']
    side_input['valueSizeBytes'] = side_input['valueSizeBytes']
    return side_input

  def _getPerElementDelaySec(self):
    return self.syntheticStepOptions.get('per_element_delay_sec', 0)

  def _getPerBundleDelaySec(self):
    return self.syntheticStepOptions.get('per_bundle_delay_sec', 0)

  def _getOutputRecordsPerInputRecords(self):
    return self.syntheticStepOptions.get('output_records_per_input_records', 0)

  def setUp(self):
    self.pipeline = TestPipeline()
    self.inputOptions = json.loads(self.pipeline.get_option('input_options'))
    self.iterations = self.pipeline.get_option('number_of_counter_operations')
    if self.iterations is None:
      self.iterations = 1
    self.iterations = int(self.iterations)

    self.metrics_monitor = self.pipeline.get_option('publish_to_big_query')
    metrics_project_id = self.pipeline.get_option('project')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')
    metrics_dataset = self.pipeline.get_option('metrics_dataset')

    check = metrics_project_id and self.metrics_namespace and metrics_dataset \
            is not None
    if not self.metrics_monitor:
      logging.info('Metrics will not be collected')
    elif check:
      self.metrics_monitor = MetricsMonitor(
          project_name=metrics_project_id,
          table=self.metrics_namespace,
          dataset=metrics_dataset,
      )
    else:
      raise ValueError('One or more of parameters for collecting metrics '
                       'are empty.')

  def testSideInput(self):
    def join_fn(element, side_input, iterations):
      list = []
      for i in range(iterations):
        for key, value in side_input:
          if i == iterations - 1:
            list.append({key: element[1]+value})
      yield list

    with self.pipeline as p:
      main_input = (p
                    | "Read pcoll 1" >> beam.io.Read(
                        synthetic_pipeline.SyntheticSource(
                            self._parseTestPipelineOptions()))
                    | 'Measure time: Start pcoll 1' >> beam.ParDo(
                        MeasureTime(self.metrics_namespace))
                   )

      side_input = (p
                    | "Read pcoll 2" >> beam.io.Read(
                        synthetic_pipeline.SyntheticSource(
                            self._getSideInput()))
                    | 'Measure time: Start pcoll 2' >> beam.ParDo(
                        MeasureTime(self.metrics_namespace))
                   )
      # pylint: disable=expression-not-assigned
      (main_input
       | "Merge" >> beam.ParDo(
           join_fn,
           AsIter(side_input),
           self.iterations)
       | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace))
      )

      result = p.run()
      result.wait_until_finish()

      if self.metrics_monitor is not None:
        self.metrics_monitor.send_metrics(result)

  if __name__ == '__main__':
    logging.getLogger().setLevel(logging.DEBUG)
    unittest.main()
class BigQueryFileLoadsIT(unittest.TestCase):

  BIG_QUERY_DATASET_ID = 'python_bq_file_loads_'
  BIG_QUERY_SCHEMA = (
      '{"fields": [{"name": "name","type": "STRING"},'
      '{"name": "language","type": "STRING"}]}'
  )

  BIG_QUERY_SCHEMA_2 = (
      '{"fields": [{"name": "name","type": "STRING"},'
      '{"name": "foundation","type": "STRING"}]}'
  )

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID,
                                  str(int(time.time())),
                                  random.randint(0, 10000))
    self.bigquery_client = bigquery_tools.BigQueryWrapper()
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    self.output_table = "%s.output_table" % (self.dataset_id)
    logging.info("Created dataset %s in project %s",
                 self.dataset_id, self.project)

  @attr('IT')
  def test_multiple_destinations_transform(self):
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)
    output_table_3 = '%s%s' % (self.output_table, 3)
    output_table_4 = '%s%s' % (self.output_table, 4)
    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_1,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_2,
            data=[(d['name'], d['foundation'])
                  for d in _ELEMENTS
                  if 'foundation' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_3,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_4,
            data=[(d['name'], d['foundation'])
                  for d in _ELEMENTS
                  if 'foundation' in d])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=all_of(*pipeline_verifiers),
        experiments='use_beam_bq_sink')

    with beam.Pipeline(argv=args) as p:
      input = p | beam.Create(_ELEMENTS)

      # Get all input in same machine
      input = (input
               | beam.Map(lambda x: (None, x))
               | beam.GroupByKey()
               | beam.FlatMap(lambda elm: elm[1]))

      _ = (input |
           "WriteWithMultipleDestsFreely" >> bigquery.WriteToBigQuery(
               table=lambda x: (output_table_1
                                if 'language' in x
                                else output_table_2),
               create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
               write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

      _ = (input |
           "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
               table=lambda x: (output_table_3
                                if 'language' in x
                                else output_table_4),
               create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
               write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
               max_file_size=20,
               max_files_per_bundle=-1))

  @attr('IT')
  def test_one_job_fails_all_jobs_fail(self):

    # If one of the import jobs fails, then other jobs must not be performed.
    # This is to avoid reinsertion of some records when a pipeline fails and
    # is rerun.
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)

    self.bigquery_client.get_or_create_table(
        self.project, self.dataset_id, output_table_1.split('.')[1],
        bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA),
        None, None)
    self.bigquery_client.get_or_create_table(
        self.project, self.dataset_id, output_table_2.split('.')[1],
        bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA_2),
        None, None)

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_1,
            data=[]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_2,
            data=[])]

    args = self.test_pipeline.get_full_options_as_args(
        experiments='use_beam_bq_sink')

    with self.assertRaises(Exception):
      with beam.Pipeline(argv=args) as p:
        input = p | beam.Create(_ELEMENTS)
        input2 = p | "Broken record" >> beam.Create(['language_broken_record'])

        input = (input, input2) | beam.Flatten()

        _ = (input |
             "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
                 table=lambda x: (output_table_1
                                  if 'language' in x
                                  else output_table_2),
                 create_disposition=(
                     beam.io.BigQueryDisposition.CREATE_IF_NEEDED),
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    hamcrest_assert(p, all_of(*pipeline_verifiers))

  def tearDown(self):
    request = bigquery_api.BigqueryDatasetsDeleteRequest(
        projectId=self.project, datasetId=self.dataset_id,
        deleteContents=True)
    try:
      logging.info("Deleting dataset %s in project %s",
                   self.dataset_id, self.project)
      self.bigquery_client.client.datasets.Delete(request)
    except HttpError:
      logging.debug('Failed to clean up dataset %s in project %s',
                    self.dataset_id, self.project)
예제 #31
0
class ParDoTest(unittest.TestCase):
  def parseTestPipelineOptions(self):
    return {'numRecords': self.input_options.get('num_records'),
            'keySizeBytes': self.input_options.get('key_size'),
            'valueSizeBytes': self.input_options.get('value_size'),
            'bundleSizeDistribution': {
                'type': self.input_options.get(
                    'bundle_size_distribution_type', 'const'
                ),
                'param': self.input_options.get(
                    'bundle_size_distribution_param', 0
                )
            },
            'forceNumInitialBundles': self.input_options.get(
                'force_initial_num_bundles', 0
            )
           }

  def setUp(self):
    self.pipeline = TestPipeline(is_integration_test=True)

    self.output = self.pipeline.get_option('output')
    self.iterations = self.pipeline.get_option('number_of_counter_operations')
    self.input_options = json.loads(self.pipeline.get_option('input_options'))

    metrics_project_id = self.pipeline.get_option('project')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')
    metrics_dataset = self.pipeline.get_option('metrics_dataset')
    self.metrics_monitor = None
    if metrics_project_id and self.metrics_namespace is not None:
      measured_values = [
          {'name': RUNTIME_LABEL, 'type': 'FLOAT', 'mode': 'REQUIRED'},
          {'name': COUNTER_LABEL, 'type': 'INTEGER', 'mode': 'REQUIRED'}
      ]
      self.metrics_monitor = MetricsMonitor(
          project_name=metrics_project_id,
          table=self.metrics_namespace,
          dataset=metrics_dataset,
          schema_map=measured_values
      )
    else:
      logging.error('One or more of parameters for collecting metrics '
                    'are empty. Metrics will not be collected')

  def testParDo(self):

    class _GetElement(beam.DoFn):
      from apache_beam.testing.load_tests.load_test_metrics_utils import count_bytes

      @count_bytes(COUNTER_LABEL)
      def process(self, element, namespace, is_returning):
        if is_returning:
          yield element

    if self.iterations is None:
      num_runs = 1
    else:
      num_runs = int(self.iterations)

    with self.pipeline as p:
      pc = (p
            | 'Read synthetic' >> beam.io.Read(
                synthetic_pipeline.SyntheticSource(
                    self.parseTestPipelineOptions()
                ))
            | 'Measure time: Start' >> beam.ParDo(
                MeasureTime(self.metrics_namespace))
           )

      for i in range(num_runs):
        is_returning = (i == (num_runs-1))
        pc = (pc
              | 'Step: %d' % i >> beam.ParDo(
                  _GetElement(), self.metrics_namespace, is_returning)
             )

      if self.output is not None:
        pc = (pc
              | "Write" >> beam.io.WriteToText(self.output)
             )

      # pylint: disable=expression-not-assigned
      (pc
       | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace))
      )

      result = p.run()
      result.wait_until_finish()

      if self.metrics_monitor is not None:
        self.metrics_monitor.send_metrics(result)
예제 #32
0
 def test_get_option(self):
   name, value = ('job', 'mockJob')
   test_pipeline = TestPipeline()
   test_pipeline.options_list = ['--%s=%s' % (name, value)]
   self.assertEqual(test_pipeline.get_option(name), value)
예제 #33
0
class CombineTest(unittest.TestCase):
  def parseTestPipelineOptions(self):
    return {
        'numRecords': self.input_options.get('num_records'),
        'keySizeBytes': self.input_options.get('key_size'),
        'valueSizeBytes': self.input_options.get('value_size'),
        'bundleSizeDistribution': {
            'type': self.input_options.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': self.input_options.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': self.input_options.get(
            'force_initial_num_bundles', 0
        )
    }

  def setUp(self):
    self.pipeline = TestPipeline()
    self.input_options = json.loads(self.pipeline.get_option('input_options'))

    self.metrics_monitor = self.pipeline.get_option('publish_to_big_query')
    metrics_project_id = self.pipeline.get_option('project')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')
    metrics_dataset = self.pipeline.get_option('metrics_dataset')
    check = metrics_project_id and self.metrics_namespace and metrics_dataset \
            is not None
    if not self.metrics_monitor:
      logging.info('Metrics will not be collected')
    elif check:
      self.metrics_monitor = MetricsMonitor(
          project_name=metrics_project_id,
          table=self.metrics_namespace,
          dataset=metrics_dataset,
      )
    else:
      raise ValueError('One or more of parameters for collecting metrics '
                       'are empty.')

  class _GetElement(beam.DoFn):
    def process(self, element):
      yield element

  def testCombineGlobally(self):
    with self.pipeline as p:
      # pylint: disable=expression-not-assigned
      (p
       | beam.io.Read(synthetic_pipeline.SyntheticSource(
           self.parseTestPipelineOptions()))
       | 'Measure time: Start' >> beam.ParDo(
           MeasureTime(self.metrics_namespace))
       | 'Combine with Top' >> beam.CombineGlobally(
           beam.combiners.TopCombineFn(1000))
       | 'Consume' >> beam.ParDo(self._GetElement())
       | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace))
      )

      result = p.run()
      result.wait_until_finish()
      if self.metrics_monitor is not None:
        self.metrics_monitor.send_metrics(result)
예제 #34
0
class BigQueryFileLoadsIT(unittest.TestCase):

    BIG_QUERY_DATASET_ID = 'python_bq_file_loads_'
    BIG_QUERY_SCHEMA = ('{"fields": [{"name": "name","type": "STRING"},'
                        '{"name": "language","type": "STRING"}]}')

    BIG_QUERY_SCHEMA_2 = ('{"fields": [{"name": "name","type": "STRING"},'
                          '{"name": "foundation","type": "STRING"}]}')

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.runner_name = type(self.test_pipeline.runner).__name__
        self.project = self.test_pipeline.get_option('project')

        self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID,
                                      str(int(time.time())),
                                      random.randint(0, 10000))
        self.bigquery_client = bigquery_tools.BigQueryWrapper()
        self.bigquery_client.get_or_create_dataset(self.project,
                                                   self.dataset_id)
        self.output_table = "%s.output_table" % (self.dataset_id)
        logging.info("Created dataset %s in project %s", self.dataset_id,
                     self.project)

    @attr('IT')
    def test_multiple_destinations_transform(self):
        output_table_1 = '%s%s' % (self.output_table, 1)
        output_table_2 = '%s%s' % (self.output_table, 2)
        output_table_3 = '%s%s' % (self.output_table, 3)
        output_table_4 = '%s%s' % (self.output_table, 4)
        schema1 = bigquery.WriteToBigQuery.get_dict_table_schema(
            bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA))
        schema2 = bigquery.WriteToBigQuery.get_dict_table_schema(
            bigquery_tools.parse_table_schema_from_json(
                self.BIG_QUERY_SCHEMA_2))

        schema_kv_pairs = [
            (output_table_1, schema1), (output_table_2, schema2),
            (output_table_3, schema1), (output_table_4, schema2)
        ]
        pipeline_verifiers = [
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT name, language FROM %s" % output_table_1,
                data=[(d['name'], d['language']) for d in _ELEMENTS
                      if 'language' in d]),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT name, foundation FROM %s" % output_table_2,
                data=[(d['name'], d['foundation']) for d in _ELEMENTS
                      if 'foundation' in d]),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT name, language FROM %s" % output_table_3,
                data=[(d['name'], d['language']) for d in _ELEMENTS
                      if 'language' in d]),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT name, foundation FROM %s" % output_table_4,
                data=[(d['name'], d['foundation']) for d in _ELEMENTS
                      if 'foundation' in d])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=all_of(*pipeline_verifiers),
            experiments='use_beam_bq_sink')

        with beam.Pipeline(argv=args) as p:
            input = p | beam.Create(_ELEMENTS)

            schema_map_pcv = beam.pvalue.AsDict(
                p | "MakeSchemas" >> beam.Create(schema_kv_pairs))

            table_record_pcv = beam.pvalue.AsDict(
                p | "MakeTables" >> beam.Create([('table1', output_table_1),
                                                 ('table2', output_table_2)]))

            # Get all input in same machine
            input = (input
                     | beam.Map(lambda x: (None, x))
                     | beam.GroupByKey()
                     | beam.FlatMap(lambda elm: elm[1]))

            _ = (
                input
                | "WriteWithMultipleDestsFreely" >> bigquery.WriteToBigQuery(
                    table=lambda x, tables:
                    (tables['table1']
                     if 'language' in x else tables['table2']),
                    table_side_inputs=(table_record_pcv, ),
                    schema=lambda dest, schema_map: schema_map.get(dest, None),
                    schema_side_inputs=(schema_map_pcv, ),
                    create_disposition=beam.io.BigQueryDisposition.
                    CREATE_IF_NEEDED,
                    write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

            _ = (input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
                table=lambda x:
                (output_table_3 if 'language' in x else output_table_4),
                schema=lambda dest, schema_map: schema_map.get(dest, None),
                schema_side_inputs=(schema_map_pcv, ),
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
                max_file_size=20,
                max_files_per_bundle=-1))

    @attr('IT')
    def test_one_job_fails_all_jobs_fail(self):

        # If one of the import jobs fails, then other jobs must not be performed.
        # This is to avoid reinsertion of some records when a pipeline fails and
        # is rerun.
        output_table_1 = '%s%s' % (self.output_table, 1)
        output_table_2 = '%s%s' % (self.output_table, 2)

        self.bigquery_client.get_or_create_table(
            self.project, self.dataset_id,
            output_table_1.split('.')[1],
            bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA),
            None, None)
        self.bigquery_client.get_or_create_table(
            self.project, self.dataset_id,
            output_table_2.split('.')[1],
            bigquery_tools.parse_table_schema_from_json(
                self.BIG_QUERY_SCHEMA_2), None, None)

        pipeline_verifiers = [
            BigqueryFullResultMatcher(project=self.project,
                                      query="SELECT name, language FROM %s" %
                                      output_table_1,
                                      data=[]),
            BigqueryFullResultMatcher(project=self.project,
                                      query="SELECT name, foundation FROM %s" %
                                      output_table_2,
                                      data=[])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            experiments='use_beam_bq_sink')

        with self.assertRaises(Exception):
            with beam.Pipeline(argv=args) as p:
                input = p | beam.Create(_ELEMENTS)
                input2 = p | "Broken record" >> beam.Create(
                    ['language_broken_record'])

                input = (input, input2) | beam.Flatten()

                _ = (input
                     | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
                         table=lambda x:
                         (output_table_1
                          if 'language' in x else output_table_2),
                         create_disposition=(
                             beam.io.BigQueryDisposition.CREATE_IF_NEEDED),
                         write_disposition=beam.io.BigQueryDisposition.
                         WRITE_APPEND))

        hamcrest_assert(p, all_of(*pipeline_verifiers))

    def tearDown(self):
        request = bigquery_api.BigqueryDatasetsDeleteRequest(
            projectId=self.project,
            datasetId=self.dataset_id,
            deleteContents=True)
        try:
            logging.info("Deleting dataset %s in project %s", self.dataset_id,
                         self.project)
            self.bigquery_client.client.datasets.Delete(request)
        except HttpError:
            logging.debug('Failed to clean up dataset %s in project %s',
                          self.dataset_id, self.project)
예제 #35
0
class ExerciseStreamingMetricsPipelineTest(unittest.TestCase):
  def setUp(self):
    """Creates all required topics and subs."""
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.project = self.test_pipeline.get_option('project')
    self.uuid = str(uuid.uuid4())

    # Set up PubSub environment.
    from google.cloud import pubsub
    self.pub_client = pubsub.PublisherClient()
    self.input_topic_name = INPUT_TOPIC + self.uuid
    self.input_topic = self.pub_client.create_topic(
        self.pub_client.topic_path(self.project, self.input_topic_name))

    self.output_topic_name = OUTPUT_TOPIC + self.uuid
    self.output_topic = self.pub_client.create_topic(
        self.pub_client.topic_path(self.project, self.output_topic_name))

    self.sub_client = pubsub.SubscriberClient()
    self.input_sub_name = INPUT_SUB + self.uuid
    self.input_sub = self.sub_client.create_subscription(
        self.sub_client.subscription_path(self.project, self.input_sub_name),
        self.input_topic.name)
    self.output_sub_name = OUTPUT_SUB + self.uuid
    self.output_sub = self.sub_client.create_subscription(
        self.sub_client.subscription_path(self.project, self.output_sub_name),
        self.output_topic.name,
        ack_deadline_seconds=60)

  def _inject_words(self, topic, messages):
    """Inject messages as test data to PubSub."""
    _LOGGER.debug('Injecting messages to topic %s', topic.name)
    for msg in messages:
      self.pub_client.publish(self.input_topic.name, msg.encode('utf-8'))
    _LOGGER.debug('Done. Injecting messages to topic %s', topic.name)

  def tearDown(self):
    """Delete all created topics and subs."""
    test_utils.cleanup_subscriptions(
        self.sub_client, [self.input_sub, self.output_sub])
    test_utils.cleanup_topics(
        self.pub_client, [self.input_topic, self.output_topic])

  def run_pipeline(self):
    # Waits for messages to appear in output topic.
    expected_msg = [msg.encode('utf-8') for msg in MESSAGES_TO_PUBLISH]
    pubsub_msg_verifier = PubSubMessageMatcher(
        self.project, self.output_sub.name, expected_msg, timeout=600)

    # Checks that pipeline initializes to RUNNING state.
    state_verifier = PipelineStateMatcher(PipelineState.RUNNING)

    extra_opts = {
        'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION,
        'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier),
        'experiment': 'beam_fn_api',
        'input_subscription': self.input_sub.name,
        'output_topic': self.output_topic.name,
    }

    argv = self.test_pipeline.get_full_options_as_args(**extra_opts)
    return dataflow_exercise_streaming_metrics_pipeline.run(argv)

  @attr('IT', 'ValidatesRunner')
  def test_streaming_pipeline_returns_expected_user_metrics_fnapi_it(self):
    """
    Runs streaming Dataflow job and verifies that user metrics are reported
    correctly.
    """
    self._inject_words(self.input_topic, MESSAGES_TO_PUBLISH)
    result = self.run_pipeline()

    METRIC_NAMESPACE = \
      ('apache_beam.runners.dataflow.'
       'dataflow_exercise_streaming_metrics_pipeline.StreamingUserMetricsDoFn')
    matchers = [
        # System metrics
        MetricResultMatcher(
            name='ElementCount',
            labels={
                "output_user_name": "generate_metrics-out0",
                "original_name": "generate_metrics-out0-ElementCount"
            },
            attempted=len(MESSAGES_TO_PUBLISH),
            committed=len(MESSAGES_TO_PUBLISH),
        ),
        # User Counter Metrics.
        MetricResultMatcher(
            name='double_msg_counter_name',
            namespace=METRIC_NAMESPACE,
            step='generate_metrics',
            attempted=len(MESSAGES_TO_PUBLISH) * 2,
            committed=len(MESSAGES_TO_PUBLISH) * 2),
        MetricResultMatcher(
            name='msg_len_dist_metric_name',
            namespace=METRIC_NAMESPACE,
            step='generate_metrics',
            attempted=DistributionMatcher(
                sum_value=len(''.join(MESSAGES_TO_PUBLISH)),
                count_value=len(MESSAGES_TO_PUBLISH),
                min_value=len(MESSAGES_TO_PUBLISH[0]),
                max_value=len(MESSAGES_TO_PUBLISH[1])),
            committed=DistributionMatcher(
                sum_value=len(''.join(MESSAGES_TO_PUBLISH)),
                count_value=len(MESSAGES_TO_PUBLISH),
                min_value=len(MESSAGES_TO_PUBLISH[0]),
                max_value=len(MESSAGES_TO_PUBLISH[1]))),
    ]

    metrics = result.metrics().all_metrics()
    errors = metric_result_matchers.verify_all(metrics, matchers)
    self.assertFalse(errors, str(errors))
예제 #36
0
class BigQueryWriteIntegrationTests(unittest.TestCase):
  BIG_QUERY_DATASET_ID = 'python_write_to_table_'

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.bigquery_client = BigQueryWrapper()
    self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID,
                                  str(int(time.time())),
                                  random.randint(0, 10000))
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    _LOGGER.info("Created dataset %s in project %s",
                 self.dataset_id, self.project)

  def tearDown(self):
    request = bigquery.BigqueryDatasetsDeleteRequest(
        projectId=self.project, datasetId=self.dataset_id,
        deleteContents=True)
    try:
      _LOGGER.info("Deleting dataset %s in project %s",
                   self.dataset_id, self.project)
      self.bigquery_client.client.datasets.Delete(request)
    except HttpError:
      _LOGGER.debug('Failed to clean up dataset %s in project %s',
                    self.dataset_id, self.project)

  def create_table(self, table_name):
    table_schema = bigquery.TableSchema()
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'bytes'
    table_field.type = 'BYTES'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'date'
    table_field.type = 'DATE'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'time'
    table_field.type = 'TIME'
    table_schema.fields.append(table_field)
    table = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=table_name),
        schema=table_schema)
    request = bigquery.BigqueryTablesInsertRequest(
        projectId=self.project, datasetId=self.dataset_id, table=table)
    self.bigquery_client.client.tables.Insert(request)

  @attr('IT')
  def test_big_query_write(self):
    table_name = 'python_write_table'
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    input_data = [
        {'number': 1, 'str': 'abc'},
        {'number': 2, 'str': 'def'},
        {'number': 3, 'str': u'你好'},
        {'number': 4, 'str': u'привет'},
    ]
    table_schema = {"fields": [
        {"name": "number", "type": "INTEGER"},
        {"name": "str", "type": "STRING"}]}

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT number, str FROM %s" % table_id,
            data=[(1, 'abc',), (2, 'def',), (3, u'你好',), (4, u'привет',)])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers))

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (p | 'create' >> beam.Create(input_data)
       | 'write' >> beam.io.WriteToBigQuery(
           table_id,
           schema=table_schema,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

  @attr('IT')
  def test_big_query_write_schema_autodetect(self):
    if self.runner_name == 'TestDataflowRunner':
      self.skipTest('DataflowRunner does not support schema autodetection')

    table_name = 'python_write_table'
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    input_data = [
        {'number': 1, 'str': 'abc'},
        {'number': 2, 'str': 'def'},
    ]

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT number, str FROM %s" % table_id,
            data=[(1, 'abc',), (2, 'def',)])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers),
        experiments='use_beam_bq_sink')

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (p | 'create' >> beam.Create(input_data)
       | 'write' >> beam.io.WriteToBigQuery(
           table_id,
           method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
           schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

  @attr('IT')
  def test_big_query_write_new_types(self):
    table_name = 'python_new_types_table'
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    row_data = {
        'float': 0.33, 'numeric': Decimal('10'), 'bytes':
        base64.b64encode(b'\xab\xac').decode('utf-8'), 'date': '3000-12-31',
        'time': '23:59:59', 'datetime': '2018-12-31T12:44:31',
        'timestamp': '2018-12-31 12:44:31.744957 UTC', 'geo': 'POINT(30 10)'
    }

    input_data = [row_data]
    # add rows with only one key value pair and None values for all other keys
    for key, value in iteritems(row_data):
      input_data.append({key: value})

    table_schema = {"fields": [
        {"name": "float", "type": "FLOAT"},
        {"name": "numeric", "type": "NUMERIC"},
        {"name": "bytes", "type": "BYTES"},
        {"name": "date", "type": "DATE"},
        {"name": "time", "type": "TIME"},
        {"name": "datetime", "type": "DATETIME"},
        {"name": "timestamp", "type": "TIMESTAMP"},
        {"name": "geo", "type": "GEOGRAPHY"}
    ]}

    expected_row = (0.33, Decimal('10'), b'\xab\xac',
                    datetime.date(3000, 12, 31), datetime.time(23, 59, 59),
                    datetime.datetime(2018, 12, 31, 12, 44, 31),
                    datetime.datetime(2018, 12, 31, 12, 44, 31, 744957,
                                      tzinfo=pytz.utc), 'POINT(30 10)',
                   )

    expected_data = [expected_row]

    # add rows with only one key value pair and None values for all other keys
    for i, value in enumerate(expected_row):
      row = [None]*len(expected_row)
      row[i] = value
      expected_data.append(tuple(row))

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query='SELECT float, numeric, bytes, date, time, datetime,'
                  'timestamp, geo FROM %s' % table_id,
            data=expected_data)]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers))

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (p | 'create' >> beam.Create(input_data)
       | 'write' >> beam.io.WriteToBigQuery(
           table_id,
           schema=table_schema,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

  @attr('IT')
  def test_big_query_write_without_schema(self):
    table_name = 'python_no_schema_table'
    self.create_table(table_name)
    table_id = '{}.{}'.format(self.dataset_id, table_name)

    input_data = [
        {'bytes': b'xyw', 'date': '2011-01-01', 'time': '23:59:59.999999'},
        {'bytes': b'abc', 'date': '2000-01-01', 'time': '00:00:00'},
        {'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd', 'date': '3000-12-31',
         'time': '23:59:59'},
        {'bytes': b'\xab\xac\xad', 'date': '2000-01-01', 'time': '00:00:00'}
    ]
    # bigquery io expects bytes to be base64 encoded values
    for row in input_data:
      row['bytes'] = base64.b64encode(row['bytes'])

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT bytes, date, time FROM %s" % table_id,
            data=[(b'xyw', datetime.date(2011, 1, 1),
                   datetime.time(23, 59, 59, 999999), ),
                  (b'abc', datetime.date(2000, 1, 1),
                   datetime.time(0, 0, 0), ),
                  (b'\xe4\xbd\xa0\xe5\xa5\xbd', datetime.date(3000, 12, 31),
                   datetime.time(23, 59, 59), ),
                  (b'\xab\xac\xad', datetime.date(2000, 1, 1),
                   datetime.time(0, 0, 0), )])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers))

    with beam.Pipeline(argv=args) as p:
      # pylint: disable=expression-not-assigned
      (p | 'create' >> beam.Create(input_data)
       | 'write' >> beam.io.WriteToBigQuery(
           table_id,
           write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
예제 #37
0
class BigQueryStreamingInsertTransformIntegrationTests(unittest.TestCase):
  BIG_QUERY_DATASET_ID = 'python_bq_streaming_inserts_'

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID,
                                  str(int(time.time())),
                                  random.randint(0, 10000))
    self.bigquery_client = bigquery_tools.BigQueryWrapper()
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    self.output_table = "%s.output_table" % (self.dataset_id)
    logging.info("Created dataset %s in project %s",
                 self.dataset_id, self.project)

  @attr('IT')
  def test_value_provider_transform(self):
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)
    schema = {'fields': [
        {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
        {'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE'}]}

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_1,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_2,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers),
        experiments='use_beam_bq_sink')

    with beam.Pipeline(argv=args) as p:
      input = p | beam.Create([row for row in _ELEMENTS if 'language' in row])

      _ = (input
           | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery(
               table=value_provider.StaticValueProvider(
                   str, '%s:%s' % (self.project, output_table_1)),
               schema=value_provider.StaticValueProvider(dict, schema),
               method='STREAMING_INSERTS'))
      _ = (input
           | "WriteWithMultipleDests2" >> beam.io.gcp.bigquery.WriteToBigQuery(
               table=value_provider.StaticValueProvider(
                   str, '%s:%s' % (self.project, output_table_2)),
               method='FILE_LOADS'))

  @attr('IT')
  def test_multiple_destinations_transform(self):
    output_table_1 = '%s%s' % (self.output_table, 1)
    output_table_2 = '%s%s' % (self.output_table, 2)

    full_output_table_1 = '%s:%s' % (self.project, output_table_1)
    full_output_table_2 = '%s:%s' % (self.project, output_table_2)

    schema1 = {'fields': [
        {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
        {'name': 'language', 'type': 'STRING', 'mode': 'NULLABLE'}]}
    schema2 = {'fields': [
        {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'},
        {'name': 'foundation', 'type': 'STRING', 'mode': 'NULLABLE'}]}

    bad_record = {'language': 1, 'manguage': 2}

    pipeline_verifiers = [
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_1,
            data=[(d['name'], d['language'])
                  for d in _ELEMENTS
                  if 'language' in d]),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT * FROM %s" % output_table_2,
            data=[(d['name'], d['foundation'])
                  for d in _ELEMENTS
                  if 'foundation' in d])]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers),
        experiments='use_beam_bq_sink')

    with beam.Pipeline(argv=args) as p:
      input = p | beam.Create(_ELEMENTS)

      input2 = p | "Broken record" >> beam.Create([bad_record])

      input = (input, input2) | beam.Flatten()

      r = (input
           | "WriteWithMultipleDests" >> beam.io.gcp.bigquery.WriteToBigQuery(
               table=lambda x: (full_output_table_1
                                if 'language' in x
                                else full_output_table_2),
               schema=lambda dest: (schema1
                                    if dest == full_output_table_1
                                    else schema2),
               method='STREAMING_INSERTS'))

      assert_that(r[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS],
                  equal_to([(full_output_table_1, bad_record)]))

  def tearDown(self):
    request = bigquery.BigqueryDatasetsDeleteRequest(
        projectId=self.project, datasetId=self.dataset_id,
        deleteContents=True)
    try:
      logging.info("Deleting dataset %s in project %s",
                   self.dataset_id, self.project)
      self.bigquery_client.client.datasets.Delete(request)
    except HttpError:
      logging.debug('Failed to clean up dataset %s in project %s',
                    self.dataset_id, self.project)
class PubSubIntegrationTest(unittest.TestCase):

  ID_LABEL = 'id'
  TIMESTAMP_ATTRIBUTE = 'timestamp'
  INPUT_MESSAGES = [
      # Use ID_LABEL attribute to deduplicate messages with the same ID.
      PubsubMessage('data001', {ID_LABEL: 'foo'}),
      PubsubMessage('data001', {ID_LABEL: 'foo'}),
      PubsubMessage('data001', {ID_LABEL: 'foo'}),
      # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the IT
      # pipeline writes back the timestamp of each element (as reported by
      # Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute.
      PubsubMessage('data002', {
          TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
      }),
  ]
  EXPECTED_OUTPUT_MESSAGES = [
      PubsubMessage('data001-seen', {'processed': 'IT'}),
      PubsubMessage('data002-seen', {
          TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z',
          'processed': 'IT',
      }),
  ]

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.project = self.test_pipeline.get_option('project')
    self.uuid = str(uuid.uuid4())

    # Set up PubSub environment.
    from google.cloud import pubsub
    self.pubsub_client = pubsub.Client(project=self.project)
    self.input_topic = self.pubsub_client.topic(INPUT_TOPIC + self.uuid)
    self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC + self.uuid)
    self.input_sub = self.input_topic.subscription(INPUT_SUB + self.uuid)
    self.output_sub = self.output_topic.subscription(OUTPUT_SUB + self.uuid)

    self.input_topic.create()
    self.output_topic.create()
    test_utils.wait_for_topics_created([self.input_topic, self.output_topic])
    self.input_sub.create()
    self.output_sub.create()

  def tearDown(self):
    test_utils.cleanup_subscriptions([self.input_sub, self.output_sub])
    test_utils.cleanup_topics([self.input_topic, self.output_topic])

  def _test_streaming(self, with_attributes):
    """Runs IT pipeline with message verifier.

    Args:
      with_attributes: False - Reads and writes message data only.
        True - Reads and writes message data and attributes. Also verifies
        id_label and timestamp_attribute features.
    """
    # Build expected dataset.
    # Set extra options to the pipeline for test purpose
    state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
    expected_messages = self.EXPECTED_OUTPUT_MESSAGES
    if not with_attributes:
      expected_messages = [pubsub_msg.data for pubsub_msg in expected_messages]
    pubsub_msg_verifier = PubSubMessageMatcher(
        self.project,
        OUTPUT_SUB + self.uuid,
        expected_messages,
        timeout=MESSAGE_MATCHER_TIMEOUT_S,
        with_attributes=with_attributes,
        strip_attributes=[self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE])
    extra_opts = {'input_subscription': self.input_sub.full_name,
                  'output_topic': self.output_topic.full_name,
                  'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS,
                  'on_success_matcher': all_of(state_verifier,
                                               pubsub_msg_verifier)}

    # Generate input data and inject to PubSub.
    test_utils.wait_for_subscriptions_created([self.input_sub])
    for msg in self.INPUT_MESSAGES:
      self.input_topic.publish(msg.data, **msg.attributes)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    pubsub_it_pipeline.run_pipeline(
        argv=self.test_pipeline.get_full_options_as_args(**extra_opts),
        with_attributes=with_attributes,
        id_label=self.ID_LABEL,
        timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)

  @attr('IT')
  def test_streaming_data_only(self):
    self._test_streaming(with_attributes=False)

  @attr('IT')
  def test_streaming_with_attributes(self):
    self._test_streaming(with_attributes=True)
예제 #39
0
class FastavroIT(unittest.TestCase):

  SCHEMA = Parse('''
    {"namespace": "example.avro",
     "type": "record",
     "name": "User",
     "fields": [
         {"name": "label", "type": "string"},
         {"name": "number",  "type": ["int", "null"]},
         {"name": "number_str", "type": ["string", "null"]},
         {"name": "color", "type": ["string", "null"]}
     ]
    }
    ''')

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.uuid = str(uuid.uuid4())
    self.output = '/'.join([
        self.test_pipeline.get_option('output'),
        self.uuid
    ])

  @attr('IT')
  def test_avro_it(self):
    num_records = self.test_pipeline.get_option('records')
    num_records = int(num_records) if num_records else 1000000

    # Seed a `PCollection` with indices that will each be FlatMap'd into
    # `batch_size` records, to avoid having a too-large list in memory at
    # the outset
    batch_size = self.test_pipeline.get_option('batch-size')
    batch_size = int(batch_size) if batch_size else 10000

    # pylint: disable=range-builtin-not-iterating
    batches = range(int(num_records / batch_size))

    def batch_indices(start):
      # pylint: disable=range-builtin-not-iterating
      return range(start * batch_size, (start + 1) * batch_size)

    # A `PCollection` with `num_records` avro records
    records_pcoll = \
        self.test_pipeline \
        | 'create-batches' >> Create(batches) \
        | 'expand-batches' >> FlatMap(batch_indices) \
        | 'create-records' >> Map(record)

    fastavro_output = '/'.join([self.output, 'fastavro'])
    avro_output = '/'.join([self.output, 'avro'])

    self.addCleanup(delete_files, [self.output + '*'])

    # pylint: disable=expression-not-assigned
    records_pcoll \
    | 'write_fastavro' >> WriteToAvro(
        fastavro_output,
        self.SCHEMA,
        use_fastavro=True
    )

    # pylint: disable=expression-not-assigned
    records_pcoll \
    | 'write_avro' >> WriteToAvro(
        avro_output,
        self.SCHEMA,
        use_fastavro=False
    )

    result = self.test_pipeline.run()
    result.wait_until_finish()
    assert result.state == PipelineState.DONE

    fastavro_read_pipeline = TestPipeline(is_integration_test=True)

    fastavro_records = \
        fastavro_read_pipeline \
        | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \
        | 'read-fastavro' >> ReadAllFromAvro(use_fastavro=True) \
        | Map(lambda rec: (rec['number'], rec))

    avro_records = \
        fastavro_read_pipeline \
        | 'create-avro' >> Create(['%s*' % avro_output]) \
        | 'read-avro' >> ReadAllFromAvro(use_fastavro=False) \
        | Map(lambda rec: (rec['number'], rec))

    def check(elem):
      v = elem[1]

      def assertEqual(l, r):
        if l != r:
          raise BeamAssertException('Assertion failed: %s == %s' % (l, r))

      assertEqual(v.keys(), ['avro', 'fastavro'])
      avro_values = v['avro']
      fastavro_values = v['fastavro']
      assertEqual(avro_values, fastavro_values)
      assertEqual(len(avro_values), 1)

    # pylint: disable=expression-not-assigned
    {
        'avro': avro_records,
        'fastavro': fastavro_records
    } \
    | CoGroupByKey() \
    | Map(check)

    fastavro_read_pipeline.run().wait_until_finish()
    assert result.state == PipelineState.DONE
예제 #40
0
class GcsIOIntegrationTest(unittest.TestCase):

  INPUT_FILE = 'gs://dataflow-samples/shakespeare/kinglear.txt'
  # Larger than 1MB to test maxBytesRewrittenPerCall.
  INPUT_FILE_LARGE = (
      'gs://dataflow-samples/wikipedia_edits/wiki_data-000000000000.json')

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    if self.runner_name != 'TestDataflowRunner':
      # This test doesn't run a pipeline, so it doesn't make sense to try it on
      # different runners. Running with TestDataflowRunner makes sense since
      # it uses GoogleCloudOptions such as 'project'.
      raise unittest.SkipTest(
          'This test only runs with TestDataflowRunner.')
    self.project = self.test_pipeline.get_option('project')
    self.gcs_tempdir = (self.test_pipeline.get_option('temp_location') +
                        '/gcs_it-' + str(uuid.uuid4()))
    self.kms_key_name = self.test_pipeline.get_option('kms_key_name')
    self.gcsio = gcsio.GcsIO()

  def tearDown(self):
    FileSystems.delete([self.gcs_tempdir + '/'])

  def _verify_copy(self, src, dst, dst_kms_key_name=None):
    self.assertTrue(FileSystems.exists(src), 'src does not exist: %s' % src)
    self.assertTrue(FileSystems.exists(dst), 'dst does not exist: %s' % dst)
    src_checksum = self.gcsio.checksum(src)
    dst_checksum = self.gcsio.checksum(dst)
    self.assertEqual(src_checksum, dst_checksum)
    actual_dst_kms_key = self.gcsio.kms_key(dst)
    if actual_dst_kms_key is None:
      self.assertEqual(actual_dst_kms_key, dst_kms_key_name)
    else:
      self.assertTrue(actual_dst_kms_key.startswith(dst_kms_key_name),
                      "got: %s, wanted startswith: %s" % (actual_dst_kms_key,
                                                          dst_kms_key_name))

  def _test_copy(self, name, kms_key_name=None,
                 max_bytes_rewritten_per_call=None, src=None):
    src = src or self.INPUT_FILE
    dst = self.gcs_tempdir + '/%s' % name
    extra_kwargs = {}
    if max_bytes_rewritten_per_call is not None:
      extra_kwargs['max_bytes_rewritten_per_call'] = (
          max_bytes_rewritten_per_call)

    self.gcsio.copy(src, dst, kms_key_name, **extra_kwargs)
    self._verify_copy(src, dst, kms_key_name)

  @attr('IT')
  def test_copy(self):
    self._test_copy("test_copy")

  @attr('IT')
  def test_copy_kms(self):
    if self.kms_key_name is None:
      raise unittest.SkipTest('--kms_key_name not specified')
    self._test_copy("test_copy_kms", self.kms_key_name)

  @attr('IT')
  def test_copy_rewrite_token(self):
    # Tests a multi-part copy (rewrite) operation. This is triggered by a
    # combination of 3 conditions:
    #  - a large enough src
    #  - setting max_bytes_rewritten_per_call
    #  - setting kms_key_name
    if self.kms_key_name is None:
      raise unittest.SkipTest('--kms_key_name not specified')

    rewrite_responses = []
    self.gcsio._set_rewrite_response_callback(
        lambda response: rewrite_responses.append(response))
    self._test_copy("test_copy_rewrite_token", kms_key_name=self.kms_key_name,
                    max_bytes_rewritten_per_call=50 * 1024 * 1024,
                    src=self.INPUT_FILE_LARGE)
    # Verify that there was a multi-part rewrite.
    self.assertTrue(any([not r.done for r in rewrite_responses]))

  def _test_copy_batch(self, name, kms_key_name=None,
                       max_bytes_rewritten_per_call=None, src=None):
    num_copies = 10
    srcs = [src or self.INPUT_FILE] * num_copies
    dsts = [self.gcs_tempdir + '/%s_%d' % (name, i)
            for i in range(num_copies)]
    src_dst_pairs = list(zip(srcs, dsts))
    extra_kwargs = {}
    if max_bytes_rewritten_per_call is not None:
      extra_kwargs['max_bytes_rewritten_per_call'] = (
          max_bytes_rewritten_per_call)

    result_statuses = self.gcsio.copy_batch(
        src_dst_pairs, kms_key_name, **extra_kwargs)
    for status in result_statuses:
      self.assertIsNone(status[2], status)
    for _src, _dst in src_dst_pairs:
      self._verify_copy(_src, _dst, kms_key_name)

  @attr('IT')
  def test_copy_batch(self):
    self._test_copy_batch("test_copy_batch")

  @attr('IT')
  def test_copy_batch_kms(self):
    if self.kms_key_name is None:
      raise unittest.SkipTest('--kms_key_name not specified')
    self._test_copy_batch("test_copy_batch_kms", self.kms_key_name)

  @attr('IT')
  def test_copy_batch_rewrite_token(self):
    # Tests a multi-part copy (rewrite) operation. This is triggered by a
    # combination of 3 conditions:
    #  - a large enough src
    #  - setting max_bytes_rewritten_per_call
    #  - setting kms_key_name
    if self.kms_key_name is None:
      raise unittest.SkipTest('--kms_key_name not specified')

    rewrite_responses = []
    self.gcsio._set_rewrite_response_callback(
        lambda response: rewrite_responses.append(response))
    self._test_copy_batch(
        "test_copy_batch_rewrite_token", kms_key_name=self.kms_key_name,
        max_bytes_rewritten_per_call=50 * 1024 * 1024,
        src=self.INPUT_FILE_LARGE)
    # Verify that there was a multi-part rewrite.
    self.assertTrue(any([not r.done for r in rewrite_responses]))
class LeaderBoardIT(unittest.TestCase):

  # Input event containing user, team, score, processing time, window start.
  INPUT_EVENT = 'user1,teamA,10,%d,2015-11-02 09:09:28.224'
  INPUT_TOPIC = 'leader_board_it_input_topic'
  INPUT_SUB = 'leader_board_it_input_subscription'

  # SHA-1 hash generated from sorted rows reading from BigQuery table
  DEFAULT_EXPECTED_CHECKSUM = 'de00231fe6730b972c0ff60a99988438911cda53'
  OUTPUT_DATASET = 'leader_board_it_dataset'
  OUTPUT_TABLE_USERS = 'leader_board_users'
  OUTPUT_TABLE_TEAMS = 'leader_board_teams'
  DEFAULT_INPUT_COUNT = 500

  WAIT_UNTIL_FINISH_DURATION = 10 * 60 * 1000   # in milliseconds

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.project = self.test_pipeline.get_option('project')
    _unique_id = str(uuid.uuid4())

    # Set up PubSub environment.
    from google.cloud import pubsub
    self.pubsub_client = pubsub.Client(project=self.project)
    unique_topic_name = self.INPUT_TOPIC + _unique_id
    unique_subscrition_name = self.INPUT_SUB + _unique_id
    self.input_topic = self.pubsub_client.topic(unique_topic_name)
    self.input_sub = self.input_topic.subscription(unique_subscrition_name)

    self.input_topic.create()
    test_utils.wait_for_topics_created([self.input_topic])
    self.input_sub.create()

    # Set up BigQuery environment
    from google.cloud import bigquery
    client = bigquery.Client()
    unique_dataset_name = self.OUTPUT_DATASET + str(int(time.time()))
    self.dataset = client.dataset(unique_dataset_name, project=self.project)
    self.dataset.create()

    self._test_timestamp = int(time.time() * 1000)

  def _inject_pubsub_game_events(self, topic, message_count):
    """Inject game events as test data to PubSub."""

    logging.debug('Injecting %d game events to topic %s',
                  message_count, topic.full_name)

    for _ in range(message_count):
      topic.publish(self.INPUT_EVENT % self._test_timestamp)

  def _cleanup_pubsub(self):
    test_utils.cleanup_subscriptions([self.input_sub])
    test_utils.cleanup_topics([self.input_topic])

  def _cleanup_dataset(self):
    self.dataset.delete()

  @attr('IT')
  def test_leader_board_it(self):
    state_verifier = PipelineStateMatcher(PipelineState.RUNNING)

    success_condition = 'total_score=5000 LIMIT 1'
    users_query = ('SELECT total_score FROM [%s:%s.%s] '
                   'WHERE %s' % (self.project,
                                 self.dataset.name,
                                 self.OUTPUT_TABLE_USERS,
                                 success_condition))
    bq_users_verifier = BigqueryMatcher(self.project,
                                        users_query,
                                        self.DEFAULT_EXPECTED_CHECKSUM)

    teams_query = ('SELECT total_score FROM [%s:%s.%s] '
                   'WHERE %s' % (self.project,
                                 self.dataset.name,
                                 self.OUTPUT_TABLE_TEAMS,
                                 success_condition))
    bq_teams_verifier = BigqueryMatcher(self.project,
                                        teams_query,
                                        self.DEFAULT_EXPECTED_CHECKSUM)

    extra_opts = {'subscription': self.input_sub.full_name,
                  'dataset': self.dataset.name,
                  'topic': self.input_topic.full_name,
                  'team_window_duration': 1,
                  'wait_until_finish_duration':
                      self.WAIT_UNTIL_FINISH_DURATION,
                  'on_success_matcher': all_of(state_verifier,
                                               bq_users_verifier,
                                               bq_teams_verifier)}

    # Register cleanup before pipeline execution.
    # Note that actual execution happens in reverse order.
    self.addCleanup(self._cleanup_pubsub)
    self.addCleanup(self._cleanup_dataset)
    self.addCleanup(utils.delete_bq_table, self.project,
                    self.dataset.name, self.OUTPUT_TABLE_USERS)
    self.addCleanup(utils.delete_bq_table, self.project,
                    self.dataset.name, self.OUTPUT_TABLE_TEAMS)

    # Generate input data and inject to PubSub.
    test_utils.wait_for_subscriptions_created([self.input_topic,
                                               self.input_sub])
    self._inject_pubsub_game_events(self.input_topic, self.DEFAULT_INPUT_COUNT)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    leader_board.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts))
예제 #42
0
class BigQueryStreamingInsertTransformIntegrationTests(unittest.TestCase):
    BIG_QUERY_DATASET_ID = 'python_bq_streaming_inserts_'

    # Prevent nose from finding and running tests that were not
    # specified in the Gradle file.
    # See "More tests may be found" in:
    # https://nose.readthedocs.io/en/latest/doc_tests/test_multiprocess
    # /multiprocess.html#other-differences-in-test-running
    _multiprocess_can_split_ = True

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.runner_name = type(self.test_pipeline.runner).__name__
        self.project = self.test_pipeline.get_option('project')

        self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID,
                                      str(int(time.time())),
                                      random.randint(0, 10000))
        self.bigquery_client = bigquery_tools.BigQueryWrapper()
        self.bigquery_client.get_or_create_dataset(self.project,
                                                   self.dataset_id)
        self.output_table = "%s.output_table" % (self.dataset_id)
        _LOGGER.info("Created dataset %s in project %s", self.dataset_id,
                     self.project)

    @attr('IT')
    def test_value_provider_transform(self):
        output_table_1 = '%s%s' % (self.output_table, 1)
        output_table_2 = '%s%s' % (self.output_table, 2)
        schema = {
            'fields': [{
                'name': 'name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'language',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }]
        }

        additional_bq_parameters = {
            'timePartitioning': {
                'type': 'DAY'
            },
            'clustering': {
                'fields': ['language']
            }
        }

        table_ref = bigquery_tools.parse_table_reference(output_table_1)
        table_ref2 = bigquery_tools.parse_table_reference(output_table_2)

        pipeline_verifiers = [
            BigQueryTableMatcher(project=self.project,
                                 dataset=table_ref.datasetId,
                                 table=table_ref.tableId,
                                 expected_properties=additional_bq_parameters),
            BigQueryTableMatcher(project=self.project,
                                 dataset=table_ref2.datasetId,
                                 table=table_ref2.tableId,
                                 expected_properties=additional_bq_parameters),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT name, language FROM %s" % output_table_1,
                data=[(d['name'], d['language']) for d in _ELEMENTS
                      if 'language' in d]),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT name, language FROM %s" % output_table_2,
                data=[(d['name'], d['language']) for d in _ELEMENTS
                      if 'language' in d])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers),
            experiments='use_beam_bq_sink')

        with beam.Pipeline(argv=args) as p:
            input = p | beam.Create(
                [row for row in _ELEMENTS if 'language' in row])

            _ = (input
                 | "WriteWithMultipleDests" >>
                 beam.io.gcp.bigquery.WriteToBigQuery(
                     table=value_provider.StaticValueProvider(
                         str, '%s:%s' % (self.project, output_table_1)),
                     schema=value_provider.StaticValueProvider(dict, schema),
                     additional_bq_parameters=additional_bq_parameters,
                     method='STREAMING_INSERTS'))
            _ = (input
                 | "WriteWithMultipleDests2" >>
                 beam.io.gcp.bigquery.WriteToBigQuery(
                     table=value_provider.StaticValueProvider(
                         str, '%s:%s' % (self.project, output_table_2)),
                     schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT,
                     additional_bq_parameters=lambda _:
                     additional_bq_parameters,
                     method='FILE_LOADS'))

    @attr('IT')
    def test_multiple_destinations_transform(self):
        streaming = self.test_pipeline.options.view_as(
            StandardOptions).streaming
        if streaming and isinstance(self.test_pipeline.runner,
                                    TestDataflowRunner):
            self.skipTest("TestStream is not supported on TestDataflowRunner")

        output_table_1 = '%s%s' % (self.output_table, 1)
        output_table_2 = '%s%s' % (self.output_table, 2)

        full_output_table_1 = '%s:%s' % (self.project, output_table_1)
        full_output_table_2 = '%s:%s' % (self.project, output_table_2)

        schema1 = {
            'fields': [{
                'name': 'name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'language',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }]
        }
        schema2 = {
            'fields': [{
                'name': 'name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'foundation',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }]
        }

        bad_record = {'language': 1, 'manguage': 2}

        if streaming:
            pipeline_verifiers = [
                PipelineStateMatcher(PipelineState.RUNNING),
                BigqueryFullResultStreamingMatcher(
                    project=self.project,
                    query="SELECT name, language FROM %s" % output_table_1,
                    data=[(d['name'], d['language']) for d in _ELEMENTS
                          if 'language' in d]),
                BigqueryFullResultStreamingMatcher(
                    project=self.project,
                    query="SELECT name, foundation FROM %s" % output_table_2,
                    data=[(d['name'], d['foundation']) for d in _ELEMENTS
                          if 'foundation' in d])
            ]
        else:
            pipeline_verifiers = [
                BigqueryFullResultMatcher(
                    project=self.project,
                    query="SELECT name, language FROM %s" % output_table_1,
                    data=[(d['name'], d['language']) for d in _ELEMENTS
                          if 'language' in d]),
                BigqueryFullResultMatcher(
                    project=self.project,
                    query="SELECT name, foundation FROM %s" % output_table_2,
                    data=[(d['name'], d['foundation']) for d in _ELEMENTS
                          if 'foundation' in d])
            ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers),
            experiments='use_beam_bq_sink')

        with beam.Pipeline(argv=args) as p:
            if streaming:
                _SIZE = len(_ELEMENTS)
                test_stream = (
                    TestStream().advance_watermark_to(0).add_elements(
                        _ELEMENTS[:_SIZE // 2]).advance_watermark_to(
                            100).add_elements(
                                _ELEMENTS[_SIZE //
                                          2:]).advance_watermark_to_infinity())
                input = p | test_stream
            else:
                input = p | beam.Create(_ELEMENTS)

            schema_table_pcv = beam.pvalue.AsDict(
                p
                | "MakeSchemas" >> beam.Create([(full_output_table_1, schema1),
                                                (full_output_table_2,
                                                 schema2)]))

            table_record_pcv = beam.pvalue.AsDict(
                p
                | "MakeTables" >> beam.Create([('table1', full_output_table_1),
                                               ('table2',
                                                full_output_table_2)]))

            input2 = p | "Broken record" >> beam.Create([bad_record])

            input = (input, input2) | beam.Flatten()

            r = (input
                 | "WriteWithMultipleDests" >>
                 beam.io.gcp.bigquery.WriteToBigQuery(
                     table=lambda x, tables:
                     (tables['table1']
                      if 'language' in x else tables['table2']),
                     table_side_inputs=(table_record_pcv, ),
                     schema=lambda dest, table_map: table_map.get(dest, None),
                     schema_side_inputs=(schema_table_pcv, ),
                     insert_retry_strategy=RetryStrategy.
                     RETRY_ON_TRANSIENT_ERROR,
                     method='STREAMING_INSERTS'))

            assert_that(r[beam.io.gcp.bigquery.BigQueryWriteFn.FAILED_ROWS],
                        equal_to([(full_output_table_1, bad_record)]))

    def tearDown(self):
        request = bigquery.BigqueryDatasetsDeleteRequest(
            projectId=self.project,
            datasetId=self.dataset_id,
            deleteContents=True)
        try:
            _LOGGER.info("Deleting dataset %s in project %s", self.dataset_id,
                         self.project)
            self.bigquery_client.client.datasets.Delete(request)
        except HttpError:
            _LOGGER.debug('Failed to clean up dataset %s in project %s',
                          self.dataset_id, self.project)
예제 #43
0
class BigQueryFileLoadsIntegrationTests(unittest.TestCase):
  BIG_QUERY_DATASET_ID = 'python_bq_file_loads_'

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.dataset_id = '%s%s%s' % (
        self.BIG_QUERY_DATASET_ID,
        str(int(time.time())),
        random.randint(0, 10000))
    self.bigquery_client = bigquery_tools.BigQueryWrapper()
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    self.output_table = '%s.output_table' % (self.dataset_id)
    self.table_ref = bigquery_tools.parse_table_reference(self.output_table)
    _LOGGER.info(
        'Created dataset %s in project %s', self.dataset_id, self.project)

  @attr('IT')
  def test_avro_file_load(self):
    # Construct elements such that they can be written via Avro but not via
    # JSON. See BEAM-8841.
    from apache_beam.io.gcp import bigquery_file_loads
    old_max_files = bigquery_file_loads._MAXIMUM_SOURCE_URIS
    old_max_file_size = bigquery_file_loads._DEFAULT_MAX_FILE_SIZE
    bigquery_file_loads._MAXIMUM_SOURCE_URIS = 1
    bigquery_file_loads._DEFAULT_MAX_FILE_SIZE = 100
    elements = [
        {
            'name': u'Negative infinity',
            'value': -float('inf'),
            'timestamp': datetime.datetime(1970, 1, 1, tzinfo=pytz.utc),
        },
        {
            'name': u'Not a number',
            'value': float('nan'),
            'timestamp': datetime.datetime(2930, 12, 9, tzinfo=pytz.utc),
        },
    ]

    schema = beam.io.gcp.bigquery.WriteToBigQuery.get_dict_table_schema(
        bigquery.TableSchema(
            fields=[
                bigquery.TableFieldSchema(
                    name='name', type='STRING', mode='REQUIRED'),
                bigquery.TableFieldSchema(
                    name='value', type='FLOAT', mode='REQUIRED'),
                bigquery.TableFieldSchema(
                    name='timestamp', type='TIMESTAMP', mode='REQUIRED'),
            ]))

    pipeline_verifiers = [
        # Some gymnastics here to avoid comparing NaN since NaN is not equal to
        # anything, including itself.
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT name, value, timestamp FROM {} WHERE value<0".format(
                self.output_table),
            data=[(d['name'], d['value'], d['timestamp'])
                  for d in elements[:1]],
        ),
        BigqueryFullResultMatcher(
            project=self.project,
            query="SELECT name, timestamp FROM {}".format(self.output_table),
            data=[(d['name'], d['timestamp']) for d in elements],
        ),
    ]

    args = self.test_pipeline.get_full_options_as_args(
        on_success_matcher=hc.all_of(*pipeline_verifiers),
    )

    with beam.Pipeline(argv=args) as p:
      input = p | 'CreateInput' >> beam.Create(elements)
      schema_pc = p | 'CreateSchema' >> beam.Create([schema])

      _ = (
          input
          | 'WriteToBigQuery' >> beam.io.gcp.bigquery.WriteToBigQuery(
              table='%s:%s' % (self.project, self.output_table),
              schema=lambda _,
              schema: schema,
              schema_side_inputs=(beam.pvalue.AsSingleton(schema_pc), ),
              method='FILE_LOADS',
              temp_file_format=bigquery_tools.FileFormat.AVRO,
          ))
    bigquery_file_loads._MAXIMUM_SOURCE_URIS = old_max_files
    bigquery_file_loads._DEFAULT_MAX_FILE_SIZE = old_max_file_size

  def tearDown(self):
    request = bigquery.BigqueryDatasetsDeleteRequest(
        projectId=self.project, datasetId=self.dataset_id, deleteContents=True)
    try:
      _LOGGER.info(
          "Deleting dataset %s in project %s", self.dataset_id, self.project)
      self.bigquery_client.client.datasets.Delete(request)
    except HttpError:
      _LOGGER.debug(
          'Failed to clean up dataset %s in project %s',
          self.dataset_id,
          self.project)
예제 #44
0
class GameStatsIT(unittest.TestCase):

  # Input events containing user, team, score, processing time, window start.
  INPUT_EVENT = 'user1,teamA,10,%d,2015-11-02 09:09:28.224'
  INPUT_TOPIC = 'game_stats_it_input_topic'
  INPUT_SUB = 'game_stats_it_input_subscription'

  # SHA-1 hash generated from sorted rows reading from BigQuery table
  DEFAULT_EXPECTED_CHECKSUM = '5288ccaab77d347c8460d77c15a0db234ef5eb4f'
  OUTPUT_DATASET = 'game_stats_it_dataset'
  OUTPUT_TABLE_SESSIONS = 'game_stats_sessions'
  OUTPUT_TABLE_TEAMS = 'game_stats_teams'
  DEFAULT_INPUT_COUNT = 500

  WAIT_UNTIL_FINISH_DURATION = 12 * 60 * 1000   # in milliseconds

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.project = self.test_pipeline.get_option('project')
    _unique_id = str(uuid.uuid4())

    # Set up PubSub environment.
    from google.cloud import pubsub
    self.pub_client = pubsub.PublisherClient()
    self.input_topic = self.pub_client.create_topic(
        self.pub_client.topic_path(self.project, self.INPUT_TOPIC + _unique_id))

    self.sub_client = pubsub.SubscriberClient()
    self.input_sub = self.sub_client.create_subscription(
        self.sub_client.subscription_path(self.project,
                                          self.INPUT_SUB + _unique_id),
        self.input_topic.name)

    # Set up BigQuery environment
    self.dataset_ref = utils.create_bq_dataset(self.project,
                                               self.OUTPUT_DATASET)

    self._test_timestamp = int(time.time() * 1000)

  def _inject_pubsub_game_events(self, topic, message_count):
    """Inject game events as test data to PubSub."""

    logging.debug('Injecting %d game events to topic %s',
                  message_count, topic.name)

    for _ in range(message_count):
      self.pub_client.publish(topic.name,
                              (self.INPUT_EVENT % self._test_timestamp
                              ).encode('utf-8'))

  def _cleanup_pubsub(self):
    test_utils.cleanup_subscriptions(self.sub_client, [self.input_sub])
    test_utils.cleanup_topics(self.pub_client, [self.input_topic])

  @attr('IT')
  def test_game_stats_it(self):
    state_verifier = PipelineStateMatcher(PipelineState.RUNNING)

    success_condition = 'mean_duration=300 LIMIT 1'
    sessions_query = ('SELECT mean_duration FROM `%s.%s.%s` '
                      'WHERE %s' % (self.project,
                                    self.dataset_ref.dataset_id,
                                    self.OUTPUT_TABLE_SESSIONS,
                                    success_condition))
    bq_sessions_verifier = BigqueryMatcher(self.project,
                                           sessions_query,
                                           self.DEFAULT_EXPECTED_CHECKSUM)

    # TODO(mariagh): Add teams table verifier once game_stats.py is fixed.

    extra_opts = {'subscription': self.input_sub.name,
                  'dataset': self.dataset_ref.dataset_id,
                  'topic': self.input_topic.name,
                  'fixed_window_duration': 1,
                  'user_activity_window_duration': 1,
                  'wait_until_finish_duration':
                      self.WAIT_UNTIL_FINISH_DURATION,
                  'on_success_matcher': all_of(state_verifier,
                                               bq_sessions_verifier)}

    # Register cleanup before pipeline execution.
    # Note that actual execution happens in reverse order.
    self.addCleanup(self._cleanup_pubsub)
    self.addCleanup(utils.delete_bq_dataset, self.project, self.dataset_ref)

    # Generate input data and inject to PubSub.
    self._inject_pubsub_game_events(self.input_topic, self.DEFAULT_INPUT_COUNT)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    game_stats.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts))
예제 #45
0
class BigQueryWriteIntegrationTests(unittest.TestCase):
    BIG_QUERY_DATASET_ID = 'python_write_to_table_'

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.runner_name = type(self.test_pipeline.runner).__name__
        self.project = self.test_pipeline.get_option('project')

        self.bigquery_client = BigQueryWrapper()
        self.dataset_id = '%s%s%d' % (self.BIG_QUERY_DATASET_ID,
                                      str(int(time.time())),
                                      random.randint(0, 10000))
        self.bigquery_client.get_or_create_dataset(self.project,
                                                   self.dataset_id)
        _LOGGER.info("Created dataset %s in project %s", self.dataset_id,
                     self.project)

    def tearDown(self):
        request = bigquery.BigqueryDatasetsDeleteRequest(
            projectId=self.project,
            datasetId=self.dataset_id,
            deleteContents=True)
        try:
            _LOGGER.info("Deleting dataset %s in project %s", self.dataset_id,
                         self.project)
            self.bigquery_client.client.datasets.Delete(request)
        except HttpError:
            _LOGGER.debug('Failed to clean up dataset %s in project %s',
                          self.dataset_id, self.project)

    def create_table(self, table_name):
        table_schema = bigquery.TableSchema()
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'int64'
        table_field.type = 'INT64'
        table_field.mode = 'REQUIRED'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'bytes'
        table_field.type = 'BYTES'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'date'
        table_field.type = 'DATE'
        table_schema.fields.append(table_field)
        table_field = bigquery.TableFieldSchema()
        table_field.name = 'time'
        table_field.type = 'TIME'
        table_schema.fields.append(table_field)
        table = bigquery.Table(tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=table_name),
                               schema=table_schema)
        request = bigquery.BigqueryTablesInsertRequest(
            projectId=self.project, datasetId=self.dataset_id, table=table)
        self.bigquery_client.client.tables.Insert(request)

    @pytest.mark.it_postcommit
    def test_big_query_write(self):
        table_name = 'python_write_table'
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [
            {
                'number': 1,
                'str': 'abc'
            },
            {
                'number': 2,
                'str': 'def'
            },
            {
                'number': 3,
                'str': u'你好'
            },
            {
                'number': 4,
                'str': u'привет'
            },
        ]
        table_schema = {
            "fields": [{
                "name": "number",
                "type": "INTEGER"
            }, {
                "name": "str",
                "type": "STRING"
            }]
        }

        pipeline_verifiers = [
            BigqueryFullResultMatcher(project=self.project,
                                      query="SELECT number, str FROM %s" %
                                      table_id,
                                      data=[(
                                          1,
                                          'abc',
                                      ), (
                                          2,
                                          'def',
                                      ), (
                                          3,
                                          u'你好',
                                      ), (
                                          4,
                                          u'привет',
                                      )])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 schema=table_schema,
                 create_disposition=beam.io.BigQueryDisposition.
                 CREATE_IF_NEEDED,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

    @pytest.mark.it_postcommit
    def test_big_query_write_schema_autodetect(self):
        if self.runner_name == 'TestDataflowRunner':
            self.skipTest(
                'DataflowRunner does not support schema autodetection')

        table_name = 'python_write_table'
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [
            {
                'number': 1,
                'str': 'abc'
            },
            {
                'number': 2,
                'str': 'def'
            },
        ]

        pipeline_verifiers = [
            BigqueryFullResultMatcher(project=self.project,
                                      query="SELECT number, str FROM %s" %
                                      table_id,
                                      data=[(
                                          1,
                                          'abc',
                                      ), (
                                          2,
                                          'def',
                                      )])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
                 schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT,
                 create_disposition=beam.io.BigQueryDisposition.
                 CREATE_IF_NEEDED,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
                 temp_file_format=FileFormat.JSON))

    @pytest.mark.it_postcommit
    def test_big_query_write_new_types(self):
        table_name = 'python_new_types_table'
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        row_data = {
            'float': 0.33,
            'numeric': Decimal('10'),
            'bytes': base64.b64encode(b'\xab\xac').decode('utf-8'),
            'date': '3000-12-31',
            'time': '23:59:59',
            'datetime': '2018-12-31T12:44:31',
            'timestamp': '2018-12-31 12:44:31.744957 UTC',
            'geo': 'POINT(30 10)'
        }

        input_data = [row_data]
        # add rows with only one key value pair and None values for all other keys
        for key, value in row_data.items():
            input_data.append({key: value})

        table_schema = {
            "fields": [{
                "name": "float",
                "type": "FLOAT"
            }, {
                "name": "numeric",
                "type": "NUMERIC"
            }, {
                "name": "bytes",
                "type": "BYTES"
            }, {
                "name": "date",
                "type": "DATE"
            }, {
                "name": "time",
                "type": "TIME"
            }, {
                "name": "datetime",
                "type": "DATETIME"
            }, {
                "name": "timestamp",
                "type": "TIMESTAMP"
            }, {
                "name": "geo",
                "type": "GEOGRAPHY"
            }]
        }

        expected_row = (
            0.33,
            Decimal('10'),
            b'\xab\xac',
            datetime.date(3000, 12, 31),
            datetime.time(23, 59, 59),
            datetime.datetime(2018, 12, 31, 12, 44, 31),
            datetime.datetime(2018,
                              12,
                              31,
                              12,
                              44,
                              31,
                              744957,
                              tzinfo=pytz.utc),
            'POINT(30 10)',
        )

        expected_data = [expected_row]

        # add rows with only one key value pair and None values for all other keys
        for i, value in enumerate(expected_row):
            row = [None] * len(expected_row)
            row[i] = value
            expected_data.append(tuple(row))

        pipeline_verifiers = [
            BigqueryFullResultMatcher(
                project=self.project,
                query='SELECT float, numeric, bytes, date, time, datetime,'
                'timestamp, geo FROM %s' % table_id,
                data=expected_data)
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 schema=table_schema,
                 create_disposition=beam.io.BigQueryDisposition.
                 CREATE_IF_NEEDED,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

    @pytest.mark.it_postcommit
    def test_big_query_write_without_schema(self):
        table_name = 'python_no_schema_table'
        self.create_table(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [{
            'int64': 1,
            'bytes': b'xyw',
            'date': '2011-01-01',
            'time': '23:59:59.999999'
        }, {
            'int64': 2,
            'bytes': b'abc',
            'date': '2000-01-01',
            'time': '00:00:00'
        }, {
            'int64': 3,
            'bytes': b'\xe4\xbd\xa0\xe5\xa5\xbd',
            'date': '3000-12-31',
            'time': '23:59:59'
        }, {
            'int64': 4,
            'bytes': b'\xab\xac\xad',
            'date': '2000-01-01',
            'time': '00:00:00'
        }]
        # bigquery io expects bytes to be base64 encoded values
        for row in input_data:
            row['bytes'] = base64.b64encode(row['bytes'])

        pipeline_verifiers = [
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT int64, bytes, date, time FROM %s" % table_id,
                data=[(
                    1,
                    b'xyw',
                    datetime.date(2011, 1, 1),
                    datetime.time(23, 59, 59, 999999),
                ),
                      (
                          2,
                          b'abc',
                          datetime.date(2000, 1, 1),
                          datetime.time(0, 0, 0),
                      ),
                      (
                          3,
                          b'\xe4\xbd\xa0\xe5\xa5\xbd',
                          datetime.date(3000, 12, 31),
                          datetime.time(23, 59, 59),
                      ),
                      (
                          4,
                          b'\xab\xac\xad',
                          datetime.date(2000, 1, 1),
                          datetime.time(0, 0, 0),
                      )])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 temp_file_format=FileFormat.JSON))

    @pytest.mark.it_postcommit
    def test_big_query_write_insert_errors_reporting(self):
        """
    Test that errors returned by beam.io.WriteToBigQuery
    contain both the failed rows amd the reason for it failing.
    """
        table_name = 'python_write_table'
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        input_data = [{
            'number': 1,
            'str': 'some_string',
        }, {
            'number': 2
        }, {
            'number': 3,
            'str': 'some_string',
            'additional_field_str': 'some_string',
        }]

        table_schema = {
            "fields": [{
                "name": "number",
                "type": "INTEGER",
                'mode': 'REQUIRED'
            }, {
                "name": "str",
                "type": "STRING",
                'mode': 'REQUIRED'
            }]
        }

        bq_result_errors = [(
            {
                "number": 2
            },
            [{
                "reason": "invalid",
                "location": "",
                "debugInfo": "",
                "message":
                "Missing required field: Msg_0_CLOUD_QUERY_TABLE.str."
            }],
        ),
                            ({
                                "number": 3,
                                "str": "some_string",
                                "additional_field_str": "some_string"
                            }, [{
                                "reason":
                                "invalid",
                                "location":
                                "additional_field_str",
                                "debugInfo":
                                "",
                                "message":
                                "no such field: additional_field_str."
                            }])]

        pipeline_verifiers = [
            BigqueryFullResultMatcher(project=self.project,
                                      query="SELECT number, str FROM %s" %
                                      table_id,
                                      data=[(1, 'some_string')]),
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*pipeline_verifiers))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            errors = (
                p | 'create' >> beam.Create(input_data)
                | 'write' >> beam.io.WriteToBigQuery(
                    table_id,
                    schema=table_schema,
                    method='STREAMING_INSERTS',
                    insert_retry_strategy='RETRY_NEVER',
                    create_disposition=beam.io.BigQueryDisposition.
                    CREATE_IF_NEEDED,
                    write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
            )

            assert_that(
                errors[BigQueryWriteFn.FAILED_ROWS_WITH_ERRORS]
                | 'ParseErrors' >> beam.Map(lambda err: (err[1], err[2])),
                equal_to(bq_result_errors))

    @pytest.mark.it_postcommit
    @parameterized.expand([
        param(file_format=FileFormat.AVRO),
        param(file_format=FileFormat.JSON),
        param(file_format=None),
    ])
    @mock.patch("apache_beam.io.gcp.bigquery_file_loads._MAXIMUM_SOURCE_URIS",
                new=1)
    def test_big_query_write_temp_table_append_schema_update(
            self, file_format):
        """
    Test that nested schema update options and schema relaxation
    are respected when appending to an existing table via temporary tables.

    _MAXIMUM_SOURCE_URIS and max_file_size are both set to 1 to force multiple
    load jobs and usage of temporary tables.
    """
        table_name = 'python_append_schema_update'
        self.create_table(table_name)
        table_id = '{}.{}'.format(self.dataset_id, table_name)

        # bytes, date, time fields are optional and omitted in the test
        # only required and new columns are specified
        table_schema = {
            "fields": [{
                "name": "int64",
                "type": "INT64",
                "mode": "NULLABLE",
            }, {
                "name": "bool",
                "type": "BOOL",
            }, {
                "name":
                "nested_field",
                "type":
                "RECORD",
                "mode":
                "REPEATED",
                "fields": [
                    {
                        "name": "fruit",
                        "type": "STRING",
                        "mode": "NULLABLE"
                    },
                ]
            }]
        }
        input_data = [{
            "int64": 1,
            "bool": True,
            "nested_field": [{
                "fruit": "Apple"
            }]
        }, {
            "bool": False,
            "nested_field": [{
                "fruit": "Mango"
            }]
        }, {
            "int64": None,
            "bool": True,
            "nested_field": [{
                "fruit": "Banana"
            }]
        }]
        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=BigqueryFullResultMatcher(
                project=self.project,
                query="""
            SELECT bytes, date, time, int64, bool, fruit
            FROM {},
            UNNEST(nested_field) as nested_field
            ORDER BY fruit
            """.format(table_id),
                data=[(None, None, None, 1, True,
                       "Apple"), (
                           None, None, None, None, True,
                           "Banana"), (None, None, None, None, False,
                                       "Mango")]))

        with beam.Pipeline(argv=args) as p:
            # pylint: disable=expression-not-assigned
            (p | 'create' >> beam.Create(input_data)
             | 'write' >> beam.io.WriteToBigQuery(
                 table_id,
                 schema=table_schema,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                 max_file_size=1,  # bytes
                 method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
                 additional_bq_parameters={
                     'schemaUpdateOptions':
                     ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION']
                 },
                 temp_file_format=file_format))
class CoGroupByKeyTest(unittest.TestCase):

  def parseTestPipelineOptions(self, options):
    return {
        'numRecords': options.get('num_records'),
        'keySizeBytes': options.get('key_size'),
        'valueSizeBytes': options.get('value_size'),
        'bundleSizeDistribution': {
            'type': options.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': options.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': options.get(
            'force_initial_num_bundles', 0
        )
    }

  def setUp(self):
    self.pipeline = TestPipeline(is_integration_test=True)
    self.inputOptions = json.loads(self.pipeline.get_option('input_options'))
    self.coInputOptions = json.loads(
        self.pipeline.get_option('co_input_options'))

  class _Ungroup(beam.DoFn):
    def process(self, element):
      values = element[1]
      inputs = values.get(INPUT_TAG)
      co_inputs = values.get(CO_INPUT_TAG)
      for i in inputs:
        yield i
      for i in co_inputs:
        yield i

  def testCoGroupByKey(self):
    with self.pipeline as p:
      pc1 = (p
             | 'Read ' + INPUT_TAG >> beam.io.Read(
                 synthetic_pipeline.SyntheticSource(
                     self.parseTestPipelineOptions(self.inputOptions)))
             | 'Make ' + INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x))
            )

      pc2 = (p
             | 'Read ' + CO_INPUT_TAG >> beam.io.Read(
                 synthetic_pipeline.SyntheticSource(
                     self.parseTestPipelineOptions(self.coInputOptions)))
             | 'Make ' + CO_INPUT_TAG + ' iterable' >> beam.Map(
                 lambda x: (x, x))
            )
      # pylint: disable=expression-not-assigned
      ({INPUT_TAG: pc1, CO_INPUT_TAG: pc2}
       | 'CoGroupByKey: ' >> beam.CoGroupByKey()
       | 'Consume Joined Collections' >> beam.ParDo(self._Ungroup())
       | 'Measure time' >> beam.ParDo(MeasureTime())
      )

      result = p.run()
      result.wait_until_finish()
      metrics = result.metrics().query()

      for dist in metrics['distributions']:
        logging.info("Distribution: %s", dist)
예제 #47
0
class FlightDelaysTest(unittest.TestCase):
    EXPECTED = {
        '2012-12-23': [
            ('AA', 20.082559339525282, 12.825593395252838),
            ('EV', 10.01901901901902, 4.431431431431432),
            ('HA', -1.0829015544041452, 0.010362694300518135),
            ('UA', 19.142555438225976, 11.07180570221753),
            ('MQ', 8.902255639097744, 3.6676691729323307),
            ('OO', 31.148883374689827, 31.90818858560794),
            ('US', 3.092541436464088, -2.350828729281768),
            ('WN', 12.074298711144806, 6.717968157695224),
            ('AS', 5.0456273764258555, 1.0722433460076046),
            ('B6', 20.646569646569645, 16.405405405405407),
            ('DL', 5.2559923298178335, -3.214765100671141),
            ('F9', 23.823529411764707, 25.455882352941178),
            ('FL', 4.492877492877493, -0.8005698005698005),
            ('VX', 62.755102040816325, 62.61224489795919),
            ('YV', 16.155844155844157, 13.376623376623376),
        ],
        '2012-12-24': [
            ('AS', 0.5917602996254682, -2.2659176029962547),
            ('B6', 8.070993914807302, 2.73630831643002),
            ('DL', 3.7171824973319105, -2.2358591248665953),
            ('F9', 14.111940298507463, 15.888059701492537),
            ('FL', 2.4210526315789473, 2.242690058479532),
            ('VX', 3.841666666666667, -2.4166666666666665),
            ('YV', 0.32, 0.78),
            ('MQ', 15.869642857142857, 9.992857142857142),
            ('OO', 11.048517520215633, 10.138814016172507),
            ('US', 1.369281045751634, -1.4101307189542485),
            ('WN', 7.515952597994531, 0.7028258887876025),
            ('AA', 7.049086757990867, -1.5970319634703196),
            ('EV', 7.297101449275362, 2.2693236714975846),
            ('HA', -2.6785714285714284, -2.4744897959183674),
            ('UA', 10.935406698564593, -1.3337320574162679),
        ],
        '2012-12-25': [
            ('AS', 3.4816326530612245, 0.27346938775510204),
            ('B6', 9.10590631364562, 3.989816700610998),
            ('DL', 2.3022170361726952, -3.6709451575262544),
            ('F9', 19.38255033557047, 21.845637583892618),
            ('FL', 1.3982300884955752, 0.9380530973451328),
            ('VX', 23.62878787878788, 23.636363636363637),
            ('YV', 11.256302521008404, 11.659663865546218),
            ('MQ', 32.6, 44.28666666666667),
            ('OO', 16.2275960170697, 17.11948790896159),
            ('US', 2.7953216374269005, 0.2236842105263158),
            ('WN', 14.405783582089553, 10.111940298507463),
            ('AA', 23.551581843191197, 35.62585969738652),
            ('EV', 17.368638239339752, 16.43191196698762),
            ('HA', -4.725806451612903, -3.9946236559139785),
            ('UA', 16.663145539906104, 10.772300469483568),
        ],
    }

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.outdir = (self.test_pipeline.get_option('temp_location') +
                       '/flight_delays_it-' + str(uuid.uuid4()))
        self.output_path = os.path.join(self.outdir, 'output.csv')

    def tearDown(self):
        FileSystems.delete([self.outdir + '/'])

    @pytest.mark.it_postcommit
    def test_flight_delays(self):
        flight_delays.run_flight_delay_pipeline(self.test_pipeline,
                                                start_date='2012-12-23',
                                                end_date='2012-12-25',
                                                output=self.output_path)

        def read_csv(path):
            with FileSystems.open(path) as fp:
                return pd.read_csv(fp)

        # Parse result file and compare.
        for date, expectation in self.EXPECTED.items():
            result_df = pd.concat(
                read_csv(metadata.path) for metadata in FileSystems.match(
                    [f'{self.output_path}-{date}*'])[0].metadata_list)
            result_df = result_df.sort_values('airline').reset_index(drop=True)

            expected_df = pd.DataFrame(
                expectation,
                columns=['airline', 'departure_delay', 'arrival_delay'])
            expected_df = expected_df.sort_values('airline').reset_index(
                drop=True)

            try:
                pd.testing.assert_frame_equal(result_df, expected_df)
            except AssertionError as e:
                raise AssertionError(f"date={date!r} result DataFrame:\n\n"
                                     f"{result_df}\n\n"
                                     "Differs from Expectation:\n\n"
                                     f"{expected_df}") from e
class BigQueryQueryToTableIT(unittest.TestCase):
  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')

    self.bigquery_client = BigQueryWrapper()
    self.dataset_id = '%s%s%d' % (BIG_QUERY_DATASET_ID, str(int(time.time())),
                                  random.randint(0, 10000))
    self.bigquery_client.get_or_create_dataset(self.project, self.dataset_id)
    self.output_table = "%s.output_table" % (self.dataset_id)

  def tearDown(self):
    request = bigquery.BigqueryDatasetsDeleteRequest(
        projectId=self.project, datasetId=self.dataset_id,
        deleteContents=True)
    try:
      self.bigquery_client.client.datasets.Delete(request)
    except HttpError:
      logging.debug('Failed to clean up dataset %s' % self.dataset_id)

  def _setup_new_types_env(self):
    table_schema = bigquery.TableSchema()
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'bytes'
    table_field.type = 'BYTES'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'date'
    table_field.type = 'DATE'
    table_schema.fields.append(table_field)
    table_field = bigquery.TableFieldSchema()
    table_field.name = 'time'
    table_field.type = 'TIME'
    table_schema.fields.append(table_field)
    table = bigquery.Table(
        tableReference=bigquery.TableReference(
            projectId=self.project,
            datasetId=self.dataset_id,
            tableId=NEW_TYPES_INPUT_TABLE),
        schema=table_schema)
    request = bigquery.BigqueryTablesInsertRequest(
        projectId=self.project, datasetId=self.dataset_id, table=table)
    self.bigquery_client.client.tables.Insert(request)
    table_data = [
        {'bytes':b'xyw=', 'date':'2011-01-01', 'time':'23:59:59.999999'},
        {'bytes':b'abc=', 'date':'2000-01-01', 'time':'00:00:00'},
        {'bytes':b'dec=', 'date':'3000-12-31', 'time':'23:59:59.990000'}
    ]
    self.bigquery_client.insert_rows(
        self.project, self.dataset_id, NEW_TYPES_INPUT_TABLE, table_data)

  @attr('IT')
  def test_big_query_legacy_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    extra_opts = {'query': LEGACY_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': False,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

  @attr('IT')
  def test_big_query_standard_sql(self):
    verify_query = DIALECT_OUTPUT_VERIFY_QUERY % self.output_table
    expected_checksum = test_utils.compute_hash(DIALECT_OUTPUT_EXPECTED)
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    extra_opts = {'query': STANDARD_QUERY,
                  'output': self.output_table,
                  'output_schema': DIALECT_OUTPUT_SCHEMA,
                  'use_standard_sql': True,
                  'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)

  @attr('IT')
  def test_big_query_new_types(self):
    expected_checksum = test_utils.compute_hash(NEW_TYPES_OUTPUT_EXPECTED)
    verify_query = NEW_TYPES_OUTPUT_VERIFY_QUERY % self.output_table
    pipeline_verifiers = [PipelineStateMatcher(), BigqueryMatcher(
        project=self.project,
        query=verify_query,
        checksum=expected_checksum)]
    self._setup_new_types_env()
    extra_opts = {
        'query': NEW_TYPES_QUERY % (self.dataset_id, NEW_TYPES_INPUT_TABLE),
        'output': self.output_table,
        'output_schema': NEW_TYPES_OUTPUT_SCHEMA,
        'use_standard_sql': False,
        'on_success_matcher': all_of(*pipeline_verifiers)}
    options = self.test_pipeline.get_full_options_as_args(**extra_opts)
    big_query_query_to_table_pipeline.run_bq_pipeline(options)
예제 #49
0
class GroupByKeyTest(unittest.TestCase):
    def parseTestPipelineOptions(self):
        return {
            'numRecords':
            self.input_options.get('num_records'),
            'keySizeBytes':
            self.input_options.get('key_size'),
            'valueSizeBytes':
            self.input_options.get('value_size'),
            'bundleSizeDistribution': {
                'type':
                self.input_options.get('bundle_size_distribution_type',
                                       'const'),
                'param':
                self.input_options.get('bundle_size_distribution_param', 0)
            },
            'forceNumInitialBundles':
            self.input_options.get('force_initial_num_bundles', 0)
        }

    def setUp(self):
        self.pipeline = TestPipeline()
        self.input_options = json.loads(
            self.pipeline.get_option('input_options'))

        self.metrics_monitor = self.pipeline.get_option('publish_to_big_query')
        metrics_project_id = self.pipeline.get_option('project')
        self.metrics_namespace = self.pipeline.get_option('metrics_table')
        metrics_dataset = self.pipeline.get_option('metrics_dataset')

        check = metrics_project_id and self.metrics_namespace and metrics_dataset \
                is not None
        if not self.metrics_monitor:
            logging.info('Metrics will not be collected')
        elif check:
            self.metrics_monitor = MetricsMonitor(
                project_name=metrics_project_id,
                table=self.metrics_namespace,
                dataset=metrics_dataset,
            )
        else:
            raise ValueError(
                'One or more of parameters for collecting metrics '
                'are empty.')

    def testGroupByKey(self):
        with self.pipeline as p:
            # pylint: disable=expression-not-assigned
            (p
             | beam.io.Read(
                 synthetic_pipeline.SyntheticSource(
                     self.parseTestPipelineOptions()))
             | 'Measure time: Start' >> beam.ParDo(
                 MeasureTime(self.metrics_namespace))
             | 'GroupByKey' >> beam.GroupByKey()
             | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v)
                                                      for v in elm[1]])
             | 'Measure time: End' >> beam.ParDo(
                 MeasureTime(self.metrics_namespace)))

            result = p.run()
            result.wait_until_finish()
            if self.metrics_monitor is not None:
                self.metrics_monitor.send_metrics(result)
class PubSubIntegrationTest(unittest.TestCase):

  ID_LABEL = 'id'
  TIMESTAMP_ATTRIBUTE = 'timestamp'
  INPUT_MESSAGES = {
      # TODO(BEAM-4275): DirectRunner doesn't support reading or writing
      # label_ids, nor writing timestamp attributes. Once these features exist,
      # TestDirectRunner and TestDataflowRunner should behave identically.
      'TestDirectRunner': [
          PubsubMessage('data001', {}),
          # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the
          # IT pipeline writes back the timestamp of each element (as reported
          # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute.
          PubsubMessage('data002', {
              TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
          }),
      ],
      'TestDataflowRunner': [
          # Use ID_LABEL attribute to deduplicate messages with the same ID.
          PubsubMessage('data001', {ID_LABEL: 'foo'}),
          PubsubMessage('data001', {ID_LABEL: 'foo'}),
          PubsubMessage('data001', {ID_LABEL: 'foo'}),
          # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the
          # IT pipeline writes back the timestamp of each element (as reported
          # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute.
          PubsubMessage('data002', {
              TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
          })
      ],
  }
  EXPECTED_OUTPUT_MESSAGES = {
      'TestDirectRunner': [
          PubsubMessage('data001-seen', {'processed': 'IT'}),
          PubsubMessage('data002-seen', {
              TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
              TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z',
              'processed': 'IT',
          }),
      ],
      'TestDataflowRunner': [
          PubsubMessage('data001-seen', {'processed': 'IT'}),
          PubsubMessage('data002-seen', {
              TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z',
              'processed': 'IT',
          }),
      ],
  }

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.runner_name = type(self.test_pipeline.runner).__name__
    self.project = self.test_pipeline.get_option('project')
    self.uuid = str(uuid.uuid4())

    # Set up PubSub environment.
    from google.cloud import pubsub
    self.pub_client = pubsub.PublisherClient()
    self.input_topic = self.pub_client.create_topic(
        self.pub_client.topic_path(self.project, INPUT_TOPIC + self.uuid))
    self.output_topic = self.pub_client.create_topic(
        self.pub_client.topic_path(self.project, OUTPUT_TOPIC + self.uuid))

    self.sub_client = pubsub.SubscriberClient()
    self.input_sub = self.sub_client.create_subscription(
        self.sub_client.subscription_path(self.project, INPUT_SUB + self.uuid),
        self.input_topic.name)
    self.output_sub = self.sub_client.create_subscription(
        self.sub_client.subscription_path(self.project, OUTPUT_SUB + self.uuid),
        self.output_topic.name)

  def tearDown(self):
    test_utils.cleanup_subscriptions(self.sub_client,
                                     [self.input_sub, self.output_sub])
    test_utils.cleanup_topics(self.pub_client,
                              [self.input_topic, self.output_topic])

  def _test_streaming(self, with_attributes):
    """Runs IT pipeline with message verifier.

    Args:
      with_attributes: False - Reads and writes message data only.
        True - Reads and writes message data and attributes. Also verifies
        id_label and timestamp_attribute features.
    """
    # Set on_success_matcher to verify pipeline state and pubsub output. These
    # verifications run on a (remote) worker.

    # Expect the state to be RUNNING since a streaming pipeline is usually
    # never DONE. The test runner will cancel the pipeline after verification.
    state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
    expected_messages = self.EXPECTED_OUTPUT_MESSAGES[self.runner_name]
    if not with_attributes:
      expected_messages = [pubsub_msg.data for pubsub_msg in expected_messages]
    if self.runner_name == 'TestDirectRunner':
      strip_attributes = None
    else:
      strip_attributes = [self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE]
    pubsub_msg_verifier = PubSubMessageMatcher(
        self.project,
        self.output_sub.name,
        expected_messages,
        timeout=MESSAGE_MATCHER_TIMEOUT_S,
        with_attributes=with_attributes,
        strip_attributes=strip_attributes)
    extra_opts = {'input_subscription': self.input_sub.name,
                  'output_topic': self.output_topic.name,
                  'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS,
                  'on_success_matcher': all_of(state_verifier,
                                               pubsub_msg_verifier)}

    # Generate input data and inject to PubSub.
    for msg in self.INPUT_MESSAGES[self.runner_name]:
      self.pub_client.publish(self.input_topic.name, msg.data, **msg.attributes)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    pubsub_it_pipeline.run_pipeline(
        argv=self.test_pipeline.get_full_options_as_args(**extra_opts),
        with_attributes=with_attributes,
        id_label=self.ID_LABEL,
        timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)

  @attr('IT')
  def test_streaming_data_only(self):
    self._test_streaming(with_attributes=False)

  @attr('IT')
  def test_streaming_with_attributes(self):
    self._test_streaming(with_attributes=True)
예제 #51
0
class PubSubBigQueryIT(unittest.TestCase):

    INPUT_TOPIC = 'psit_topic_output'
    INPUT_SUB = 'psit_subscription_input'

    BIG_QUERY_DATASET_ID = 'python_pubsub_bq_'
    SCHEMA = {
        'fields': [{
            'name': 'number',
            'type': 'INTEGER',
            'mode': 'NULLABLE'
        }]
    }

    _SIZE = 4

    WAIT_UNTIL_FINISH_DURATION = 15 * 60 * 1000

    def setUp(self):
        # Set up PubSub
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.runner_name = type(self.test_pipeline.runner).__name__
        self.project = self.test_pipeline.get_option('project')
        self.uuid = str(uuid.uuid4())
        from google.cloud import pubsub
        self.pub_client = pubsub.PublisherClient()
        self.input_topic = self.pub_client.create_topic(
            self.pub_client.topic_path(self.project,
                                       self.INPUT_TOPIC + self.uuid))
        self.sub_client = pubsub.SubscriberClient()
        self.input_sub = self.sub_client.create_subscription(
            self.sub_client.subscription_path(self.project,
                                              self.INPUT_SUB + self.uuid),
            self.input_topic.name)

        # Set up BQ
        self.dataset_ref = utils.create_bq_dataset(self.project,
                                                   self.BIG_QUERY_DATASET_ID)
        self.output_table = "%s.output_table" % (self.dataset_ref.dataset_id)

    def tearDown(self):
        # Tear down PubSub
        test_utils.cleanup_topics(self.pub_client, [self.input_topic])
        test_utils.cleanup_subscriptions(self.sub_client, [self.input_sub])
        # Tear down BigQuery
        utils.delete_bq_dataset(self.project, self.dataset_ref)

    def _run_pubsub_bq_pipeline(self, method, triggering_frequency=None):
        l = [i for i in range(self._SIZE)]

        matchers = [
            PipelineStateMatcher(PipelineState.RUNNING),
            BigqueryFullResultStreamingMatcher(project=self.project,
                                               query="SELECT number FROM %s" %
                                               self.output_table,
                                               data=[(i, ) for i in l])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*matchers),
            wait_until_finish_duration=self.WAIT_UNTIL_FINISH_DURATION,
            experiments='use_beam_bq_sink',
            streaming=True)

        def add_schema_info(element):
            yield {'number': element}

        messages = [str(i).encode('utf-8') for i in l]
        for message in messages:
            self.pub_client.publish(self.input_topic.name, message)

        with beam.Pipeline(argv=args) as p:
            mesages = (p
                       | ReadFromPubSub(subscription=self.input_sub.name)
                       | beam.ParDo(add_schema_info))
            _ = mesages | WriteToBigQuery(
                self.output_table,
                schema=self.SCHEMA,
                method=method,
                triggering_frequency=triggering_frequency)

    @attr('IT')
    def test_streaming_inserts(self):
        self._run_pubsub_bq_pipeline(WriteToBigQuery.Method.STREAMING_INSERTS)

    @attr('IT')
    def test_file_loads(self):
        if isinstance(self.test_pipeline.runner, TestDataflowRunner):
            self.skipTest('https://issuetracker.google.com/issues/118375066')
        self._run_pubsub_bq_pipeline(WriteToBigQuery.Method.FILE_LOADS,
                                     triggering_frequency=20)
예제 #52
0
class CoGroupByKeyTest(unittest.TestCase):

  def parseTestPipelineOptions(self, options):
    return {
        'numRecords': options.get('num_records'),
        'keySizeBytes': options.get('key_size'),
        'valueSizeBytes': options.get('value_size'),
        'bundleSizeDistribution': {
            'type': options.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': options.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': options.get(
            'force_initial_num_bundles', 0
        )
    }

  def setUp(self):
    self.pipeline = TestPipeline(is_integration_test=True)
    self.input_options = json.loads(self.pipeline.get_option('input_options'))
    self.co_input_options = json.loads(
        self.pipeline.get_option('co_input_options'))

    metrics_project_id = self.pipeline.get_option('project')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')
    metrics_dataset = self.pipeline.get_option('metrics_dataset')
    self.metrics_monitor = None
    check = metrics_project_id and self.metrics_namespace and metrics_dataset\
            is not None
    if check:
      measured_values = [{'name': RUNTIME_LABEL,
                          'type': 'FLOAT',
                          'mode': 'REQUIRED'}]
      self.metrics_monitor = MetricsMonitor(
          project_name=metrics_project_id,
          table=self.metrics_namespace,
          dataset=metrics_dataset,
          schema_map=measured_values
      )
    else:
      logging.error('One or more of parameters for collecting metrics '
                    'are empty. Metrics will not be collected')

  class _Ungroup(beam.DoFn):
    def process(self, element):
      values = element[1]
      inputs = values.get(INPUT_TAG)
      co_inputs = values.get(CO_INPUT_TAG)
      for i in inputs:
        yield i
      for i in co_inputs:
        yield i

  def testCoGroupByKey(self):
    with self.pipeline as p:
      pc1 = (p
             | 'Read ' + INPUT_TAG >> beam.io.Read(
                 synthetic_pipeline.SyntheticSource(
                     self.parseTestPipelineOptions(self.input_options)))
             | 'Make ' + INPUT_TAG + ' iterable' >> beam.Map(lambda x: (x, x))
             | 'Measure time: Start pc1' >> beam.ParDo(
                 MeasureTime(self.metrics_namespace))
            )

      pc2 = (p
             | 'Read ' + CO_INPUT_TAG >> beam.io.Read(
                 synthetic_pipeline.SyntheticSource(
                     self.parseTestPipelineOptions(self.co_input_options)))
             | 'Make ' + CO_INPUT_TAG + ' iterable' >> beam.Map(
                 lambda x: (x, x))
             | 'Measure time: Start pc2' >> beam.ParDo(
                 MeasureTime(self.metrics_namespace))
            )
      # pylint: disable=expression-not-assigned
      ({INPUT_TAG: pc1, CO_INPUT_TAG: pc2}
       | 'CoGroupByKey: ' >> beam.CoGroupByKey()
       | 'Consume Joined Collections' >> beam.ParDo(self._Ungroup())
       | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace))
      )

      result = p.run()
      result.wait_until_finish()
      if self.metrics_monitor is not None:
        self.metrics_monitor.send_metrics(result)
예제 #53
0
class GameStatsIT(unittest.TestCase):

    # Input events containing user, team, score, processing time, window start.
    INPUT_EVENT = 'user1,teamA,10,%d,2015-11-02 09:09:28.224'
    INPUT_TOPIC = 'game_stats_it_input_topic'
    INPUT_SUB = 'game_stats_it_input_subscription'

    # SHA-1 hash generated from sorted rows reading from BigQuery table
    DEFAULT_EXPECTED_CHECKSUM = '5288ccaab77d347c8460d77c15a0db234ef5eb4f'
    OUTPUT_DATASET = 'game_stats_it_dataset'
    OUTPUT_TABLE_SESSIONS = 'game_stats_sessions'
    OUTPUT_TABLE_TEAMS = 'game_stats_teams'
    DEFAULT_INPUT_COUNT = 500

    WAIT_UNTIL_FINISH_DURATION = 12 * 60 * 1000  # in milliseconds

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.project = self.test_pipeline.get_option('project')
        _unique_id = str(uuid.uuid4())

        # Set up PubSub environment.
        from google.cloud import pubsub
        self.pub_client = pubsub.PublisherClient()
        self.input_topic = self.pub_client.create_topic(
            self.pub_client.topic_path(self.project,
                                       self.INPUT_TOPIC + _unique_id))

        self.sub_client = pubsub.SubscriberClient()
        self.input_sub = self.sub_client.create_subscription(
            self.sub_client.subscription_path(self.project,
                                              self.INPUT_SUB + _unique_id),
            self.input_topic.name)

        # Set up BigQuery environment
        self.dataset_ref = utils.create_bq_dataset(self.project,
                                                   self.OUTPUT_DATASET)

        self._test_timestamp = int(time.time() * 1000)

    def _inject_pubsub_game_events(self, topic, message_count):
        """Inject game events as test data to PubSub."""

        logging.debug('Injecting %d game events to topic %s', message_count,
                      topic.name)

        for _ in range(message_count):
            self.pub_client.publish(topic.name,
                                    (self.INPUT_EVENT %
                                     self._test_timestamp).encode('utf-8'))

    def _cleanup_pubsub(self):
        test_utils.cleanup_subscriptions(self.sub_client, [self.input_sub])
        test_utils.cleanup_topics(self.pub_client, [self.input_topic])

    @pytest.mark.it_postcommit
    def test_game_stats_it(self):
        state_verifier = PipelineStateMatcher(PipelineState.RUNNING)

        success_condition = 'mean_duration=300 LIMIT 1'
        sessions_query = ('SELECT mean_duration FROM `%s.%s.%s` '
                          'WHERE %s' %
                          (self.project, self.dataset_ref.dataset_id,
                           self.OUTPUT_TABLE_SESSIONS, success_condition))
        bq_sessions_verifier = BigqueryMatcher(self.project, sessions_query,
                                               self.DEFAULT_EXPECTED_CHECKSUM)

        # TODO(mariagh): Add teams table verifier once game_stats.py is fixed.

        extra_opts = {
            'subscription': self.input_sub.name,
            'dataset': self.dataset_ref.dataset_id,
            'topic': self.input_topic.name,
            'fixed_window_duration': 1,
            'user_activity_window_duration': 1,
            'wait_until_finish_duration': self.WAIT_UNTIL_FINISH_DURATION,
            'on_success_matcher': all_of(state_verifier, bq_sessions_verifier)
        }

        # Register cleanup before pipeline execution.
        # Note that actual execution happens in reverse order.
        self.addCleanup(self._cleanup_pubsub)
        self.addCleanup(utils.delete_bq_dataset, self.project,
                        self.dataset_ref)

        # Generate input data and inject to PubSub.
        self._inject_pubsub_game_events(self.input_topic,
                                        self.DEFAULT_INPUT_COUNT)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        game_stats.run(
            self.test_pipeline.get_full_options_as_args(**extra_opts),
            save_main_session=False)
예제 #54
0
class ParDoTest(unittest.TestCase):
  def parseTestPipelineOptions(self):
    return {'numRecords': self.inputOptions.get('num_records'),
            'keySizeBytes': self.inputOptions.get('key_size'),
            'valueSizeBytes': self.inputOptions.get('value_size'),
            'bundleSizeDistribution': {
                'type': self.inputOptions.get(
                    'bundle_size_distribution_type', 'const'
                ),
                'param': self.inputOptions.get(
                    'bundle_size_distribution_param', 0
                )
            },
            'forceNumInitialBundles': self.inputOptions.get(
                'force_initial_num_bundles', 0
            )
           }

  def setUp(self):
    self.pipeline = TestPipeline(is_integration_test=True)
    self.output = self.pipeline.get_option('output')
    self.iterations = self.pipeline.get_option('number_of_counter_operations')
    self.inputOptions = json.loads(self.pipeline.get_option('input_options'))

  class _MeasureTime(beam.DoFn):
    def __init__(self):
      self.runtime_start = Metrics.distribution('pardo', 'runtime.start')
      self.runtime_end = Metrics.distribution('pardo', 'runtime.end')

    def start_bundle(self):
      self.runtime_start.update(time.time())

    def finish_bundle(self):
      self.runtime_end.update(time.time())

    def process(self, element):
      yield element

  class _GetElement(beam.DoFn):
    def __init__(self):
      self.counter = Metrics.counter('pardo', 'total_bytes.count')

    def process(self, element):
      _, value = element
      for i in range(len(value)):
        self.counter.inc(i)
      yield element

  def testParDo(self):
    if self.iterations is None:
      num_runs = 1
    else:
      num_runs = int(self.iterations)

    with self.pipeline as p:
      pc = (p
            | 'Read synthetic' >> beam.io.Read(
                synthetic_pipeline.SyntheticSource(
                    self.parseTestPipelineOptions()
                ))
            | 'Measure time' >> beam.ParDo(MeasureTime())
           )

      for i in range(num_runs):
        label = 'Step: %d' % i
        pc = (pc
              | label >> beam.ParDo(self._GetElement()))

      if self.output is not None:
        # pylint: disable=expression-not-assigned
        (pc
         | "Write" >> beam.io.WriteToText(self.output)
        )

      result = p.run()
      result.wait_until_finish()
      metrics = result.metrics().query()
      for counter in metrics['counters']:
        logging.info("Counter: %s", counter)

      for dist in metrics['distributions']:
        logging.info("Distribution: %s", dist)