Beispiel #1
0
 def test_write_metrics(self):
     MetricsEnvironment.process_wide_container().reset()
     write_fn = bigtableio._BigTableWriteFn(self._PROJECT_ID,
                                            self._INSTANCE_ID,
                                            self._TABLE_ID)
     write_fn.table = self.table
     write_fn.start_bundle()
     number_of_rows = 2
     error = Status()
     error.message = 'Entity already exists.'
     error.code = ALREADY_EXISTS
     success = Status()
     success.message = 'Success'
     success.code = OK
     rows_response = [error, success] * number_of_rows
     with patch.object(Table, 'mutate_rows', return_value=rows_response):
         direct_rows = [
             self.generate_row(i) for i in range(number_of_rows * 2)
         ]
         for direct_row in direct_rows:
             write_fn.process(direct_row)
         try:
             write_fn.finish_bundle()
         except:  # pylint: disable=bare-except
             # Currently we fail the bundle when there are any failures.
             # TODO(BEAM-13849): remove after bigtableio can selectively retry.
             pass
         self.verify_write_call_metric(
             self._PROJECT_ID, self._INSTANCE_ID, self._TABLE_ID,
             ServiceCallMetric.bigtable_error_code_to_grpc_status_string(
                 ALREADY_EXISTS), 2)
         self.verify_write_call_metric(
             self._PROJECT_ID, self._INSTANCE_ID, self._TABLE_ID,
             ServiceCallMetric.bigtable_error_code_to_grpc_status_string(
                 OK), 2)
Beispiel #2
0
    def test_uploader_monitoring_info(self):
        # Clear the process wide metric container.
        MetricsEnvironment.process_wide_container().reset()

        file_name = 'gs://gcsio-metrics-test/dummy_mode_file'
        file_size = 5 * 1024 * 1024 + 100
        random_file = self._insert_random_file(self.client, file_name,
                                               file_size)
        f = self.gcs.open(file_name, 'w')

        resource = resource_identifiers.GoogleCloudStorageBucket(
            random_file.bucket)
        labels = {
            monitoring_infos.SERVICE_LABEL: 'Storage',
            monitoring_infos.METHOD_LABEL: 'Objects.insert',
            monitoring_infos.RESOURCE_LABEL: resource,
            monitoring_infos.GCS_BUCKET_LABEL: random_file.bucket,
            monitoring_infos.GCS_PROJECT_ID_LABEL: str(DEFAULT_PROJECT_NUMBER),
            monitoring_infos.STATUS_LABEL: 'ok'
        }

        f.close()
        metric_name = MetricName(None,
                                 None,
                                 urn=monitoring_infos.API_REQUEST_COUNT_URN,
                                 labels=labels)
        metric_value = MetricsEnvironment.process_wide_container().get_counter(
            metric_name).get_cumulative()

        self.assertEqual(metric_value, 1)
  def run_pipeline(self, pipeline):
    """Execute the entire pipeline and returns an DirectPipelineResult."""

    # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
    # with resolving imports when they are at top.
    # pylint: disable=wrong-import-position
    from apache_beam.pipeline import PipelineVisitor
    from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \
      ConsumerTrackingPipelineVisitor
    from apache_beam.runners.direct.evaluation_context import EvaluationContext
    from apache_beam.runners.direct.executor import Executor
    from apache_beam.runners.direct.transform_evaluator import \
      TransformEvaluatorRegistry
    from apache_beam.testing.test_stream import TestStream

    # Performing configured PTransform overrides.
    pipeline.replace_all(_get_transform_overrides(pipeline.options))

    # If the TestStream I/O is used, use a mock test clock.
    class _TestStreamUsageVisitor(PipelineVisitor):
      """Visitor determining whether a Pipeline uses a TestStream."""

      def __init__(self):
        self.uses_test_stream = False

      def visit_transform(self, applied_ptransform):
        if isinstance(applied_ptransform.transform, TestStream):
          self.uses_test_stream = True

    visitor = _TestStreamUsageVisitor()
    pipeline.visit(visitor)
    clock = TestClock() if visitor.uses_test_stream else RealClock()

    MetricsEnvironment.set_metrics_supported(True)
    logging.info('Running pipeline with DirectRunner.')
    self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor()
    pipeline.visit(self.consumer_tracking_visitor)

    evaluation_context = EvaluationContext(
        pipeline._options,
        BundleFactory(stacked=pipeline._options.view_as(DirectOptions)
                      .direct_runner_use_stacked_bundle),
        self.consumer_tracking_visitor.root_transforms,
        self.consumer_tracking_visitor.value_to_consumers,
        self.consumer_tracking_visitor.step_names,
        self.consumer_tracking_visitor.views,
        clock)

    executor = Executor(self.consumer_tracking_visitor.value_to_consumers,
                        TransformEvaluatorRegistry(evaluation_context),
                        evaluation_context)
    # DirectRunner does not support injecting
    # PipelineOptions values at runtime
    RuntimeValueProvider.set_runtime_options({})
    # Start the executor. This is a non-blocking call, it will start the
    # execution in background threads and return.
    executor.start(self.consumer_tracking_visitor.root_transforms)
    result = DirectPipelineResult(executor, evaluation_context)

    return result
  def test_create_process_wide(self):
    sampler = statesampler.StateSampler('', counters.CounterFactory())
    statesampler.set_current_tracker(sampler)
    state1 = sampler.scoped_state(
        'mystep', 'myState', metrics_container=MetricsContainer('mystep'))

    try:
      sampler.start()
      with state1:
        urn = "my:custom:urn"
        labels = {'key': 'value'}
        counter = InternalMetrics.counter(
            urn=urn, labels=labels, process_wide=True)
        # Test that if process_wide is set, that it will be set
        # on the process_wide container.
        counter.inc(10)
        self.assertTrue(isinstance(counter, Metrics.DelegatingCounter))

        del counter

        metric_name = MetricName(None, None, urn=urn, labels=labels)
        # Expect a value set on the current container.
        self.assertEqual(
            MetricsEnvironment.process_wide_container().get_counter(
                metric_name).get_cumulative(),
            10)
        # Expect no value set on the current container.
        self.assertEqual(
            MetricsEnvironment.current_container().get_counter(
                metric_name).get_cumulative(),
            0)
    finally:
      sampler.stop()
Beispiel #5
0
  def test_create_counter_distribution(self):
    MetricsEnvironment.set_current_container(MetricsContainer('mystep'))
    counter_ns = 'aCounterNamespace'
    distro_ns = 'aDistributionNamespace'
    gauge_ns = 'aGaugeNamespace'
    name = 'a_name'
    counter = Metrics.counter(counter_ns, name)
    distro = Metrics.distribution(distro_ns, name)
    gauge = Metrics.gauge(gauge_ns, name)
    counter.inc(10)
    counter.dec(3)
    distro.update(10)
    distro.update(2)
    gauge.set(10)
    self.assertTrue(isinstance(counter, Metrics.DelegatingCounter))
    self.assertTrue(isinstance(distro, Metrics.DelegatingDistribution))
    self.assertTrue(isinstance(gauge, Metrics.DelegatingGauge))

    del distro
    del counter
    del gauge

    container = MetricsEnvironment.current_container()
    self.assertEqual(
        container.counters[MetricName(counter_ns, name)].get_cumulative(),
        7)
    self.assertEqual(
        container.distributions[MetricName(distro_ns, name)].get_cumulative(),
        DistributionData(12, 2, 2, 10))
    self.assertEqual(
        container.gauges[MetricName(gauge_ns, name)].get_cumulative().value,
        10)
Beispiel #6
0
    def test_metrics_error_call(self):
        if 'DirectRunner' not in self.runner_name:
            raise unittest.SkipTest('This test only runs with DirectRunner.')

        MetricsEnvironment.process_wide_container().reset()
        _prefix = 'test_write_batches'
        mutations = [
            WriteMutation.insert('Albums', ('AlbumId', 'Name'),
                                 [(_prefix + '3', _prefix + 'inset-3')]),
            WriteMutation.insert('Albums', ('AlbumId', 'Name'),
                                 [(_prefix + '3', _prefix + 'inset-3')]),
        ]

        with self.assertRaises(Exception):
            p = beam.Pipeline(argv=self.args)
            _ = (p | beam.Create(mutations)
                 | WriteToSpanner(project_id=self.project,
                                  instance_id=self.instance,
                                  database_id=self.TEST_DATABASE))

            res = p.run()
            res.wait_until_finish()

        self.verify_write_call_metric(self.project, self.TEST_DATABASE,
                                      'Albums', '400', 1)
Beispiel #7
0
    def run_pipeline(self, pipeline, options):
        """Execute the entire pipeline and returns an DirectPipelineResult."""

        # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
        # with resolving imports when they are at top.
        # pylint: disable=wrong-import-position
        from apache_beam.pipeline import PipelineVisitor
        from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \
          ConsumerTrackingPipelineVisitor
        from apache_beam.runners.direct.evaluation_context import EvaluationContext
        from apache_beam.runners.direct.executor import Executor
        from apache_beam.runners.direct.transform_evaluator import \
          TransformEvaluatorRegistry
        from apache_beam.testing.test_stream import TestStream

        # Performing configured PTransform overrides.
        pipeline.replace_all(_get_transform_overrides(options))

        # If the TestStream I/O is used, use a mock test clock.
        class _TestStreamUsageVisitor(PipelineVisitor):
            """Visitor determining whether a Pipeline uses a TestStream."""
            def __init__(self):
                self.uses_test_stream = False

            def visit_transform(self, applied_ptransform):
                if isinstance(applied_ptransform.transform, TestStream):
                    self.uses_test_stream = True

        visitor = _TestStreamUsageVisitor()
        pipeline.visit(visitor)
        clock = TestClock() if visitor.uses_test_stream else RealClock()

        # TODO(BEAM-4274): Circular import runners-metrics. Requires refactoring.
        from apache_beam.metrics.execution import MetricsEnvironment
        MetricsEnvironment.set_metrics_supported(True)
        logging.info('Running pipeline with DirectRunner.')
        self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor()
        pipeline.visit(self.consumer_tracking_visitor)

        evaluation_context = EvaluationContext(
            options,
            BundleFactory(stacked=options.view_as(
                DirectOptions).direct_runner_use_stacked_bundle),
            self.consumer_tracking_visitor.root_transforms,
            self.consumer_tracking_visitor.value_to_consumers,
            self.consumer_tracking_visitor.step_names,
            self.consumer_tracking_visitor.views, clock)

        executor = Executor(self.consumer_tracking_visitor.value_to_consumers,
                            TransformEvaluatorRegistry(evaluation_context),
                            evaluation_context)
        # DirectRunner does not support injecting
        # PipelineOptions values at runtime
        RuntimeValueProvider.set_runtime_options({})
        # Start the executor. This is a non-blocking call, it will start the
        # execution in background threads and return.
        executor.start(self.consumer_tracking_visitor.root_transforms)
        result = DirectPipelineResult(executor, evaluation_context)

        return result
Beispiel #8
0
 def test_write_metrics(self):
     MetricsEnvironment.process_wide_container().reset()
     write_fn = bigtableio._BigTableWriteFn(self._PROJECT_ID,
                                            self._INSTANCE_ID,
                                            self._TABLE_ID)
     write_fn.table = self.table
     write_fn.start_bundle()
     number_of_rows = 2
     error = Status()
     error.message = 'Entity already exists.'
     error.code = ALREADY_EXISTS
     success = Status()
     success.message = 'Success'
     success.code = OK
     rows_response = [error, success] * number_of_rows
     with patch.object(Table, 'mutate_rows', return_value=rows_response):
         direct_rows = [
             self.generate_row(i) for i in range(number_of_rows * 2)
         ]
         for direct_row in direct_rows:
             write_fn.process(direct_row)
         write_fn.finish_bundle()
         self.verify_write_call_metric(
             self._PROJECT_ID, self._INSTANCE_ID, self._TABLE_ID,
             ServiceCallMetric.bigtable_error_code_to_grpc_status_string(
                 ALREADY_EXISTS), 2)
         self.verify_write_call_metric(
             self._PROJECT_ID, self._INSTANCE_ID, self._TABLE_ID,
             ServiceCallMetric.bigtable_error_code_to_grpc_status_string(
                 OK), 2)
Beispiel #9
0
    def test_create_counter_distribution(self):
        MetricsEnvironment.set_current_container(MetricsContainer('mystep'))
        counter_ns = 'aCounterNamespace'
        distro_ns = 'aDistributionNamespace'
        gauge_ns = 'aGaugeNamespace'
        name = 'a_name'
        counter = Metrics.counter(counter_ns, name)
        distro = Metrics.distribution(distro_ns, name)
        gauge = Metrics.gauge(gauge_ns, name)
        counter.inc(10)
        counter.dec(3)
        distro.update(10)
        distro.update(2)
        gauge.set(10)
        self.assertTrue(isinstance(counter, Metrics.DelegatingCounter))
        self.assertTrue(isinstance(distro, Metrics.DelegatingDistribution))
        self.assertTrue(isinstance(gauge, Metrics.DelegatingGauge))

        del distro
        del counter
        del gauge

        container = MetricsEnvironment.current_container()
        self.assertEqual(
            container.counters[MetricName(counter_ns, name)].get_cumulative(),
            7)
        self.assertEqual(
            container.distributions[MetricName(distro_ns,
                                               name)].get_cumulative(),
            DistributionData(12, 2, 2, 10))
        self.assertEqual(
            container.gauges[MetricName(gauge_ns,
                                        name)].get_cumulative().value, 10)
Beispiel #10
0
  def run(self, pipeline):
    """Execute the entire pipeline and returns an DirectPipelineResult."""

    # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
    # with resolving imports when they are at top.
    # pylint: disable=wrong-import-position
    from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \
      ConsumerTrackingPipelineVisitor
    from apache_beam.runners.direct.evaluation_context import EvaluationContext
    from apache_beam.runners.direct.executor import Executor
    from apache_beam.runners.direct.transform_evaluator import \
      TransformEvaluatorRegistry

    MetricsEnvironment.set_metrics_supported(True)
    logging.info('Running pipeline with DirectRunner.')
    self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor()
    pipeline.visit(group_by_key_input_visitor())
    pipeline.visit(self.consumer_tracking_visitor)

    evaluation_context = EvaluationContext(
        pipeline.options,
        BundleFactory(stacked=pipeline.options.view_as(DirectOptions)
                      .direct_runner_use_stacked_bundle),
        self.consumer_tracking_visitor.root_transforms,
        self.consumer_tracking_visitor.value_to_consumers,
        self.consumer_tracking_visitor.step_names,
        self.consumer_tracking_visitor.views)

    evaluation_context.use_pvalue_cache(self._cache)

    executor = Executor(self.consumer_tracking_visitor.value_to_consumers,
                        TransformEvaluatorRegistry(evaluation_context),
                        evaluation_context)
    # Start the executor. This is a non-blocking call, it will start the
    # execution in background threads and return.

    if pipeline.options:
      RuntimeValueProvider.set_runtime_options(pipeline.options._options_id, {})
    executor.start(self.consumer_tracking_visitor.root_transforms)
    result = DirectPipelineResult(executor, evaluation_context)

    if self._cache:
      # We are running in eager mode, block until the pipeline execution
      # completes in order to have full results in the cache.
      result.wait_until_finish()
      self._cache.finalize()

      # Unset runtime options after the pipeline finishes.
      # TODO: Move this to a post finish hook and clean for all cases.
      if pipeline.options:
        RuntimeValueProvider.unset_runtime_options(pipeline.options._options_id)

    return result
  def run(self, pipeline):
    """Execute the entire pipeline and returns an DirectPipelineResult."""

    # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
    # with resolving imports when they are at top.
    # pylint: disable=wrong-import-position
    from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \
      ConsumerTrackingPipelineVisitor
    from apache_beam.runners.direct.evaluation_context import EvaluationContext
    from apache_beam.runners.direct.executor import Executor
    from apache_beam.runners.direct.transform_evaluator import \
      TransformEvaluatorRegistry

    MetricsEnvironment.set_metrics_supported(True)
    logging.info('Running pipeline with DirectRunner.')
    self.visitor = ConsumerTrackingPipelineVisitor()
    pipeline.visit(self.visitor)

    evaluation_context = EvaluationContext(
        pipeline.options,
        BundleFactory(stacked=pipeline.options.view_as(DirectOptions)
                      .direct_runner_use_stacked_bundle),
        self.visitor.root_transforms,
        self.visitor.value_to_consumers,
        self.visitor.step_names,
        self.visitor.views)

    evaluation_context.use_pvalue_cache(self._cache)

    executor = Executor(self.visitor.value_to_consumers,
                        TransformEvaluatorRegistry(evaluation_context),
                        evaluation_context)
    # Start the executor. This is a non-blocking call, it will start the
    # execution in background threads and return.

    if pipeline.options:
      RuntimeValueProvider.set_runtime_options(pipeline.options._options_id, {})
    executor.start(self.visitor.root_transforms)
    result = DirectPipelineResult(executor, evaluation_context)

    if self._cache:
      # We are running in eager mode, block until the pipeline execution
      # completes in order to have full results in the cache.
      result.wait_until_finish()
      self._cache.finalize()

      # Unset runtime options after the pipeline finishes.
      # TODO: Move this to a post finish hook and clean for all cases.
      if pipeline.options:
        RuntimeValueProvider.unset_runtime_options(pipeline.options._options_id)

    return result
Beispiel #12
0
  def run_pipeline(self, pipeline):
    """Execute the entire pipeline and returns an DirectPipelineResult."""

    # Performing configured PTransform overrides.
    pipeline.replace_all(self._ptransform_overrides)

    # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
    # with resolving imports when they are at top.
    # pylint: disable=wrong-import-position
    from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \
      ConsumerTrackingPipelineVisitor
    from apache_beam.runners.direct.evaluation_context import EvaluationContext
    from apache_beam.runners.direct.executor import Executor
    from apache_beam.runners.direct.transform_evaluator import \
      TransformEvaluatorRegistry

    MetricsEnvironment.set_metrics_supported(True)
    logging.info('Running pipeline with DirectRunner.')
    self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor()
    pipeline.visit(self.consumer_tracking_visitor)

    clock = TestClock() if self._use_test_clock else RealClock()
    evaluation_context = EvaluationContext(
        pipeline._options,
        BundleFactory(stacked=pipeline._options.view_as(DirectOptions)
                      .direct_runner_use_stacked_bundle),
        self.consumer_tracking_visitor.root_transforms,
        self.consumer_tracking_visitor.value_to_consumers,
        self.consumer_tracking_visitor.step_names,
        self.consumer_tracking_visitor.views,
        clock)

    evaluation_context.use_pvalue_cache(self._cache)

    executor = Executor(self.consumer_tracking_visitor.value_to_consumers,
                        TransformEvaluatorRegistry(evaluation_context),
                        evaluation_context)
    # DirectRunner does not support injecting
    # PipelineOptions values at runtime
    RuntimeValueProvider.set_runtime_options({})
    # Start the executor. This is a non-blocking call, it will start the
    # execution in background threads and return.
    executor.start(self.consumer_tracking_visitor.root_transforms)
    result = DirectPipelineResult(executor, evaluation_context)

    if self._cache:
      # We are running in eager mode, block until the pipeline execution
      # completes in order to have full results in the cache.
      result.wait_until_finish()
      self._cache.finalize()

    return result
Beispiel #13
0
    def run(self, pipeline):
        MetricsEnvironment.set_metrics_supported(self.has_metrics_support())
        # List of map tasks  Each map task is a list of
        # (stage_name, operation_specs.WorkerOperation) instructions.
        self.map_tasks = []

        # Map of pvalues to
        # (map_task_index, producer_operation_index, producer_output_index)
        self.outputs = {}

        # Unique mappings of PCollections to strings.
        self.side_input_labels = collections.defaultdict(
            lambda: str(len(self.side_input_labels)))

        # Mapping of map task indices to all map tasks that must preceed them.
        self.dependencies = collections.defaultdict(set)

        # Visit the graph, building up the map_tasks and their metadata.
        super(MapTaskExecutorRunner, self).run(pipeline)

        # Now run the tasks in topological order.
        def compute_depth_map(deps):
            memoized = {}

            def compute_depth(x):
                if x not in memoized:
                    memoized[x] = 1 + max([-1] +
                                          [compute_depth(y) for y in deps[x]])
                return memoized[x]

            return {x: compute_depth(x) for x in deps.keys()}

        map_task_depths = compute_depth_map(self.dependencies)
        ordered_map_tasks = sorted(
            (map_task_depths.get(ix, -1), map_task)
            for ix, map_task in enumerate(self.map_tasks))

        profile_options = pipeline.options.view_as(
            pipeline_options.ProfilingOptions)
        if profile_options.profile_cpu:
            with profiler.Profile(
                    profile_id='worker-runner',
                    profile_location=profile_options.profile_location,
                    log_results=True,
                    file_copy_fn=_dependency_file_copy):
                self.execute_map_tasks(ordered_map_tasks)
        else:
            self.execute_map_tasks(ordered_map_tasks)

        return WorkerRunnerResult(PipelineState.UNKNOWN)
Beispiel #14
0
 def run_pipeline(self, pipeline, options):
   MetricsEnvironment.set_metrics_supported(False)
   RuntimeValueProvider.set_runtime_options({})
   # This is sometimes needed if type checking is disabled
   # to enforce that the inputs (and outputs) of GroupByKey operations
   # are known to be KVs.
   from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner
   pipeline.visit(DataflowRunner.group_by_key_input_visitor())
   self._bundle_repeat = self._bundle_repeat or options.view_as(
       pipeline_options.DirectOptions).direct_runner_bundle_repeat
   self._profiler_factory = profiler.Profile.factory_from_options(
       options.view_as(pipeline_options.ProfilingOptions))
   return self.run_via_runner_api(pipeline.to_runner_api(
       default_environment=self._default_environment))
Beispiel #15
0
 def run_pipeline(self, pipeline, options):
   MetricsEnvironment.set_metrics_supported(False)
   RuntimeValueProvider.set_runtime_options({})
   # This is sometimes needed if type checking is disabled
   # to enforce that the inputs (and outputs) of GroupByKey operations
   # are known to be KVs.
   from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner
   pipeline.visit(DataflowRunner.group_by_key_input_visitor())
   self._bundle_repeat = self._bundle_repeat or options.view_as(
       pipeline_options.DirectOptions).direct_runner_bundle_repeat
   self._profiler_factory = profiler.Profile.factory_from_options(
       options.view_as(pipeline_options.ProfilingOptions))
   return self.run_via_runner_api(pipeline.to_runner_api(
       default_environment=self._default_environment))
Beispiel #16
0
    def test_downloader_fail_to_get_project_number(self, mock_get):
        # Raising an error when listing GCS Bucket so that project number fails to
        # be retrieved.
        mock_get.side_effect = HttpError({'status': 403}, None, None)
        # Clear the process wide metric container.
        MetricsEnvironment.process_wide_container().reset()

        file_name = 'gs://gcsio-metrics-test/dummy_mode_file'
        file_size = 5 * 1024 * 1024 + 100
        random_file = self._insert_random_file(self.client, file_name,
                                               file_size)
        self.gcs.open(file_name, 'r')

        resource = resource_identifiers.GoogleCloudStorageBucket(
            random_file.bucket)
        labels = {
            monitoring_infos.SERVICE_LABEL: 'Storage',
            monitoring_infos.METHOD_LABEL: 'Objects.get',
            monitoring_infos.RESOURCE_LABEL: resource,
            monitoring_infos.GCS_BUCKET_LABEL: random_file.bucket,
            monitoring_infos.GCS_PROJECT_ID_LABEL: str(DEFAULT_PROJECT_NUMBER),
            monitoring_infos.STATUS_LABEL: 'ok'
        }

        metric_name = MetricName(None,
                                 None,
                                 urn=monitoring_infos.API_REQUEST_COUNT_URN,
                                 labels=labels)
        metric_value = MetricsEnvironment.process_wide_container().get_counter(
            metric_name).get_cumulative()

        self.assertEqual(metric_value, 0)

        labels_without_project_id = {
            monitoring_infos.SERVICE_LABEL: 'Storage',
            monitoring_infos.METHOD_LABEL: 'Objects.get',
            monitoring_infos.RESOURCE_LABEL: resource,
            monitoring_infos.GCS_BUCKET_LABEL: random_file.bucket,
            monitoring_infos.STATUS_LABEL: 'ok'
        }
        metric_name = MetricName(None,
                                 None,
                                 urn=monitoring_infos.API_REQUEST_COUNT_URN,
                                 labels=labels_without_project_id)
        metric_value = MetricsEnvironment.process_wide_container().get_counter(
            metric_name).get_cumulative()

        self.assertEqual(metric_value, 2)
    def verify_read_call_metric(self, project_id, namespace, status, count):
        """Check if a metric was recorded for the Datastore IO read API call."""
        process_wide_monitoring_infos = list(
            MetricsEnvironment.process_wide_container(
            ).to_runner_api_monitoring_infos(None).values())
        resource = resource_identifiers.DatastoreNamespace(
            project_id, namespace)
        labels = {
            monitoring_infos.SERVICE_LABEL: 'Datastore',
            monitoring_infos.METHOD_LABEL: 'BatchDatastoreRead',
            monitoring_infos.RESOURCE_LABEL: resource,
            monitoring_infos.DATASTORE_NAMESPACE_LABEL: namespace,
            monitoring_infos.DATASTORE_PROJECT_ID_LABEL: project_id,
            monitoring_infos.STATUS_LABEL: status
        }
        expected_mi = monitoring_infos.int64_counter(
            monitoring_infos.API_REQUEST_COUNT_URN, count, labels=labels)
        expected_mi.ClearField("start_time")

        found = False
        for actual_mi in process_wide_monitoring_infos:
            actual_mi.ClearField("start_time")
            if expected_mi == actual_mi:
                found = True
                break
        self.assertTrue(
            found, "Did not find read call metric with status: %s" % status)
Beispiel #18
0
    def test_metrics(self):
        sampler = statesampler.StateSampler('', counters.CounterFactory())
        statesampler.set_current_tracker(sampler)
        state1 = sampler.scoped_state(
            'mystep', 'myState', metrics_container=MetricsContainer('mystep'))

        try:
            sampler.start()
            with state1:
                counter = MetricTests.base_metric_group.counter("my_counter")
                meter = MetricTests.base_metric_group.meter("my_meter")
                distribution = MetricTests.base_metric_group.distribution("my_distribution")
                container = MetricsEnvironment.current_container()

                self.assertEqual(0, counter.get_count())
                self.assertEqual(0, meter.get_count())
                self.assertEqual(
                    DistributionData(
                        0, 0, 0, 0), container.get_distribution(
                        MetricName(
                            '[]', 'my_distribution')).get_cumulative())
                counter.inc(-2)
                meter.mark_event(3)
                distribution.update(10)
                distribution.update(2)
                self.assertEqual(-2, counter.get_count())
                self.assertEqual(3, meter.get_count())
                self.assertEqual(
                    DistributionData(
                        12, 2, 2, 10), container.get_distribution(
                        MetricName(
                            '[]', 'my_distribution')).get_cumulative())
        finally:
            sampler.stop()
Beispiel #19
0
 def test_write_mutations_metric_on_failure(self):
   MetricsEnvironment.process_wide_container().reset()
   mock_batch = MagicMock()
   mock_batch.commit.side_effect = [
       exceptions.DeadlineExceeded("Deadline Exceeded"), []
   ]
   mock_throttler = MagicMock()
   rpc_stats_callback = MagicMock()
   mock_throttler.throttle_request.return_value = False
   mutate = datastoreio._Mutate.DatastoreMutateFn("my_project")
   mutate._batch = mock_batch
   mutate._batch_elements = []
   mutate._client = MagicMock()
   mutate.write_mutations(mock_throttler, rpc_stats_callback, throttle_delay=0)
   self.verify_write_call_metric("my_project", "", "deadline_exceeded", 1)
   self.verify_write_call_metric("my_project", "", "ok", 1)
Beispiel #20
0
  def test_create_counter_distribution(self):
    sampler = statesampler.StateSampler('', counters.CounterFactory())
    statesampler.set_current_tracker(sampler)
    state1 = sampler.scoped_state('mystep', 'myState',
                                  metrics_container=MetricsContainer('mystep'))
    sampler.start()
    with state1:
      counter_ns = 'aCounterNamespace'
      distro_ns = 'aDistributionNamespace'
      name = 'a_name'
      counter = Metrics.counter(counter_ns, name)
      distro = Metrics.distribution(distro_ns, name)
      counter.inc(10)
      counter.dec(3)
      distro.update(10)
      distro.update(2)
      self.assertTrue(isinstance(counter, Metrics.DelegatingCounter))
      self.assertTrue(isinstance(distro, Metrics.DelegatingDistribution))

      del distro
      del counter

      container = MetricsEnvironment.current_container()
      self.assertEqual(
          container.counters[MetricName(counter_ns, name)].get_cumulative(),
          7)
      self.assertEqual(
          container.distributions[MetricName(distro_ns, name)].get_cumulative(),
          DistributionData(12, 2, 2, 10))
    sampler.stop()
Beispiel #21
0
  def verify_write_call_metric(
      self, project_id, dataset_id, table_id, status, count):
    """Check if an metric was recorded for the BQ IO write API call."""
    process_wide_monitoring_infos = list(
        MetricsEnvironment.process_wide_container().
        to_runner_api_monitoring_infos(None).values())
    resource = resource_identifiers.BigQueryTable(
        project_id, dataset_id, table_id)
    labels = {
        # TODO(ajamato): Add Ptransform label.
        monitoring_infos.SERVICE_LABEL: 'BigQuery',
        # Refer to any method which writes elements to BigQuery in batches
        # as "BigQueryBatchWrite". I.e. storage API's insertAll, or future
        # APIs introduced.
        monitoring_infos.METHOD_LABEL: 'BigQueryBatchWrite',
        monitoring_infos.RESOURCE_LABEL: resource,
        monitoring_infos.BIGQUERY_PROJECT_ID_LABEL: project_id,
        monitoring_infos.BIGQUERY_DATASET_LABEL: dataset_id,
        monitoring_infos.BIGQUERY_TABLE_LABEL: table_id,
        monitoring_infos.STATUS_LABEL: status,
    }
    expected_mi = monitoring_infos.int64_counter(
        monitoring_infos.API_REQUEST_COUNT_URN, count, labels=labels)
    expected_mi.ClearField("start_time")

    found = False
    for actual_mi in process_wide_monitoring_infos:
      actual_mi.ClearField("start_time")
      if expected_mi == actual_mi:
        found = True
        break
    self.assertTrue(
        found, "Did not find write call metric with status: %s" % status)
    def test_sql_metrics_ok_call(self):
        if 'DirectRunner' not in self.runner_name:
            raise unittest.SkipTest('This test only runs with DirectRunner.')

        MetricsEnvironment.process_wide_container().reset()

        with beam.Pipeline(argv=self.args) as p:
            r = p | ReadFromSpanner(self.project,
                                    self.instance,
                                    self.TEST_DATABASE,
                                    sql="select * from Users",
                                    query_name='query-1')

        assert_that(r, equal_to(self._data))
        self.verify_sql_read_call_metric(self.project, self.TEST_DATABASE,
                                         'query-1', 'ok', 1)
Beispiel #23
0
  def test_create_counter_distribution(self):
    sampler = statesampler.StateSampler('', counters.CounterFactory())
    statesampler.set_current_tracker(sampler)
    state1 = sampler.scoped_state('mystep', 'myState',
                                  metrics_container=MetricsContainer('mystep'))
    sampler.start()
    with state1:
      counter_ns = 'aCounterNamespace'
      distro_ns = 'aDistributionNamespace'
      name = 'a_name'
      counter = Metrics.counter(counter_ns, name)
      distro = Metrics.distribution(distro_ns, name)
      counter.inc(10)
      counter.dec(3)
      distro.update(10)
      distro.update(2)
      self.assertTrue(isinstance(counter, Metrics.DelegatingCounter))
      self.assertTrue(isinstance(distro, Metrics.DelegatingDistribution))

      del distro
      del counter

      container = MetricsEnvironment.current_container()
      self.assertEqual(
          container.counters[MetricName(counter_ns, name)].get_cumulative(),
          7)
      self.assertEqual(
          container.distributions[MetricName(distro_ns, name)].get_cumulative(),
          DistributionData(12, 2, 2, 10))
    sampler.stop()
    def test_table_metrics_ok_call(self):
        if 'DirectRunner' not in self.runner_name:
            raise unittest.SkipTest('This test only runs with DirectRunner.')

        MetricsEnvironment.process_wide_container().reset()

        with beam.Pipeline(argv=self.args) as p:
            r = p | ReadFromSpanner(self.project,
                                    self.instance,
                                    self.TEST_DATABASE,
                                    table="Users",
                                    columns=["UserId", "Key"])

        assert_that(r, equal_to(self._data))
        self.verify_table_read_call_metric(self.project, self.TEST_DATABASE,
                                           'Users', 'ok', 1)
Beispiel #25
0
    def verify_write_call_metric(self, project_id, instance_id, table_id,
                                 status, count):
        """Check if a metric was recorded for the Datastore IO write API call."""
        process_wide_monitoring_infos = list(
            MetricsEnvironment.process_wide_container(
            ).to_runner_api_monitoring_infos(None).values())
        resource = resource_identifiers.BigtableTable(project_id, instance_id,
                                                      table_id)
        labels = {
            monitoring_infos.SERVICE_LABEL: 'BigTable',
            monitoring_infos.METHOD_LABEL: 'google.bigtable.v2.MutateRows',
            monitoring_infos.RESOURCE_LABEL: resource,
            monitoring_infos.BIGTABLE_PROJECT_ID_LABEL: project_id,
            monitoring_infos.INSTANCE_ID_LABEL: instance_id,
            monitoring_infos.TABLE_ID_LABEL: table_id,
            monitoring_infos.STATUS_LABEL: status
        }
        expected_mi = monitoring_infos.int64_counter(
            monitoring_infos.API_REQUEST_COUNT_URN, count, labels=labels)
        expected_mi.ClearField("start_time")

        found = False
        for actual_mi in process_wide_monitoring_infos:
            actual_mi.ClearField("start_time")
            if expected_mi == actual_mi:
                found = True
                break
        self.assertTrue(
            found, "Did not find write call metric with status: %s" % status)
  def run_pipeline(self, pipeline):
    MetricsEnvironment.set_metrics_supported(self.has_metrics_support())
    # List of map tasks  Each map task is a list of
    # (stage_name, operation_specs.WorkerOperation) instructions.
    self.map_tasks = []

    # Map of pvalues to
    # (map_task_index, producer_operation_index, producer_output_index)
    self.outputs = {}

    # Unique mappings of PCollections to strings.
    self.side_input_labels = collections.defaultdict(
        lambda: str(len(self.side_input_labels)))

    # Mapping of map task indices to all map tasks that must preceed them.
    self.dependencies = collections.defaultdict(set)

    # Visit the graph, building up the map_tasks and their metadata.
    super(MapTaskExecutorRunner, self).run_pipeline(pipeline)

    # Now run the tasks in topological order.
    def compute_depth_map(deps):
      memoized = {}

      def compute_depth(x):
        if x not in memoized:
          memoized[x] = 1 + max([-1] + [compute_depth(y) for y in deps[x]])
        return memoized[x]

      return {x: compute_depth(x) for x in deps.keys()}

    map_task_depths = compute_depth_map(self.dependencies)
    ordered_map_tasks = sorted((map_task_depths.get(ix, -1), map_task)
                               for ix, map_task in enumerate(self.map_tasks))

    profile_options = pipeline.options.view_as(
        pipeline_options.ProfilingOptions)
    if profile_options.profile_cpu:
      with profiler.Profile(
          profile_id='worker-runner',
          profile_location=profile_options.profile_location,
          log_results=True, file_copy_fn=_dependency_file_copy):
        self.execute_map_tasks(ordered_map_tasks)
    else:
      self.execute_map_tasks(ordered_map_tasks)

    return WorkerRunnerResult(PipelineState.UNKNOWN)
Beispiel #27
0
    def get_count(self) -> int:
        """
        Get number of events marked on the meter.

        .. versionadded:: 1.11.0
        """
        from apache_beam.metrics.execution import MetricsEnvironment
        container = MetricsEnvironment.current_container()
        return container.get_counter(self._inner_counter.metric_name).get_cumulative()
Beispiel #28
0
    def test_harness_monitoring_infos_and_metadata(self):
        # Clear the process wide metric container.
        MetricsEnvironment.process_wide_container().reset()
        # Create a process_wide metric.
        urn = 'my.custom.urn'
        labels = {'key': 'value'}
        InternalMetrics.counter(urn=urn, labels=labels,
                                process_wide=True).inc(10)

        harness_monitoring_infos_request = beam_fn_api_pb2.InstructionRequest(
            instruction_id="monitoring_infos",
            harness_monitoring_infos=beam_fn_api_pb2.
            HarnessMonitoringInfosRequest())

        responses = self.get_responses([harness_monitoring_infos_request])

        expected_monitoring_info = monitoring_infos.int64_counter(
            urn, 10, labels=labels)
        monitoring_data = (responses['monitoring_infos'].
                           harness_monitoring_infos.monitoring_data)

        # Request the full MonitoringInfo metadata for the returned short_ids.
        short_ids = list(monitoring_data.keys())
        monitoring_infos_metadata_request = beam_fn_api_pb2.InstructionRequest(
            instruction_id="monitoring_infos_metadata",
            monitoring_infos=beam_fn_api_pb2.MonitoringInfosMetadataRequest(
                monitoring_info_id=short_ids))

        responses = self.get_responses([monitoring_infos_metadata_request])

        # Request the full MonitoringInfo metadata to be returned now.
        expected_monitoring_info.ClearField("payload")

        # Verify that one of the returned monitoring infos is our expected
        # monitoring info.
        short_id_to_mi = (responses['monitoring_infos_metadata'].
                          monitoring_infos.monitoring_info)
        found = False
        for mi in short_id_to_mi.values():
            # Clear the timestamp before comparing
            mi.ClearField("start_time")
            if mi == expected_monitoring_info:
                found = True
        self.assertTrue(found, str(responses['monitoring_infos_metadata']))
Beispiel #29
0
    def get_count(self):
        """
        Returns the current count.

        .. versionadded:: 1.11.0
        """
        from apache_beam.metrics.execution import MetricsEnvironment
        container = MetricsEnvironment.current_container()
        return container.get_counter(
            self._inner_counter.metric_name).get_cumulative()
    def test_sql_metrics_error_call(self):
        if 'DirectRunner' not in self.runner_name:
            raise unittest.SkipTest('This test only runs with DirectRunner.')

        MetricsEnvironment.process_wide_container().reset()

        with self.assertRaises(Exception):
            p = beam.Pipeline(argv=self.args)
            _ = p | ReadFromSpanner(self.project,
                                    self.instance,
                                    self.TEST_DATABASE,
                                    sql="select * from NonExistent",
                                    query_name='query-2')

            res = p.run()
            res.wait_until_finish()

            self.verify_sql_read_call_metric(self.project, self.TEST_DATABASE,
                                             'query-2', '400', 1)
Beispiel #31
0
  def test_insert_rows_sets_metric_on_failure(self):
    MetricsEnvironment.process_wide_container().reset()
    client = mock.Mock()
    client.insert_rows_json = mock.Mock(
        # Fail a few times, then succeed.
        side_effect=[
            DeadlineExceeded("Deadline Exceeded"),
            InternalServerError("Internal Error"),
            [],
        ])
    wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client)
    wrapper.insert_rows("my_project", "my_dataset", "my_table", [])

    # Expect two failing calls, then a success (i.e. two retries).
    self.verify_write_call_metric(
        "my_project", "my_dataset", "my_table", "deadline_exceeded", 1)
    self.verify_write_call_metric(
        "my_project", "my_dataset", "my_table", "internal", 1)
    self.verify_write_call_metric(
        "my_project", "my_dataset", "my_table", "ok", 1)
 def test_QueryFn_metric_on_failure(self):
     MetricsEnvironment.process_wide_container().reset()
     with patch.object(helper, 'get_client',
                       return_value=self._mock_client):
         self._mock_query.project = self._PROJECT
         self._mock_query.namespace = self._NAMESPACE
         _query_fn = ReadFromDatastore._QueryFn()
         client_query = self._mock_query._to_client_query()
         # Test with exception
         client_query.fetch.side_effect = [
             exceptions.DeadlineExceeded("Deadline exceed")
         ]
         list(_query_fn.process(self._mock_query))
         self.verify_read_call_metric(self._PROJECT, self._NAMESPACE,
                                      "deadline_exceeded", 1)
         # Test success
         client_query.fetch.side_effect = [[]]
         list(_query_fn.process(self._mock_query))
         self.verify_read_call_metric(self._PROJECT, self._NAMESPACE, "ok",
                                      1)
Beispiel #33
0
  def test_table_metrics_error_call(self):
    if 'DirectRunner' not in self.runner_name:
      raise unittest.SkipTest('This test only runs with DirectRunner.')

    MetricsEnvironment.process_wide_container().reset()

    with self.assertRaises(Exception):
      p = beam.Pipeline(argv=self.args)
      _ = p | ReadFromSpanner(
          self.project,
          self.instance,
          self.TEST_DATABASE,
          table="INVALID_TABLE",
          columns=["UserId", "Key"])

      res = p.run()
      res.wait_until_finish()

      self.verify_table_read_call_metric(
          self.project, self.TEST_DATABASE, 'INVALID_TABLE', '404', 1)
Beispiel #34
0
    def test_scoped_container(self):
        c1 = MetricsContainer('mystep')
        c2 = MetricsContainer('myinternalstep')
        with ScopedMetricsContainer(c1):
            self.assertEqual(c1, MetricsEnvironment.current_container())
            counter = Metrics.counter('ns', 'name')
            counter.inc(2)

            with ScopedMetricsContainer(c2):
                self.assertEqual(c2, MetricsEnvironment.current_container())
                counter = Metrics.counter('ns', 'name')
                counter.inc(3)
                self.assertEqual(list(c2.get_cumulative().counters.items()),
                                 [(MetricKey('myinternalstep',
                                             MetricName('ns', 'name')), 3)])

            self.assertEqual(c1, MetricsEnvironment.current_container())
            counter = Metrics.counter('ns', 'name')
            counter.inc(4)
            self.assertEqual(
                list(c1.get_cumulative().counters.items()),
                [(MetricKey('mystep', MetricName('ns', 'name')), 6)])
  def test_scoped_container(self):
    c1 = MetricsContainer('mystep')
    c2 = MetricsContainer('myinternalstep')
    with ScopedMetricsContainer(c1):
      self.assertEqual(c1, MetricsEnvironment.current_container())
      counter = Metrics.counter('ns', 'name')
      counter.inc(2)

      with ScopedMetricsContainer(c2):
        self.assertEqual(c2, MetricsEnvironment.current_container())
        counter = Metrics.counter('ns', 'name')
        counter.inc(3)
        self.assertEqual(
            c2.get_cumulative().counters.items(),
            [(MetricKey('myinternalstep', MetricName('ns', 'name')), 3)])

      self.assertEqual(c1, MetricsEnvironment.current_container())
      counter = Metrics.counter('ns', 'name')
      counter.inc(4)
      self.assertEqual(
          c1.get_cumulative().counters.items(),
          [(MetricKey('mystep', MetricName('ns', 'name')), 6)])
Beispiel #36
0
 def _request_harness_monitoring_infos(self, request):
   # type: (beam_fn_api_pb2.InstructionRequest) -> None
   process_wide_monitoring_infos = MetricsEnvironment.process_wide_container(
   ).to_runner_api_monitoring_infos(None).values()
   self._execute(
       lambda: beam_fn_api_pb2.InstructionResponse(
           instruction_id=request.instruction_id,
           harness_monitoring_infos=(
               beam_fn_api_pb2.HarnessMonitoringInfosResponse(
                   monitoring_data={
                       SHORT_ID_CACHE.get_short_id(info): info.payload
                       for info in process_wide_monitoring_infos
                   }))),
       request)
Beispiel #37
0
  def verify_write_call_metric(self, project, database, table, status, count):
    resource = resource_identifiers.SpannerTable(project, database, table)
    labels = {
        monitoring_infos.SERVICE_LABEL: 'Spanner',
        monitoring_infos.METHOD_LABEL: 'Write',
        monitoring_infos.SPANNER_PROJECT_ID: project,
        monitoring_infos.SPANNER_DATABASE_ID: database,
        monitoring_infos.RESOURCE_LABEL: resource,
        monitoring_infos.SPANNER_TABLE_ID: table,
        monitoring_infos.STATUS_LABEL: status
    }
    metric_name = MetricName(
        None, None, urn=monitoring_infos.API_REQUEST_COUNT_URN, labels=labels)
    metric_value = MetricsEnvironment.process_wide_container().get_counter(
        metric_name).get_cumulative()

    self.assertEqual(metric_value, count)
  def test_uses_right_container(self):
    c1 = MetricsContainer('step1')
    c2 = MetricsContainer('step2')
    counter = Metrics.counter('ns', 'name')
    MetricsEnvironment.set_current_container(c1)
    counter.inc()
    MetricsEnvironment.set_current_container(c2)
    counter.inc(3)
    MetricsEnvironment.unset_current_container()

    self.assertEqual(
        c1.get_cumulative().counters.items(),
        [(MetricKey('step1', MetricName('ns', 'name')), 1)])

    self.assertEqual(
        c2.get_cumulative().counters.items(),
        [(MetricKey('step2', MetricName('ns', 'name')), 3)])
Beispiel #39
0
 def inc(self, n=1):
   container = MetricsEnvironment.current_container()
   if container is not None:
     container.get_counter(self.metric_name).inc(n)
 def test_no_container(self):
   self.assertEqual(MetricsEnvironment.current_container(),
                    None)
Beispiel #41
0
 def run_pipeline(self, pipeline):
   MetricsEnvironment.set_metrics_supported(False)
   return self.run_via_runner_api(pipeline.to_runner_api())
 def run(self, pipeline):
   MetricsEnvironment.set_metrics_supported(self.has_metrics_support())
   if pipeline._verify_runner_api_compatible():
     return self.run_via_runner_api(pipeline.to_runner_api())
   else:
     return super(FnApiRunner, self).run(pipeline)
Beispiel #43
0
 def set(self, value):
   container = MetricsEnvironment.current_container()
   if container is not None:
     container.get_gauge(self.metric_name).set(value)
Beispiel #44
0
 def update(self, value):
   container = MetricsEnvironment.current_container()
   if container is not None:
     container.get_distribution(self.metric_name).update(value)