def test_write_metrics(self): MetricsEnvironment.process_wide_container().reset() write_fn = bigtableio._BigTableWriteFn(self._PROJECT_ID, self._INSTANCE_ID, self._TABLE_ID) write_fn.table = self.table write_fn.start_bundle() number_of_rows = 2 error = Status() error.message = 'Entity already exists.' error.code = ALREADY_EXISTS success = Status() success.message = 'Success' success.code = OK rows_response = [error, success] * number_of_rows with patch.object(Table, 'mutate_rows', return_value=rows_response): direct_rows = [ self.generate_row(i) for i in range(number_of_rows * 2) ] for direct_row in direct_rows: write_fn.process(direct_row) try: write_fn.finish_bundle() except: # pylint: disable=bare-except # Currently we fail the bundle when there are any failures. # TODO(BEAM-13849): remove after bigtableio can selectively retry. pass self.verify_write_call_metric( self._PROJECT_ID, self._INSTANCE_ID, self._TABLE_ID, ServiceCallMetric.bigtable_error_code_to_grpc_status_string( ALREADY_EXISTS), 2) self.verify_write_call_metric( self._PROJECT_ID, self._INSTANCE_ID, self._TABLE_ID, ServiceCallMetric.bigtable_error_code_to_grpc_status_string( OK), 2)
def test_uploader_monitoring_info(self): # Clear the process wide metric container. MetricsEnvironment.process_wide_container().reset() file_name = 'gs://gcsio-metrics-test/dummy_mode_file' file_size = 5 * 1024 * 1024 + 100 random_file = self._insert_random_file(self.client, file_name, file_size) f = self.gcs.open(file_name, 'w') resource = resource_identifiers.GoogleCloudStorageBucket( random_file.bucket) labels = { monitoring_infos.SERVICE_LABEL: 'Storage', monitoring_infos.METHOD_LABEL: 'Objects.insert', monitoring_infos.RESOURCE_LABEL: resource, monitoring_infos.GCS_BUCKET_LABEL: random_file.bucket, monitoring_infos.GCS_PROJECT_ID_LABEL: str(DEFAULT_PROJECT_NUMBER), monitoring_infos.STATUS_LABEL: 'ok' } f.close() metric_name = MetricName(None, None, urn=monitoring_infos.API_REQUEST_COUNT_URN, labels=labels) metric_value = MetricsEnvironment.process_wide_container().get_counter( metric_name).get_cumulative() self.assertEqual(metric_value, 1)
def run_pipeline(self, pipeline): """Execute the entire pipeline and returns an DirectPipelineResult.""" # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems # with resolving imports when they are at top. # pylint: disable=wrong-import-position from apache_beam.pipeline import PipelineVisitor from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \ ConsumerTrackingPipelineVisitor from apache_beam.runners.direct.evaluation_context import EvaluationContext from apache_beam.runners.direct.executor import Executor from apache_beam.runners.direct.transform_evaluator import \ TransformEvaluatorRegistry from apache_beam.testing.test_stream import TestStream # Performing configured PTransform overrides. pipeline.replace_all(_get_transform_overrides(pipeline.options)) # If the TestStream I/O is used, use a mock test clock. class _TestStreamUsageVisitor(PipelineVisitor): """Visitor determining whether a Pipeline uses a TestStream.""" def __init__(self): self.uses_test_stream = False def visit_transform(self, applied_ptransform): if isinstance(applied_ptransform.transform, TestStream): self.uses_test_stream = True visitor = _TestStreamUsageVisitor() pipeline.visit(visitor) clock = TestClock() if visitor.uses_test_stream else RealClock() MetricsEnvironment.set_metrics_supported(True) logging.info('Running pipeline with DirectRunner.') self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor() pipeline.visit(self.consumer_tracking_visitor) evaluation_context = EvaluationContext( pipeline._options, BundleFactory(stacked=pipeline._options.view_as(DirectOptions) .direct_runner_use_stacked_bundle), self.consumer_tracking_visitor.root_transforms, self.consumer_tracking_visitor.value_to_consumers, self.consumer_tracking_visitor.step_names, self.consumer_tracking_visitor.views, clock) executor = Executor(self.consumer_tracking_visitor.value_to_consumers, TransformEvaluatorRegistry(evaluation_context), evaluation_context) # DirectRunner does not support injecting # PipelineOptions values at runtime RuntimeValueProvider.set_runtime_options({}) # Start the executor. This is a non-blocking call, it will start the # execution in background threads and return. executor.start(self.consumer_tracking_visitor.root_transforms) result = DirectPipelineResult(executor, evaluation_context) return result
def test_create_process_wide(self): sampler = statesampler.StateSampler('', counters.CounterFactory()) statesampler.set_current_tracker(sampler) state1 = sampler.scoped_state( 'mystep', 'myState', metrics_container=MetricsContainer('mystep')) try: sampler.start() with state1: urn = "my:custom:urn" labels = {'key': 'value'} counter = InternalMetrics.counter( urn=urn, labels=labels, process_wide=True) # Test that if process_wide is set, that it will be set # on the process_wide container. counter.inc(10) self.assertTrue(isinstance(counter, Metrics.DelegatingCounter)) del counter metric_name = MetricName(None, None, urn=urn, labels=labels) # Expect a value set on the current container. self.assertEqual( MetricsEnvironment.process_wide_container().get_counter( metric_name).get_cumulative(), 10) # Expect no value set on the current container. self.assertEqual( MetricsEnvironment.current_container().get_counter( metric_name).get_cumulative(), 0) finally: sampler.stop()
def test_create_counter_distribution(self): MetricsEnvironment.set_current_container(MetricsContainer('mystep')) counter_ns = 'aCounterNamespace' distro_ns = 'aDistributionNamespace' gauge_ns = 'aGaugeNamespace' name = 'a_name' counter = Metrics.counter(counter_ns, name) distro = Metrics.distribution(distro_ns, name) gauge = Metrics.gauge(gauge_ns, name) counter.inc(10) counter.dec(3) distro.update(10) distro.update(2) gauge.set(10) self.assertTrue(isinstance(counter, Metrics.DelegatingCounter)) self.assertTrue(isinstance(distro, Metrics.DelegatingDistribution)) self.assertTrue(isinstance(gauge, Metrics.DelegatingGauge)) del distro del counter del gauge container = MetricsEnvironment.current_container() self.assertEqual( container.counters[MetricName(counter_ns, name)].get_cumulative(), 7) self.assertEqual( container.distributions[MetricName(distro_ns, name)].get_cumulative(), DistributionData(12, 2, 2, 10)) self.assertEqual( container.gauges[MetricName(gauge_ns, name)].get_cumulative().value, 10)
def test_metrics_error_call(self): if 'DirectRunner' not in self.runner_name: raise unittest.SkipTest('This test only runs with DirectRunner.') MetricsEnvironment.process_wide_container().reset() _prefix = 'test_write_batches' mutations = [ WriteMutation.insert('Albums', ('AlbumId', 'Name'), [(_prefix + '3', _prefix + 'inset-3')]), WriteMutation.insert('Albums', ('AlbumId', 'Name'), [(_prefix + '3', _prefix + 'inset-3')]), ] with self.assertRaises(Exception): p = beam.Pipeline(argv=self.args) _ = (p | beam.Create(mutations) | WriteToSpanner(project_id=self.project, instance_id=self.instance, database_id=self.TEST_DATABASE)) res = p.run() res.wait_until_finish() self.verify_write_call_metric(self.project, self.TEST_DATABASE, 'Albums', '400', 1)
def run_pipeline(self, pipeline, options): """Execute the entire pipeline and returns an DirectPipelineResult.""" # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems # with resolving imports when they are at top. # pylint: disable=wrong-import-position from apache_beam.pipeline import PipelineVisitor from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \ ConsumerTrackingPipelineVisitor from apache_beam.runners.direct.evaluation_context import EvaluationContext from apache_beam.runners.direct.executor import Executor from apache_beam.runners.direct.transform_evaluator import \ TransformEvaluatorRegistry from apache_beam.testing.test_stream import TestStream # Performing configured PTransform overrides. pipeline.replace_all(_get_transform_overrides(options)) # If the TestStream I/O is used, use a mock test clock. class _TestStreamUsageVisitor(PipelineVisitor): """Visitor determining whether a Pipeline uses a TestStream.""" def __init__(self): self.uses_test_stream = False def visit_transform(self, applied_ptransform): if isinstance(applied_ptransform.transform, TestStream): self.uses_test_stream = True visitor = _TestStreamUsageVisitor() pipeline.visit(visitor) clock = TestClock() if visitor.uses_test_stream else RealClock() # TODO(BEAM-4274): Circular import runners-metrics. Requires refactoring. from apache_beam.metrics.execution import MetricsEnvironment MetricsEnvironment.set_metrics_supported(True) logging.info('Running pipeline with DirectRunner.') self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor() pipeline.visit(self.consumer_tracking_visitor) evaluation_context = EvaluationContext( options, BundleFactory(stacked=options.view_as( DirectOptions).direct_runner_use_stacked_bundle), self.consumer_tracking_visitor.root_transforms, self.consumer_tracking_visitor.value_to_consumers, self.consumer_tracking_visitor.step_names, self.consumer_tracking_visitor.views, clock) executor = Executor(self.consumer_tracking_visitor.value_to_consumers, TransformEvaluatorRegistry(evaluation_context), evaluation_context) # DirectRunner does not support injecting # PipelineOptions values at runtime RuntimeValueProvider.set_runtime_options({}) # Start the executor. This is a non-blocking call, it will start the # execution in background threads and return. executor.start(self.consumer_tracking_visitor.root_transforms) result = DirectPipelineResult(executor, evaluation_context) return result
def test_write_metrics(self): MetricsEnvironment.process_wide_container().reset() write_fn = bigtableio._BigTableWriteFn(self._PROJECT_ID, self._INSTANCE_ID, self._TABLE_ID) write_fn.table = self.table write_fn.start_bundle() number_of_rows = 2 error = Status() error.message = 'Entity already exists.' error.code = ALREADY_EXISTS success = Status() success.message = 'Success' success.code = OK rows_response = [error, success] * number_of_rows with patch.object(Table, 'mutate_rows', return_value=rows_response): direct_rows = [ self.generate_row(i) for i in range(number_of_rows * 2) ] for direct_row in direct_rows: write_fn.process(direct_row) write_fn.finish_bundle() self.verify_write_call_metric( self._PROJECT_ID, self._INSTANCE_ID, self._TABLE_ID, ServiceCallMetric.bigtable_error_code_to_grpc_status_string( ALREADY_EXISTS), 2) self.verify_write_call_metric( self._PROJECT_ID, self._INSTANCE_ID, self._TABLE_ID, ServiceCallMetric.bigtable_error_code_to_grpc_status_string( OK), 2)
def test_create_counter_distribution(self): MetricsEnvironment.set_current_container(MetricsContainer('mystep')) counter_ns = 'aCounterNamespace' distro_ns = 'aDistributionNamespace' gauge_ns = 'aGaugeNamespace' name = 'a_name' counter = Metrics.counter(counter_ns, name) distro = Metrics.distribution(distro_ns, name) gauge = Metrics.gauge(gauge_ns, name) counter.inc(10) counter.dec(3) distro.update(10) distro.update(2) gauge.set(10) self.assertTrue(isinstance(counter, Metrics.DelegatingCounter)) self.assertTrue(isinstance(distro, Metrics.DelegatingDistribution)) self.assertTrue(isinstance(gauge, Metrics.DelegatingGauge)) del distro del counter del gauge container = MetricsEnvironment.current_container() self.assertEqual( container.counters[MetricName(counter_ns, name)].get_cumulative(), 7) self.assertEqual( container.distributions[MetricName(distro_ns, name)].get_cumulative(), DistributionData(12, 2, 2, 10)) self.assertEqual( container.gauges[MetricName(gauge_ns, name)].get_cumulative().value, 10)
def run(self, pipeline): """Execute the entire pipeline and returns an DirectPipelineResult.""" # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems # with resolving imports when they are at top. # pylint: disable=wrong-import-position from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \ ConsumerTrackingPipelineVisitor from apache_beam.runners.direct.evaluation_context import EvaluationContext from apache_beam.runners.direct.executor import Executor from apache_beam.runners.direct.transform_evaluator import \ TransformEvaluatorRegistry MetricsEnvironment.set_metrics_supported(True) logging.info('Running pipeline with DirectRunner.') self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor() pipeline.visit(group_by_key_input_visitor()) pipeline.visit(self.consumer_tracking_visitor) evaluation_context = EvaluationContext( pipeline.options, BundleFactory(stacked=pipeline.options.view_as(DirectOptions) .direct_runner_use_stacked_bundle), self.consumer_tracking_visitor.root_transforms, self.consumer_tracking_visitor.value_to_consumers, self.consumer_tracking_visitor.step_names, self.consumer_tracking_visitor.views) evaluation_context.use_pvalue_cache(self._cache) executor = Executor(self.consumer_tracking_visitor.value_to_consumers, TransformEvaluatorRegistry(evaluation_context), evaluation_context) # Start the executor. This is a non-blocking call, it will start the # execution in background threads and return. if pipeline.options: RuntimeValueProvider.set_runtime_options(pipeline.options._options_id, {}) executor.start(self.consumer_tracking_visitor.root_transforms) result = DirectPipelineResult(executor, evaluation_context) if self._cache: # We are running in eager mode, block until the pipeline execution # completes in order to have full results in the cache. result.wait_until_finish() self._cache.finalize() # Unset runtime options after the pipeline finishes. # TODO: Move this to a post finish hook and clean for all cases. if pipeline.options: RuntimeValueProvider.unset_runtime_options(pipeline.options._options_id) return result
def run(self, pipeline): """Execute the entire pipeline and returns an DirectPipelineResult.""" # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems # with resolving imports when they are at top. # pylint: disable=wrong-import-position from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \ ConsumerTrackingPipelineVisitor from apache_beam.runners.direct.evaluation_context import EvaluationContext from apache_beam.runners.direct.executor import Executor from apache_beam.runners.direct.transform_evaluator import \ TransformEvaluatorRegistry MetricsEnvironment.set_metrics_supported(True) logging.info('Running pipeline with DirectRunner.') self.visitor = ConsumerTrackingPipelineVisitor() pipeline.visit(self.visitor) evaluation_context = EvaluationContext( pipeline.options, BundleFactory(stacked=pipeline.options.view_as(DirectOptions) .direct_runner_use_stacked_bundle), self.visitor.root_transforms, self.visitor.value_to_consumers, self.visitor.step_names, self.visitor.views) evaluation_context.use_pvalue_cache(self._cache) executor = Executor(self.visitor.value_to_consumers, TransformEvaluatorRegistry(evaluation_context), evaluation_context) # Start the executor. This is a non-blocking call, it will start the # execution in background threads and return. if pipeline.options: RuntimeValueProvider.set_runtime_options(pipeline.options._options_id, {}) executor.start(self.visitor.root_transforms) result = DirectPipelineResult(executor, evaluation_context) if self._cache: # We are running in eager mode, block until the pipeline execution # completes in order to have full results in the cache. result.wait_until_finish() self._cache.finalize() # Unset runtime options after the pipeline finishes. # TODO: Move this to a post finish hook and clean for all cases. if pipeline.options: RuntimeValueProvider.unset_runtime_options(pipeline.options._options_id) return result
def run_pipeline(self, pipeline): """Execute the entire pipeline and returns an DirectPipelineResult.""" # Performing configured PTransform overrides. pipeline.replace_all(self._ptransform_overrides) # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems # with resolving imports when they are at top. # pylint: disable=wrong-import-position from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \ ConsumerTrackingPipelineVisitor from apache_beam.runners.direct.evaluation_context import EvaluationContext from apache_beam.runners.direct.executor import Executor from apache_beam.runners.direct.transform_evaluator import \ TransformEvaluatorRegistry MetricsEnvironment.set_metrics_supported(True) logging.info('Running pipeline with DirectRunner.') self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor() pipeline.visit(self.consumer_tracking_visitor) clock = TestClock() if self._use_test_clock else RealClock() evaluation_context = EvaluationContext( pipeline._options, BundleFactory(stacked=pipeline._options.view_as(DirectOptions) .direct_runner_use_stacked_bundle), self.consumer_tracking_visitor.root_transforms, self.consumer_tracking_visitor.value_to_consumers, self.consumer_tracking_visitor.step_names, self.consumer_tracking_visitor.views, clock) evaluation_context.use_pvalue_cache(self._cache) executor = Executor(self.consumer_tracking_visitor.value_to_consumers, TransformEvaluatorRegistry(evaluation_context), evaluation_context) # DirectRunner does not support injecting # PipelineOptions values at runtime RuntimeValueProvider.set_runtime_options({}) # Start the executor. This is a non-blocking call, it will start the # execution in background threads and return. executor.start(self.consumer_tracking_visitor.root_transforms) result = DirectPipelineResult(executor, evaluation_context) if self._cache: # We are running in eager mode, block until the pipeline execution # completes in order to have full results in the cache. result.wait_until_finish() self._cache.finalize() return result
def run(self, pipeline): MetricsEnvironment.set_metrics_supported(self.has_metrics_support()) # List of map tasks Each map task is a list of # (stage_name, operation_specs.WorkerOperation) instructions. self.map_tasks = [] # Map of pvalues to # (map_task_index, producer_operation_index, producer_output_index) self.outputs = {} # Unique mappings of PCollections to strings. self.side_input_labels = collections.defaultdict( lambda: str(len(self.side_input_labels))) # Mapping of map task indices to all map tasks that must preceed them. self.dependencies = collections.defaultdict(set) # Visit the graph, building up the map_tasks and their metadata. super(MapTaskExecutorRunner, self).run(pipeline) # Now run the tasks in topological order. def compute_depth_map(deps): memoized = {} def compute_depth(x): if x not in memoized: memoized[x] = 1 + max([-1] + [compute_depth(y) for y in deps[x]]) return memoized[x] return {x: compute_depth(x) for x in deps.keys()} map_task_depths = compute_depth_map(self.dependencies) ordered_map_tasks = sorted( (map_task_depths.get(ix, -1), map_task) for ix, map_task in enumerate(self.map_tasks)) profile_options = pipeline.options.view_as( pipeline_options.ProfilingOptions) if profile_options.profile_cpu: with profiler.Profile( profile_id='worker-runner', profile_location=profile_options.profile_location, log_results=True, file_copy_fn=_dependency_file_copy): self.execute_map_tasks(ordered_map_tasks) else: self.execute_map_tasks(ordered_map_tasks) return WorkerRunnerResult(PipelineState.UNKNOWN)
def run_pipeline(self, pipeline, options): MetricsEnvironment.set_metrics_supported(False) RuntimeValueProvider.set_runtime_options({}) # This is sometimes needed if type checking is disabled # to enforce that the inputs (and outputs) of GroupByKey operations # are known to be KVs. from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner pipeline.visit(DataflowRunner.group_by_key_input_visitor()) self._bundle_repeat = self._bundle_repeat or options.view_as( pipeline_options.DirectOptions).direct_runner_bundle_repeat self._profiler_factory = profiler.Profile.factory_from_options( options.view_as(pipeline_options.ProfilingOptions)) return self.run_via_runner_api(pipeline.to_runner_api( default_environment=self._default_environment))
def run_pipeline(self, pipeline, options): MetricsEnvironment.set_metrics_supported(False) RuntimeValueProvider.set_runtime_options({}) # This is sometimes needed if type checking is disabled # to enforce that the inputs (and outputs) of GroupByKey operations # are known to be KVs. from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner pipeline.visit(DataflowRunner.group_by_key_input_visitor()) self._bundle_repeat = self._bundle_repeat or options.view_as( pipeline_options.DirectOptions).direct_runner_bundle_repeat self._profiler_factory = profiler.Profile.factory_from_options( options.view_as(pipeline_options.ProfilingOptions)) return self.run_via_runner_api(pipeline.to_runner_api( default_environment=self._default_environment))
def test_downloader_fail_to_get_project_number(self, mock_get): # Raising an error when listing GCS Bucket so that project number fails to # be retrieved. mock_get.side_effect = HttpError({'status': 403}, None, None) # Clear the process wide metric container. MetricsEnvironment.process_wide_container().reset() file_name = 'gs://gcsio-metrics-test/dummy_mode_file' file_size = 5 * 1024 * 1024 + 100 random_file = self._insert_random_file(self.client, file_name, file_size) self.gcs.open(file_name, 'r') resource = resource_identifiers.GoogleCloudStorageBucket( random_file.bucket) labels = { monitoring_infos.SERVICE_LABEL: 'Storage', monitoring_infos.METHOD_LABEL: 'Objects.get', monitoring_infos.RESOURCE_LABEL: resource, monitoring_infos.GCS_BUCKET_LABEL: random_file.bucket, monitoring_infos.GCS_PROJECT_ID_LABEL: str(DEFAULT_PROJECT_NUMBER), monitoring_infos.STATUS_LABEL: 'ok' } metric_name = MetricName(None, None, urn=monitoring_infos.API_REQUEST_COUNT_URN, labels=labels) metric_value = MetricsEnvironment.process_wide_container().get_counter( metric_name).get_cumulative() self.assertEqual(metric_value, 0) labels_without_project_id = { monitoring_infos.SERVICE_LABEL: 'Storage', monitoring_infos.METHOD_LABEL: 'Objects.get', monitoring_infos.RESOURCE_LABEL: resource, monitoring_infos.GCS_BUCKET_LABEL: random_file.bucket, monitoring_infos.STATUS_LABEL: 'ok' } metric_name = MetricName(None, None, urn=monitoring_infos.API_REQUEST_COUNT_URN, labels=labels_without_project_id) metric_value = MetricsEnvironment.process_wide_container().get_counter( metric_name).get_cumulative() self.assertEqual(metric_value, 2)
def verify_read_call_metric(self, project_id, namespace, status, count): """Check if a metric was recorded for the Datastore IO read API call.""" process_wide_monitoring_infos = list( MetricsEnvironment.process_wide_container( ).to_runner_api_monitoring_infos(None).values()) resource = resource_identifiers.DatastoreNamespace( project_id, namespace) labels = { monitoring_infos.SERVICE_LABEL: 'Datastore', monitoring_infos.METHOD_LABEL: 'BatchDatastoreRead', monitoring_infos.RESOURCE_LABEL: resource, monitoring_infos.DATASTORE_NAMESPACE_LABEL: namespace, monitoring_infos.DATASTORE_PROJECT_ID_LABEL: project_id, monitoring_infos.STATUS_LABEL: status } expected_mi = monitoring_infos.int64_counter( monitoring_infos.API_REQUEST_COUNT_URN, count, labels=labels) expected_mi.ClearField("start_time") found = False for actual_mi in process_wide_monitoring_infos: actual_mi.ClearField("start_time") if expected_mi == actual_mi: found = True break self.assertTrue( found, "Did not find read call metric with status: %s" % status)
def test_metrics(self): sampler = statesampler.StateSampler('', counters.CounterFactory()) statesampler.set_current_tracker(sampler) state1 = sampler.scoped_state( 'mystep', 'myState', metrics_container=MetricsContainer('mystep')) try: sampler.start() with state1: counter = MetricTests.base_metric_group.counter("my_counter") meter = MetricTests.base_metric_group.meter("my_meter") distribution = MetricTests.base_metric_group.distribution("my_distribution") container = MetricsEnvironment.current_container() self.assertEqual(0, counter.get_count()) self.assertEqual(0, meter.get_count()) self.assertEqual( DistributionData( 0, 0, 0, 0), container.get_distribution( MetricName( '[]', 'my_distribution')).get_cumulative()) counter.inc(-2) meter.mark_event(3) distribution.update(10) distribution.update(2) self.assertEqual(-2, counter.get_count()) self.assertEqual(3, meter.get_count()) self.assertEqual( DistributionData( 12, 2, 2, 10), container.get_distribution( MetricName( '[]', 'my_distribution')).get_cumulative()) finally: sampler.stop()
def test_write_mutations_metric_on_failure(self): MetricsEnvironment.process_wide_container().reset() mock_batch = MagicMock() mock_batch.commit.side_effect = [ exceptions.DeadlineExceeded("Deadline Exceeded"), [] ] mock_throttler = MagicMock() rpc_stats_callback = MagicMock() mock_throttler.throttle_request.return_value = False mutate = datastoreio._Mutate.DatastoreMutateFn("my_project") mutate._batch = mock_batch mutate._batch_elements = [] mutate._client = MagicMock() mutate.write_mutations(mock_throttler, rpc_stats_callback, throttle_delay=0) self.verify_write_call_metric("my_project", "", "deadline_exceeded", 1) self.verify_write_call_metric("my_project", "", "ok", 1)
def test_create_counter_distribution(self): sampler = statesampler.StateSampler('', counters.CounterFactory()) statesampler.set_current_tracker(sampler) state1 = sampler.scoped_state('mystep', 'myState', metrics_container=MetricsContainer('mystep')) sampler.start() with state1: counter_ns = 'aCounterNamespace' distro_ns = 'aDistributionNamespace' name = 'a_name' counter = Metrics.counter(counter_ns, name) distro = Metrics.distribution(distro_ns, name) counter.inc(10) counter.dec(3) distro.update(10) distro.update(2) self.assertTrue(isinstance(counter, Metrics.DelegatingCounter)) self.assertTrue(isinstance(distro, Metrics.DelegatingDistribution)) del distro del counter container = MetricsEnvironment.current_container() self.assertEqual( container.counters[MetricName(counter_ns, name)].get_cumulative(), 7) self.assertEqual( container.distributions[MetricName(distro_ns, name)].get_cumulative(), DistributionData(12, 2, 2, 10)) sampler.stop()
def verify_write_call_metric( self, project_id, dataset_id, table_id, status, count): """Check if an metric was recorded for the BQ IO write API call.""" process_wide_monitoring_infos = list( MetricsEnvironment.process_wide_container(). to_runner_api_monitoring_infos(None).values()) resource = resource_identifiers.BigQueryTable( project_id, dataset_id, table_id) labels = { # TODO(ajamato): Add Ptransform label. monitoring_infos.SERVICE_LABEL: 'BigQuery', # Refer to any method which writes elements to BigQuery in batches # as "BigQueryBatchWrite". I.e. storage API's insertAll, or future # APIs introduced. monitoring_infos.METHOD_LABEL: 'BigQueryBatchWrite', monitoring_infos.RESOURCE_LABEL: resource, monitoring_infos.BIGQUERY_PROJECT_ID_LABEL: project_id, monitoring_infos.BIGQUERY_DATASET_LABEL: dataset_id, monitoring_infos.BIGQUERY_TABLE_LABEL: table_id, monitoring_infos.STATUS_LABEL: status, } expected_mi = monitoring_infos.int64_counter( monitoring_infos.API_REQUEST_COUNT_URN, count, labels=labels) expected_mi.ClearField("start_time") found = False for actual_mi in process_wide_monitoring_infos: actual_mi.ClearField("start_time") if expected_mi == actual_mi: found = True break self.assertTrue( found, "Did not find write call metric with status: %s" % status)
def test_sql_metrics_ok_call(self): if 'DirectRunner' not in self.runner_name: raise unittest.SkipTest('This test only runs with DirectRunner.') MetricsEnvironment.process_wide_container().reset() with beam.Pipeline(argv=self.args) as p: r = p | ReadFromSpanner(self.project, self.instance, self.TEST_DATABASE, sql="select * from Users", query_name='query-1') assert_that(r, equal_to(self._data)) self.verify_sql_read_call_metric(self.project, self.TEST_DATABASE, 'query-1', 'ok', 1)
def test_create_counter_distribution(self): sampler = statesampler.StateSampler('', counters.CounterFactory()) statesampler.set_current_tracker(sampler) state1 = sampler.scoped_state('mystep', 'myState', metrics_container=MetricsContainer('mystep')) sampler.start() with state1: counter_ns = 'aCounterNamespace' distro_ns = 'aDistributionNamespace' name = 'a_name' counter = Metrics.counter(counter_ns, name) distro = Metrics.distribution(distro_ns, name) counter.inc(10) counter.dec(3) distro.update(10) distro.update(2) self.assertTrue(isinstance(counter, Metrics.DelegatingCounter)) self.assertTrue(isinstance(distro, Metrics.DelegatingDistribution)) del distro del counter container = MetricsEnvironment.current_container() self.assertEqual( container.counters[MetricName(counter_ns, name)].get_cumulative(), 7) self.assertEqual( container.distributions[MetricName(distro_ns, name)].get_cumulative(), DistributionData(12, 2, 2, 10)) sampler.stop()
def test_table_metrics_ok_call(self): if 'DirectRunner' not in self.runner_name: raise unittest.SkipTest('This test only runs with DirectRunner.') MetricsEnvironment.process_wide_container().reset() with beam.Pipeline(argv=self.args) as p: r = p | ReadFromSpanner(self.project, self.instance, self.TEST_DATABASE, table="Users", columns=["UserId", "Key"]) assert_that(r, equal_to(self._data)) self.verify_table_read_call_metric(self.project, self.TEST_DATABASE, 'Users', 'ok', 1)
def verify_write_call_metric(self, project_id, instance_id, table_id, status, count): """Check if a metric was recorded for the Datastore IO write API call.""" process_wide_monitoring_infos = list( MetricsEnvironment.process_wide_container( ).to_runner_api_monitoring_infos(None).values()) resource = resource_identifiers.BigtableTable(project_id, instance_id, table_id) labels = { monitoring_infos.SERVICE_LABEL: 'BigTable', monitoring_infos.METHOD_LABEL: 'google.bigtable.v2.MutateRows', monitoring_infos.RESOURCE_LABEL: resource, monitoring_infos.BIGTABLE_PROJECT_ID_LABEL: project_id, monitoring_infos.INSTANCE_ID_LABEL: instance_id, monitoring_infos.TABLE_ID_LABEL: table_id, monitoring_infos.STATUS_LABEL: status } expected_mi = monitoring_infos.int64_counter( monitoring_infos.API_REQUEST_COUNT_URN, count, labels=labels) expected_mi.ClearField("start_time") found = False for actual_mi in process_wide_monitoring_infos: actual_mi.ClearField("start_time") if expected_mi == actual_mi: found = True break self.assertTrue( found, "Did not find write call metric with status: %s" % status)
def run_pipeline(self, pipeline): MetricsEnvironment.set_metrics_supported(self.has_metrics_support()) # List of map tasks Each map task is a list of # (stage_name, operation_specs.WorkerOperation) instructions. self.map_tasks = [] # Map of pvalues to # (map_task_index, producer_operation_index, producer_output_index) self.outputs = {} # Unique mappings of PCollections to strings. self.side_input_labels = collections.defaultdict( lambda: str(len(self.side_input_labels))) # Mapping of map task indices to all map tasks that must preceed them. self.dependencies = collections.defaultdict(set) # Visit the graph, building up the map_tasks and their metadata. super(MapTaskExecutorRunner, self).run_pipeline(pipeline) # Now run the tasks in topological order. def compute_depth_map(deps): memoized = {} def compute_depth(x): if x not in memoized: memoized[x] = 1 + max([-1] + [compute_depth(y) for y in deps[x]]) return memoized[x] return {x: compute_depth(x) for x in deps.keys()} map_task_depths = compute_depth_map(self.dependencies) ordered_map_tasks = sorted((map_task_depths.get(ix, -1), map_task) for ix, map_task in enumerate(self.map_tasks)) profile_options = pipeline.options.view_as( pipeline_options.ProfilingOptions) if profile_options.profile_cpu: with profiler.Profile( profile_id='worker-runner', profile_location=profile_options.profile_location, log_results=True, file_copy_fn=_dependency_file_copy): self.execute_map_tasks(ordered_map_tasks) else: self.execute_map_tasks(ordered_map_tasks) return WorkerRunnerResult(PipelineState.UNKNOWN)
def get_count(self) -> int: """ Get number of events marked on the meter. .. versionadded:: 1.11.0 """ from apache_beam.metrics.execution import MetricsEnvironment container = MetricsEnvironment.current_container() return container.get_counter(self._inner_counter.metric_name).get_cumulative()
def test_harness_monitoring_infos_and_metadata(self): # Clear the process wide metric container. MetricsEnvironment.process_wide_container().reset() # Create a process_wide metric. urn = 'my.custom.urn' labels = {'key': 'value'} InternalMetrics.counter(urn=urn, labels=labels, process_wide=True).inc(10) harness_monitoring_infos_request = beam_fn_api_pb2.InstructionRequest( instruction_id="monitoring_infos", harness_monitoring_infos=beam_fn_api_pb2. HarnessMonitoringInfosRequest()) responses = self.get_responses([harness_monitoring_infos_request]) expected_monitoring_info = monitoring_infos.int64_counter( urn, 10, labels=labels) monitoring_data = (responses['monitoring_infos']. harness_monitoring_infos.monitoring_data) # Request the full MonitoringInfo metadata for the returned short_ids. short_ids = list(monitoring_data.keys()) monitoring_infos_metadata_request = beam_fn_api_pb2.InstructionRequest( instruction_id="monitoring_infos_metadata", monitoring_infos=beam_fn_api_pb2.MonitoringInfosMetadataRequest( monitoring_info_id=short_ids)) responses = self.get_responses([monitoring_infos_metadata_request]) # Request the full MonitoringInfo metadata to be returned now. expected_monitoring_info.ClearField("payload") # Verify that one of the returned monitoring infos is our expected # monitoring info. short_id_to_mi = (responses['monitoring_infos_metadata']. monitoring_infos.monitoring_info) found = False for mi in short_id_to_mi.values(): # Clear the timestamp before comparing mi.ClearField("start_time") if mi == expected_monitoring_info: found = True self.assertTrue(found, str(responses['monitoring_infos_metadata']))
def get_count(self): """ Returns the current count. .. versionadded:: 1.11.0 """ from apache_beam.metrics.execution import MetricsEnvironment container = MetricsEnvironment.current_container() return container.get_counter( self._inner_counter.metric_name).get_cumulative()
def test_sql_metrics_error_call(self): if 'DirectRunner' not in self.runner_name: raise unittest.SkipTest('This test only runs with DirectRunner.') MetricsEnvironment.process_wide_container().reset() with self.assertRaises(Exception): p = beam.Pipeline(argv=self.args) _ = p | ReadFromSpanner(self.project, self.instance, self.TEST_DATABASE, sql="select * from NonExistent", query_name='query-2') res = p.run() res.wait_until_finish() self.verify_sql_read_call_metric(self.project, self.TEST_DATABASE, 'query-2', '400', 1)
def test_insert_rows_sets_metric_on_failure(self): MetricsEnvironment.process_wide_container().reset() client = mock.Mock() client.insert_rows_json = mock.Mock( # Fail a few times, then succeed. side_effect=[ DeadlineExceeded("Deadline Exceeded"), InternalServerError("Internal Error"), [], ]) wrapper = beam.io.gcp.bigquery_tools.BigQueryWrapper(client) wrapper.insert_rows("my_project", "my_dataset", "my_table", []) # Expect two failing calls, then a success (i.e. two retries). self.verify_write_call_metric( "my_project", "my_dataset", "my_table", "deadline_exceeded", 1) self.verify_write_call_metric( "my_project", "my_dataset", "my_table", "internal", 1) self.verify_write_call_metric( "my_project", "my_dataset", "my_table", "ok", 1)
def test_QueryFn_metric_on_failure(self): MetricsEnvironment.process_wide_container().reset() with patch.object(helper, 'get_client', return_value=self._mock_client): self._mock_query.project = self._PROJECT self._mock_query.namespace = self._NAMESPACE _query_fn = ReadFromDatastore._QueryFn() client_query = self._mock_query._to_client_query() # Test with exception client_query.fetch.side_effect = [ exceptions.DeadlineExceeded("Deadline exceed") ] list(_query_fn.process(self._mock_query)) self.verify_read_call_metric(self._PROJECT, self._NAMESPACE, "deadline_exceeded", 1) # Test success client_query.fetch.side_effect = [[]] list(_query_fn.process(self._mock_query)) self.verify_read_call_metric(self._PROJECT, self._NAMESPACE, "ok", 1)
def test_table_metrics_error_call(self): if 'DirectRunner' not in self.runner_name: raise unittest.SkipTest('This test only runs with DirectRunner.') MetricsEnvironment.process_wide_container().reset() with self.assertRaises(Exception): p = beam.Pipeline(argv=self.args) _ = p | ReadFromSpanner( self.project, self.instance, self.TEST_DATABASE, table="INVALID_TABLE", columns=["UserId", "Key"]) res = p.run() res.wait_until_finish() self.verify_table_read_call_metric( self.project, self.TEST_DATABASE, 'INVALID_TABLE', '404', 1)
def test_scoped_container(self): c1 = MetricsContainer('mystep') c2 = MetricsContainer('myinternalstep') with ScopedMetricsContainer(c1): self.assertEqual(c1, MetricsEnvironment.current_container()) counter = Metrics.counter('ns', 'name') counter.inc(2) with ScopedMetricsContainer(c2): self.assertEqual(c2, MetricsEnvironment.current_container()) counter = Metrics.counter('ns', 'name') counter.inc(3) self.assertEqual(list(c2.get_cumulative().counters.items()), [(MetricKey('myinternalstep', MetricName('ns', 'name')), 3)]) self.assertEqual(c1, MetricsEnvironment.current_container()) counter = Metrics.counter('ns', 'name') counter.inc(4) self.assertEqual( list(c1.get_cumulative().counters.items()), [(MetricKey('mystep', MetricName('ns', 'name')), 6)])
def test_scoped_container(self): c1 = MetricsContainer('mystep') c2 = MetricsContainer('myinternalstep') with ScopedMetricsContainer(c1): self.assertEqual(c1, MetricsEnvironment.current_container()) counter = Metrics.counter('ns', 'name') counter.inc(2) with ScopedMetricsContainer(c2): self.assertEqual(c2, MetricsEnvironment.current_container()) counter = Metrics.counter('ns', 'name') counter.inc(3) self.assertEqual( c2.get_cumulative().counters.items(), [(MetricKey('myinternalstep', MetricName('ns', 'name')), 3)]) self.assertEqual(c1, MetricsEnvironment.current_container()) counter = Metrics.counter('ns', 'name') counter.inc(4) self.assertEqual( c1.get_cumulative().counters.items(), [(MetricKey('mystep', MetricName('ns', 'name')), 6)])
def _request_harness_monitoring_infos(self, request): # type: (beam_fn_api_pb2.InstructionRequest) -> None process_wide_monitoring_infos = MetricsEnvironment.process_wide_container( ).to_runner_api_monitoring_infos(None).values() self._execute( lambda: beam_fn_api_pb2.InstructionResponse( instruction_id=request.instruction_id, harness_monitoring_infos=( beam_fn_api_pb2.HarnessMonitoringInfosResponse( monitoring_data={ SHORT_ID_CACHE.get_short_id(info): info.payload for info in process_wide_monitoring_infos }))), request)
def verify_write_call_metric(self, project, database, table, status, count): resource = resource_identifiers.SpannerTable(project, database, table) labels = { monitoring_infos.SERVICE_LABEL: 'Spanner', monitoring_infos.METHOD_LABEL: 'Write', monitoring_infos.SPANNER_PROJECT_ID: project, monitoring_infos.SPANNER_DATABASE_ID: database, monitoring_infos.RESOURCE_LABEL: resource, monitoring_infos.SPANNER_TABLE_ID: table, monitoring_infos.STATUS_LABEL: status } metric_name = MetricName( None, None, urn=monitoring_infos.API_REQUEST_COUNT_URN, labels=labels) metric_value = MetricsEnvironment.process_wide_container().get_counter( metric_name).get_cumulative() self.assertEqual(metric_value, count)
def test_uses_right_container(self): c1 = MetricsContainer('step1') c2 = MetricsContainer('step2') counter = Metrics.counter('ns', 'name') MetricsEnvironment.set_current_container(c1) counter.inc() MetricsEnvironment.set_current_container(c2) counter.inc(3) MetricsEnvironment.unset_current_container() self.assertEqual( c1.get_cumulative().counters.items(), [(MetricKey('step1', MetricName('ns', 'name')), 1)]) self.assertEqual( c2.get_cumulative().counters.items(), [(MetricKey('step2', MetricName('ns', 'name')), 3)])
def inc(self, n=1): container = MetricsEnvironment.current_container() if container is not None: container.get_counter(self.metric_name).inc(n)
def test_no_container(self): self.assertEqual(MetricsEnvironment.current_container(), None)
def run_pipeline(self, pipeline): MetricsEnvironment.set_metrics_supported(False) return self.run_via_runner_api(pipeline.to_runner_api())
def run(self, pipeline): MetricsEnvironment.set_metrics_supported(self.has_metrics_support()) if pipeline._verify_runner_api_compatible(): return self.run_via_runner_api(pipeline.to_runner_api()) else: return super(FnApiRunner, self).run(pipeline)
def set(self, value): container = MetricsEnvironment.current_container() if container is not None: container.get_gauge(self.metric_name).set(value)
def update(self, value): container = MetricsEnvironment.current_container() if container is not None: container.get_distribution(self.metric_name).update(value)