def test_query_structured_metrics(self): mock_client, mock_job_result = self.setup_mock_client_result( self.STRUCTURED_COUNTER_LIST) dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result) dm._translate_step_name = types.MethodType(lambda self, x: 'split', dm) query_result = dm.query() expected_counters = [ MetricResult( MetricKey( 'split', MetricName('__main__.WordExtractingDoFn', 'word_lengths'), ), 109475, 109475), ] self.assertEqual(query_result['counters'], expected_counters) expected_distributions = [ MetricResult( MetricKey( 'split', MetricName('__main__.WordExtractingDoFn', 'word_length_dist'), ), DistributionResult(DistributionData(18, 2, 2, 16)), DistributionResult(DistributionData(18, 2, 2, 16))), ] self.assertEqual(query_result['distributions'], expected_distributions)
def test_metric_filter_step_matching(self): name = MetricName('ns1', 'name1') filter = MetricsFilter().with_step('Step1') key = MetricKey('Step1', name) self.assertTrue(MetricResults.matches(filter, key)) key = MetricKey('Step10', name) self.assertFalse(MetricResults.matches(filter, key)) key = MetricKey('Step10/Step1', name) self.assertTrue(MetricResults.matches(filter, key)) key = MetricKey('Top1/Outer1/Inner1', name) filter = MetricsFilter().with_step('Top1/Outer1/Inner1') self.assertTrue(MetricResults.matches(filter, key)) filter = MetricsFilter().with_step('Top1/Outer1') self.assertTrue(MetricResults.matches(filter, key)) filter = MetricsFilter().with_step('Outer1/Inner1') self.assertTrue(MetricResults.matches(filter, key)) filter = MetricsFilter().with_step('Top1/Inner1') self.assertFalse(MetricResults.matches(filter, key))
def test_apply_physical_no_filter(self): metrics = DirectMetrics() metrics.update_physical( object(), MetricUpdates( counters={ MetricKey('step1', self.name1): 5, MetricKey('step1', self.name3): 8 })) metrics.update_physical( object(), MetricUpdates( counters={ MetricKey('step2', self.name1): 7, MetricKey('step1', self.name3): 4 })) results = metrics.query() hc.assert_that( results['counters'], hc.contains_inanyorder(*[ MetricResult(MetricKey('step1', self.name1), 0, 5), MetricResult(MetricKey('step1', self.name3), 0, 12), MetricResult(MetricKey('step2', self.name1), 0, 7) ])) metrics.commit_physical(object(), MetricUpdates()) results = metrics.query() hc.assert_that( results['counters'], hc.contains_inanyorder(*[ MetricResult(MetricKey('step1', self.name1), 0, 5), MetricResult(MetricKey('step1', self.name3), 0, 12), MetricResult(MetricKey('step2', self.name1), 0, 7) ]))
def test_direct_runner_metrics(self): class MyDoFn(beam.DoFn): def start_bundle(self): count = Metrics.counter(self.__class__, 'bundles') count.inc() def finish_bundle(self): count = Metrics.counter(self.__class__, 'finished_bundles') count.inc() def process(self, element): gauge = Metrics.gauge(self.__class__, 'latest_element') gauge.set(element) count = Metrics.counter(self.__class__, 'elements') count.inc() distro = Metrics.distribution(self.__class__, 'element_dist') distro.update(element) return [element] p = Pipeline(DirectRunner()) pcoll = (p | beam.Create([1, 2, 3, 4, 5]) | 'Do' >> beam.ParDo(MyDoFn())) assert_that(pcoll, equal_to([1, 2, 3, 4, 5])) result = p.run() result.wait_until_finish() metrics = result.metrics().query() namespace = '{}.{}'.format(MyDoFn.__module__, MyDoFn.__name__) hc.assert_that( metrics['counters'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'elements')), 5, 5), MetricResult( MetricKey('Do', MetricName(namespace, 'bundles')), 1, 1), MetricResult( MetricKey('Do', MetricName(namespace, 'finished_bundles')), 1, 1))) hc.assert_that( metrics['distributions'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'element_dist')), DistributionResult(DistributionData(15, 5, 1, 5)), DistributionResult(DistributionData(15, 5, 1, 5))))) gauge_result = metrics['gauges'][0] hc.assert_that( gauge_result.key, hc.equal_to(MetricKey('Do', MetricName(namespace, 'latest_element')))) hc.assert_that(gauge_result.committed.value, hc.equal_to(5)) hc.assert_that(gauge_result.attempted.value, hc.equal_to(5))
def test_metric_filter_name_matching(self): filter = MetricsFilter().with_name('name1').with_namespace('ns1') name = MetricName('ns1', 'name1') key = MetricKey('step1', name) self.assertTrue(MetricResults.matches(filter, key)) filter = MetricsFilter().with_name('name1') name = MetricName('ns1', 'name1') key = MetricKey('step1', name) self.assertTrue(MetricResults.matches(filter, key))
def test_basic_metric_name(self): name = MetricName('namespace1', 'name1') self.assertEqual(name.namespace, 'namespace1') self.assertEqual(name.name, 'name1') self.assertEqual(name, MetricName('namespace1', 'name1')) key = MetricKey('step1', name) self.assertEqual(key.step, 'step1') self.assertEqual(key.metric.namespace, 'namespace1') self.assertEqual(key.metric.name, 'name1') self.assertEqual(key, MetricKey('step1', MetricName('namespace1', 'name1')))
def test_equality_for_key_with_labels(self): test_labels = {'label1', 'value1'} test_object = MetricKey( 'step', MetricName('namespace', 'name'), labels=test_labels) same_labels = MetricKey( 'step', MetricName('namespace', 'name'), labels={'label1', 'value1'}) same_label_reference = MetricKey( 'step', MetricName('namespace', 'name'), labels=test_labels) self.assertEqual(test_object, same_labels) self.assertEqual(test_object, same_label_reference) self.assertEqual(hash(test_object), hash(same_labels)) self.assertEqual(hash(test_object), hash(same_label_reference))
def query(self, filter=None): counters = [ MetricResult(MetricKey(k.step, k.metric), v.extract_committed(), v.extract_latest_attempted()) for k, v in self._counters.items() if self.matches(filter, k) ] distributions = [ MetricResult(MetricKey(k.step, k.metric), v.extract_committed(), v.extract_latest_attempted()) for k, v in self._distributions.items() if self.matches(filter, k) ] return {'counters': counters, 'distributions': distributions}
def test_direct_runner_metrics(self): from apache_beam.metrics.metric import Metrics class MyDoFn(beam.DoFn): def start_bundle(self): count = Metrics.counter(self.__class__, 'bundles') count.inc() def finish_bundle(self): count = Metrics.counter(self.__class__, 'finished_bundles') count.inc() def process(self, element): count = Metrics.counter(self.__class__, 'elements') count.inc() distro = Metrics.distribution(self.__class__, 'element_dist') distro.update(element) return [element] runner = DirectRunner() p = Pipeline(runner, options=PipelineOptions(self.default_properties)) # pylint: disable=expression-not-assigned (p | ptransform.Create([1, 2, 3, 4, 5]) | 'Do' >> beam.ParDo(MyDoFn())) result = p.run() result.wait_until_finish() metrics = result.metrics().query() namespace = '{}.{}'.format(MyDoFn.__module__, MyDoFn.__name__) hc.assert_that( metrics['counters'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'elements')), 5, 5), MetricResult( MetricKey('Do', MetricName(namespace, 'bundles')), 1, 1), MetricResult( MetricKey('Do', MetricName(namespace, 'finished_bundles')), 1, 1))) hc.assert_that( metrics['distributions'], hc.contains_inanyorder( MetricResult( MetricKey('Do', MetricName(namespace, 'element_dist')), DistributionResult(DistributionData(15, 5, 1, 5)), DistributionResult(DistributionData(15, 5, 1, 5)))))
def test_equality_for_key_with_no_labels(self): test_object = MetricKey('step', MetricName('namespace', 'name')) same = MetricKey('step', MetricName('namespace', 'name')) self.assertEqual(test_object, same) self.assertEqual(hash(test_object), hash(same)) diff_step = MetricKey('step_diff', MetricName('namespace', 'name')) diff_namespace = MetricKey('step', MetricName('namespace_diff', 'name')) diff_name = MetricKey('step', MetricName('namespace', 'name_diff')) self.assertNotEqual(test_object, diff_step) self.assertNotEqual(test_object, diff_namespace) self.assertNotEqual(test_object, diff_name) self.assertNotEqual(hash(test_object), hash(diff_step)) self.assertNotEqual(hash(test_object), hash(diff_namespace)) self.assertNotEqual(hash(test_object), hash(diff_name))
def test_inequality_for_key_with_labels(self): test_labels = {'label1', 'value1'} test_object = MetricKey( 'step', MetricName('namespace', 'name'), labels=test_labels) no_labels = MetricKey('step', MetricName('namespace', 'name')) diff_label_key = MetricKey( 'step', MetricName('namespace', 'name'), labels={'l1_diff', 'value1'}) diff_label_value = MetricKey( 'step', MetricName('namespace', 'name'), labels={'label1', 'v1_diff'}) self.assertNotEqual(test_object, no_labels) self.assertNotEqual(test_object, diff_label_key) self.assertNotEqual(test_object, diff_label_value) self.assertNotEqual(hash(test_object), hash(no_labels)) self.assertNotEqual(hash(test_object), hash(diff_label_key)) self.assertNotEqual(hash(test_object), hash(diff_label_value))
def test_uses_right_container(self): c1 = MetricsContainer('step1') c2 = MetricsContainer('step2') counter = Metrics.counter('ns', 'name') MetricsEnvironment.set_current_container(c1) counter.inc() MetricsEnvironment.set_current_container(c2) counter.inc(3) MetricsEnvironment.unset_current_container() self.assertEqual(list(c1.get_cumulative().counters.items()), [(MetricKey('step1', MetricName('ns', 'name')), 1)]) self.assertEqual(list(c2.get_cumulative().counters.items()), [(MetricKey('step2', MetricName('ns', 'name')), 3)])
def _create_metric_key(monitoring_info): step_name = monitoring_infos.get_step_name(monitoring_info) if not step_name: raise ValueError('Failed to deduce step_name from MonitoringInfo: {}' .format(monitoring_info)) namespace, name = monitoring_infos.parse_namespace_and_name(monitoring_info) return MetricKey(step_name, MetricName(namespace, name))
def _get_metric_key(self, metric): """Populate the MetricKey object for a queried metric result.""" step = "" name = metric.name.name # Always extract a name labels = dict() try: # Try to extract the user step name. # If ValueError is thrown within this try-block, it is because of # one of the following: # 1. Unable to translate the step name. Only happening with improperly # formatted job graph (unlikely), or step name not being the internal # step name (only happens for unstructured-named metrics). # 2. Unable to unpack [step] or [namespace]; which should only happen # for unstructured names. step = _get_match(metric.name.context.additionalProperties, lambda x: x.key == STEP_LABEL).value step = self._translate_step_name(step) except ValueError: pass namespace = "dataflow/v1b3" # Try to extract namespace or add a default. try: namespace = _get_match(metric.name.context.additionalProperties, lambda x: x.key == 'namespace').value except ValueError: pass for kv in metric.name.context.additionalProperties: if kv.key in STRUCTURED_NAME_LABELS: labels[kv.key] = kv.value # Package everything besides namespace and name the labels as well, # including unmodified step names to assist in integration the exact # unmodified values which come from dataflow. return MetricKey(step, MetricName(namespace, name), labels=labels)
def _get_metric_key(self, metric): """Populate the MetricKey object for a queried metric result.""" try: # If ValueError is thrown within this try-block, it is because of # one of the following: # 1. Unable to translate the step name. Only happening with improperly # formatted job graph (unlikely), or step name not being the internal # step name (only happens for unstructured-named metrics). # 2. Unable to unpack [step] or [namespace]; which should only happen # for unstructured names. [step] = [ prop.value for prop in metric.name.context.additionalProperties if prop.key == 'step' ] step = self._translate_step_name(step) [namespace] = [ prop.value for prop in metric.name.context.additionalProperties if prop.key == 'namespace' ] name = metric.name.name except ValueError: # An unstructured metric name is "step/namespace/name", but step names # can (and often do) contain slashes. Must only split on the right-most # two slashes, to preserve the full step name. [step, namespace, name] = metric.name.name.rsplit('/', 2) return MetricKey(step, MetricName(namespace, name))
def assert_counter_exists(metrics, namespace, name, step): found = 0 metric_key = MetricKey(step, MetricName(namespace, name)) for m in metrics['counters']: if m.key == metric_key: found = found + 1 self.assertEqual( 1, found, "Did not find exactly 1 metric for %s." % metric_key)
def test_query_counters(self): mock_client, mock_job_result = self.setup_mock_client_result() dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result) query_result = dm.query() expected_counters = [ MetricResult( MetricKey( 'split', MetricName('__main__.WordExtractingDoFn', 'empty_lines')), 1080, 1080), MetricResult( MetricKey('longstepname/split', MetricName('__main__.WordExtractingDoFn', 'words')), 26181, 26185), ] self.assertEqual( sorted(query_result['counters'], key=lambda x: x.key.metric.name), sorted(expected_counters, key=lambda x: x.key.metric.name))
def test_pardo_metrics(self): class MyDoFn(beam.DoFn): def start_bundle(self): self.count = Metrics.counter(self.__class__, 'elements') def process(self, element): self.count.inc(element) return [element] class MyOtherDoFn(beam.DoFn): def start_bundle(self): self.count = Metrics.counter(self.__class__, 'elementsplusone') def process(self, element): self.count.inc(element + 1) return [element] with self.create_pipeline() as p: res = (p | beam.Create([1, 2, 3]) | 'mydofn' >> beam.ParDo(MyDoFn()) | 'myotherdofn' >> beam.ParDo(MyOtherDoFn())) p.run() if not MetricsEnvironment.METRICS_SUPPORTED: self.skipTest('Metrics are not supported.') counter_updates = [ { 'key': key, 'value': val } for container in p.runner.metrics_containers() for key, val in container.get_updates().counters.items() ] counter_values = [update['value'] for update in counter_updates] counter_keys = [update['key'] for update in counter_updates] assert_that(res, equal_to([1, 2, 3])) self.assertEqual(counter_values, [6, 9]) self.assertEqual(counter_keys, [ MetricKey('mydofn', MetricName(__name__ + '.MyDoFn', 'elements')), MetricKey( 'myotherdofn', MetricName(__name__ + '.MyOtherDoFn', 'elementsplusone')) ])
def query(self, filter=None): counters = [MetricResult(MetricKey(k.step, k.metric), v.extract_committed(), v.extract_latest_attempted()) for k, v in self._counters.items() if self.matches(filter, k)] distributions = [MetricResult(MetricKey(k.step, k.metric), v.extract_committed(), v.extract_latest_attempted()) for k, v in self._distributions.items() if self.matches(filter, k)] gauges = [MetricResult(MetricKey(k.step, k.metric), v.extract_committed(), v.extract_latest_attempted()) for k, v in self._gauges.items() if self.matches(filter, k)] return {self.COUNTERS: counters, self.DISTRIBUTIONS: distributions, self.GAUGES: gauges}
def test_query_counters(self): mock_client, mock_job_result = self.setup_mock_client_result( self.ONLY_COUNTERS_LIST) dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result) dm._translate_step_name = types.MethodType(lambda self, x: 'split', dm) query_result = dm.query() expected_counters = [ MetricResult( MetricKey( 'split', MetricName('__main__.WordExtractingDoFn', 'empty_lines')), 1080, 1080), MetricResult( MetricKey('split', MetricName('__main__.WordExtractingDoFn', 'words')), 26181, 26185), ] self.assertEqual( sorted(query_result['counters'], key=lambda x: x.key.metric.name), sorted(expected_counters, key=lambda x: x.key.metric.name))
def test_commit_logical_no_filter(self): metrics = DirectMetrics() metrics.commit_logical( self.bundle1, MetricUpdates( counters={ MetricKey('step1', self.name1): 5, MetricKey('step1', self.name2): 8 }, distributions={ MetricKey('step1', self.name1): DistributionData(8, 2, 3, 5) })) metrics.commit_logical( self.bundle1, MetricUpdates( counters={ MetricKey('step2', self.name1): 7, MetricKey('step1', self.name2): 4 }, distributions={ MetricKey('step1', self.name1): DistributionData(4, 1, 4, 4) })) results = metrics.query() hc.assert_that( results['counters'], hc.contains_inanyorder( *[ MetricResult(MetricKey('step1', self.name2), 12, 0), MetricResult(MetricKey('step2', self.name1), 7, 0), MetricResult(MetricKey('step1', self.name1), 5, 0) ])) hc.assert_that( results['distributions'], hc.contains_inanyorder( MetricResult( MetricKey('step1', self.name1), DistributionResult(DistributionData(12, 3, 3, 5)), DistributionResult(DistributionData(0, 0, None, None)))))
def test_scoped_container(self): c1 = MetricsContainer('mystep') c2 = MetricsContainer('myinternalstep') with ScopedMetricsContainer(c1): self.assertEqual(c1, MetricsEnvironment.current_container()) counter = Metrics.counter('ns', 'name') counter.inc(2) with ScopedMetricsContainer(c2): self.assertEqual(c2, MetricsEnvironment.current_container()) counter = Metrics.counter('ns', 'name') counter.inc(3) self.assertEqual(list(c2.get_cumulative().counters.items()), [(MetricKey('myinternalstep', MetricName('ns', 'name')), 3)]) self.assertEqual(c1, MetricsEnvironment.current_container()) counter = Metrics.counter('ns', 'name') counter.inc(4) self.assertEqual( list(c1.get_cumulative().counters.items()), [(MetricKey('mystep', MetricName('ns', 'name')), 6)])
def _get_metric_key(self, metric): """Populate the MetricKey object for a queried metric result.""" try: # If ValueError is thrown within this try-block, it is because of # one of the following: # 1. Unable to translate the step name. Only happening with improperly # formatted job graph (unlikely), or step name not being the internal # step name (only happens for unstructured-named metrics). # 2. Unable to unpack [step] or [namespace]; which should only happen # for unstructured names. step = _get_match(metric.name.context.additionalProperties, lambda x: x.key == 'step').value step = self._translate_step_name(step) namespace = _get_match(metric.name.context.additionalProperties, lambda x: x.key == 'namespace').value name = metric.name.name except ValueError: return None return MetricKey(step, MetricName(namespace, name))
def _create_metric_result(data_dict): step = data_dict['step'] if 'step' in data_dict else '' labels = data_dict['labels'] if 'labels' in data_dict else dict() values = {} for key in ['attempted', 'committed']: if key in data_dict: if 'counter' in data_dict[key]: values[key] = data_dict[key]['counter'] elif 'distribution' in data_dict[key]: distribution = data_dict[key]['distribution'] values[key] = DistributionResult( DistributionData( distribution['sum'], distribution['count'], distribution['min'], distribution['max'], )) attempted = values['attempted'] if 'attempted' in values else None committed = values['committed'] if 'committed' in values else None metric_name = MetricName(data_dict['namespace'], data_dict['name']) metric_key = MetricKey(step, metric_name, labels) return MetricResult(metric_key, committed, attempted)
def _populate_metric_results(self, response): """Take a list of metrics, and convert it to a list of MetricResult.""" user_metrics = [ metric for metric in response.metrics if metric.name.origin == 'user' ] # Get the tentative/committed versions of every metric together. metrics_by_name = defaultdict(lambda: {}) for metric in user_metrics: tentative = [ prop for prop in metric.name.context.additionalProperties if prop.key == 'tentative' and prop.value == 'true' ] key = 'tentative' if tentative else 'committed' metrics_by_name[metric.name.name][key] = metric # Now we create the MetricResult elements. result = [] for name, metric in metrics_by_name.iteritems(): if (name.endswith('(DIST)') or name.endswith('[MIN]') or name.endswith('[MAX]') or name.endswith('[MEAN]') or name.endswith('[COUNT]')): warn( 'Distribution metrics will be ignored in the MetricsResult.query' 'method. You can see them in the Dataflow User Interface.') # Distributions are not yet fully supported in this runner continue [step, namespace, name] = name.split('/') key = MetricKey(step, MetricName(namespace, name)) attempted = metric['tentative'].scalar.integer_value committed = metric['committed'].scalar.integer_value result.append( MetricResult(key, attempted=attempted, committed=committed)) return result
def _to_metric_key(self, monitoring_info): # Right now this assumes that all metrics have a PTRANSFORM ptransform_id = monitoring_info.labels['PTRANSFORM'] namespace, name = monitoring_infos.parse_namespace_and_name(monitoring_info) return MetricKey( ptransform_id, metrics.metricbase.MetricName(namespace, name))
def test_apply_physical_logical(self): metrics = DirectMetrics() dist_zero = DistributionData(0, 0, None, None) metrics.update_physical( object(), MetricUpdates(counters={ MetricKey('step1', self.name1): 7, MetricKey('step1', self.name2): 5, MetricKey('step2', self.name1): 1 }, distributions={ MetricKey('step1', self.name1): DistributionData(3, 1, 3, 3), MetricKey('step2', self.name3): DistributionData(8, 2, 4, 4) })) results = metrics.query() hc.assert_that( results['counters'], hc.contains_inanyorder(*[ MetricResult(MetricKey('step1', self.name1), 0, 7), MetricResult(MetricKey('step1', self.name2), 0, 5), MetricResult(MetricKey('step2', self.name1), 0, 1) ])) hc.assert_that( results['distributions'], hc.contains_inanyorder(*[ MetricResult(MetricKey('step1', self.name1), DistributionResult(dist_zero), DistributionResult(DistributionData(3, 1, 3, 3))), MetricResult(MetricKey('step2', self.name3), DistributionResult(dist_zero), DistributionResult(DistributionData(8, 2, 4, 4))) ])) metrics.commit_physical( object(), MetricUpdates(counters={ MetricKey('step1', self.name1): -3, MetricKey('step2', self.name1): -5 }, distributions={ MetricKey('step1', self.name1): DistributionData(8, 4, 1, 5), MetricKey('step2', self.name2): DistributionData(8, 8, 1, 1) })) results = metrics.query() hc.assert_that( results['counters'], hc.contains_inanyorder(*[ MetricResult(MetricKey('step1', self.name1), 0, 4), MetricResult(MetricKey('step1', self.name2), 0, 5), MetricResult(MetricKey('step2', self.name1), 0, -4) ])) hc.assert_that( results['distributions'], hc.contains_inanyorder(*[ MetricResult(MetricKey('step1', self.name1), DistributionResult(dist_zero), DistributionResult(DistributionData(11, 5, 1, 5))), MetricResult(MetricKey('step2', self.name3), DistributionResult(dist_zero), DistributionResult(DistributionData(8, 2, 4, 4))), MetricResult(MetricKey('step2', self.name2), DistributionResult(dist_zero), DistributionResult(DistributionData(8, 8, 1, 1))) ])) metrics.commit_logical( object(), MetricUpdates(counters={ MetricKey('step1', self.name1): 3, MetricKey('step1', self.name2): 5, MetricKey('step2', self.name1): -3 }, distributions={ MetricKey('step1', self.name1): DistributionData(11, 5, 1, 5), MetricKey('step2', self.name2): DistributionData(8, 8, 1, 1), MetricKey('step2', self.name3): DistributionData(4, 1, 4, 4) })) results = metrics.query() hc.assert_that( results['counters'], hc.contains_inanyorder(*[ MetricResult(MetricKey('step1', self.name1), 3, 4), MetricResult(MetricKey('step1', self.name2), 5, 5), MetricResult(MetricKey('step2', self.name1), -3, -4) ])) hc.assert_that( results['distributions'], hc.contains_inanyorder(*[ MetricResult(MetricKey('step1', self.name1), DistributionResult(DistributionData(11, 5, 1, 5)), DistributionResult(DistributionData(11, 5, 1, 5))), MetricResult(MetricKey('step2', self.name3), DistributionResult(DistributionData(4, 1, 4, 4)), DistributionResult(DistributionData(8, 2, 4, 4))), MetricResult(MetricKey('step2', self.name2), DistributionResult(DistributionData(8, 8, 1, 1)), DistributionResult(DistributionData(8, 8, 1, 1))) ]))