def test_query_structured_metrics(self):
        mock_client, mock_job_result = self.setup_mock_client_result(
            self.STRUCTURED_COUNTER_LIST)
        dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result)
        dm._translate_step_name = types.MethodType(lambda self, x: 'split', dm)
        query_result = dm.query()
        expected_counters = [
            MetricResult(
                MetricKey(
                    'split',
                    MetricName('__main__.WordExtractingDoFn', 'word_lengths'),
                ), 109475, 109475),
        ]
        self.assertEqual(query_result['counters'], expected_counters)

        expected_distributions = [
            MetricResult(
                MetricKey(
                    'split',
                    MetricName('__main__.WordExtractingDoFn',
                               'word_length_dist'),
                ), DistributionResult(DistributionData(18, 2, 2, 16)),
                DistributionResult(DistributionData(18, 2, 2, 16))),
        ]
        self.assertEqual(query_result['distributions'], expected_distributions)
  def test_direct_runner_metrics(self):

    class MyDoFn(beam.DoFn):
      def start_bundle(self):
        count = Metrics.counter(self.__class__, 'bundles')
        count.inc()

      def finish_bundle(self):
        count = Metrics.counter(self.__class__, 'finished_bundles')
        count.inc()

      def process(self, element):
        gauge = Metrics.gauge(self.__class__, 'latest_element')
        gauge.set(element)
        count = Metrics.counter(self.__class__, 'elements')
        count.inc()
        distro = Metrics.distribution(self.__class__, 'element_dist')
        distro.update(element)
        return [element]

    p = Pipeline(DirectRunner())
    pcoll = (p | beam.Create([1, 2, 3, 4, 5])
             | 'Do' >> beam.ParDo(MyDoFn()))
    assert_that(pcoll, equal_to([1, 2, 3, 4, 5]))
    result = p.run()
    result.wait_until_finish()
    metrics = result.metrics().query()
    namespace = '{}.{}'.format(MyDoFn.__module__,
                               MyDoFn.__name__)

    hc.assert_that(
        metrics['counters'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'elements')),
                5, 5),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'bundles')),
                1, 1),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'finished_bundles')),
                1, 1)))

    hc.assert_that(
        metrics['distributions'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'element_dist')),
                DistributionResult(DistributionData(15, 5, 1, 5)),
                DistributionResult(DistributionData(15, 5, 1, 5)))))

    gauge_result = metrics['gauges'][0]
    hc.assert_that(
        gauge_result.key,
        hc.equal_to(MetricKey('Do', MetricName(namespace, 'latest_element'))))
    hc.assert_that(gauge_result.committed.value, hc.equal_to(5))
    hc.assert_that(gauge_result.attempted.value, hc.equal_to(5))
Beispiel #3
0
    def query(self, filter=None):
        counters = [
            MetricResult(MetricKey(k.step, k.metric), v.extract_committed(),
                         v.extract_latest_attempted())
            for k, v in self._counters.items() if self.matches(filter, k)
        ]
        distributions = [
            MetricResult(MetricKey(k.step, k.metric), v.extract_committed(),
                         v.extract_latest_attempted())
            for k, v in self._distributions.items() if self.matches(filter, k)
        ]

        return {'counters': counters, 'distributions': distributions}
Beispiel #4
0
  def test_direct_runner_metrics(self):
    from apache_beam.metrics.metric import Metrics

    class MyDoFn(beam.DoFn):
      def start_bundle(self):
        count = Metrics.counter(self.__class__, 'bundles')
        count.inc()

      def finish_bundle(self):
        count = Metrics.counter(self.__class__, 'finished_bundles')
        count.inc()

      def process(self, element):
        count = Metrics.counter(self.__class__, 'elements')
        count.inc()
        distro = Metrics.distribution(self.__class__, 'element_dist')
        distro.update(element)
        return [element]

    runner = DirectRunner()
    p = Pipeline(runner,
                 options=PipelineOptions(self.default_properties))
    # pylint: disable=expression-not-assigned
    (p | ptransform.Create([1, 2, 3, 4, 5])
     | 'Do' >> beam.ParDo(MyDoFn()))
    result = p.run()
    result.wait_until_finish()
    metrics = result.metrics().query()
    namespace = '{}.{}'.format(MyDoFn.__module__,
                               MyDoFn.__name__)

    hc.assert_that(
        metrics['counters'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'elements')),
                5, 5),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'bundles')),
                1, 1),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'finished_bundles')),
                1, 1)))
    hc.assert_that(
        metrics['distributions'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'element_dist')),
                DistributionResult(DistributionData(15, 5, 1, 5)),
                DistributionResult(DistributionData(15, 5, 1, 5)))))
Beispiel #5
0
 def _combine(committed, attempted, filter):
   all_keys = set(committed.keys()) | set(attempted.keys())
   return [
       MetricResult(key, committed.get(key), attempted.get(key))
       for key in all_keys
       if metric.MetricResults.matches(filter, key)
   ]
Beispiel #6
0
 def test_query_counters(self):
     mock_client, mock_job_result = self.setup_mock_client_result()
     dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result)
     query_result = dm.query()
     expected_counters = [
         MetricResult(
             MetricKey(
                 'split',
                 MetricName('__main__.WordExtractingDoFn', 'empty_lines')),
             1080, 1080),
         MetricResult(
             MetricKey('longstepname/split',
                       MetricName('__main__.WordExtractingDoFn', 'words')),
             26181, 26185),
     ]
     self.assertEqual(
         sorted(query_result['counters'], key=lambda x: x.key.metric.name),
         sorted(expected_counters, key=lambda x: x.key.metric.name))
    def query(self, filter=None):
        counters = [
            MetricResult(k, v, v) for k, v in self._counters.items()
            if self.matches(filter, k)
        ]
        distributions = [
            MetricResult(k, v, v) for k, v in self._distributions.items()
            if self.matches(filter, k)
        ]
        gauges = [
            MetricResult(k, v, v) for k, v in self._gauges.items()
            if self.matches(filter, k)
        ]

        return {
            self.COUNTERS: counters,
            self.DISTRIBUTIONS: distributions,
            self.GAUGES: gauges
        }
 def test_query_counters(self):
     mock_client, mock_job_result = self.setup_mock_client_result(
         self.ONLY_COUNTERS_LIST)
     dm = dataflow_metrics.DataflowMetrics(mock_client, mock_job_result)
     dm._translate_step_name = types.MethodType(lambda self, x: 'split', dm)
     query_result = dm.query()
     expected_counters = [
         MetricResult(
             MetricKey(
                 'split',
                 MetricName('__main__.WordExtractingDoFn', 'empty_lines')),
             1080, 1080),
         MetricResult(
             MetricKey('split',
                       MetricName('__main__.WordExtractingDoFn', 'words')),
             26181, 26185),
     ]
     self.assertEqual(
         sorted(query_result['counters'], key=lambda x: x.key.metric.name),
         sorted(expected_counters, key=lambda x: x.key.metric.name))
Beispiel #9
0
  def query(self, filter=None):
    counters = [MetricResult(MetricKey(k.step, k.metric),
                             v.extract_committed(),
                             v.extract_latest_attempted())
                for k, v in self._counters.items()
                if self.matches(filter, k)]
    distributions = [MetricResult(MetricKey(k.step, k.metric),
                                  v.extract_committed(),
                                  v.extract_latest_attempted())
                     for k, v in self._distributions.items()
                     if self.matches(filter, k)]
    gauges = [MetricResult(MetricKey(k.step, k.metric),
                           v.extract_committed(),
                           v.extract_latest_attempted())
              for k, v in self._gauges.items()
              if self.matches(filter, k)]

    return {self.COUNTERS: counters,
            self.DISTRIBUTIONS: distributions,
            self.GAUGES: gauges}
Beispiel #10
0
    def test_apply_physical_no_filter(self):
        metrics = DirectMetrics()
        metrics.update_physical(
            object(),
            MetricUpdates(
                counters={
                    MetricKey('step1', self.name1): 5,
                    MetricKey('step1', self.name3): 8
                }))

        metrics.update_physical(
            object(),
            MetricUpdates(
                counters={
                    MetricKey('step2', self.name1): 7,
                    MetricKey('step1', self.name3): 4
                }))
        results = metrics.query()
        hc.assert_that(
            results['counters'],
            hc.contains_inanyorder(*[
                MetricResult(MetricKey('step1', self.name1), 0, 5),
                MetricResult(MetricKey('step1', self.name3), 0, 12),
                MetricResult(MetricKey('step2', self.name1), 0, 7)
            ]))

        metrics.commit_physical(object(), MetricUpdates())
        results = metrics.query()
        hc.assert_that(
            results['counters'],
            hc.contains_inanyorder(*[
                MetricResult(MetricKey('step1', self.name1), 0, 5),
                MetricResult(MetricKey('step1', self.name3), 0, 12),
                MetricResult(MetricKey('step2', self.name1), 0, 7)
            ]))
Beispiel #11
0
  def test_commit_logical_no_filter(self):
    metrics = DirectMetrics()
    metrics.commit_logical(
        self.bundle1,
        MetricUpdates(
            counters={
                MetricKey('step1', self.name1): 5,
                MetricKey('step1', self.name2): 8
            },
            distributions={
                MetricKey('step1', self.name1): DistributionData(8, 2, 3, 5)
            }))

    metrics.commit_logical(
        self.bundle1,
        MetricUpdates(
            counters={
                MetricKey('step2', self.name1): 7,
                MetricKey('step1', self.name2): 4
            },
            distributions={
                MetricKey('step1', self.name1): DistributionData(4, 1, 4, 4)
            }))

    results = metrics.query()
    hc.assert_that(
        results['counters'],
        hc.contains_inanyorder(
            *[
                MetricResult(MetricKey('step1', self.name2), 12, 0),
                MetricResult(MetricKey('step2', self.name1), 7, 0),
                MetricResult(MetricKey('step1', self.name1), 5, 0)
            ]))
    hc.assert_that(
        results['distributions'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('step1', self.name1),
                DistributionResult(DistributionData(12, 3, 3, 5)),
                DistributionResult(DistributionData(0, 0, None, None)))))
Beispiel #12
0
    def _populate_metrics(self, response, result, user_metrics=False):
        """Move metrics from response to results as MetricResults."""
        if user_metrics:
            metrics = [
                metric for metric in response.metrics
                if metric.name.origin == 'user'
            ]
        else:
            metrics = [
                metric for metric in response.metrics
                if metric.name.origin == 'dataflow/v1b3'
            ]

        # Get the tentative/committed versions of every metric together.
        metrics_by_name = defaultdict(lambda: {})
        for metric in metrics:
            if (metric.name.name.endswith('[MIN]')
                    or metric.name.name.endswith('[MAX]')
                    or metric.name.name.endswith('[MEAN]')
                    or metric.name.name.endswith('[COUNT]')):
                # The Dataflow Service presents distribution metrics in two ways:
                # One way is as a single distribution object with all its fields, and
                # another way is as four different scalar metrics labeled as [MIN],
                # [MAX], [COUNT], [MEAN].
                # TODO(pabloem) remove these when distributions are not being broken up
                #  in the service.
                # The second way is only useful for the UI, and should be ignored.
                continue
            is_tentative = [
                prop for prop in metric.name.context.additionalProperties
                if prop.key == 'tentative' and prop.value == 'true'
            ]
            tentative_or_committed = 'tentative' if is_tentative else 'committed'

            metric_key = self._get_metric_key(metric)
            if metric_key is None:
                continue
            metrics_by_name[metric_key][tentative_or_committed] = metric

        # Now we create the MetricResult elements.
        for metric_key, metric in iteritems(metrics_by_name):
            attempted = self._get_metric_value(metric['tentative'])
            committed = self._get_metric_value(metric['committed'])
            if attempted is None or committed is None:
                continue
            result.append(
                MetricResult(metric_key,
                             attempted=attempted,
                             committed=committed))
Beispiel #13
0
    def _populate_metric_results(self, response):
        """Take a list of metrics, and convert it to a list of MetricResult."""
        user_metrics = [
            metric for metric in response.metrics
            if metric.name.origin == 'user'
        ]

        # Get the tentative/committed versions of every metric together.
        metrics_by_name = defaultdict(lambda: {})
        for metric in user_metrics:
            if (metric.name.name.endswith('(DIST)')
                    or metric.name.name.endswith('[MIN]')
                    or metric.name.name.endswith('[MAX]')
                    or metric.name.name.endswith('[MEAN]')
                    or metric.name.name.endswith('[COUNT]')):
                warn(
                    'Distribution metrics will be ignored in the MetricsResult.query'
                    'method. You can see them in the Dataflow User Interface.')
                # Distributions are not yet fully supported in this runner
                continue
            is_tentative = [
                prop for prop in metric.name.context.additionalProperties
                if prop.key == 'tentative' and prop.value == 'true'
            ]
            tentative_or_committed = 'tentative' if is_tentative else 'committed'

            metric_key = self._get_metric_key(metric)
            metrics_by_name[metric_key][tentative_or_committed] = metric

        # Now we create the MetricResult elements.
        result = []
        for metric_key, metric in metrics_by_name.iteritems():
            if (metric['tentative'].scalar is None
                    or metric['committed'].scalar is None):
                continue
            attempted = metric['tentative'].scalar.integer_value
            committed = metric['committed'].scalar.integer_value
            result.append(
                MetricResult(metric_key,
                             attempted=attempted,
                             committed=committed))

        return result
def _create_metric_result(data_dict):
    step = data_dict['step'] if 'step' in data_dict else ''
    labels = data_dict['labels'] if 'labels' in data_dict else dict()
    values = {}
    for key in ['attempted', 'committed']:
        if key in data_dict:
            if 'counter' in data_dict[key]:
                values[key] = data_dict[key]['counter']
            elif 'distribution' in data_dict[key]:
                distribution = data_dict[key]['distribution']
                values[key] = DistributionResult(
                    DistributionData(
                        distribution['sum'],
                        distribution['count'],
                        distribution['min'],
                        distribution['max'],
                    ))
    attempted = values['attempted'] if 'attempted' in values else None
    committed = values['committed'] if 'committed' in values else None

    metric_name = MetricName(data_dict['namespace'], data_dict['name'])
    metric_key = MetricKey(step, metric_name, labels)
    return MetricResult(metric_key, committed, attempted)
Beispiel #15
0
    def test_apply_physical_logical(self):
        metrics = DirectMetrics()
        dist_zero = DistributionData(0, 0, None, None)
        metrics.update_physical(
            object(),
            MetricUpdates(counters={
                MetricKey('step1', self.name1): 7,
                MetricKey('step1', self.name2): 5,
                MetricKey('step2', self.name1): 1
            },
                          distributions={
                              MetricKey('step1', self.name1):
                              DistributionData(3, 1, 3, 3),
                              MetricKey('step2', self.name3):
                              DistributionData(8, 2, 4, 4)
                          }))
        results = metrics.query()
        hc.assert_that(
            results['counters'],
            hc.contains_inanyorder(*[
                MetricResult(MetricKey('step1', self.name1), 0, 7),
                MetricResult(MetricKey('step1', self.name2), 0, 5),
                MetricResult(MetricKey('step2', self.name1), 0, 1)
            ]))
        hc.assert_that(
            results['distributions'],
            hc.contains_inanyorder(*[
                MetricResult(MetricKey('step1', self.name1),
                             DistributionResult(dist_zero),
                             DistributionResult(DistributionData(3, 1, 3, 3))),
                MetricResult(MetricKey('step2', self.name3),
                             DistributionResult(dist_zero),
                             DistributionResult(DistributionData(8, 2, 4, 4)))
            ]))

        metrics.commit_physical(
            object(),
            MetricUpdates(counters={
                MetricKey('step1', self.name1): -3,
                MetricKey('step2', self.name1): -5
            },
                          distributions={
                              MetricKey('step1', self.name1):
                              DistributionData(8, 4, 1, 5),
                              MetricKey('step2', self.name2):
                              DistributionData(8, 8, 1, 1)
                          }))
        results = metrics.query()
        hc.assert_that(
            results['counters'],
            hc.contains_inanyorder(*[
                MetricResult(MetricKey('step1', self.name1), 0, 4),
                MetricResult(MetricKey('step1', self.name2), 0, 5),
                MetricResult(MetricKey('step2', self.name1), 0, -4)
            ]))
        hc.assert_that(
            results['distributions'],
            hc.contains_inanyorder(*[
                MetricResult(MetricKey('step1', self.name1),
                             DistributionResult(dist_zero),
                             DistributionResult(DistributionData(11, 5, 1,
                                                                 5))),
                MetricResult(MetricKey('step2', self.name3),
                             DistributionResult(dist_zero),
                             DistributionResult(DistributionData(8, 2, 4, 4))),
                MetricResult(MetricKey('step2', self.name2),
                             DistributionResult(dist_zero),
                             DistributionResult(DistributionData(8, 8, 1, 1)))
            ]))

        metrics.commit_logical(
            object(),
            MetricUpdates(counters={
                MetricKey('step1', self.name1): 3,
                MetricKey('step1', self.name2): 5,
                MetricKey('step2', self.name1): -3
            },
                          distributions={
                              MetricKey('step1', self.name1):
                              DistributionData(11, 5, 1, 5),
                              MetricKey('step2', self.name2):
                              DistributionData(8, 8, 1, 1),
                              MetricKey('step2', self.name3):
                              DistributionData(4, 1, 4, 4)
                          }))

        results = metrics.query()
        hc.assert_that(
            results['counters'],
            hc.contains_inanyorder(*[
                MetricResult(MetricKey('step1', self.name1), 3, 4),
                MetricResult(MetricKey('step1', self.name2), 5, 5),
                MetricResult(MetricKey('step2', self.name1), -3, -4)
            ]))
        hc.assert_that(
            results['distributions'],
            hc.contains_inanyorder(*[
                MetricResult(MetricKey('step1', self.name1),
                             DistributionResult(DistributionData(11, 5, 1, 5)),
                             DistributionResult(DistributionData(11, 5, 1,
                                                                 5))),
                MetricResult(MetricKey('step2', self.name3),
                             DistributionResult(DistributionData(4, 1, 4, 4)),
                             DistributionResult(DistributionData(8, 2, 4, 4))),
                MetricResult(MetricKey('step2', self.name2),
                             DistributionResult(DistributionData(8, 8, 1, 1)),
                             DistributionResult(DistributionData(8, 8, 1, 1)))
            ]))