def add_computed_metrics(self, metrics_dict, job_status_dict,
                           find_memory_metrics=True):
    """Computes additional metrics and adds them to `metrics_dict`.

    Args:
      metrics_dict (dict): Keys are strings and values are MetricPoints.
      job_status_dict (dict): Should contain `job_status`, `start_time`,
        and `stop_time` as keys.
      find_memory_metrics (bool, optional): If True, query Cloud Monitoring
        to find memory usage metrics and add them to `metrics_dict`.
    """
    start_time = job_status_dict['start_time']
    stop_time = job_status_dict['stop_time']
    metrics_dict['total_wall_time'] = metrics.MetricPoint(
        stop_time - start_time, stop_time)

    # Compute time_to_accuracy if requested in the config.
    tta_config = self.metric_collection_config.get('time_to_accuracy')
    if tta_config:
      if 'accuracy_tag' not in tta_config or \
          'accuracy_threshold' not in tta_config:
        raise ValueError('Invalid `time_to_accuracy` portion of config. '
                         'See README for how to set up the config.')
      tag = tta_config['accuracy_tag']
      threshold = tta_config['accuracy_threshold']
      try:
        metrics_dict['time_to_accuracy'] = metrics.time_to_accuracy(
            metrics_dict, tag, threshold)
      except ValueError as e:
        raise ValueError('Error computing time to accuracy: {}'.format(e))

    if find_memory_metrics:
      metrics.compute_memory_metrics(metrics_dict, self.project,
                                     self.debug_info.job_name)
Ejemplo n.º 2
0
    def get_metrics_from_events_dir(self, job_status_dict):
        """Retrieves and aggregates metrics from Tensorboard Summary file.

    Args:
      job_status_dict (dict): Should contain `job_status`, `start_time`,
        and `stop_time` as keys.

    Returns:
      final_metrics (dict): Key is metric name and value is a MetricPoint
        containing the aggregated value for that metric.
    """
        tags_to_ignore = set(
            self.metric_collection_config.get('tags_to_ignore', []))
        raw_metrics = metrics.read_metrics_from_events_dir(
            self.events_dir, tags_to_ignore)

        if not raw_metrics:
            self.logger.warning("No metrics found in {}".format(
                self.events_dir))
            return {}

        default_aggregation_strategies = self.metric_collection_config.get(
            'default_aggregation_strategies')
        metric_to_aggregation_strategies = self.metric_collection_config.get(
            'metric_to_aggregation_strategies')
        try:
            final_metrics = metrics.aggregate_metrics(
                raw_metrics, default_aggregation_strategies,
                metric_to_aggregation_strategies)
        except ValueError as e:
            raise ValueError("Error during metric aggregation: {}".format(e))

        start_time = job_status_dict['start_time']
        stop_time = job_status_dict['stop_time']
        final_metrics['total_wall_time'] = metrics.MetricPoint(
            stop_time - start_time, stop_time)

        tta_config = self.metric_collection_config.get('time_to_accuracy')
        # Compute time_to_accuracy if requested in the config.
        if tta_config:
            if 'accuracy_tag' not in tta_config or \
                'accuracy_threshold' not in tta_config:
                raise ValueError(
                    'Invalid `time_to_accuracy` portion of config. '
                    'See README for how to set up the config.')
            tag = tta_config['accuracy_tag']
            threshold = tta_config['accuracy_threshold']
            try:
                final_metrics['time_to_accuracy'] = metrics.time_to_accuracy(
                    raw_metrics, tag, threshold)
            except ValueError as e:
                raise ValueError(
                    'Error computing time to accuracy: {}'.format(e))

        return final_metrics
Ejemplo n.º 3
0
    def test_time_to_accuracy(self):
        raw_metrics = {
            'accuracy': [
                metrics.MetricPoint(metric_value=.2, wall_time=0),
                metrics.MetricPoint(metric_value=.4, wall_time=10),
                metrics.MetricPoint(metric_value=.6, wall_time=20),
            ],
            'other_metric':
            [metrics.MetricPoint(metric_value=1, wall_time=15)]
        }

        time_to_accuracy = metrics.time_to_accuracy(raw_metrics,
                                                    tag='accuracy',
                                                    threshold=.4)
        self.assertEqual(time_to_accuracy,
                         metrics.MetricPoint(metric_value=10, wall_time=10))