def add_computed_metrics(self, metrics_dict, job_status_dict, find_memory_metrics=True): """Computes additional metrics and adds them to `metrics_dict`. Args: metrics_dict (dict): Keys are strings and values are MetricPoints. job_status_dict (dict): Should contain `job_status`, `start_time`, and `stop_time` as keys. find_memory_metrics (bool, optional): If True, query Cloud Monitoring to find memory usage metrics and add them to `metrics_dict`. """ start_time = job_status_dict['start_time'] stop_time = job_status_dict['stop_time'] metrics_dict['total_wall_time'] = metrics.MetricPoint( stop_time - start_time, stop_time) # Compute time_to_accuracy if requested in the config. tta_config = self.metric_collection_config.get('time_to_accuracy') if tta_config: if 'accuracy_tag' not in tta_config or \ 'accuracy_threshold' not in tta_config: raise ValueError('Invalid `time_to_accuracy` portion of config. ' 'See README for how to set up the config.') tag = tta_config['accuracy_tag'] threshold = tta_config['accuracy_threshold'] try: metrics_dict['time_to_accuracy'] = metrics.time_to_accuracy( metrics_dict, tag, threshold) except ValueError as e: raise ValueError('Error computing time to accuracy: {}'.format(e)) if find_memory_metrics: metrics.compute_memory_metrics(metrics_dict, self.project, self.debug_info.job_name)
def get_metrics_from_events_dir(self, job_status_dict): """Retrieves and aggregates metrics from Tensorboard Summary file. Args: job_status_dict (dict): Should contain `job_status`, `start_time`, and `stop_time` as keys. Returns: final_metrics (dict): Key is metric name and value is a MetricPoint containing the aggregated value for that metric. """ tags_to_ignore = set( self.metric_collection_config.get('tags_to_ignore', [])) raw_metrics = metrics.read_metrics_from_events_dir( self.events_dir, tags_to_ignore) if not raw_metrics: self.logger.warning("No metrics found in {}".format( self.events_dir)) return {} default_aggregation_strategies = self.metric_collection_config.get( 'default_aggregation_strategies') metric_to_aggregation_strategies = self.metric_collection_config.get( 'metric_to_aggregation_strategies') try: final_metrics = metrics.aggregate_metrics( raw_metrics, default_aggregation_strategies, metric_to_aggregation_strategies) except ValueError as e: raise ValueError("Error during metric aggregation: {}".format(e)) start_time = job_status_dict['start_time'] stop_time = job_status_dict['stop_time'] final_metrics['total_wall_time'] = metrics.MetricPoint( stop_time - start_time, stop_time) tta_config = self.metric_collection_config.get('time_to_accuracy') # Compute time_to_accuracy if requested in the config. if tta_config: if 'accuracy_tag' not in tta_config or \ 'accuracy_threshold' not in tta_config: raise ValueError( 'Invalid `time_to_accuracy` portion of config. ' 'See README for how to set up the config.') tag = tta_config['accuracy_tag'] threshold = tta_config['accuracy_threshold'] try: final_metrics['time_to_accuracy'] = metrics.time_to_accuracy( raw_metrics, tag, threshold) except ValueError as e: raise ValueError( 'Error computing time to accuracy: {}'.format(e)) return final_metrics
def test_time_to_accuracy(self): raw_metrics = { 'accuracy': [ metrics.MetricPoint(metric_value=.2, wall_time=0), metrics.MetricPoint(metric_value=.4, wall_time=10), metrics.MetricPoint(metric_value=.6, wall_time=20), ], 'other_metric': [metrics.MetricPoint(metric_value=1, wall_time=15)] } time_to_accuracy = metrics.time_to_accuracy(raw_metrics, tag='accuracy', threshold=.4) self.assertEqual(time_to_accuracy, metrics.MetricPoint(metric_value=10, wall_time=10))